diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,236095 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 33723, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.9653352311478814e-05, + "grad_norm": 6.782041072845459, + "learning_rate": 0.0, + "loss": 12.1604, + "step": 1 + }, + { + "epoch": 5.930670462295763e-05, + "grad_norm": 6.99527645111084, + "learning_rate": 2.9585798816568047e-06, + "loss": 12.1562, + "step": 2 + }, + { + "epoch": 8.896005693443644e-05, + "grad_norm": 6.74578857421875, + "learning_rate": 5.917159763313609e-06, + "loss": 12.1575, + "step": 3 + }, + { + "epoch": 0.00011861340924591526, + "grad_norm": 6.8893280029296875, + "learning_rate": 8.875739644970414e-06, + "loss": 12.0568, + "step": 4 + }, + { + "epoch": 0.00014826676155739406, + "grad_norm": 7.043562412261963, + "learning_rate": 1.1834319526627219e-05, + "loss": 11.777, + "step": 5 + }, + { + "epoch": 0.00017792011386887289, + "grad_norm": 6.812330722808838, + "learning_rate": 1.4792899408284024e-05, + "loss": 11.6846, + "step": 6 + }, + { + "epoch": 0.00020757346618035169, + "grad_norm": 6.539841651916504, + "learning_rate": 1.7751479289940828e-05, + "loss": 11.1177, + "step": 7 + }, + { + "epoch": 0.00023722681849183051, + "grad_norm": 7.624247074127197, + "learning_rate": 2.0710059171597635e-05, + "loss": 10.7519, + "step": 8 + }, + { + "epoch": 0.0002668801708033093, + "grad_norm": 6.015682220458984, + "learning_rate": 2.3668639053254438e-05, + "loss": 10.504, + "step": 9 + }, + { + "epoch": 0.0002965335231147881, + "grad_norm": 3.882702112197876, + "learning_rate": 2.6627218934911244e-05, + "loss": 10.3032, + "step": 10 + }, + { + "epoch": 0.0003261868754262669, + "grad_norm": 2.9844086170196533, + "learning_rate": 2.9585798816568047e-05, + "loss": 10.1951, + "step": 11 + }, + { + "epoch": 0.00035584022773774577, + "grad_norm": 2.9160287380218506, + "learning_rate": 3.254437869822485e-05, + "loss": 10.0697, + "step": 12 + }, + { + "epoch": 0.00038549358004922457, + "grad_norm": 2.2919814586639404, + "learning_rate": 3.5502958579881656e-05, + "loss": 9.933, + "step": 13 + }, + { + "epoch": 0.00041514693236070337, + "grad_norm": 2.428372383117676, + "learning_rate": 3.846153846153846e-05, + "loss": 9.8169, + "step": 14 + }, + { + "epoch": 0.00044480028467218217, + "grad_norm": 2.171254873275757, + "learning_rate": 4.142011834319527e-05, + "loss": 9.6786, + "step": 15 + }, + { + "epoch": 0.00047445363698366103, + "grad_norm": 1.7822846174240112, + "learning_rate": 4.437869822485207e-05, + "loss": 9.5993, + "step": 16 + }, + { + "epoch": 0.0005041069892951398, + "grad_norm": 1.7479466199874878, + "learning_rate": 4.7337278106508875e-05, + "loss": 9.5204, + "step": 17 + }, + { + "epoch": 0.0005337603416066186, + "grad_norm": 1.9169152975082397, + "learning_rate": 5.029585798816568e-05, + "loss": 9.4398, + "step": 18 + }, + { + "epoch": 0.0005634136939180975, + "grad_norm": 1.4479023218154907, + "learning_rate": 5.325443786982249e-05, + "loss": 9.3371, + "step": 19 + }, + { + "epoch": 0.0005930670462295762, + "grad_norm": 1.6577495336532593, + "learning_rate": 5.6213017751479294e-05, + "loss": 9.2246, + "step": 20 + }, + { + "epoch": 0.0006227203985410551, + "grad_norm": 1.4400572776794434, + "learning_rate": 5.9171597633136094e-05, + "loss": 9.1712, + "step": 21 + }, + { + "epoch": 0.0006523737508525338, + "grad_norm": 1.2785083055496216, + "learning_rate": 6.21301775147929e-05, + "loss": 9.035, + "step": 22 + }, + { + "epoch": 0.0006820271031640127, + "grad_norm": 1.4833413362503052, + "learning_rate": 6.50887573964497e-05, + "loss": 8.9701, + "step": 23 + }, + { + "epoch": 0.0007116804554754915, + "grad_norm": 1.1467076539993286, + "learning_rate": 6.80473372781065e-05, + "loss": 8.8574, + "step": 24 + }, + { + "epoch": 0.0007413338077869703, + "grad_norm": 1.1965774297714233, + "learning_rate": 7.100591715976331e-05, + "loss": 8.7304, + "step": 25 + }, + { + "epoch": 0.0007709871600984491, + "grad_norm": 1.3073712587356567, + "learning_rate": 7.396449704142012e-05, + "loss": 8.6942, + "step": 26 + }, + { + "epoch": 0.0008006405124099279, + "grad_norm": 0.8901578783988953, + "learning_rate": 7.692307692307693e-05, + "loss": 8.5384, + "step": 27 + }, + { + "epoch": 0.0008302938647214067, + "grad_norm": 1.086179494857788, + "learning_rate": 7.988165680473373e-05, + "loss": 8.4391, + "step": 28 + }, + { + "epoch": 0.0008599472170328856, + "grad_norm": 1.3572951555252075, + "learning_rate": 8.284023668639054e-05, + "loss": 8.3604, + "step": 29 + }, + { + "epoch": 0.0008896005693443643, + "grad_norm": 0.8596665263175964, + "learning_rate": 8.579881656804733e-05, + "loss": 8.2355, + "step": 30 + }, + { + "epoch": 0.0009192539216558432, + "grad_norm": 1.7226903438568115, + "learning_rate": 8.875739644970414e-05, + "loss": 8.186, + "step": 31 + }, + { + "epoch": 0.0009489072739673221, + "grad_norm": 1.527985692024231, + "learning_rate": 9.171597633136094e-05, + "loss": 8.1481, + "step": 32 + }, + { + "epoch": 0.0009785606262788008, + "grad_norm": 0.9979639649391174, + "learning_rate": 9.467455621301775e-05, + "loss": 8.0434, + "step": 33 + }, + { + "epoch": 0.0010082139785902795, + "grad_norm": 3.4863460063934326, + "learning_rate": 9.763313609467456e-05, + "loss": 8.0467, + "step": 34 + }, + { + "epoch": 0.0010378673309017585, + "grad_norm": 1.955718755722046, + "learning_rate": 0.00010059171597633136, + "loss": 7.9757, + "step": 35 + }, + { + "epoch": 0.0010675206832132373, + "grad_norm": 1.7017618417739868, + "learning_rate": 0.00010355029585798817, + "loss": 7.9526, + "step": 36 + }, + { + "epoch": 0.001097174035524716, + "grad_norm": 2.2920162677764893, + "learning_rate": 0.00010650887573964498, + "loss": 7.9226, + "step": 37 + }, + { + "epoch": 0.001126827387836195, + "grad_norm": 1.3648720979690552, + "learning_rate": 0.00010946745562130178, + "loss": 7.8576, + "step": 38 + }, + { + "epoch": 0.0011564807401476737, + "grad_norm": 2.395550012588501, + "learning_rate": 0.00011242603550295859, + "loss": 7.8651, + "step": 39 + }, + { + "epoch": 0.0011861340924591525, + "grad_norm": 1.5992865562438965, + "learning_rate": 0.0001153846153846154, + "loss": 7.8336, + "step": 40 + }, + { + "epoch": 0.0012157874447706314, + "grad_norm": 1.6818935871124268, + "learning_rate": 0.00011834319526627219, + "loss": 7.7865, + "step": 41 + }, + { + "epoch": 0.0012454407970821102, + "grad_norm": 2.012242317199707, + "learning_rate": 0.000121301775147929, + "loss": 7.7786, + "step": 42 + }, + { + "epoch": 0.001275094149393589, + "grad_norm": 1.7051700353622437, + "learning_rate": 0.0001242603550295858, + "loss": 7.7689, + "step": 43 + }, + { + "epoch": 0.0013047475017050677, + "grad_norm": 1.69509756565094, + "learning_rate": 0.00012721893491124262, + "loss": 7.7612, + "step": 44 + }, + { + "epoch": 0.0013344008540165466, + "grad_norm": 1.6454294919967651, + "learning_rate": 0.0001301775147928994, + "loss": 7.629, + "step": 45 + }, + { + "epoch": 0.0013640542063280254, + "grad_norm": 2.390256404876709, + "learning_rate": 0.00013313609467455623, + "loss": 7.6878, + "step": 46 + }, + { + "epoch": 0.0013937075586395041, + "grad_norm": 1.4744431972503662, + "learning_rate": 0.000136094674556213, + "loss": 7.6433, + "step": 47 + }, + { + "epoch": 0.001423360910950983, + "grad_norm": 1.0472729206085205, + "learning_rate": 0.00013905325443786982, + "loss": 7.6137, + "step": 48 + }, + { + "epoch": 0.0014530142632624618, + "grad_norm": 3.3697683811187744, + "learning_rate": 0.00014201183431952663, + "loss": 7.6264, + "step": 49 + }, + { + "epoch": 0.0014826676155739406, + "grad_norm": 1.7601678371429443, + "learning_rate": 0.00014497041420118343, + "loss": 7.614, + "step": 50 + }, + { + "epoch": 0.0015123209678854195, + "grad_norm": 1.8846670389175415, + "learning_rate": 0.00014792899408284024, + "loss": 7.5802, + "step": 51 + }, + { + "epoch": 0.0015419743201968983, + "grad_norm": 1.3987630605697632, + "learning_rate": 0.00015088757396449705, + "loss": 7.5625, + "step": 52 + }, + { + "epoch": 0.001571627672508377, + "grad_norm": 2.0262248516082764, + "learning_rate": 0.00015384615384615385, + "loss": 7.4549, + "step": 53 + }, + { + "epoch": 0.0016012810248198558, + "grad_norm": 1.5666464567184448, + "learning_rate": 0.00015680473372781066, + "loss": 7.4664, + "step": 54 + }, + { + "epoch": 0.0016309343771313347, + "grad_norm": 2.301426410675049, + "learning_rate": 0.00015976331360946746, + "loss": 7.4643, + "step": 55 + }, + { + "epoch": 0.0016605877294428135, + "grad_norm": 1.7119866609573364, + "learning_rate": 0.00016272189349112427, + "loss": 7.4106, + "step": 56 + }, + { + "epoch": 0.0016902410817542922, + "grad_norm": 1.6556282043457031, + "learning_rate": 0.00016568047337278108, + "loss": 7.3669, + "step": 57 + }, + { + "epoch": 0.0017198944340657712, + "grad_norm": 1.0545799732208252, + "learning_rate": 0.00016863905325443788, + "loss": 7.3281, + "step": 58 + }, + { + "epoch": 0.00174954778637725, + "grad_norm": 1.1120377779006958, + "learning_rate": 0.00017159763313609466, + "loss": 7.2633, + "step": 59 + }, + { + "epoch": 0.0017792011386887287, + "grad_norm": 1.5273922681808472, + "learning_rate": 0.0001745562130177515, + "loss": 7.2453, + "step": 60 + }, + { + "epoch": 0.0018088544910002077, + "grad_norm": 1.2063521146774292, + "learning_rate": 0.00017751479289940828, + "loss": 7.2271, + "step": 61 + }, + { + "epoch": 0.0018385078433116864, + "grad_norm": 1.3371572494506836, + "learning_rate": 0.0001804733727810651, + "loss": 7.228, + "step": 62 + }, + { + "epoch": 0.0018681611956231651, + "grad_norm": 1.0827168226242065, + "learning_rate": 0.0001834319526627219, + "loss": 7.135, + "step": 63 + }, + { + "epoch": 0.0018978145479346441, + "grad_norm": 1.1399023532867432, + "learning_rate": 0.00018639053254437872, + "loss": 7.1218, + "step": 64 + }, + { + "epoch": 0.0019274679002461229, + "grad_norm": 1.3876287937164307, + "learning_rate": 0.0001893491124260355, + "loss": 7.104, + "step": 65 + }, + { + "epoch": 0.0019571212525576016, + "grad_norm": 1.3557099103927612, + "learning_rate": 0.00019230769230769233, + "loss": 7.0791, + "step": 66 + }, + { + "epoch": 0.0019867746048690803, + "grad_norm": 1.3249162435531616, + "learning_rate": 0.00019526627218934911, + "loss": 7.0875, + "step": 67 + }, + { + "epoch": 0.002016427957180559, + "grad_norm": 1.4407223463058472, + "learning_rate": 0.00019822485207100595, + "loss": 7.0394, + "step": 68 + }, + { + "epoch": 0.0020460813094920383, + "grad_norm": 1.267762303352356, + "learning_rate": 0.00020118343195266273, + "loss": 7.0184, + "step": 69 + }, + { + "epoch": 0.002075734661803517, + "grad_norm": 1.7767499685287476, + "learning_rate": 0.0002041420118343195, + "loss": 6.9762, + "step": 70 + }, + { + "epoch": 0.0021053880141149958, + "grad_norm": 1.582594633102417, + "learning_rate": 0.00020710059171597634, + "loss": 6.9459, + "step": 71 + }, + { + "epoch": 0.0021350413664264745, + "grad_norm": 1.3706927299499512, + "learning_rate": 0.00021005917159763312, + "loss": 6.9599, + "step": 72 + }, + { + "epoch": 0.0021646947187379533, + "grad_norm": 0.970565915107727, + "learning_rate": 0.00021301775147928995, + "loss": 6.8961, + "step": 73 + }, + { + "epoch": 0.002194348071049432, + "grad_norm": 2.7387280464172363, + "learning_rate": 0.00021597633136094673, + "loss": 6.9167, + "step": 74 + }, + { + "epoch": 0.0022240014233609108, + "grad_norm": 1.2715474367141724, + "learning_rate": 0.00021893491124260357, + "loss": 6.8902, + "step": 75 + }, + { + "epoch": 0.00225365477567239, + "grad_norm": 1.9923373460769653, + "learning_rate": 0.00022189349112426034, + "loss": 6.8822, + "step": 76 + }, + { + "epoch": 0.0022833081279838687, + "grad_norm": 1.6264853477478027, + "learning_rate": 0.00022485207100591718, + "loss": 6.8251, + "step": 77 + }, + { + "epoch": 0.0023129614802953474, + "grad_norm": 1.6880940198898315, + "learning_rate": 0.00022781065088757396, + "loss": 6.8438, + "step": 78 + }, + { + "epoch": 0.002342614832606826, + "grad_norm": 2.4152703285217285, + "learning_rate": 0.0002307692307692308, + "loss": 6.8382, + "step": 79 + }, + { + "epoch": 0.002372268184918305, + "grad_norm": 1.2049732208251953, + "learning_rate": 0.00023372781065088757, + "loss": 6.7788, + "step": 80 + }, + { + "epoch": 0.0024019215372297837, + "grad_norm": 1.4897282123565674, + "learning_rate": 0.00023668639053254438, + "loss": 6.7587, + "step": 81 + }, + { + "epoch": 0.002431574889541263, + "grad_norm": 1.1186306476593018, + "learning_rate": 0.00023964497041420118, + "loss": 6.7323, + "step": 82 + }, + { + "epoch": 0.0024612282418527416, + "grad_norm": 2.681396007537842, + "learning_rate": 0.000242603550295858, + "loss": 6.7472, + "step": 83 + }, + { + "epoch": 0.0024908815941642203, + "grad_norm": 1.214967131614685, + "learning_rate": 0.0002455621301775148, + "loss": 6.7255, + "step": 84 + }, + { + "epoch": 0.002520534946475699, + "grad_norm": 1.3106650114059448, + "learning_rate": 0.0002485207100591716, + "loss": 6.7464, + "step": 85 + }, + { + "epoch": 0.002550188298787178, + "grad_norm": 1.8390547037124634, + "learning_rate": 0.00025147928994082844, + "loss": 6.7048, + "step": 86 + }, + { + "epoch": 0.0025798416510986566, + "grad_norm": 1.1644833087921143, + "learning_rate": 0.00025443786982248524, + "loss": 6.6962, + "step": 87 + }, + { + "epoch": 0.0026094950034101353, + "grad_norm": 1.073466420173645, + "learning_rate": 0.000257396449704142, + "loss": 6.6287, + "step": 88 + }, + { + "epoch": 0.0026391483557216145, + "grad_norm": 2.172978401184082, + "learning_rate": 0.0002603550295857988, + "loss": 6.6957, + "step": 89 + }, + { + "epoch": 0.0026688017080330933, + "grad_norm": 1.737131953239441, + "learning_rate": 0.00026331360946745566, + "loss": 6.66, + "step": 90 + }, + { + "epoch": 0.002698455060344572, + "grad_norm": 1.3316679000854492, + "learning_rate": 0.00026627218934911247, + "loss": 6.6562, + "step": 91 + }, + { + "epoch": 0.0027281084126560507, + "grad_norm": 1.402278184890747, + "learning_rate": 0.0002692307692307692, + "loss": 6.6248, + "step": 92 + }, + { + "epoch": 0.0027577617649675295, + "grad_norm": 1.3297146558761597, + "learning_rate": 0.000272189349112426, + "loss": 6.6077, + "step": 93 + }, + { + "epoch": 0.0027874151172790082, + "grad_norm": 1.3293187618255615, + "learning_rate": 0.0002751479289940829, + "loss": 6.5674, + "step": 94 + }, + { + "epoch": 0.0028170684695904874, + "grad_norm": 1.3541898727416992, + "learning_rate": 0.00027810650887573964, + "loss": 6.5586, + "step": 95 + }, + { + "epoch": 0.002846721821901966, + "grad_norm": 1.067320704460144, + "learning_rate": 0.00028106508875739645, + "loss": 6.5346, + "step": 96 + }, + { + "epoch": 0.002876375174213445, + "grad_norm": 0.9450618028640747, + "learning_rate": 0.00028402366863905325, + "loss": 6.5358, + "step": 97 + }, + { + "epoch": 0.0029060285265249237, + "grad_norm": 1.2999441623687744, + "learning_rate": 0.0002869822485207101, + "loss": 6.506, + "step": 98 + }, + { + "epoch": 0.0029356818788364024, + "grad_norm": 1.1129587888717651, + "learning_rate": 0.00028994082840236686, + "loss": 6.4779, + "step": 99 + }, + { + "epoch": 0.002965335231147881, + "grad_norm": 0.9055272340774536, + "learning_rate": 0.00029289940828402367, + "loss": 6.4481, + "step": 100 + }, + { + "epoch": 0.00299498858345936, + "grad_norm": 0.8659515380859375, + "learning_rate": 0.0002958579881656805, + "loss": 6.4683, + "step": 101 + }, + { + "epoch": 0.003024641935770839, + "grad_norm": 1.2182223796844482, + "learning_rate": 0.00029881656804733734, + "loss": 6.4458, + "step": 102 + }, + { + "epoch": 0.003054295288082318, + "grad_norm": 1.2510238885879517, + "learning_rate": 0.0003017751479289941, + "loss": 6.4366, + "step": 103 + }, + { + "epoch": 0.0030839486403937966, + "grad_norm": 0.7147052884101868, + "learning_rate": 0.0003047337278106509, + "loss": 6.3871, + "step": 104 + }, + { + "epoch": 0.0031136019927052753, + "grad_norm": 1.8880711793899536, + "learning_rate": 0.0003076923076923077, + "loss": 6.3891, + "step": 105 + }, + { + "epoch": 0.003143255345016754, + "grad_norm": 1.0333633422851562, + "learning_rate": 0.00031065088757396446, + "loss": 6.4088, + "step": 106 + }, + { + "epoch": 0.003172908697328233, + "grad_norm": 1.6473525762557983, + "learning_rate": 0.0003136094674556213, + "loss": 6.4315, + "step": 107 + }, + { + "epoch": 0.0032025620496397116, + "grad_norm": 0.8508673906326294, + "learning_rate": 0.0003165680473372781, + "loss": 6.3296, + "step": 108 + }, + { + "epoch": 0.0032322154019511907, + "grad_norm": 0.8800122141838074, + "learning_rate": 0.00031952662721893493, + "loss": 6.2921, + "step": 109 + }, + { + "epoch": 0.0032618687542626695, + "grad_norm": 0.994353711605072, + "learning_rate": 0.0003224852071005917, + "loss": 6.2687, + "step": 110 + }, + { + "epoch": 0.0032915221065741482, + "grad_norm": 1.7332899570465088, + "learning_rate": 0.00032544378698224854, + "loss": 6.3918, + "step": 111 + }, + { + "epoch": 0.003321175458885627, + "grad_norm": 0.9144511818885803, + "learning_rate": 0.00032840236686390535, + "loss": 6.302, + "step": 112 + }, + { + "epoch": 0.0033508288111971057, + "grad_norm": 1.074702501296997, + "learning_rate": 0.00033136094674556215, + "loss": 6.2408, + "step": 113 + }, + { + "epoch": 0.0033804821635085845, + "grad_norm": 2.122992515563965, + "learning_rate": 0.0003343195266272189, + "loss": 6.3185, + "step": 114 + }, + { + "epoch": 0.0034101355158200636, + "grad_norm": 1.1567546129226685, + "learning_rate": 0.00033727810650887577, + "loss": 6.259, + "step": 115 + }, + { + "epoch": 0.0034397888681315424, + "grad_norm": 1.8046022653579712, + "learning_rate": 0.0003402366863905326, + "loss": 6.2663, + "step": 116 + }, + { + "epoch": 0.003469442220443021, + "grad_norm": 1.071861982345581, + "learning_rate": 0.0003431952662721893, + "loss": 6.2506, + "step": 117 + }, + { + "epoch": 0.0034990955727545, + "grad_norm": 1.1966499090194702, + "learning_rate": 0.00034615384615384613, + "loss": 6.221, + "step": 118 + }, + { + "epoch": 0.0035287489250659786, + "grad_norm": 2.0666215419769287, + "learning_rate": 0.000349112426035503, + "loss": 6.2234, + "step": 119 + }, + { + "epoch": 0.0035584022773774574, + "grad_norm": 1.21734619140625, + "learning_rate": 0.0003520710059171598, + "loss": 6.1789, + "step": 120 + }, + { + "epoch": 0.003588055629688936, + "grad_norm": 1.216431736946106, + "learning_rate": 0.00035502958579881655, + "loss": 6.2299, + "step": 121 + }, + { + "epoch": 0.0036177089820004153, + "grad_norm": 1.3999985456466675, + "learning_rate": 0.00035798816568047336, + "loss": 6.2183, + "step": 122 + }, + { + "epoch": 0.003647362334311894, + "grad_norm": 0.8166043162345886, + "learning_rate": 0.0003609467455621302, + "loss": 6.1695, + "step": 123 + }, + { + "epoch": 0.003677015686623373, + "grad_norm": 1.4444960355758667, + "learning_rate": 0.000363905325443787, + "loss": 6.1409, + "step": 124 + }, + { + "epoch": 0.0037066690389348515, + "grad_norm": 1.1505333185195923, + "learning_rate": 0.0003668639053254438, + "loss": 6.1371, + "step": 125 + }, + { + "epoch": 0.0037363223912463303, + "grad_norm": 1.5520890951156616, + "learning_rate": 0.0003698224852071006, + "loss": 6.1274, + "step": 126 + }, + { + "epoch": 0.003765975743557809, + "grad_norm": 1.1516132354736328, + "learning_rate": 0.00037278106508875744, + "loss": 6.1249, + "step": 127 + }, + { + "epoch": 0.0037956290958692882, + "grad_norm": 1.3683756589889526, + "learning_rate": 0.0003757396449704142, + "loss": 6.1391, + "step": 128 + }, + { + "epoch": 0.003825282448180767, + "grad_norm": 1.1006982326507568, + "learning_rate": 0.000378698224852071, + "loss": 6.1209, + "step": 129 + }, + { + "epoch": 0.0038549358004922457, + "grad_norm": 1.5147583484649658, + "learning_rate": 0.0003816568047337278, + "loss": 6.0772, + "step": 130 + }, + { + "epoch": 0.0038845891528037245, + "grad_norm": 1.6919981241226196, + "learning_rate": 0.00038461538461538467, + "loss": 6.1039, + "step": 131 + }, + { + "epoch": 0.003914242505115203, + "grad_norm": 0.8398289084434509, + "learning_rate": 0.0003875739644970414, + "loss": 6.0688, + "step": 132 + }, + { + "epoch": 0.003943895857426682, + "grad_norm": 1.4211941957473755, + "learning_rate": 0.00039053254437869823, + "loss": 6.0581, + "step": 133 + }, + { + "epoch": 0.003973549209738161, + "grad_norm": 1.1077960729599, + "learning_rate": 0.00039349112426035503, + "loss": 6.0183, + "step": 134 + }, + { + "epoch": 0.0040032025620496394, + "grad_norm": 1.4566738605499268, + "learning_rate": 0.0003964497041420119, + "loss": 6.0496, + "step": 135 + }, + { + "epoch": 0.004032855914361118, + "grad_norm": 1.1909221410751343, + "learning_rate": 0.00039940828402366865, + "loss": 6.0201, + "step": 136 + }, + { + "epoch": 0.004062509266672597, + "grad_norm": 0.9814589619636536, + "learning_rate": 0.00040236686390532545, + "loss": 6.0156, + "step": 137 + }, + { + "epoch": 0.0040921626189840766, + "grad_norm": 1.0915955305099487, + "learning_rate": 0.00040532544378698226, + "loss": 6.0222, + "step": 138 + }, + { + "epoch": 0.004121815971295555, + "grad_norm": 1.0183378458023071, + "learning_rate": 0.000408284023668639, + "loss": 5.967, + "step": 139 + }, + { + "epoch": 0.004151469323607034, + "grad_norm": 1.0965499877929688, + "learning_rate": 0.00041124260355029587, + "loss": 5.9536, + "step": 140 + }, + { + "epoch": 0.004181122675918513, + "grad_norm": 1.2478625774383545, + "learning_rate": 0.0004142011834319527, + "loss": 5.9541, + "step": 141 + }, + { + "epoch": 0.0042107760282299915, + "grad_norm": 1.098069190979004, + "learning_rate": 0.0004171597633136095, + "loss": 5.9624, + "step": 142 + }, + { + "epoch": 0.00424042938054147, + "grad_norm": 1.1460800170898438, + "learning_rate": 0.00042011834319526624, + "loss": 5.9109, + "step": 143 + }, + { + "epoch": 0.004270082732852949, + "grad_norm": 0.9549556970596313, + "learning_rate": 0.0004230769230769231, + "loss": 5.8842, + "step": 144 + }, + { + "epoch": 0.004299736085164428, + "grad_norm": 0.923005223274231, + "learning_rate": 0.0004260355029585799, + "loss": 5.8857, + "step": 145 + }, + { + "epoch": 0.0043293894374759065, + "grad_norm": 0.7595265507698059, + "learning_rate": 0.0004289940828402367, + "loss": 5.8614, + "step": 146 + }, + { + "epoch": 0.004359042789787385, + "grad_norm": 0.765347421169281, + "learning_rate": 0.00043195266272189346, + "loss": 5.841, + "step": 147 + }, + { + "epoch": 0.004388696142098864, + "grad_norm": 0.9857151508331299, + "learning_rate": 0.0004349112426035503, + "loss": 5.8084, + "step": 148 + }, + { + "epoch": 0.004418349494410343, + "grad_norm": 1.168884038925171, + "learning_rate": 0.00043786982248520713, + "loss": 5.7993, + "step": 149 + }, + { + "epoch": 0.0044480028467218215, + "grad_norm": 1.3082650899887085, + "learning_rate": 0.0004408284023668639, + "loss": 5.8509, + "step": 150 + }, + { + "epoch": 0.004477656199033301, + "grad_norm": 1.3945202827453613, + "learning_rate": 0.0004437869822485207, + "loss": 5.8321, + "step": 151 + }, + { + "epoch": 0.00450730955134478, + "grad_norm": 0.9984548687934875, + "learning_rate": 0.00044674556213017755, + "loss": 5.8291, + "step": 152 + }, + { + "epoch": 0.004536962903656259, + "grad_norm": 1.125320315361023, + "learning_rate": 0.00044970414201183436, + "loss": 5.799, + "step": 153 + }, + { + "epoch": 0.004566616255967737, + "grad_norm": 1.398929238319397, + "learning_rate": 0.0004526627218934911, + "loss": 5.7898, + "step": 154 + }, + { + "epoch": 0.004596269608279216, + "grad_norm": 0.8768879175186157, + "learning_rate": 0.0004556213017751479, + "loss": 5.7692, + "step": 155 + }, + { + "epoch": 0.004625922960590695, + "grad_norm": 0.9811862707138062, + "learning_rate": 0.0004585798816568048, + "loss": 5.7413, + "step": 156 + }, + { + "epoch": 0.004655576312902174, + "grad_norm": 1.634334921836853, + "learning_rate": 0.0004615384615384616, + "loss": 5.7582, + "step": 157 + }, + { + "epoch": 0.004685229665213652, + "grad_norm": 0.6890387535095215, + "learning_rate": 0.00046449704142011833, + "loss": 5.7428, + "step": 158 + }, + { + "epoch": 0.004714883017525131, + "grad_norm": 0.7336459159851074, + "learning_rate": 0.00046745562130177514, + "loss": 5.6778, + "step": 159 + }, + { + "epoch": 0.00474453636983661, + "grad_norm": 1.142289161682129, + "learning_rate": 0.000470414201183432, + "loss": 5.7211, + "step": 160 + }, + { + "epoch": 0.004774189722148089, + "grad_norm": 1.9119774103164673, + "learning_rate": 0.00047337278106508875, + "loss": 5.7463, + "step": 161 + }, + { + "epoch": 0.004803843074459567, + "grad_norm": 0.8100204467773438, + "learning_rate": 0.00047633136094674556, + "loss": 5.6859, + "step": 162 + }, + { + "epoch": 0.004833496426771046, + "grad_norm": 1.3659331798553467, + "learning_rate": 0.00047928994082840237, + "loss": 5.7026, + "step": 163 + }, + { + "epoch": 0.004863149779082526, + "grad_norm": 1.0362147092819214, + "learning_rate": 0.0004822485207100592, + "loss": 5.683, + "step": 164 + }, + { + "epoch": 0.0048928031313940044, + "grad_norm": 1.5060149431228638, + "learning_rate": 0.000485207100591716, + "loss": 5.7247, + "step": 165 + }, + { + "epoch": 0.004922456483705483, + "grad_norm": 1.5911206007003784, + "learning_rate": 0.0004881656804733728, + "loss": 5.7272, + "step": 166 + }, + { + "epoch": 0.004952109836016962, + "grad_norm": 0.8080172538757324, + "learning_rate": 0.0004911242603550296, + "loss": 5.6667, + "step": 167 + }, + { + "epoch": 0.004981763188328441, + "grad_norm": 0.9610570669174194, + "learning_rate": 0.0004940828402366864, + "loss": 5.6278, + "step": 168 + }, + { + "epoch": 0.005011416540639919, + "grad_norm": 1.0105317831039429, + "learning_rate": 0.0004970414201183431, + "loss": 5.6553, + "step": 169 + }, + { + "epoch": 0.005041069892951398, + "grad_norm": 0.9905191659927368, + "learning_rate": 0.0005, + "loss": 5.6357, + "step": 170 + }, + { + "epoch": 0.005070723245262877, + "grad_norm": 1.0099610090255737, + "learning_rate": 0.0005029585798816569, + "loss": 5.6409, + "step": 171 + }, + { + "epoch": 0.005100376597574356, + "grad_norm": 0.8011944890022278, + "learning_rate": 0.0005059171597633136, + "loss": 5.6309, + "step": 172 + }, + { + "epoch": 0.005130029949885834, + "grad_norm": 0.8999521136283875, + "learning_rate": 0.0005088757396449705, + "loss": 5.5819, + "step": 173 + }, + { + "epoch": 0.005159683302197313, + "grad_norm": 0.8857150077819824, + "learning_rate": 0.0005118343195266271, + "loss": 5.5386, + "step": 174 + }, + { + "epoch": 0.005189336654508792, + "grad_norm": 0.9367064833641052, + "learning_rate": 0.000514792899408284, + "loss": 5.5701, + "step": 175 + }, + { + "epoch": 0.005218990006820271, + "grad_norm": 1.4709664583206177, + "learning_rate": 0.0005177514792899408, + "loss": 5.6095, + "step": 176 + }, + { + "epoch": 0.00524864335913175, + "grad_norm": 1.0006425380706787, + "learning_rate": 0.0005207100591715976, + "loss": 5.59, + "step": 177 + }, + { + "epoch": 0.005278296711443229, + "grad_norm": 1.3179266452789307, + "learning_rate": 0.0005236686390532545, + "loss": 5.5842, + "step": 178 + }, + { + "epoch": 0.005307950063754708, + "grad_norm": 1.029981255531311, + "learning_rate": 0.0005266272189349113, + "loss": 5.5351, + "step": 179 + }, + { + "epoch": 0.0053376034160661865, + "grad_norm": 0.9946054816246033, + "learning_rate": 0.0005295857988165681, + "loss": 5.5274, + "step": 180 + }, + { + "epoch": 0.005367256768377665, + "grad_norm": 0.9739015698432922, + "learning_rate": 0.0005325443786982249, + "loss": 5.5329, + "step": 181 + }, + { + "epoch": 0.005396910120689144, + "grad_norm": 0.6429661512374878, + "learning_rate": 0.0005355029585798816, + "loss": 5.4962, + "step": 182 + }, + { + "epoch": 0.005426563473000623, + "grad_norm": 0.7007419466972351, + "learning_rate": 0.0005384615384615384, + "loss": 5.4875, + "step": 183 + }, + { + "epoch": 0.0054562168253121015, + "grad_norm": 0.6527716517448425, + "learning_rate": 0.0005414201183431953, + "loss": 5.444, + "step": 184 + }, + { + "epoch": 0.00548587017762358, + "grad_norm": 0.9280970692634583, + "learning_rate": 0.000544378698224852, + "loss": 5.4936, + "step": 185 + }, + { + "epoch": 0.005515523529935059, + "grad_norm": 0.9991222023963928, + "learning_rate": 0.0005473372781065089, + "loss": 5.492, + "step": 186 + }, + { + "epoch": 0.005545176882246538, + "grad_norm": 1.0214622020721436, + "learning_rate": 0.0005502958579881658, + "loss": 5.4479, + "step": 187 + }, + { + "epoch": 0.0055748302345580165, + "grad_norm": 0.8486016392707825, + "learning_rate": 0.0005532544378698225, + "loss": 5.4255, + "step": 188 + }, + { + "epoch": 0.005604483586869495, + "grad_norm": 0.6443628668785095, + "learning_rate": 0.0005562130177514793, + "loss": 5.4614, + "step": 189 + }, + { + "epoch": 0.005634136939180975, + "grad_norm": 0.6247698664665222, + "learning_rate": 0.000559171597633136, + "loss": 5.3917, + "step": 190 + }, + { + "epoch": 0.005663790291492454, + "grad_norm": 0.49396994709968567, + "learning_rate": 0.0005621301775147929, + "loss": 5.3968, + "step": 191 + }, + { + "epoch": 0.005693443643803932, + "grad_norm": 0.5287873148918152, + "learning_rate": 0.0005650887573964498, + "loss": 5.4077, + "step": 192 + }, + { + "epoch": 0.005723096996115411, + "grad_norm": 0.6019405722618103, + "learning_rate": 0.0005680473372781065, + "loss": 5.3379, + "step": 193 + }, + { + "epoch": 0.00575275034842689, + "grad_norm": 0.6707992553710938, + "learning_rate": 0.0005710059171597634, + "loss": 5.3594, + "step": 194 + }, + { + "epoch": 0.005782403700738369, + "grad_norm": 0.5250041484832764, + "learning_rate": 0.0005739644970414202, + "loss": 5.3393, + "step": 195 + }, + { + "epoch": 0.005812057053049847, + "grad_norm": 0.6677492260932922, + "learning_rate": 0.0005769230769230769, + "loss": 5.2943, + "step": 196 + }, + { + "epoch": 0.005841710405361326, + "grad_norm": 0.8785022497177124, + "learning_rate": 0.0005798816568047337, + "loss": 5.3388, + "step": 197 + }, + { + "epoch": 0.005871363757672805, + "grad_norm": 0.9521197080612183, + "learning_rate": 0.0005828402366863905, + "loss": 5.3181, + "step": 198 + }, + { + "epoch": 0.0059010171099842836, + "grad_norm": 1.242186427116394, + "learning_rate": 0.0005857988165680473, + "loss": 5.3316, + "step": 199 + }, + { + "epoch": 0.005930670462295762, + "grad_norm": 0.6594260931015015, + "learning_rate": 0.0005887573964497042, + "loss": 5.2753, + "step": 200 + }, + { + "epoch": 0.005960323814607241, + "grad_norm": 0.60521399974823, + "learning_rate": 0.000591715976331361, + "loss": 5.2857, + "step": 201 + }, + { + "epoch": 0.00598997716691872, + "grad_norm": 0.8038555383682251, + "learning_rate": 0.0005946745562130178, + "loss": 5.2981, + "step": 202 + }, + { + "epoch": 0.006019630519230199, + "grad_norm": 1.045690894126892, + "learning_rate": 0.0005976331360946747, + "loss": 5.3251, + "step": 203 + }, + { + "epoch": 0.006049283871541678, + "grad_norm": 0.746450662612915, + "learning_rate": 0.0006005917159763313, + "loss": 5.2872, + "step": 204 + }, + { + "epoch": 0.006078937223853157, + "grad_norm": 0.7744626998901367, + "learning_rate": 0.0006035502958579882, + "loss": 5.2691, + "step": 205 + }, + { + "epoch": 0.006108590576164636, + "grad_norm": 0.7122390270233154, + "learning_rate": 0.0006065088757396449, + "loss": 5.253, + "step": 206 + }, + { + "epoch": 0.006138243928476114, + "grad_norm": 1.075055718421936, + "learning_rate": 0.0006094674556213018, + "loss": 5.2714, + "step": 207 + }, + { + "epoch": 0.006167897280787593, + "grad_norm": 0.9268677234649658, + "learning_rate": 0.0006124260355029587, + "loss": 5.3177, + "step": 208 + }, + { + "epoch": 0.006197550633099072, + "grad_norm": 0.7479074597358704, + "learning_rate": 0.0006153846153846154, + "loss": 5.2541, + "step": 209 + }, + { + "epoch": 0.006227203985410551, + "grad_norm": 1.1970747709274292, + "learning_rate": 0.0006183431952662723, + "loss": 5.2729, + "step": 210 + }, + { + "epoch": 0.006256857337722029, + "grad_norm": 0.9651538729667664, + "learning_rate": 0.0006213017751479289, + "loss": 5.2819, + "step": 211 + }, + { + "epoch": 0.006286510690033508, + "grad_norm": 0.7983511686325073, + "learning_rate": 0.0006242603550295858, + "loss": 5.2224, + "step": 212 + }, + { + "epoch": 0.006316164042344987, + "grad_norm": 0.6978031396865845, + "learning_rate": 0.0006272189349112426, + "loss": 5.2529, + "step": 213 + }, + { + "epoch": 0.006345817394656466, + "grad_norm": 0.8586859107017517, + "learning_rate": 0.0006301775147928994, + "loss": 5.2064, + "step": 214 + }, + { + "epoch": 0.006375470746967944, + "grad_norm": 0.9872927069664001, + "learning_rate": 0.0006331360946745562, + "loss": 5.2164, + "step": 215 + }, + { + "epoch": 0.006405124099279423, + "grad_norm": 0.7056111097335815, + "learning_rate": 0.0006360946745562131, + "loss": 5.174, + "step": 216 + }, + { + "epoch": 0.006434777451590903, + "grad_norm": 0.5802350044250488, + "learning_rate": 0.0006390532544378699, + "loss": 5.1768, + "step": 217 + }, + { + "epoch": 0.0064644308039023815, + "grad_norm": 0.672815203666687, + "learning_rate": 0.0006420118343195266, + "loss": 5.1629, + "step": 218 + }, + { + "epoch": 0.00649408415621386, + "grad_norm": 0.8444591760635376, + "learning_rate": 0.0006449704142011834, + "loss": 5.127, + "step": 219 + }, + { + "epoch": 0.006523737508525339, + "grad_norm": 0.7703789472579956, + "learning_rate": 0.0006479289940828402, + "loss": 5.1444, + "step": 220 + }, + { + "epoch": 0.006553390860836818, + "grad_norm": 0.9393143057823181, + "learning_rate": 0.0006508875739644971, + "loss": 5.1647, + "step": 221 + }, + { + "epoch": 0.0065830442131482965, + "grad_norm": 0.771821141242981, + "learning_rate": 0.0006538461538461538, + "loss": 5.1087, + "step": 222 + }, + { + "epoch": 0.006612697565459775, + "grad_norm": 0.605191171169281, + "learning_rate": 0.0006568047337278107, + "loss": 5.11, + "step": 223 + }, + { + "epoch": 0.006642350917771254, + "grad_norm": 0.6809538006782532, + "learning_rate": 0.0006597633136094676, + "loss": 5.1087, + "step": 224 + }, + { + "epoch": 0.006672004270082733, + "grad_norm": 0.6699666976928711, + "learning_rate": 0.0006627218934911243, + "loss": 5.0983, + "step": 225 + }, + { + "epoch": 0.0067016576223942114, + "grad_norm": 0.5897895097732544, + "learning_rate": 0.0006656804733727811, + "loss": 5.0839, + "step": 226 + }, + { + "epoch": 0.00673131097470569, + "grad_norm": 0.7744308710098267, + "learning_rate": 0.0006686390532544378, + "loss": 5.0913, + "step": 227 + }, + { + "epoch": 0.006760964327017169, + "grad_norm": 0.6826775670051575, + "learning_rate": 0.0006715976331360947, + "loss": 5.0736, + "step": 228 + }, + { + "epoch": 0.006790617679328648, + "grad_norm": 0.6877195239067078, + "learning_rate": 0.0006745562130177515, + "loss": 5.0295, + "step": 229 + }, + { + "epoch": 0.006820271031640127, + "grad_norm": 0.8488063216209412, + "learning_rate": 0.0006775147928994083, + "loss": 5.0654, + "step": 230 + }, + { + "epoch": 0.006849924383951606, + "grad_norm": 0.6234027147293091, + "learning_rate": 0.0006804733727810651, + "loss": 5.0686, + "step": 231 + }, + { + "epoch": 0.006879577736263085, + "grad_norm": 0.686409592628479, + "learning_rate": 0.000683431952662722, + "loss": 5.0489, + "step": 232 + }, + { + "epoch": 0.0069092310885745635, + "grad_norm": 0.7324587106704712, + "learning_rate": 0.0006863905325443787, + "loss": 5.0691, + "step": 233 + }, + { + "epoch": 0.006938884440886042, + "grad_norm": 0.627479076385498, + "learning_rate": 0.0006893491124260355, + "loss": 5.038, + "step": 234 + }, + { + "epoch": 0.006968537793197521, + "grad_norm": 0.8999791145324707, + "learning_rate": 0.0006923076923076923, + "loss": 4.9851, + "step": 235 + }, + { + "epoch": 0.006998191145509, + "grad_norm": 0.674656331539154, + "learning_rate": 0.0006952662721893491, + "loss": 5.0279, + "step": 236 + }, + { + "epoch": 0.0070278444978204785, + "grad_norm": 0.5172945261001587, + "learning_rate": 0.000698224852071006, + "loss": 5.0228, + "step": 237 + }, + { + "epoch": 0.007057497850131957, + "grad_norm": 0.7305448651313782, + "learning_rate": 0.0007011834319526627, + "loss": 4.9873, + "step": 238 + }, + { + "epoch": 0.007087151202443436, + "grad_norm": 0.7272880673408508, + "learning_rate": 0.0007041420118343196, + "loss": 5.016, + "step": 239 + }, + { + "epoch": 0.007116804554754915, + "grad_norm": 0.5988402962684631, + "learning_rate": 0.0007071005917159762, + "loss": 4.9675, + "step": 240 + }, + { + "epoch": 0.0071464579070663935, + "grad_norm": 0.7123491764068604, + "learning_rate": 0.0007100591715976331, + "loss": 4.9686, + "step": 241 + }, + { + "epoch": 0.007176111259377872, + "grad_norm": 0.5863490104675293, + "learning_rate": 0.00071301775147929, + "loss": 4.9767, + "step": 242 + }, + { + "epoch": 0.007205764611689352, + "grad_norm": 0.717117428779602, + "learning_rate": 0.0007159763313609467, + "loss": 4.9271, + "step": 243 + }, + { + "epoch": 0.007235417964000831, + "grad_norm": 0.7154814600944519, + "learning_rate": 0.0007189349112426036, + "loss": 4.9606, + "step": 244 + }, + { + "epoch": 0.007265071316312309, + "grad_norm": 0.9186888337135315, + "learning_rate": 0.0007218934911242604, + "loss": 4.9352, + "step": 245 + }, + { + "epoch": 0.007294724668623788, + "grad_norm": 0.8325491547584534, + "learning_rate": 0.0007248520710059172, + "loss": 4.9586, + "step": 246 + }, + { + "epoch": 0.007324378020935267, + "grad_norm": 0.7382215261459351, + "learning_rate": 0.000727810650887574, + "loss": 4.9314, + "step": 247 + }, + { + "epoch": 0.007354031373246746, + "grad_norm": 0.6561602354049683, + "learning_rate": 0.0007307692307692307, + "loss": 4.9464, + "step": 248 + }, + { + "epoch": 0.007383684725558224, + "grad_norm": 0.6498631834983826, + "learning_rate": 0.0007337278106508876, + "loss": 4.9218, + "step": 249 + }, + { + "epoch": 0.007413338077869703, + "grad_norm": 0.6634749174118042, + "learning_rate": 0.0007366863905325444, + "loss": 4.932, + "step": 250 + }, + { + "epoch": 0.007442991430181182, + "grad_norm": 0.6483098268508911, + "learning_rate": 0.0007396449704142012, + "loss": 4.9204, + "step": 251 + }, + { + "epoch": 0.007472644782492661, + "grad_norm": 0.6428969502449036, + "learning_rate": 0.000742603550295858, + "loss": 4.8889, + "step": 252 + }, + { + "epoch": 0.007502298134804139, + "grad_norm": 0.6477037668228149, + "learning_rate": 0.0007455621301775149, + "loss": 4.9126, + "step": 253 + }, + { + "epoch": 0.007531951487115618, + "grad_norm": 0.5465149879455566, + "learning_rate": 0.0007485207100591716, + "loss": 4.872, + "step": 254 + }, + { + "epoch": 0.007561604839427097, + "grad_norm": 0.5257192254066467, + "learning_rate": 0.0007514792899408284, + "loss": 4.8666, + "step": 255 + }, + { + "epoch": 0.0075912581917385764, + "grad_norm": 0.5593119263648987, + "learning_rate": 0.0007544378698224851, + "loss": 4.8377, + "step": 256 + }, + { + "epoch": 0.007620911544050055, + "grad_norm": 0.6386788487434387, + "learning_rate": 0.000757396449704142, + "loss": 4.8403, + "step": 257 + }, + { + "epoch": 0.007650564896361534, + "grad_norm": 0.7664703726768494, + "learning_rate": 0.0007603550295857989, + "loss": 4.8458, + "step": 258 + }, + { + "epoch": 0.007680218248673013, + "grad_norm": 0.6819597482681274, + "learning_rate": 0.0007633136094674556, + "loss": 4.8539, + "step": 259 + }, + { + "epoch": 0.007709871600984491, + "grad_norm": 0.7141460180282593, + "learning_rate": 0.0007662721893491125, + "loss": 4.8664, + "step": 260 + }, + { + "epoch": 0.00773952495329597, + "grad_norm": 0.8965128064155579, + "learning_rate": 0.0007692307692307693, + "loss": 4.8472, + "step": 261 + }, + { + "epoch": 0.007769178305607449, + "grad_norm": 0.7742786407470703, + "learning_rate": 0.000772189349112426, + "loss": 4.8204, + "step": 262 + }, + { + "epoch": 0.007798831657918928, + "grad_norm": 0.73893141746521, + "learning_rate": 0.0007751479289940828, + "loss": 4.8147, + "step": 263 + }, + { + "epoch": 0.007828485010230406, + "grad_norm": 0.744563102722168, + "learning_rate": 0.0007781065088757396, + "loss": 4.8027, + "step": 264 + }, + { + "epoch": 0.007858138362541886, + "grad_norm": 0.7650623321533203, + "learning_rate": 0.0007810650887573965, + "loss": 4.7878, + "step": 265 + }, + { + "epoch": 0.007887791714853364, + "grad_norm": 0.6093603372573853, + "learning_rate": 0.0007840236686390533, + "loss": 4.7842, + "step": 266 + }, + { + "epoch": 0.007917445067164844, + "grad_norm": 0.5600751042366028, + "learning_rate": 0.0007869822485207101, + "loss": 4.7703, + "step": 267 + }, + { + "epoch": 0.007947098419476321, + "grad_norm": 0.4956977069377899, + "learning_rate": 0.0007899408284023669, + "loss": 4.8173, + "step": 268 + }, + { + "epoch": 0.007976751771787801, + "grad_norm": 0.4526093602180481, + "learning_rate": 0.0007928994082840238, + "loss": 4.7476, + "step": 269 + }, + { + "epoch": 0.008006405124099279, + "grad_norm": 0.46059271693229675, + "learning_rate": 0.0007958579881656804, + "loss": 4.7217, + "step": 270 + }, + { + "epoch": 0.008036058476410759, + "grad_norm": 0.515108585357666, + "learning_rate": 0.0007988165680473373, + "loss": 4.7418, + "step": 271 + }, + { + "epoch": 0.008065711828722236, + "grad_norm": 0.5636969208717346, + "learning_rate": 0.000801775147928994, + "loss": 4.7251, + "step": 272 + }, + { + "epoch": 0.008095365181033716, + "grad_norm": 0.4838902950286865, + "learning_rate": 0.0008047337278106509, + "loss": 4.6948, + "step": 273 + }, + { + "epoch": 0.008125018533345194, + "grad_norm": 0.5041856169700623, + "learning_rate": 0.0008076923076923078, + "loss": 4.6563, + "step": 274 + }, + { + "epoch": 0.008154671885656673, + "grad_norm": 0.5743809938430786, + "learning_rate": 0.0008106508875739645, + "loss": 4.7035, + "step": 275 + }, + { + "epoch": 0.008184325237968153, + "grad_norm": 0.5451339483261108, + "learning_rate": 0.0008136094674556214, + "loss": 4.6922, + "step": 276 + }, + { + "epoch": 0.008213978590279631, + "grad_norm": 0.4593498408794403, + "learning_rate": 0.000816568047337278, + "loss": 4.677, + "step": 277 + }, + { + "epoch": 0.00824363194259111, + "grad_norm": 0.4799157679080963, + "learning_rate": 0.0008195266272189349, + "loss": 4.6524, + "step": 278 + }, + { + "epoch": 0.008273285294902588, + "grad_norm": 0.5510650277137756, + "learning_rate": 0.0008224852071005917, + "loss": 4.6565, + "step": 279 + }, + { + "epoch": 0.008302938647214068, + "grad_norm": 0.5808246731758118, + "learning_rate": 0.0008254437869822485, + "loss": 4.6568, + "step": 280 + }, + { + "epoch": 0.008332591999525546, + "grad_norm": 0.606224775314331, + "learning_rate": 0.0008284023668639054, + "loss": 4.6316, + "step": 281 + }, + { + "epoch": 0.008362245351837026, + "grad_norm": 0.5638919472694397, + "learning_rate": 0.0008313609467455622, + "loss": 4.6457, + "step": 282 + }, + { + "epoch": 0.008391898704148503, + "grad_norm": 0.5725050568580627, + "learning_rate": 0.000834319526627219, + "loss": 4.6326, + "step": 283 + }, + { + "epoch": 0.008421552056459983, + "grad_norm": 0.4239210784435272, + "learning_rate": 0.0008372781065088757, + "loss": 4.6133, + "step": 284 + }, + { + "epoch": 0.008451205408771461, + "grad_norm": 0.46953466534614563, + "learning_rate": 0.0008402366863905325, + "loss": 4.6029, + "step": 285 + }, + { + "epoch": 0.00848085876108294, + "grad_norm": 0.43134114146232605, + "learning_rate": 0.0008431952662721893, + "loss": 4.5652, + "step": 286 + }, + { + "epoch": 0.008510512113394418, + "grad_norm": 0.5223537087440491, + "learning_rate": 0.0008461538461538462, + "loss": 4.5969, + "step": 287 + }, + { + "epoch": 0.008540165465705898, + "grad_norm": 0.4775165617465973, + "learning_rate": 0.000849112426035503, + "loss": 4.5484, + "step": 288 + }, + { + "epoch": 0.008569818818017378, + "grad_norm": 0.634803831577301, + "learning_rate": 0.0008520710059171598, + "loss": 4.5701, + "step": 289 + }, + { + "epoch": 0.008599472170328856, + "grad_norm": 0.7227613925933838, + "learning_rate": 0.0008550295857988167, + "loss": 4.5761, + "step": 290 + }, + { + "epoch": 0.008629125522640335, + "grad_norm": 0.8712694644927979, + "learning_rate": 0.0008579881656804734, + "loss": 4.6276, + "step": 291 + }, + { + "epoch": 0.008658778874951813, + "grad_norm": 0.6303057670593262, + "learning_rate": 0.0008609467455621302, + "loss": 4.615, + "step": 292 + }, + { + "epoch": 0.008688432227263293, + "grad_norm": 0.6628982424736023, + "learning_rate": 0.0008639053254437869, + "loss": 4.5876, + "step": 293 + }, + { + "epoch": 0.00871808557957477, + "grad_norm": 0.7094428539276123, + "learning_rate": 0.0008668639053254438, + "loss": 4.5778, + "step": 294 + }, + { + "epoch": 0.00874773893188625, + "grad_norm": 0.7644968628883362, + "learning_rate": 0.0008698224852071006, + "loss": 4.612, + "step": 295 + }, + { + "epoch": 0.008777392284197728, + "grad_norm": 0.8258724212646484, + "learning_rate": 0.0008727810650887574, + "loss": 4.5937, + "step": 296 + }, + { + "epoch": 0.008807045636509208, + "grad_norm": 0.6633610129356384, + "learning_rate": 0.0008757396449704143, + "loss": 4.5562, + "step": 297 + }, + { + "epoch": 0.008836698988820686, + "grad_norm": 0.6630558371543884, + "learning_rate": 0.0008786982248520711, + "loss": 4.5481, + "step": 298 + }, + { + "epoch": 0.008866352341132165, + "grad_norm": 0.6891627311706543, + "learning_rate": 0.0008816568047337278, + "loss": 4.5363, + "step": 299 + }, + { + "epoch": 0.008896005693443643, + "grad_norm": 0.906449556350708, + "learning_rate": 0.0008846153846153846, + "loss": 4.5861, + "step": 300 + }, + { + "epoch": 0.008925659045755123, + "grad_norm": 0.7529159784317017, + "learning_rate": 0.0008875739644970414, + "loss": 4.609, + "step": 301 + }, + { + "epoch": 0.008955312398066602, + "grad_norm": 0.5625036358833313, + "learning_rate": 0.0008905325443786982, + "loss": 4.5347, + "step": 302 + }, + { + "epoch": 0.00898496575037808, + "grad_norm": 0.6193239092826843, + "learning_rate": 0.0008934911242603551, + "loss": 4.5554, + "step": 303 + }, + { + "epoch": 0.00901461910268956, + "grad_norm": 0.7130552530288696, + "learning_rate": 0.0008964497041420119, + "loss": 4.5503, + "step": 304 + }, + { + "epoch": 0.009044272455001038, + "grad_norm": 0.7082216739654541, + "learning_rate": 0.0008994082840236687, + "loss": 4.5485, + "step": 305 + }, + { + "epoch": 0.009073925807312517, + "grad_norm": 0.5553739070892334, + "learning_rate": 0.0009023668639053254, + "loss": 4.4841, + "step": 306 + }, + { + "epoch": 0.009103579159623995, + "grad_norm": 0.505854606628418, + "learning_rate": 0.0009053254437869822, + "loss": 4.5406, + "step": 307 + }, + { + "epoch": 0.009133232511935475, + "grad_norm": 0.5582791566848755, + "learning_rate": 0.0009082840236686391, + "loss": 4.466, + "step": 308 + }, + { + "epoch": 0.009162885864246953, + "grad_norm": 0.5420180559158325, + "learning_rate": 0.0009112426035502958, + "loss": 4.5008, + "step": 309 + }, + { + "epoch": 0.009192539216558432, + "grad_norm": 0.4393915832042694, + "learning_rate": 0.0009142011834319527, + "loss": 4.4825, + "step": 310 + }, + { + "epoch": 0.00922219256886991, + "grad_norm": 0.43087637424468994, + "learning_rate": 0.0009171597633136096, + "loss": 4.5033, + "step": 311 + }, + { + "epoch": 0.00925184592118139, + "grad_norm": 0.4802876114845276, + "learning_rate": 0.0009201183431952663, + "loss": 4.4594, + "step": 312 + }, + { + "epoch": 0.009281499273492868, + "grad_norm": 0.37921059131622314, + "learning_rate": 0.0009230769230769232, + "loss": 4.4532, + "step": 313 + }, + { + "epoch": 0.009311152625804347, + "grad_norm": 0.3548833429813385, + "learning_rate": 0.0009260355029585798, + "loss": 4.4135, + "step": 314 + }, + { + "epoch": 0.009340805978115827, + "grad_norm": 0.383232057094574, + "learning_rate": 0.0009289940828402367, + "loss": 4.4163, + "step": 315 + }, + { + "epoch": 0.009370459330427305, + "grad_norm": 0.40537846088409424, + "learning_rate": 0.0009319526627218935, + "loss": 4.4419, + "step": 316 + }, + { + "epoch": 0.009400112682738784, + "grad_norm": 0.4190423786640167, + "learning_rate": 0.0009349112426035503, + "loss": 4.4266, + "step": 317 + }, + { + "epoch": 0.009429766035050262, + "grad_norm": 0.43807893991470337, + "learning_rate": 0.0009378698224852071, + "loss": 4.3424, + "step": 318 + }, + { + "epoch": 0.009459419387361742, + "grad_norm": 0.4100263714790344, + "learning_rate": 0.000940828402366864, + "loss": 4.392, + "step": 319 + }, + { + "epoch": 0.00948907273967322, + "grad_norm": 0.4128643870353699, + "learning_rate": 0.0009437869822485208, + "loss": 4.3914, + "step": 320 + }, + { + "epoch": 0.0095187260919847, + "grad_norm": 0.42840689420700073, + "learning_rate": 0.0009467455621301775, + "loss": 4.3534, + "step": 321 + }, + { + "epoch": 0.009548379444296177, + "grad_norm": 0.48047712445259094, + "learning_rate": 0.0009497041420118343, + "loss": 4.3835, + "step": 322 + }, + { + "epoch": 0.009578032796607657, + "grad_norm": 0.458128958940506, + "learning_rate": 0.0009526627218934911, + "loss": 4.3781, + "step": 323 + }, + { + "epoch": 0.009607686148919135, + "grad_norm": 0.4183042645454407, + "learning_rate": 0.000955621301775148, + "loss": 4.3963, + "step": 324 + }, + { + "epoch": 0.009637339501230614, + "grad_norm": 0.49821943044662476, + "learning_rate": 0.0009585798816568047, + "loss": 4.3421, + "step": 325 + }, + { + "epoch": 0.009666992853542092, + "grad_norm": 0.4602010250091553, + "learning_rate": 0.0009615384615384616, + "loss": 4.3618, + "step": 326 + }, + { + "epoch": 0.009696646205853572, + "grad_norm": 0.5034216642379761, + "learning_rate": 0.0009644970414201185, + "loss": 4.3877, + "step": 327 + }, + { + "epoch": 0.009726299558165051, + "grad_norm": 0.5651377439498901, + "learning_rate": 0.0009674556213017751, + "loss": 4.3542, + "step": 328 + }, + { + "epoch": 0.00975595291047653, + "grad_norm": 0.4499555230140686, + "learning_rate": 0.000970414201183432, + "loss": 4.3128, + "step": 329 + }, + { + "epoch": 0.009785606262788009, + "grad_norm": 0.6124625205993652, + "learning_rate": 0.0009733727810650887, + "loss": 4.3707, + "step": 330 + }, + { + "epoch": 0.009815259615099487, + "grad_norm": 0.6219717860221863, + "learning_rate": 0.0009763313609467456, + "loss": 4.3903, + "step": 331 + }, + { + "epoch": 0.009844912967410966, + "grad_norm": 0.46527138352394104, + "learning_rate": 0.0009792899408284023, + "loss": 4.332, + "step": 332 + }, + { + "epoch": 0.009874566319722444, + "grad_norm": 0.5260749459266663, + "learning_rate": 0.0009822485207100593, + "loss": 4.3811, + "step": 333 + }, + { + "epoch": 0.009904219672033924, + "grad_norm": 0.4670645296573639, + "learning_rate": 0.000985207100591716, + "loss": 4.3447, + "step": 334 + }, + { + "epoch": 0.009933873024345402, + "grad_norm": 0.38384053111076355, + "learning_rate": 0.0009881656804733728, + "loss": 4.3466, + "step": 335 + }, + { + "epoch": 0.009963526376656881, + "grad_norm": 0.42785319685935974, + "learning_rate": 0.0009911242603550295, + "loss": 4.3416, + "step": 336 + }, + { + "epoch": 0.00999317972896836, + "grad_norm": 0.545905351638794, + "learning_rate": 0.0009940828402366863, + "loss": 4.3358, + "step": 337 + }, + { + "epoch": 0.010022833081279839, + "grad_norm": 0.4625833332538605, + "learning_rate": 0.0009970414201183433, + "loss": 4.3371, + "step": 338 + }, + { + "epoch": 0.010052486433591317, + "grad_norm": 0.3906095325946808, + "learning_rate": 0.001, + "loss": 4.2617, + "step": 339 + }, + { + "epoch": 0.010082139785902796, + "grad_norm": 0.5965687036514282, + "learning_rate": 0.000999999997786207, + "loss": 4.3566, + "step": 340 + }, + { + "epoch": 0.010111793138214276, + "grad_norm": 0.653741717338562, + "learning_rate": 0.0009999999911448282, + "loss": 4.3453, + "step": 341 + }, + { + "epoch": 0.010141446490525754, + "grad_norm": 0.5435784459114075, + "learning_rate": 0.000999999980075864, + "loss": 4.3222, + "step": 342 + }, + { + "epoch": 0.010171099842837233, + "grad_norm": 0.4084891974925995, + "learning_rate": 0.0009999999645793139, + "loss": 4.3094, + "step": 343 + }, + { + "epoch": 0.010200753195148711, + "grad_norm": 0.4589015245437622, + "learning_rate": 0.0009999999446551782, + "loss": 4.2985, + "step": 344 + }, + { + "epoch": 0.010230406547460191, + "grad_norm": 0.5070710778236389, + "learning_rate": 0.0009999999203034573, + "loss": 4.3203, + "step": 345 + }, + { + "epoch": 0.010260059899771669, + "grad_norm": 0.4824785590171814, + "learning_rate": 0.000999999891524151, + "loss": 4.3401, + "step": 346 + }, + { + "epoch": 0.010289713252083148, + "grad_norm": 0.40560251474380493, + "learning_rate": 0.0009999998583172603, + "loss": 4.2848, + "step": 347 + }, + { + "epoch": 0.010319366604394626, + "grad_norm": 0.33495843410491943, + "learning_rate": 0.0009999998206827846, + "loss": 4.284, + "step": 348 + }, + { + "epoch": 0.010349019956706106, + "grad_norm": 0.3513641357421875, + "learning_rate": 0.000999999778620725, + "loss": 4.2701, + "step": 349 + }, + { + "epoch": 0.010378673309017584, + "grad_norm": 0.322648286819458, + "learning_rate": 0.0009999997321310814, + "loss": 4.244, + "step": 350 + }, + { + "epoch": 0.010408326661329063, + "grad_norm": 0.35637590289115906, + "learning_rate": 0.0009999996812138543, + "loss": 4.2182, + "step": 351 + }, + { + "epoch": 0.010437980013640541, + "grad_norm": 0.37419793009757996, + "learning_rate": 0.0009999996258690442, + "loss": 4.22, + "step": 352 + }, + { + "epoch": 0.010467633365952021, + "grad_norm": 0.39671680331230164, + "learning_rate": 0.0009999995660966517, + "loss": 4.2545, + "step": 353 + }, + { + "epoch": 0.0104972867182635, + "grad_norm": 0.38925281167030334, + "learning_rate": 0.0009999995018966771, + "loss": 4.247, + "step": 354 + }, + { + "epoch": 0.010526940070574978, + "grad_norm": 0.29580289125442505, + "learning_rate": 0.0009999994332691212, + "loss": 4.2462, + "step": 355 + }, + { + "epoch": 0.010556593422886458, + "grad_norm": 0.3648681342601776, + "learning_rate": 0.0009999993602139846, + "loss": 4.2253, + "step": 356 + }, + { + "epoch": 0.010586246775197936, + "grad_norm": 0.34971582889556885, + "learning_rate": 0.0009999992827312675, + "loss": 4.2105, + "step": 357 + }, + { + "epoch": 0.010615900127509416, + "grad_norm": 0.38109642267227173, + "learning_rate": 0.000999999200820971, + "loss": 4.2014, + "step": 358 + }, + { + "epoch": 0.010645553479820893, + "grad_norm": 0.3783658742904663, + "learning_rate": 0.000999999114483096, + "loss": 4.2024, + "step": 359 + }, + { + "epoch": 0.010675206832132373, + "grad_norm": 0.4066912531852722, + "learning_rate": 0.0009999990237176428, + "loss": 4.2184, + "step": 360 + }, + { + "epoch": 0.010704860184443851, + "grad_norm": 0.437334269285202, + "learning_rate": 0.0009999989285246124, + "loss": 4.2223, + "step": 361 + }, + { + "epoch": 0.01073451353675533, + "grad_norm": 0.3744148910045624, + "learning_rate": 0.0009999988289040058, + "loss": 4.2092, + "step": 362 + }, + { + "epoch": 0.010764166889066808, + "grad_norm": 0.35699713230133057, + "learning_rate": 0.0009999987248558238, + "loss": 4.1938, + "step": 363 + }, + { + "epoch": 0.010793820241378288, + "grad_norm": 0.34668460488319397, + "learning_rate": 0.0009999986163800672, + "loss": 4.1652, + "step": 364 + }, + { + "epoch": 0.010823473593689766, + "grad_norm": 0.32405146956443787, + "learning_rate": 0.0009999985034767369, + "loss": 4.1893, + "step": 365 + }, + { + "epoch": 0.010853126946001245, + "grad_norm": 0.3061670660972595, + "learning_rate": 0.0009999983861458343, + "loss": 4.1663, + "step": 366 + }, + { + "epoch": 0.010882780298312725, + "grad_norm": 0.35230252146720886, + "learning_rate": 0.0009999982643873599, + "loss": 4.18, + "step": 367 + }, + { + "epoch": 0.010912433650624203, + "grad_norm": 0.34471285343170166, + "learning_rate": 0.0009999981382013152, + "loss": 4.1633, + "step": 368 + }, + { + "epoch": 0.010942087002935683, + "grad_norm": 0.3418842852115631, + "learning_rate": 0.000999998007587701, + "loss": 4.178, + "step": 369 + }, + { + "epoch": 0.01097174035524716, + "grad_norm": 0.3562449514865875, + "learning_rate": 0.0009999978725465188, + "loss": 4.1404, + "step": 370 + }, + { + "epoch": 0.01100139370755864, + "grad_norm": 0.3401280343532562, + "learning_rate": 0.0009999977330777694, + "loss": 4.1945, + "step": 371 + }, + { + "epoch": 0.011031047059870118, + "grad_norm": 0.3568212389945984, + "learning_rate": 0.0009999975891814546, + "loss": 4.1812, + "step": 372 + }, + { + "epoch": 0.011060700412181598, + "grad_norm": 0.3503057658672333, + "learning_rate": 0.000999997440857575, + "loss": 4.1458, + "step": 373 + }, + { + "epoch": 0.011090353764493075, + "grad_norm": 0.41381824016571045, + "learning_rate": 0.0009999972881061323, + "loss": 4.1601, + "step": 374 + }, + { + "epoch": 0.011120007116804555, + "grad_norm": 0.36365509033203125, + "learning_rate": 0.000999997130927128, + "loss": 4.1591, + "step": 375 + }, + { + "epoch": 0.011149660469116033, + "grad_norm": 0.34165236353874207, + "learning_rate": 0.000999996969320563, + "loss": 4.1513, + "step": 376 + }, + { + "epoch": 0.011179313821427513, + "grad_norm": 0.4387838840484619, + "learning_rate": 0.000999996803286439, + "loss": 4.1611, + "step": 377 + }, + { + "epoch": 0.01120896717373899, + "grad_norm": 0.4464419484138489, + "learning_rate": 0.0009999966328247578, + "loss": 4.1258, + "step": 378 + }, + { + "epoch": 0.01123862052605047, + "grad_norm": 0.41620227694511414, + "learning_rate": 0.0009999964579355202, + "loss": 4.1674, + "step": 379 + }, + { + "epoch": 0.01126827387836195, + "grad_norm": 0.4234539270401001, + "learning_rate": 0.0009999962786187285, + "loss": 4.1224, + "step": 380 + }, + { + "epoch": 0.011297927230673428, + "grad_norm": 0.5273023247718811, + "learning_rate": 0.000999996094874384, + "loss": 4.1336, + "step": 381 + }, + { + "epoch": 0.011327580582984907, + "grad_norm": 0.47751298546791077, + "learning_rate": 0.0009999959067024879, + "loss": 4.1487, + "step": 382 + }, + { + "epoch": 0.011357233935296385, + "grad_norm": 0.38251346349716187, + "learning_rate": 0.0009999957141030422, + "loss": 4.169, + "step": 383 + }, + { + "epoch": 0.011386887287607865, + "grad_norm": 0.3440977931022644, + "learning_rate": 0.000999995517076049, + "loss": 4.1311, + "step": 384 + }, + { + "epoch": 0.011416540639919343, + "grad_norm": 0.36738553643226624, + "learning_rate": 0.0009999953156215094, + "loss": 4.1168, + "step": 385 + }, + { + "epoch": 0.011446193992230822, + "grad_norm": 0.36051100492477417, + "learning_rate": 0.0009999951097394255, + "loss": 4.1002, + "step": 386 + }, + { + "epoch": 0.0114758473445423, + "grad_norm": 0.38050055503845215, + "learning_rate": 0.0009999948994297992, + "loss": 4.1392, + "step": 387 + }, + { + "epoch": 0.01150550069685378, + "grad_norm": 0.37160706520080566, + "learning_rate": 0.000999994684692632, + "loss": 4.1316, + "step": 388 + }, + { + "epoch": 0.011535154049165258, + "grad_norm": 0.365360289812088, + "learning_rate": 0.0009999944655279262, + "loss": 4.1077, + "step": 389 + }, + { + "epoch": 0.011564807401476737, + "grad_norm": 0.33757781982421875, + "learning_rate": 0.0009999942419356837, + "loss": 4.1054, + "step": 390 + }, + { + "epoch": 0.011594460753788215, + "grad_norm": 0.3202689290046692, + "learning_rate": 0.0009999940139159061, + "loss": 4.0711, + "step": 391 + }, + { + "epoch": 0.011624114106099695, + "grad_norm": 0.26455968618392944, + "learning_rate": 0.0009999937814685958, + "loss": 4.0674, + "step": 392 + }, + { + "epoch": 0.011653767458411174, + "grad_norm": 0.28427112102508545, + "learning_rate": 0.0009999935445937549, + "loss": 4.0551, + "step": 393 + }, + { + "epoch": 0.011683420810722652, + "grad_norm": 0.27101826667785645, + "learning_rate": 0.000999993303291385, + "loss": 4.0827, + "step": 394 + }, + { + "epoch": 0.011713074163034132, + "grad_norm": 0.2638905942440033, + "learning_rate": 0.000999993057561489, + "loss": 4.0318, + "step": 395 + }, + { + "epoch": 0.01174272751534561, + "grad_norm": 0.21026727557182312, + "learning_rate": 0.0009999928074040682, + "loss": 4.0466, + "step": 396 + }, + { + "epoch": 0.01177238086765709, + "grad_norm": 0.23528684675693512, + "learning_rate": 0.0009999925528191257, + "loss": 4.0254, + "step": 397 + }, + { + "epoch": 0.011802034219968567, + "grad_norm": 0.27079248428344727, + "learning_rate": 0.000999992293806663, + "loss": 4.0528, + "step": 398 + }, + { + "epoch": 0.011831687572280047, + "grad_norm": 0.29700976610183716, + "learning_rate": 0.000999992030366683, + "loss": 4.0416, + "step": 399 + }, + { + "epoch": 0.011861340924591525, + "grad_norm": 0.3121340870857239, + "learning_rate": 0.0009999917624991875, + "loss": 4.0483, + "step": 400 + }, + { + "epoch": 0.011890994276903004, + "grad_norm": 0.2996945083141327, + "learning_rate": 0.0009999914902041793, + "loss": 4.0257, + "step": 401 + }, + { + "epoch": 0.011920647629214482, + "grad_norm": 0.31001463532447815, + "learning_rate": 0.0009999912134816605, + "loss": 4.0236, + "step": 402 + }, + { + "epoch": 0.011950300981525962, + "grad_norm": 0.30074065923690796, + "learning_rate": 0.0009999909323316336, + "loss": 4.0796, + "step": 403 + }, + { + "epoch": 0.01197995433383744, + "grad_norm": 0.31781014800071716, + "learning_rate": 0.0009999906467541013, + "loss": 4.0643, + "step": 404 + }, + { + "epoch": 0.01200960768614892, + "grad_norm": 0.35011255741119385, + "learning_rate": 0.000999990356749066, + "loss": 4.0639, + "step": 405 + }, + { + "epoch": 0.012039261038460399, + "grad_norm": 0.37993836402893066, + "learning_rate": 0.0009999900623165303, + "loss": 4.0458, + "step": 406 + }, + { + "epoch": 0.012068914390771877, + "grad_norm": 0.3963221311569214, + "learning_rate": 0.0009999897634564968, + "loss": 4.064, + "step": 407 + }, + { + "epoch": 0.012098567743083356, + "grad_norm": 0.3778785169124603, + "learning_rate": 0.000999989460168968, + "loss": 4.0277, + "step": 408 + }, + { + "epoch": 0.012128221095394834, + "grad_norm": 0.3316337466239929, + "learning_rate": 0.0009999891524539468, + "loss": 4.0408, + "step": 409 + }, + { + "epoch": 0.012157874447706314, + "grad_norm": 0.31473711133003235, + "learning_rate": 0.000999988840311436, + "loss": 4.0571, + "step": 410 + }, + { + "epoch": 0.012187527800017792, + "grad_norm": 0.28396672010421753, + "learning_rate": 0.0009999885237414379, + "loss": 4.0387, + "step": 411 + }, + { + "epoch": 0.012217181152329271, + "grad_norm": 0.25462257862091064, + "learning_rate": 0.0009999882027439556, + "loss": 4.0354, + "step": 412 + }, + { + "epoch": 0.01224683450464075, + "grad_norm": 0.3061962127685547, + "learning_rate": 0.0009999878773189921, + "loss": 3.9781, + "step": 413 + }, + { + "epoch": 0.012276487856952229, + "grad_norm": 0.3174237608909607, + "learning_rate": 0.00099998754746655, + "loss": 4.0185, + "step": 414 + }, + { + "epoch": 0.012306141209263707, + "grad_norm": 0.29349493980407715, + "learning_rate": 0.0009999872131866323, + "loss": 4.0186, + "step": 415 + }, + { + "epoch": 0.012335794561575186, + "grad_norm": 0.3176197409629822, + "learning_rate": 0.0009999868744792423, + "loss": 4.0594, + "step": 416 + }, + { + "epoch": 0.012365447913886664, + "grad_norm": 0.31254521012306213, + "learning_rate": 0.0009999865313443826, + "loss": 4.0126, + "step": 417 + }, + { + "epoch": 0.012395101266198144, + "grad_norm": 0.3208905756473541, + "learning_rate": 0.000999986183782056, + "loss": 4.0075, + "step": 418 + }, + { + "epoch": 0.012424754618509622, + "grad_norm": 0.32246077060699463, + "learning_rate": 0.0009999858317922663, + "loss": 4.0294, + "step": 419 + }, + { + "epoch": 0.012454407970821101, + "grad_norm": 0.3044556677341461, + "learning_rate": 0.0009999854753750162, + "loss": 4.0231, + "step": 420 + }, + { + "epoch": 0.012484061323132581, + "grad_norm": 0.2835943102836609, + "learning_rate": 0.0009999851145303087, + "loss": 3.9923, + "step": 421 + }, + { + "epoch": 0.012513714675444059, + "grad_norm": 0.301717072725296, + "learning_rate": 0.0009999847492581475, + "loss": 3.9821, + "step": 422 + }, + { + "epoch": 0.012543368027755538, + "grad_norm": 0.31555911898612976, + "learning_rate": 0.0009999843795585352, + "loss": 3.9895, + "step": 423 + }, + { + "epoch": 0.012573021380067016, + "grad_norm": 0.38951554894447327, + "learning_rate": 0.0009999840054314756, + "loss": 3.9867, + "step": 424 + }, + { + "epoch": 0.012602674732378496, + "grad_norm": 0.3297612965106964, + "learning_rate": 0.0009999836268769719, + "loss": 3.981, + "step": 425 + }, + { + "epoch": 0.012632328084689974, + "grad_norm": 0.33894607424736023, + "learning_rate": 0.000999983243895027, + "loss": 4.0174, + "step": 426 + }, + { + "epoch": 0.012661981437001453, + "grad_norm": 0.31165191531181335, + "learning_rate": 0.000999982856485645, + "loss": 3.9761, + "step": 427 + }, + { + "epoch": 0.012691634789312931, + "grad_norm": 0.32579144835472107, + "learning_rate": 0.0009999824646488287, + "loss": 3.9964, + "step": 428 + }, + { + "epoch": 0.01272128814162441, + "grad_norm": 0.4051938056945801, + "learning_rate": 0.000999982068384582, + "loss": 3.965, + "step": 429 + }, + { + "epoch": 0.012750941493935889, + "grad_norm": 0.40017515420913696, + "learning_rate": 0.0009999816676929084, + "loss": 3.9926, + "step": 430 + }, + { + "epoch": 0.012780594846247368, + "grad_norm": 0.299189031124115, + "learning_rate": 0.000999981262573811, + "loss": 3.9842, + "step": 431 + }, + { + "epoch": 0.012810248198558846, + "grad_norm": 0.37934762239456177, + "learning_rate": 0.0009999808530272941, + "loss": 4.0111, + "step": 432 + }, + { + "epoch": 0.012839901550870326, + "grad_norm": 0.33244410157203674, + "learning_rate": 0.0009999804390533606, + "loss": 3.9671, + "step": 433 + }, + { + "epoch": 0.012869554903181805, + "grad_norm": 0.31052860617637634, + "learning_rate": 0.0009999800206520145, + "loss": 4.0174, + "step": 434 + }, + { + "epoch": 0.012899208255493283, + "grad_norm": 0.3069404363632202, + "learning_rate": 0.0009999795978232597, + "loss": 3.9777, + "step": 435 + }, + { + "epoch": 0.012928861607804763, + "grad_norm": 0.35886797308921814, + "learning_rate": 0.0009999791705670995, + "loss": 3.9677, + "step": 436 + }, + { + "epoch": 0.01295851496011624, + "grad_norm": 0.35406163334846497, + "learning_rate": 0.0009999787388835382, + "loss": 3.9747, + "step": 437 + }, + { + "epoch": 0.01298816831242772, + "grad_norm": 0.3420802056789398, + "learning_rate": 0.000999978302772579, + "loss": 3.9788, + "step": 438 + }, + { + "epoch": 0.013017821664739198, + "grad_norm": 0.32319486141204834, + "learning_rate": 0.0009999778622342263, + "loss": 3.9542, + "step": 439 + }, + { + "epoch": 0.013047475017050678, + "grad_norm": 0.2600070536136627, + "learning_rate": 0.0009999774172684839, + "loss": 3.963, + "step": 440 + }, + { + "epoch": 0.013077128369362156, + "grad_norm": 0.24246595799922943, + "learning_rate": 0.0009999769678753557, + "loss": 3.9562, + "step": 441 + }, + { + "epoch": 0.013106781721673635, + "grad_norm": 0.25597333908081055, + "learning_rate": 0.0009999765140548454, + "loss": 3.9321, + "step": 442 + }, + { + "epoch": 0.013136435073985113, + "grad_norm": 0.23100632429122925, + "learning_rate": 0.0009999760558069572, + "loss": 3.9333, + "step": 443 + }, + { + "epoch": 0.013166088426296593, + "grad_norm": 0.24366579949855804, + "learning_rate": 0.0009999755931316954, + "loss": 3.9256, + "step": 444 + }, + { + "epoch": 0.01319574177860807, + "grad_norm": 0.21958202123641968, + "learning_rate": 0.0009999751260290639, + "loss": 3.9473, + "step": 445 + }, + { + "epoch": 0.01322539513091955, + "grad_norm": 0.25764936208724976, + "learning_rate": 0.000999974654499067, + "loss": 3.9212, + "step": 446 + }, + { + "epoch": 0.01325504848323103, + "grad_norm": 0.296039879322052, + "learning_rate": 0.0009999741785417084, + "loss": 3.9282, + "step": 447 + }, + { + "epoch": 0.013284701835542508, + "grad_norm": 0.2541691064834595, + "learning_rate": 0.0009999736981569926, + "loss": 3.9236, + "step": 448 + }, + { + "epoch": 0.013314355187853988, + "grad_norm": 0.3705298602581024, + "learning_rate": 0.0009999732133449241, + "loss": 3.9275, + "step": 449 + }, + { + "epoch": 0.013344008540165465, + "grad_norm": 0.34862202405929565, + "learning_rate": 0.000999972724105507, + "loss": 3.92, + "step": 450 + }, + { + "epoch": 0.013373661892476945, + "grad_norm": 0.2771366834640503, + "learning_rate": 0.0009999722304387456, + "loss": 3.9106, + "step": 451 + }, + { + "epoch": 0.013403315244788423, + "grad_norm": 0.31935548782348633, + "learning_rate": 0.000999971732344644, + "loss": 3.9271, + "step": 452 + }, + { + "epoch": 0.013432968597099902, + "grad_norm": 0.25303077697753906, + "learning_rate": 0.000999971229823207, + "loss": 3.9153, + "step": 453 + }, + { + "epoch": 0.01346262194941138, + "grad_norm": 0.2576387822628021, + "learning_rate": 0.0009999707228744391, + "loss": 3.8849, + "step": 454 + }, + { + "epoch": 0.01349227530172286, + "grad_norm": 0.25466468930244446, + "learning_rate": 0.0009999702114983446, + "loss": 3.9155, + "step": 455 + }, + { + "epoch": 0.013521928654034338, + "grad_norm": 0.2547924220561981, + "learning_rate": 0.000999969695694928, + "loss": 3.8711, + "step": 456 + }, + { + "epoch": 0.013551582006345817, + "grad_norm": 0.2918775677680969, + "learning_rate": 0.000999969175464194, + "loss": 3.9139, + "step": 457 + }, + { + "epoch": 0.013581235358657295, + "grad_norm": 0.31676024198532104, + "learning_rate": 0.000999968650806147, + "loss": 3.8724, + "step": 458 + }, + { + "epoch": 0.013610888710968775, + "grad_norm": 0.3002021610736847, + "learning_rate": 0.0009999681217207918, + "loss": 3.9146, + "step": 459 + }, + { + "epoch": 0.013640542063280255, + "grad_norm": 0.23750793933868408, + "learning_rate": 0.000999967588208133, + "loss": 3.9334, + "step": 460 + }, + { + "epoch": 0.013670195415591732, + "grad_norm": 0.2635066509246826, + "learning_rate": 0.0009999670502681757, + "loss": 3.8586, + "step": 461 + }, + { + "epoch": 0.013699848767903212, + "grad_norm": 0.21599382162094116, + "learning_rate": 0.0009999665079009241, + "loss": 3.8995, + "step": 462 + }, + { + "epoch": 0.01372950212021469, + "grad_norm": 0.24119694530963898, + "learning_rate": 0.0009999659611063833, + "loss": 3.9062, + "step": 463 + }, + { + "epoch": 0.01375915547252617, + "grad_norm": 0.2596946656703949, + "learning_rate": 0.0009999654098845582, + "loss": 3.8905, + "step": 464 + }, + { + "epoch": 0.013788808824837647, + "grad_norm": 0.2807733714580536, + "learning_rate": 0.0009999648542354533, + "loss": 3.8776, + "step": 465 + }, + { + "epoch": 0.013818462177149127, + "grad_norm": 0.23506085574626923, + "learning_rate": 0.0009999642941590742, + "loss": 3.8833, + "step": 466 + }, + { + "epoch": 0.013848115529460605, + "grad_norm": 0.21226254105567932, + "learning_rate": 0.0009999637296554253, + "loss": 3.8985, + "step": 467 + }, + { + "epoch": 0.013877768881772085, + "grad_norm": 0.21485280990600586, + "learning_rate": 0.0009999631607245114, + "loss": 3.8956, + "step": 468 + }, + { + "epoch": 0.013907422234083562, + "grad_norm": 0.278178334236145, + "learning_rate": 0.0009999625873663384, + "loss": 3.8883, + "step": 469 + }, + { + "epoch": 0.013937075586395042, + "grad_norm": 0.2827528715133667, + "learning_rate": 0.0009999620095809106, + "loss": 3.9032, + "step": 470 + }, + { + "epoch": 0.01396672893870652, + "grad_norm": 0.2536941468715668, + "learning_rate": 0.0009999614273682334, + "loss": 3.8481, + "step": 471 + }, + { + "epoch": 0.013996382291018, + "grad_norm": 0.31736648082733154, + "learning_rate": 0.000999960840728312, + "loss": 3.848, + "step": 472 + }, + { + "epoch": 0.01402603564332948, + "grad_norm": 0.25291433930397034, + "learning_rate": 0.0009999602496611516, + "loss": 3.8557, + "step": 473 + }, + { + "epoch": 0.014055688995640957, + "grad_norm": 0.3029123842716217, + "learning_rate": 0.0009999596541667574, + "loss": 3.8414, + "step": 474 + }, + { + "epoch": 0.014085342347952437, + "grad_norm": 0.3427191376686096, + "learning_rate": 0.0009999590542451343, + "loss": 3.882, + "step": 475 + }, + { + "epoch": 0.014114995700263915, + "grad_norm": 0.27078092098236084, + "learning_rate": 0.0009999584498962882, + "loss": 3.8636, + "step": 476 + }, + { + "epoch": 0.014144649052575394, + "grad_norm": 0.27660036087036133, + "learning_rate": 0.0009999578411202244, + "loss": 3.8324, + "step": 477 + }, + { + "epoch": 0.014174302404886872, + "grad_norm": 0.2870725393295288, + "learning_rate": 0.0009999572279169478, + "loss": 3.8518, + "step": 478 + }, + { + "epoch": 0.014203955757198352, + "grad_norm": 0.34520018100738525, + "learning_rate": 0.0009999566102864641, + "loss": 3.8526, + "step": 479 + }, + { + "epoch": 0.01423360910950983, + "grad_norm": 0.3587929904460907, + "learning_rate": 0.000999955988228779, + "loss": 3.9028, + "step": 480 + }, + { + "epoch": 0.01426326246182131, + "grad_norm": 0.3173093795776367, + "learning_rate": 0.0009999553617438977, + "loss": 3.8466, + "step": 481 + }, + { + "epoch": 0.014292915814132787, + "grad_norm": 0.31210431456565857, + "learning_rate": 0.000999954730831826, + "loss": 3.8653, + "step": 482 + }, + { + "epoch": 0.014322569166444267, + "grad_norm": 0.33784714341163635, + "learning_rate": 0.0009999540954925693, + "loss": 3.8595, + "step": 483 + }, + { + "epoch": 0.014352222518755745, + "grad_norm": 0.21881985664367676, + "learning_rate": 0.000999953455726133, + "loss": 3.8352, + "step": 484 + }, + { + "epoch": 0.014381875871067224, + "grad_norm": 0.22868354618549347, + "learning_rate": 0.0009999528115325234, + "loss": 3.8434, + "step": 485 + }, + { + "epoch": 0.014411529223378704, + "grad_norm": 0.21475939452648163, + "learning_rate": 0.0009999521629117456, + "loss": 3.8432, + "step": 486 + }, + { + "epoch": 0.014441182575690182, + "grad_norm": 0.23613600432872772, + "learning_rate": 0.0009999515098638057, + "loss": 3.8506, + "step": 487 + }, + { + "epoch": 0.014470835928001661, + "grad_norm": 0.24045728147029877, + "learning_rate": 0.0009999508523887094, + "loss": 3.8397, + "step": 488 + }, + { + "epoch": 0.014500489280313139, + "grad_norm": 0.21935909986495972, + "learning_rate": 0.0009999501904864624, + "loss": 3.8402, + "step": 489 + }, + { + "epoch": 0.014530142632624619, + "grad_norm": 0.2790547013282776, + "learning_rate": 0.0009999495241570706, + "loss": 3.8334, + "step": 490 + }, + { + "epoch": 0.014559795984936097, + "grad_norm": 0.28960883617401123, + "learning_rate": 0.0009999488534005402, + "loss": 3.8311, + "step": 491 + }, + { + "epoch": 0.014589449337247576, + "grad_norm": 0.2536050081253052, + "learning_rate": 0.0009999481782168767, + "loss": 3.829, + "step": 492 + }, + { + "epoch": 0.014619102689559054, + "grad_norm": 0.21416603028774261, + "learning_rate": 0.000999947498606086, + "loss": 3.816, + "step": 493 + }, + { + "epoch": 0.014648756041870534, + "grad_norm": 0.24665690958499908, + "learning_rate": 0.0009999468145681749, + "loss": 3.8107, + "step": 494 + }, + { + "epoch": 0.014678409394182012, + "grad_norm": 0.2142617255449295, + "learning_rate": 0.0009999461261031486, + "loss": 3.8416, + "step": 495 + }, + { + "epoch": 0.014708062746493491, + "grad_norm": 0.1745952069759369, + "learning_rate": 0.0009999454332110136, + "loss": 3.8265, + "step": 496 + }, + { + "epoch": 0.014737716098804969, + "grad_norm": 0.17090438306331635, + "learning_rate": 0.0009999447358917761, + "loss": 3.7739, + "step": 497 + }, + { + "epoch": 0.014767369451116449, + "grad_norm": 0.2062888890504837, + "learning_rate": 0.000999944034145442, + "loss": 3.7873, + "step": 498 + }, + { + "epoch": 0.014797022803427928, + "grad_norm": 0.20627492666244507, + "learning_rate": 0.000999943327972018, + "loss": 3.8265, + "step": 499 + }, + { + "epoch": 0.014826676155739406, + "grad_norm": 0.20979200303554535, + "learning_rate": 0.0009999426173715096, + "loss": 3.837, + "step": 500 + }, + { + "epoch": 0.014856329508050886, + "grad_norm": 0.2242320030927658, + "learning_rate": 0.0009999419023439236, + "loss": 3.8178, + "step": 501 + }, + { + "epoch": 0.014885982860362364, + "grad_norm": 0.274331659078598, + "learning_rate": 0.000999941182889266, + "loss": 3.8214, + "step": 502 + }, + { + "epoch": 0.014915636212673843, + "grad_norm": 0.298446387052536, + "learning_rate": 0.000999940459007544, + "loss": 3.8111, + "step": 503 + }, + { + "epoch": 0.014945289564985321, + "grad_norm": 0.22323623299598694, + "learning_rate": 0.000999939730698763, + "loss": 3.7967, + "step": 504 + }, + { + "epoch": 0.0149749429172968, + "grad_norm": 0.27514633536338806, + "learning_rate": 0.00099993899796293, + "loss": 3.7959, + "step": 505 + }, + { + "epoch": 0.015004596269608279, + "grad_norm": 0.27645307779312134, + "learning_rate": 0.0009999382608000514, + "loss": 3.7846, + "step": 506 + }, + { + "epoch": 0.015034249621919758, + "grad_norm": 0.2688368260860443, + "learning_rate": 0.0009999375192101337, + "loss": 3.8324, + "step": 507 + }, + { + "epoch": 0.015063902974231236, + "grad_norm": 0.3251209557056427, + "learning_rate": 0.0009999367731931834, + "loss": 3.8128, + "step": 508 + }, + { + "epoch": 0.015093556326542716, + "grad_norm": 0.305534690618515, + "learning_rate": 0.0009999360227492071, + "loss": 3.8111, + "step": 509 + }, + { + "epoch": 0.015123209678854194, + "grad_norm": 0.20553569495677948, + "learning_rate": 0.0009999352678782116, + "loss": 3.83, + "step": 510 + }, + { + "epoch": 0.015152863031165673, + "grad_norm": 0.2286054939031601, + "learning_rate": 0.0009999345085802034, + "loss": 3.7681, + "step": 511 + }, + { + "epoch": 0.015182516383477153, + "grad_norm": 0.21042442321777344, + "learning_rate": 0.0009999337448551894, + "loss": 3.798, + "step": 512 + }, + { + "epoch": 0.01521216973578863, + "grad_norm": 0.2589910328388214, + "learning_rate": 0.0009999329767031763, + "loss": 3.812, + "step": 513 + }, + { + "epoch": 0.01524182308810011, + "grad_norm": 0.3070850372314453, + "learning_rate": 0.0009999322041241707, + "loss": 3.7841, + "step": 514 + }, + { + "epoch": 0.015271476440411588, + "grad_norm": 0.27719423174858093, + "learning_rate": 0.00099993142711818, + "loss": 3.769, + "step": 515 + }, + { + "epoch": 0.015301129792723068, + "grad_norm": 0.2615205943584442, + "learning_rate": 0.0009999306456852102, + "loss": 3.7893, + "step": 516 + }, + { + "epoch": 0.015330783145034546, + "grad_norm": 0.25093749165534973, + "learning_rate": 0.000999929859825269, + "loss": 3.7383, + "step": 517 + }, + { + "epoch": 0.015360436497346025, + "grad_norm": 0.17652225494384766, + "learning_rate": 0.000999929069538363, + "loss": 3.7825, + "step": 518 + }, + { + "epoch": 0.015390089849657503, + "grad_norm": 0.22382497787475586, + "learning_rate": 0.0009999282748244993, + "loss": 3.7529, + "step": 519 + }, + { + "epoch": 0.015419743201968983, + "grad_norm": 0.18385636806488037, + "learning_rate": 0.000999927475683685, + "loss": 3.7585, + "step": 520 + }, + { + "epoch": 0.01544939655428046, + "grad_norm": 0.199680358171463, + "learning_rate": 0.000999926672115927, + "loss": 3.7602, + "step": 521 + }, + { + "epoch": 0.01547904990659194, + "grad_norm": 0.20612305402755737, + "learning_rate": 0.0009999258641212325, + "loss": 3.7803, + "step": 522 + }, + { + "epoch": 0.015508703258903418, + "grad_norm": 0.19767141342163086, + "learning_rate": 0.0009999250516996088, + "loss": 3.7672, + "step": 523 + }, + { + "epoch": 0.015538356611214898, + "grad_norm": 0.20173102617263794, + "learning_rate": 0.0009999242348510628, + "loss": 3.7449, + "step": 524 + }, + { + "epoch": 0.015568009963526377, + "grad_norm": 0.2516862154006958, + "learning_rate": 0.000999923413575602, + "loss": 3.7521, + "step": 525 + }, + { + "epoch": 0.015597663315837855, + "grad_norm": 0.2902933657169342, + "learning_rate": 0.0009999225878732335, + "loss": 3.8085, + "step": 526 + }, + { + "epoch": 0.015627316668149333, + "grad_norm": 0.29449960589408875, + "learning_rate": 0.0009999217577439645, + "loss": 3.7735, + "step": 527 + }, + { + "epoch": 0.015656970020460813, + "grad_norm": 0.29081204533576965, + "learning_rate": 0.0009999209231878027, + "loss": 3.7784, + "step": 528 + }, + { + "epoch": 0.015686623372772292, + "grad_norm": 0.22114267945289612, + "learning_rate": 0.0009999200842047554, + "loss": 3.7821, + "step": 529 + }, + { + "epoch": 0.015716276725083772, + "grad_norm": 0.24999374151229858, + "learning_rate": 0.00099991924079483, + "loss": 3.7224, + "step": 530 + }, + { + "epoch": 0.015745930077395248, + "grad_norm": 0.2441588044166565, + "learning_rate": 0.0009999183929580338, + "loss": 3.7604, + "step": 531 + }, + { + "epoch": 0.015775583429706728, + "grad_norm": 0.21299861371517181, + "learning_rate": 0.0009999175406943744, + "loss": 3.7713, + "step": 532 + }, + { + "epoch": 0.015805236782018207, + "grad_norm": 0.21011117100715637, + "learning_rate": 0.0009999166840038592, + "loss": 3.7672, + "step": 533 + }, + { + "epoch": 0.015834890134329687, + "grad_norm": 0.220332071185112, + "learning_rate": 0.0009999158228864962, + "loss": 3.7383, + "step": 534 + }, + { + "epoch": 0.015864543486641163, + "grad_norm": 0.2506074905395508, + "learning_rate": 0.0009999149573422926, + "loss": 3.7247, + "step": 535 + }, + { + "epoch": 0.015894196838952643, + "grad_norm": 0.23833119869232178, + "learning_rate": 0.0009999140873712565, + "loss": 3.7508, + "step": 536 + }, + { + "epoch": 0.015923850191264122, + "grad_norm": 0.22063113749027252, + "learning_rate": 0.000999913212973395, + "loss": 3.7524, + "step": 537 + }, + { + "epoch": 0.015953503543575602, + "grad_norm": 0.20277726650238037, + "learning_rate": 0.0009999123341487164, + "loss": 3.767, + "step": 538 + }, + { + "epoch": 0.01598315689588708, + "grad_norm": 0.22834521532058716, + "learning_rate": 0.0009999114508972282, + "loss": 3.7439, + "step": 539 + }, + { + "epoch": 0.016012810248198558, + "grad_norm": 0.21704134345054626, + "learning_rate": 0.0009999105632189384, + "loss": 3.7316, + "step": 540 + }, + { + "epoch": 0.016042463600510037, + "grad_norm": 0.2102942168712616, + "learning_rate": 0.0009999096711138546, + "loss": 3.7447, + "step": 541 + }, + { + "epoch": 0.016072116952821517, + "grad_norm": 0.19012093544006348, + "learning_rate": 0.000999908774581985, + "loss": 3.7031, + "step": 542 + }, + { + "epoch": 0.016101770305132997, + "grad_norm": 0.22729933261871338, + "learning_rate": 0.0009999078736233373, + "loss": 3.7374, + "step": 543 + }, + { + "epoch": 0.016131423657444473, + "grad_norm": 0.22881406545639038, + "learning_rate": 0.0009999069682379198, + "loss": 3.7206, + "step": 544 + }, + { + "epoch": 0.016161077009755952, + "grad_norm": 0.19213604927062988, + "learning_rate": 0.00099990605842574, + "loss": 3.6999, + "step": 545 + }, + { + "epoch": 0.016190730362067432, + "grad_norm": 0.18918301165103912, + "learning_rate": 0.0009999051441868063, + "loss": 3.7323, + "step": 546 + }, + { + "epoch": 0.01622038371437891, + "grad_norm": 0.20557555556297302, + "learning_rate": 0.0009999042255211268, + "loss": 3.7388, + "step": 547 + }, + { + "epoch": 0.016250037066690388, + "grad_norm": 0.2317773997783661, + "learning_rate": 0.0009999033024287096, + "loss": 3.7288, + "step": 548 + }, + { + "epoch": 0.016279690419001867, + "grad_norm": 0.22924141585826874, + "learning_rate": 0.0009999023749095628, + "loss": 3.7248, + "step": 549 + }, + { + "epoch": 0.016309343771313347, + "grad_norm": 0.20797361433506012, + "learning_rate": 0.000999901442963695, + "loss": 3.7053, + "step": 550 + }, + { + "epoch": 0.016338997123624827, + "grad_norm": 0.21064043045043945, + "learning_rate": 0.0009999005065911136, + "loss": 3.7134, + "step": 551 + }, + { + "epoch": 0.016368650475936306, + "grad_norm": 0.22733019292354584, + "learning_rate": 0.0009998995657918275, + "loss": 3.7423, + "step": 552 + }, + { + "epoch": 0.016398303828247782, + "grad_norm": 0.22448378801345825, + "learning_rate": 0.0009998986205658452, + "loss": 3.6996, + "step": 553 + }, + { + "epoch": 0.016427957180559262, + "grad_norm": 0.24074044823646545, + "learning_rate": 0.0009998976709131744, + "loss": 3.7305, + "step": 554 + }, + { + "epoch": 0.01645761053287074, + "grad_norm": 0.27693960070610046, + "learning_rate": 0.0009998967168338242, + "loss": 3.7341, + "step": 555 + }, + { + "epoch": 0.01648726388518222, + "grad_norm": 0.3017973005771637, + "learning_rate": 0.0009998957583278027, + "loss": 3.7455, + "step": 556 + }, + { + "epoch": 0.016516917237493697, + "grad_norm": 0.2436131238937378, + "learning_rate": 0.0009998947953951186, + "loss": 3.7514, + "step": 557 + }, + { + "epoch": 0.016546570589805177, + "grad_norm": 0.2503582835197449, + "learning_rate": 0.00099989382803578, + "loss": 3.7319, + "step": 558 + }, + { + "epoch": 0.016576223942116657, + "grad_norm": 0.23194116353988647, + "learning_rate": 0.000999892856249796, + "loss": 3.7333, + "step": 559 + }, + { + "epoch": 0.016605877294428136, + "grad_norm": 0.16133129596710205, + "learning_rate": 0.000999891880037175, + "loss": 3.7207, + "step": 560 + }, + { + "epoch": 0.016635530646739612, + "grad_norm": 0.1639849692583084, + "learning_rate": 0.0009998908993979255, + "loss": 3.7065, + "step": 561 + }, + { + "epoch": 0.016665183999051092, + "grad_norm": 0.17051786184310913, + "learning_rate": 0.0009998899143320561, + "loss": 3.6828, + "step": 562 + }, + { + "epoch": 0.01669483735136257, + "grad_norm": 0.21575185656547546, + "learning_rate": 0.0009998889248395758, + "loss": 3.7175, + "step": 563 + }, + { + "epoch": 0.01672449070367405, + "grad_norm": 0.24211153388023376, + "learning_rate": 0.0009998879309204933, + "loss": 3.7271, + "step": 564 + }, + { + "epoch": 0.01675414405598553, + "grad_norm": 0.20792734622955322, + "learning_rate": 0.0009998869325748175, + "loss": 3.7182, + "step": 565 + }, + { + "epoch": 0.016783797408297007, + "grad_norm": 0.16386137902736664, + "learning_rate": 0.0009998859298025568, + "loss": 3.6844, + "step": 566 + }, + { + "epoch": 0.016813450760608487, + "grad_norm": 0.16484054923057556, + "learning_rate": 0.0009998849226037208, + "loss": 3.6861, + "step": 567 + }, + { + "epoch": 0.016843104112919966, + "grad_norm": 0.172918438911438, + "learning_rate": 0.0009998839109783178, + "loss": 3.7174, + "step": 568 + }, + { + "epoch": 0.016872757465231446, + "grad_norm": 0.194429412484169, + "learning_rate": 0.0009998828949263567, + "loss": 3.6719, + "step": 569 + }, + { + "epoch": 0.016902410817542922, + "grad_norm": 0.23298491537570953, + "learning_rate": 0.000999881874447847, + "loss": 3.7144, + "step": 570 + }, + { + "epoch": 0.0169320641698544, + "grad_norm": 0.25031912326812744, + "learning_rate": 0.0009998808495427975, + "loss": 3.7029, + "step": 571 + }, + { + "epoch": 0.01696171752216588, + "grad_norm": 0.25793445110321045, + "learning_rate": 0.0009998798202112175, + "loss": 3.7061, + "step": 572 + }, + { + "epoch": 0.01699137087447736, + "grad_norm": 0.24031586945056915, + "learning_rate": 0.0009998787864531156, + "loss": 3.7425, + "step": 573 + }, + { + "epoch": 0.017021024226788837, + "grad_norm": 0.22607839107513428, + "learning_rate": 0.0009998777482685013, + "loss": 3.7245, + "step": 574 + }, + { + "epoch": 0.017050677579100317, + "grad_norm": 0.22475478053092957, + "learning_rate": 0.0009998767056573837, + "loss": 3.709, + "step": 575 + }, + { + "epoch": 0.017080330931411796, + "grad_norm": 0.22092778980731964, + "learning_rate": 0.0009998756586197721, + "loss": 3.7303, + "step": 576 + }, + { + "epoch": 0.017109984283723276, + "grad_norm": 0.2715263068675995, + "learning_rate": 0.0009998746071556758, + "loss": 3.6861, + "step": 577 + }, + { + "epoch": 0.017139637636034755, + "grad_norm": 0.27918022871017456, + "learning_rate": 0.000999873551265104, + "loss": 3.6992, + "step": 578 + }, + { + "epoch": 0.01716929098834623, + "grad_norm": 0.20460131764411926, + "learning_rate": 0.000999872490948066, + "loss": 3.7035, + "step": 579 + }, + { + "epoch": 0.01719894434065771, + "grad_norm": 0.18100522458553314, + "learning_rate": 0.0009998714262045715, + "loss": 3.7059, + "step": 580 + }, + { + "epoch": 0.01722859769296919, + "grad_norm": 0.17877250909805298, + "learning_rate": 0.0009998703570346297, + "loss": 3.6478, + "step": 581 + }, + { + "epoch": 0.01725825104528067, + "grad_norm": 0.15353919565677643, + "learning_rate": 0.0009998692834382499, + "loss": 3.6847, + "step": 582 + }, + { + "epoch": 0.017287904397592146, + "grad_norm": 0.1629875749349594, + "learning_rate": 0.000999868205415442, + "loss": 3.6851, + "step": 583 + }, + { + "epoch": 0.017317557749903626, + "grad_norm": 0.1989017277956009, + "learning_rate": 0.000999867122966215, + "loss": 3.6775, + "step": 584 + }, + { + "epoch": 0.017347211102215106, + "grad_norm": 0.2283223569393158, + "learning_rate": 0.0009998660360905792, + "loss": 3.6688, + "step": 585 + }, + { + "epoch": 0.017376864454526585, + "grad_norm": 0.24673070013523102, + "learning_rate": 0.0009998649447885437, + "loss": 3.6954, + "step": 586 + }, + { + "epoch": 0.01740651780683806, + "grad_norm": 0.22602105140686035, + "learning_rate": 0.0009998638490601184, + "loss": 3.6567, + "step": 587 + }, + { + "epoch": 0.01743617115914954, + "grad_norm": 0.20002318918704987, + "learning_rate": 0.0009998627489053128, + "loss": 3.6647, + "step": 588 + }, + { + "epoch": 0.01746582451146102, + "grad_norm": 0.1921076476573944, + "learning_rate": 0.000999861644324137, + "loss": 3.6502, + "step": 589 + }, + { + "epoch": 0.0174954778637725, + "grad_norm": 0.19527462124824524, + "learning_rate": 0.0009998605353166, + "loss": 3.6907, + "step": 590 + }, + { + "epoch": 0.01752513121608398, + "grad_norm": 0.14299041032791138, + "learning_rate": 0.0009998594218827127, + "loss": 3.6657, + "step": 591 + }, + { + "epoch": 0.017554784568395456, + "grad_norm": 0.16619686782360077, + "learning_rate": 0.0009998583040224842, + "loss": 3.6264, + "step": 592 + }, + { + "epoch": 0.017584437920706936, + "grad_norm": 0.178636372089386, + "learning_rate": 0.0009998571817359245, + "loss": 3.6625, + "step": 593 + }, + { + "epoch": 0.017614091273018415, + "grad_norm": 0.1931566298007965, + "learning_rate": 0.0009998560550230438, + "loss": 3.6568, + "step": 594 + }, + { + "epoch": 0.017643744625329895, + "grad_norm": 0.2266320288181305, + "learning_rate": 0.000999854923883852, + "loss": 3.6364, + "step": 595 + }, + { + "epoch": 0.01767339797764137, + "grad_norm": 0.2373763471841812, + "learning_rate": 0.000999853788318359, + "loss": 3.6809, + "step": 596 + }, + { + "epoch": 0.01770305132995285, + "grad_norm": 0.20357264578342438, + "learning_rate": 0.0009998526483265748, + "loss": 3.6357, + "step": 597 + }, + { + "epoch": 0.01773270468226433, + "grad_norm": 0.20505361258983612, + "learning_rate": 0.0009998515039085097, + "loss": 3.6763, + "step": 598 + }, + { + "epoch": 0.01776235803457581, + "grad_norm": 0.17590385675430298, + "learning_rate": 0.0009998503550641739, + "loss": 3.6512, + "step": 599 + }, + { + "epoch": 0.017792011386887286, + "grad_norm": 0.20785042643547058, + "learning_rate": 0.000999849201793577, + "loss": 3.6673, + "step": 600 + }, + { + "epoch": 0.017821664739198766, + "grad_norm": 0.2230292409658432, + "learning_rate": 0.00099984804409673, + "loss": 3.6259, + "step": 601 + }, + { + "epoch": 0.017851318091510245, + "grad_norm": 0.17691822350025177, + "learning_rate": 0.0009998468819736425, + "loss": 3.6541, + "step": 602 + }, + { + "epoch": 0.017880971443821725, + "grad_norm": 0.2072225660085678, + "learning_rate": 0.0009998457154243253, + "loss": 3.68, + "step": 603 + }, + { + "epoch": 0.017910624796133204, + "grad_norm": 0.24672237038612366, + "learning_rate": 0.0009998445444487883, + "loss": 3.6784, + "step": 604 + }, + { + "epoch": 0.01794027814844468, + "grad_norm": 0.19877678155899048, + "learning_rate": 0.000999843369047042, + "loss": 3.6585, + "step": 605 + }, + { + "epoch": 0.01796993150075616, + "grad_norm": 0.17338234186172485, + "learning_rate": 0.0009998421892190971, + "loss": 3.6389, + "step": 606 + }, + { + "epoch": 0.01799958485306764, + "grad_norm": 0.21789860725402832, + "learning_rate": 0.0009998410049649638, + "loss": 3.6841, + "step": 607 + }, + { + "epoch": 0.01802923820537912, + "grad_norm": 0.21810981631278992, + "learning_rate": 0.0009998398162846525, + "loss": 3.6799, + "step": 608 + }, + { + "epoch": 0.018058891557690596, + "grad_norm": 0.26651737093925476, + "learning_rate": 0.0009998386231781738, + "loss": 3.6447, + "step": 609 + }, + { + "epoch": 0.018088544910002075, + "grad_norm": 0.2975062131881714, + "learning_rate": 0.0009998374256455383, + "loss": 3.6819, + "step": 610 + }, + { + "epoch": 0.018118198262313555, + "grad_norm": 0.28345635533332825, + "learning_rate": 0.0009998362236867567, + "loss": 3.6465, + "step": 611 + }, + { + "epoch": 0.018147851614625034, + "grad_norm": 0.28810352087020874, + "learning_rate": 0.0009998350173018393, + "loss": 3.6395, + "step": 612 + }, + { + "epoch": 0.01817750496693651, + "grad_norm": 0.2776890993118286, + "learning_rate": 0.0009998338064907974, + "loss": 3.6437, + "step": 613 + }, + { + "epoch": 0.01820715831924799, + "grad_norm": 0.22165432572364807, + "learning_rate": 0.0009998325912536413, + "loss": 3.6329, + "step": 614 + }, + { + "epoch": 0.01823681167155947, + "grad_norm": 0.16228803992271423, + "learning_rate": 0.0009998313715903816, + "loss": 3.6622, + "step": 615 + }, + { + "epoch": 0.01826646502387095, + "grad_norm": 0.16328348219394684, + "learning_rate": 0.0009998301475010293, + "loss": 3.6452, + "step": 616 + }, + { + "epoch": 0.01829611837618243, + "grad_norm": 0.19883689284324646, + "learning_rate": 0.0009998289189855954, + "loss": 3.6217, + "step": 617 + }, + { + "epoch": 0.018325771728493905, + "grad_norm": 0.1599595844745636, + "learning_rate": 0.0009998276860440905, + "loss": 3.652, + "step": 618 + }, + { + "epoch": 0.018355425080805385, + "grad_norm": 0.15226277709007263, + "learning_rate": 0.0009998264486765257, + "loss": 3.6568, + "step": 619 + }, + { + "epoch": 0.018385078433116864, + "grad_norm": 0.14889279007911682, + "learning_rate": 0.0009998252068829118, + "loss": 3.6468, + "step": 620 + }, + { + "epoch": 0.018414731785428344, + "grad_norm": 0.14513222873210907, + "learning_rate": 0.0009998239606632603, + "loss": 3.6272, + "step": 621 + }, + { + "epoch": 0.01844438513773982, + "grad_norm": 0.15401995182037354, + "learning_rate": 0.0009998227100175814, + "loss": 3.6306, + "step": 622 + }, + { + "epoch": 0.0184740384900513, + "grad_norm": 0.1511073261499405, + "learning_rate": 0.0009998214549458869, + "loss": 3.6088, + "step": 623 + }, + { + "epoch": 0.01850369184236278, + "grad_norm": 0.1374017894268036, + "learning_rate": 0.0009998201954481874, + "loss": 3.6062, + "step": 624 + }, + { + "epoch": 0.01853334519467426, + "grad_norm": 0.13538800179958344, + "learning_rate": 0.0009998189315244942, + "loss": 3.635, + "step": 625 + }, + { + "epoch": 0.018562998546985735, + "grad_norm": 0.1560511589050293, + "learning_rate": 0.0009998176631748187, + "loss": 3.6396, + "step": 626 + }, + { + "epoch": 0.018592651899297215, + "grad_norm": 0.17474891245365143, + "learning_rate": 0.0009998163903991721, + "loss": 3.6274, + "step": 627 + }, + { + "epoch": 0.018622305251608694, + "grad_norm": 0.183576762676239, + "learning_rate": 0.0009998151131975655, + "loss": 3.6104, + "step": 628 + }, + { + "epoch": 0.018651958603920174, + "grad_norm": 0.19350311160087585, + "learning_rate": 0.0009998138315700103, + "loss": 3.6177, + "step": 629 + }, + { + "epoch": 0.018681611956231654, + "grad_norm": 0.22118298709392548, + "learning_rate": 0.000999812545516518, + "loss": 3.6073, + "step": 630 + }, + { + "epoch": 0.01871126530854313, + "grad_norm": 0.245102658867836, + "learning_rate": 0.0009998112550370995, + "loss": 3.6203, + "step": 631 + }, + { + "epoch": 0.01874091866085461, + "grad_norm": 0.22878398001194, + "learning_rate": 0.0009998099601317666, + "loss": 3.6381, + "step": 632 + }, + { + "epoch": 0.01877057201316609, + "grad_norm": 0.22269171476364136, + "learning_rate": 0.0009998086608005309, + "loss": 3.6219, + "step": 633 + }, + { + "epoch": 0.01880022536547757, + "grad_norm": 0.21724428236484528, + "learning_rate": 0.0009998073570434034, + "loss": 3.6425, + "step": 634 + }, + { + "epoch": 0.018829878717789045, + "grad_norm": 0.20017968118190765, + "learning_rate": 0.0009998060488603962, + "loss": 3.6331, + "step": 635 + }, + { + "epoch": 0.018859532070100524, + "grad_norm": 0.23651909828186035, + "learning_rate": 0.0009998047362515207, + "loss": 3.6004, + "step": 636 + }, + { + "epoch": 0.018889185422412004, + "grad_norm": 0.2664756178855896, + "learning_rate": 0.0009998034192167883, + "loss": 3.6466, + "step": 637 + }, + { + "epoch": 0.018918838774723484, + "grad_norm": 0.2388830929994583, + "learning_rate": 0.000999802097756211, + "loss": 3.6533, + "step": 638 + }, + { + "epoch": 0.01894849212703496, + "grad_norm": 0.21945089101791382, + "learning_rate": 0.0009998007718698002, + "loss": 3.6345, + "step": 639 + }, + { + "epoch": 0.01897814547934644, + "grad_norm": 0.19748640060424805, + "learning_rate": 0.0009997994415575679, + "loss": 3.6255, + "step": 640 + }, + { + "epoch": 0.01900779883165792, + "grad_norm": 0.21388499438762665, + "learning_rate": 0.0009997981068195255, + "loss": 3.5991, + "step": 641 + }, + { + "epoch": 0.0190374521839694, + "grad_norm": 0.21577303111553192, + "learning_rate": 0.0009997967676556854, + "loss": 3.6187, + "step": 642 + }, + { + "epoch": 0.019067105536280878, + "grad_norm": 0.20433413982391357, + "learning_rate": 0.000999795424066059, + "loss": 3.627, + "step": 643 + }, + { + "epoch": 0.019096758888592354, + "grad_norm": 0.171546071767807, + "learning_rate": 0.0009997940760506582, + "loss": 3.6271, + "step": 644 + }, + { + "epoch": 0.019126412240903834, + "grad_norm": 0.1822800636291504, + "learning_rate": 0.0009997927236094952, + "loss": 3.6181, + "step": 645 + }, + { + "epoch": 0.019156065593215314, + "grad_norm": 0.1536330282688141, + "learning_rate": 0.0009997913667425817, + "loss": 3.6153, + "step": 646 + }, + { + "epoch": 0.019185718945526793, + "grad_norm": 0.1685560941696167, + "learning_rate": 0.00099979000544993, + "loss": 3.628, + "step": 647 + }, + { + "epoch": 0.01921537229783827, + "grad_norm": 0.18734249472618103, + "learning_rate": 0.0009997886397315522, + "loss": 3.6383, + "step": 648 + }, + { + "epoch": 0.01924502565014975, + "grad_norm": 0.14095339179039001, + "learning_rate": 0.0009997872695874598, + "loss": 3.6311, + "step": 649 + }, + { + "epoch": 0.01927467900246123, + "grad_norm": 0.13983258605003357, + "learning_rate": 0.0009997858950176657, + "loss": 3.5776, + "step": 650 + }, + { + "epoch": 0.019304332354772708, + "grad_norm": 0.16155283153057098, + "learning_rate": 0.0009997845160221815, + "loss": 3.6104, + "step": 651 + }, + { + "epoch": 0.019333985707084184, + "grad_norm": 0.16923168301582336, + "learning_rate": 0.0009997831326010198, + "loss": 3.6007, + "step": 652 + }, + { + "epoch": 0.019363639059395664, + "grad_norm": 0.16669802367687225, + "learning_rate": 0.0009997817447541925, + "loss": 3.6132, + "step": 653 + }, + { + "epoch": 0.019393292411707144, + "grad_norm": 0.180235356092453, + "learning_rate": 0.000999780352481712, + "loss": 3.5734, + "step": 654 + }, + { + "epoch": 0.019422945764018623, + "grad_norm": 0.20015954971313477, + "learning_rate": 0.000999778955783591, + "loss": 3.6192, + "step": 655 + }, + { + "epoch": 0.019452599116330103, + "grad_norm": 0.22775231301784515, + "learning_rate": 0.0009997775546598414, + "loss": 3.6091, + "step": 656 + }, + { + "epoch": 0.01948225246864158, + "grad_norm": 0.22583726048469543, + "learning_rate": 0.0009997761491104754, + "loss": 3.6111, + "step": 657 + }, + { + "epoch": 0.01951190582095306, + "grad_norm": 0.1656685620546341, + "learning_rate": 0.0009997747391355064, + "loss": 3.5847, + "step": 658 + }, + { + "epoch": 0.019541559173264538, + "grad_norm": 0.1502220779657364, + "learning_rate": 0.000999773324734946, + "loss": 3.6125, + "step": 659 + }, + { + "epoch": 0.019571212525576018, + "grad_norm": 0.15196026861667633, + "learning_rate": 0.0009997719059088072, + "loss": 3.5512, + "step": 660 + }, + { + "epoch": 0.019600865877887494, + "grad_norm": 0.1476306915283203, + "learning_rate": 0.000999770482657102, + "loss": 3.5604, + "step": 661 + }, + { + "epoch": 0.019630519230198974, + "grad_norm": 0.15439793467521667, + "learning_rate": 0.0009997690549798438, + "loss": 3.5724, + "step": 662 + }, + { + "epoch": 0.019660172582510453, + "grad_norm": 0.1855437159538269, + "learning_rate": 0.0009997676228770448, + "loss": 3.5588, + "step": 663 + }, + { + "epoch": 0.019689825934821933, + "grad_norm": 0.2037259191274643, + "learning_rate": 0.0009997661863487175, + "loss": 3.5662, + "step": 664 + }, + { + "epoch": 0.01971947928713341, + "grad_norm": 0.18087166547775269, + "learning_rate": 0.0009997647453948752, + "loss": 3.6068, + "step": 665 + }, + { + "epoch": 0.01974913263944489, + "grad_norm": 0.229839488863945, + "learning_rate": 0.0009997633000155299, + "loss": 3.5712, + "step": 666 + }, + { + "epoch": 0.019778785991756368, + "grad_norm": 0.24083003401756287, + "learning_rate": 0.0009997618502106949, + "loss": 3.5289, + "step": 667 + }, + { + "epoch": 0.019808439344067848, + "grad_norm": 0.18935555219650269, + "learning_rate": 0.000999760395980383, + "loss": 3.5862, + "step": 668 + }, + { + "epoch": 0.019838092696379327, + "grad_norm": 0.1982816755771637, + "learning_rate": 0.000999758937324607, + "loss": 3.5834, + "step": 669 + }, + { + "epoch": 0.019867746048690803, + "grad_norm": 0.1870570331811905, + "learning_rate": 0.0009997574742433798, + "loss": 3.573, + "step": 670 + }, + { + "epoch": 0.019897399401002283, + "grad_norm": 0.18169671297073364, + "learning_rate": 0.0009997560067367144, + "loss": 3.6053, + "step": 671 + }, + { + "epoch": 0.019927052753313763, + "grad_norm": 0.1621391326189041, + "learning_rate": 0.0009997545348046238, + "loss": 3.5517, + "step": 672 + }, + { + "epoch": 0.019956706105625242, + "grad_norm": 0.1860952377319336, + "learning_rate": 0.0009997530584471208, + "loss": 3.5505, + "step": 673 + }, + { + "epoch": 0.01998635945793672, + "grad_norm": 0.17361754179000854, + "learning_rate": 0.000999751577664219, + "loss": 3.5669, + "step": 674 + }, + { + "epoch": 0.020016012810248198, + "grad_norm": 0.19670997560024261, + "learning_rate": 0.0009997500924559311, + "loss": 3.6089, + "step": 675 + }, + { + "epoch": 0.020045666162559678, + "grad_norm": 0.218978613615036, + "learning_rate": 0.0009997486028222701, + "loss": 3.5748, + "step": 676 + }, + { + "epoch": 0.020075319514871157, + "grad_norm": 0.21822677552700043, + "learning_rate": 0.0009997471087632498, + "loss": 3.5796, + "step": 677 + }, + { + "epoch": 0.020104972867182633, + "grad_norm": 0.216500386595726, + "learning_rate": 0.0009997456102788828, + "loss": 3.5669, + "step": 678 + }, + { + "epoch": 0.020134626219494113, + "grad_norm": 0.23164702951908112, + "learning_rate": 0.0009997441073691829, + "loss": 3.5776, + "step": 679 + }, + { + "epoch": 0.020164279571805593, + "grad_norm": 0.17819428443908691, + "learning_rate": 0.000999742600034163, + "loss": 3.5823, + "step": 680 + }, + { + "epoch": 0.020193932924117072, + "grad_norm": 0.1679418683052063, + "learning_rate": 0.0009997410882738368, + "loss": 3.5337, + "step": 681 + }, + { + "epoch": 0.020223586276428552, + "grad_norm": 0.15975414216518402, + "learning_rate": 0.0009997395720882172, + "loss": 3.5478, + "step": 682 + }, + { + "epoch": 0.020253239628740028, + "grad_norm": 0.17038871347904205, + "learning_rate": 0.000999738051477318, + "loss": 3.5382, + "step": 683 + }, + { + "epoch": 0.020282892981051508, + "grad_norm": 0.14635218679904938, + "learning_rate": 0.0009997365264411526, + "loss": 3.5988, + "step": 684 + }, + { + "epoch": 0.020312546333362987, + "grad_norm": 0.13924624025821686, + "learning_rate": 0.0009997349969797344, + "loss": 3.5775, + "step": 685 + }, + { + "epoch": 0.020342199685674467, + "grad_norm": 0.15894494950771332, + "learning_rate": 0.000999733463093077, + "loss": 3.5605, + "step": 686 + }, + { + "epoch": 0.020371853037985943, + "grad_norm": 0.17498679459095, + "learning_rate": 0.0009997319247811941, + "loss": 3.5484, + "step": 687 + }, + { + "epoch": 0.020401506390297423, + "grad_norm": 0.19984745979309082, + "learning_rate": 0.0009997303820440994, + "loss": 3.5755, + "step": 688 + }, + { + "epoch": 0.020431159742608902, + "grad_norm": 0.22320115566253662, + "learning_rate": 0.0009997288348818061, + "loss": 3.5427, + "step": 689 + }, + { + "epoch": 0.020460813094920382, + "grad_norm": 0.22272485494613647, + "learning_rate": 0.0009997272832943283, + "loss": 3.5734, + "step": 690 + }, + { + "epoch": 0.020490466447231858, + "grad_norm": 0.2156044989824295, + "learning_rate": 0.0009997257272816797, + "loss": 3.5715, + "step": 691 + }, + { + "epoch": 0.020520119799543338, + "grad_norm": 0.18025052547454834, + "learning_rate": 0.0009997241668438738, + "loss": 3.544, + "step": 692 + }, + { + "epoch": 0.020549773151854817, + "grad_norm": 0.18209435045719147, + "learning_rate": 0.000999722601980925, + "loss": 3.5457, + "step": 693 + }, + { + "epoch": 0.020579426504166297, + "grad_norm": 0.15791988372802734, + "learning_rate": 0.0009997210326928463, + "loss": 3.5925, + "step": 694 + }, + { + "epoch": 0.020609079856477776, + "grad_norm": 0.15157760679721832, + "learning_rate": 0.0009997194589796525, + "loss": 3.5803, + "step": 695 + }, + { + "epoch": 0.020638733208789253, + "grad_norm": 0.1708373874425888, + "learning_rate": 0.000999717880841357, + "loss": 3.5914, + "step": 696 + }, + { + "epoch": 0.020668386561100732, + "grad_norm": 0.1613413691520691, + "learning_rate": 0.0009997162982779738, + "loss": 3.5849, + "step": 697 + }, + { + "epoch": 0.020698039913412212, + "grad_norm": 0.14587031304836273, + "learning_rate": 0.000999714711289517, + "loss": 3.534, + "step": 698 + }, + { + "epoch": 0.02072769326572369, + "grad_norm": 0.14741060137748718, + "learning_rate": 0.0009997131198760006, + "loss": 3.57, + "step": 699 + }, + { + "epoch": 0.020757346618035168, + "grad_norm": 0.15466360747814178, + "learning_rate": 0.0009997115240374388, + "loss": 3.5454, + "step": 700 + }, + { + "epoch": 0.020786999970346647, + "grad_norm": 0.16865938901901245, + "learning_rate": 0.000999709923773846, + "loss": 3.5669, + "step": 701 + }, + { + "epoch": 0.020816653322658127, + "grad_norm": 0.1614600569009781, + "learning_rate": 0.0009997083190852356, + "loss": 3.549, + "step": 702 + }, + { + "epoch": 0.020846306674969606, + "grad_norm": 0.17156904935836792, + "learning_rate": 0.0009997067099716225, + "loss": 3.5558, + "step": 703 + }, + { + "epoch": 0.020875960027281083, + "grad_norm": 0.1877913922071457, + "learning_rate": 0.0009997050964330205, + "loss": 3.5377, + "step": 704 + }, + { + "epoch": 0.020905613379592562, + "grad_norm": 0.16411730647087097, + "learning_rate": 0.0009997034784694444, + "loss": 3.5372, + "step": 705 + }, + { + "epoch": 0.020935266731904042, + "grad_norm": 0.18330174684524536, + "learning_rate": 0.000999701856080908, + "loss": 3.5873, + "step": 706 + }, + { + "epoch": 0.02096492008421552, + "grad_norm": 0.20142702758312225, + "learning_rate": 0.000999700229267426, + "loss": 3.5522, + "step": 707 + }, + { + "epoch": 0.020994573436527, + "grad_norm": 0.19612948596477509, + "learning_rate": 0.0009996985980290126, + "loss": 3.5304, + "step": 708 + }, + { + "epoch": 0.021024226788838477, + "grad_norm": 0.19792966544628143, + "learning_rate": 0.0009996969623656824, + "loss": 3.5832, + "step": 709 + }, + { + "epoch": 0.021053880141149957, + "grad_norm": 0.1931608021259308, + "learning_rate": 0.0009996953222774498, + "loss": 3.5479, + "step": 710 + }, + { + "epoch": 0.021083533493461436, + "grad_norm": 0.23456889390945435, + "learning_rate": 0.0009996936777643293, + "loss": 3.5712, + "step": 711 + }, + { + "epoch": 0.021113186845772916, + "grad_norm": 0.25511789321899414, + "learning_rate": 0.0009996920288263358, + "loss": 3.5566, + "step": 712 + }, + { + "epoch": 0.021142840198084392, + "grad_norm": 0.23638685047626495, + "learning_rate": 0.0009996903754634833, + "loss": 3.5526, + "step": 713 + }, + { + "epoch": 0.021172493550395872, + "grad_norm": 0.31733331084251404, + "learning_rate": 0.0009996887176757867, + "loss": 3.5641, + "step": 714 + }, + { + "epoch": 0.02120214690270735, + "grad_norm": 0.24183064699172974, + "learning_rate": 0.000999687055463261, + "loss": 3.5768, + "step": 715 + }, + { + "epoch": 0.02123180025501883, + "grad_norm": 0.16342438757419586, + "learning_rate": 0.0009996853888259206, + "loss": 3.5342, + "step": 716 + }, + { + "epoch": 0.021261453607330307, + "grad_norm": 0.1768781989812851, + "learning_rate": 0.0009996837177637802, + "loss": 3.5145, + "step": 717 + }, + { + "epoch": 0.021291106959641787, + "grad_norm": 0.15693984925746918, + "learning_rate": 0.0009996820422768548, + "loss": 3.5194, + "step": 718 + }, + { + "epoch": 0.021320760311953266, + "grad_norm": 0.14506345987319946, + "learning_rate": 0.0009996803623651591, + "loss": 3.5185, + "step": 719 + }, + { + "epoch": 0.021350413664264746, + "grad_norm": 0.1649334728717804, + "learning_rate": 0.000999678678028708, + "loss": 3.5268, + "step": 720 + }, + { + "epoch": 0.021380067016576226, + "grad_norm": 0.14910240471363068, + "learning_rate": 0.0009996769892675166, + "loss": 3.5296, + "step": 721 + }, + { + "epoch": 0.021409720368887702, + "grad_norm": 0.17633451521396637, + "learning_rate": 0.0009996752960815996, + "loss": 3.4957, + "step": 722 + }, + { + "epoch": 0.02143937372119918, + "grad_norm": 0.13824544847011566, + "learning_rate": 0.000999673598470972, + "loss": 3.5488, + "step": 723 + }, + { + "epoch": 0.02146902707351066, + "grad_norm": 0.1393604874610901, + "learning_rate": 0.0009996718964356487, + "loss": 3.5303, + "step": 724 + }, + { + "epoch": 0.02149868042582214, + "grad_norm": 0.1311102956533432, + "learning_rate": 0.0009996701899756455, + "loss": 3.5165, + "step": 725 + }, + { + "epoch": 0.021528333778133617, + "grad_norm": 0.1338406801223755, + "learning_rate": 0.0009996684790909767, + "loss": 3.5242, + "step": 726 + }, + { + "epoch": 0.021557987130445096, + "grad_norm": 0.15562273561954498, + "learning_rate": 0.0009996667637816577, + "loss": 3.5182, + "step": 727 + }, + { + "epoch": 0.021587640482756576, + "grad_norm": 0.15311294794082642, + "learning_rate": 0.000999665044047704, + "loss": 3.4812, + "step": 728 + }, + { + "epoch": 0.021617293835068056, + "grad_norm": 0.14122548699378967, + "learning_rate": 0.0009996633198891302, + "loss": 3.5527, + "step": 729 + }, + { + "epoch": 0.021646947187379532, + "grad_norm": 0.19162343442440033, + "learning_rate": 0.0009996615913059523, + "loss": 3.5316, + "step": 730 + }, + { + "epoch": 0.02167660053969101, + "grad_norm": 0.17660751938819885, + "learning_rate": 0.000999659858298185, + "loss": 3.5287, + "step": 731 + }, + { + "epoch": 0.02170625389200249, + "grad_norm": 0.1579160988330841, + "learning_rate": 0.000999658120865844, + "loss": 3.5238, + "step": 732 + }, + { + "epoch": 0.02173590724431397, + "grad_norm": 0.16022881865501404, + "learning_rate": 0.0009996563790089445, + "loss": 3.5, + "step": 733 + }, + { + "epoch": 0.02176556059662545, + "grad_norm": 0.1661415696144104, + "learning_rate": 0.0009996546327275023, + "loss": 3.519, + "step": 734 + }, + { + "epoch": 0.021795213948936926, + "grad_norm": 0.17836429178714752, + "learning_rate": 0.0009996528820215322, + "loss": 3.5005, + "step": 735 + }, + { + "epoch": 0.021824867301248406, + "grad_norm": 0.17647109925746918, + "learning_rate": 0.0009996511268910502, + "loss": 3.5016, + "step": 736 + }, + { + "epoch": 0.021854520653559886, + "grad_norm": 0.1629297435283661, + "learning_rate": 0.0009996493673360717, + "loss": 3.5152, + "step": 737 + }, + { + "epoch": 0.021884174005871365, + "grad_norm": 0.21193838119506836, + "learning_rate": 0.0009996476033566123, + "loss": 3.5178, + "step": 738 + }, + { + "epoch": 0.02191382735818284, + "grad_norm": 0.20944926142692566, + "learning_rate": 0.0009996458349526877, + "loss": 3.5051, + "step": 739 + }, + { + "epoch": 0.02194348071049432, + "grad_norm": 0.1639397144317627, + "learning_rate": 0.0009996440621243133, + "loss": 3.5265, + "step": 740 + }, + { + "epoch": 0.0219731340628058, + "grad_norm": 0.17510846257209778, + "learning_rate": 0.000999642284871505, + "loss": 3.5167, + "step": 741 + }, + { + "epoch": 0.02200278741511728, + "grad_norm": 0.20723675191402435, + "learning_rate": 0.0009996405031942786, + "loss": 3.5449, + "step": 742 + }, + { + "epoch": 0.022032440767428756, + "grad_norm": 0.23030900955200195, + "learning_rate": 0.0009996387170926495, + "loss": 3.5285, + "step": 743 + }, + { + "epoch": 0.022062094119740236, + "grad_norm": 0.22752036154270172, + "learning_rate": 0.0009996369265666341, + "loss": 3.4958, + "step": 744 + }, + { + "epoch": 0.022091747472051716, + "grad_norm": 0.2065182775259018, + "learning_rate": 0.0009996351316162479, + "loss": 3.5149, + "step": 745 + }, + { + "epoch": 0.022121400824363195, + "grad_norm": 0.20583876967430115, + "learning_rate": 0.0009996333322415069, + "loss": 3.5113, + "step": 746 + }, + { + "epoch": 0.022151054176674675, + "grad_norm": 0.21880759298801422, + "learning_rate": 0.0009996315284424267, + "loss": 3.5138, + "step": 747 + }, + { + "epoch": 0.02218070752898615, + "grad_norm": 0.17893339693546295, + "learning_rate": 0.0009996297202190239, + "loss": 3.5313, + "step": 748 + }, + { + "epoch": 0.02221036088129763, + "grad_norm": 0.16601034998893738, + "learning_rate": 0.000999627907571314, + "loss": 3.4939, + "step": 749 + }, + { + "epoch": 0.02224001423360911, + "grad_norm": 0.17073054611682892, + "learning_rate": 0.000999626090499313, + "loss": 3.5092, + "step": 750 + }, + { + "epoch": 0.02226966758592059, + "grad_norm": 0.13628989458084106, + "learning_rate": 0.0009996242690030377, + "loss": 3.5142, + "step": 751 + }, + { + "epoch": 0.022299320938232066, + "grad_norm": 0.15466533601284027, + "learning_rate": 0.0009996224430825033, + "loss": 3.4729, + "step": 752 + }, + { + "epoch": 0.022328974290543546, + "grad_norm": 0.17830219864845276, + "learning_rate": 0.0009996206127377268, + "loss": 3.4944, + "step": 753 + }, + { + "epoch": 0.022358627642855025, + "grad_norm": 0.18991881608963013, + "learning_rate": 0.0009996187779687236, + "loss": 3.5167, + "step": 754 + }, + { + "epoch": 0.022388280995166505, + "grad_norm": 0.1815270334482193, + "learning_rate": 0.0009996169387755107, + "loss": 3.53, + "step": 755 + }, + { + "epoch": 0.02241793434747798, + "grad_norm": 0.14263178408145905, + "learning_rate": 0.000999615095158104, + "loss": 3.5177, + "step": 756 + }, + { + "epoch": 0.02244758769978946, + "grad_norm": 0.15624845027923584, + "learning_rate": 0.0009996132471165196, + "loss": 3.5129, + "step": 757 + }, + { + "epoch": 0.02247724105210094, + "grad_norm": 0.1188562735915184, + "learning_rate": 0.0009996113946507744, + "loss": 3.4557, + "step": 758 + }, + { + "epoch": 0.02250689440441242, + "grad_norm": 0.13332806527614594, + "learning_rate": 0.0009996095377608845, + "loss": 3.4941, + "step": 759 + }, + { + "epoch": 0.0225365477567239, + "grad_norm": 0.15251486003398895, + "learning_rate": 0.0009996076764468664, + "loss": 3.494, + "step": 760 + }, + { + "epoch": 0.022566201109035375, + "grad_norm": 0.1489959955215454, + "learning_rate": 0.0009996058107087365, + "loss": 3.4852, + "step": 761 + }, + { + "epoch": 0.022595854461346855, + "grad_norm": 0.14063328504562378, + "learning_rate": 0.0009996039405465113, + "loss": 3.4853, + "step": 762 + }, + { + "epoch": 0.022625507813658335, + "grad_norm": 0.15284483134746552, + "learning_rate": 0.0009996020659602076, + "loss": 3.5067, + "step": 763 + }, + { + "epoch": 0.022655161165969814, + "grad_norm": 0.1744374930858612, + "learning_rate": 0.000999600186949842, + "loss": 3.4854, + "step": 764 + }, + { + "epoch": 0.02268481451828129, + "grad_norm": 0.21002833545207977, + "learning_rate": 0.0009995983035154307, + "loss": 3.4709, + "step": 765 + }, + { + "epoch": 0.02271446787059277, + "grad_norm": 0.14653858542442322, + "learning_rate": 0.0009995964156569908, + "loss": 3.5273, + "step": 766 + }, + { + "epoch": 0.02274412122290425, + "grad_norm": 0.16911382973194122, + "learning_rate": 0.0009995945233745387, + "loss": 3.4869, + "step": 767 + }, + { + "epoch": 0.02277377457521573, + "grad_norm": 0.2048460692167282, + "learning_rate": 0.0009995926266680917, + "loss": 3.4654, + "step": 768 + }, + { + "epoch": 0.022803427927527205, + "grad_norm": 0.2009643167257309, + "learning_rate": 0.000999590725537666, + "loss": 3.5243, + "step": 769 + }, + { + "epoch": 0.022833081279838685, + "grad_norm": 0.17927633225917816, + "learning_rate": 0.0009995888199832786, + "loss": 3.4727, + "step": 770 + }, + { + "epoch": 0.022862734632150165, + "grad_norm": 0.238255113363266, + "learning_rate": 0.0009995869100049466, + "loss": 3.4664, + "step": 771 + }, + { + "epoch": 0.022892387984461644, + "grad_norm": 0.28245481848716736, + "learning_rate": 0.0009995849956026869, + "loss": 3.5307, + "step": 772 + }, + { + "epoch": 0.022922041336773124, + "grad_norm": 0.29932355880737305, + "learning_rate": 0.000999583076776516, + "loss": 3.5374, + "step": 773 + }, + { + "epoch": 0.0229516946890846, + "grad_norm": 0.2639029324054718, + "learning_rate": 0.0009995811535264514, + "loss": 3.4776, + "step": 774 + }, + { + "epoch": 0.02298134804139608, + "grad_norm": 0.17879533767700195, + "learning_rate": 0.0009995792258525099, + "loss": 3.4974, + "step": 775 + }, + { + "epoch": 0.02301100139370756, + "grad_norm": 0.21875394880771637, + "learning_rate": 0.0009995772937547085, + "loss": 3.5131, + "step": 776 + }, + { + "epoch": 0.02304065474601904, + "grad_norm": 0.28344517946243286, + "learning_rate": 0.0009995753572330645, + "loss": 3.4808, + "step": 777 + }, + { + "epoch": 0.023070308098330515, + "grad_norm": 0.2152918577194214, + "learning_rate": 0.0009995734162875948, + "loss": 3.5274, + "step": 778 + }, + { + "epoch": 0.023099961450641995, + "grad_norm": 0.15658840537071228, + "learning_rate": 0.000999571470918317, + "loss": 3.5158, + "step": 779 + }, + { + "epoch": 0.023129614802953474, + "grad_norm": 0.1546323597431183, + "learning_rate": 0.0009995695211252478, + "loss": 3.5322, + "step": 780 + }, + { + "epoch": 0.023159268155264954, + "grad_norm": 0.13771232962608337, + "learning_rate": 0.000999567566908405, + "loss": 3.4954, + "step": 781 + }, + { + "epoch": 0.02318892150757643, + "grad_norm": 0.14686641097068787, + "learning_rate": 0.0009995656082678055, + "loss": 3.4931, + "step": 782 + }, + { + "epoch": 0.02321857485988791, + "grad_norm": 0.16688497364521027, + "learning_rate": 0.0009995636452034668, + "loss": 3.4927, + "step": 783 + }, + { + "epoch": 0.02324822821219939, + "grad_norm": 0.1425934135913849, + "learning_rate": 0.0009995616777154063, + "loss": 3.4794, + "step": 784 + }, + { + "epoch": 0.02327788156451087, + "grad_norm": 0.15148918330669403, + "learning_rate": 0.0009995597058036416, + "loss": 3.4859, + "step": 785 + }, + { + "epoch": 0.02330753491682235, + "grad_norm": 0.13534720242023468, + "learning_rate": 0.0009995577294681897, + "loss": 3.4856, + "step": 786 + }, + { + "epoch": 0.023337188269133825, + "grad_norm": 0.11610002815723419, + "learning_rate": 0.0009995557487090683, + "loss": 3.4765, + "step": 787 + }, + { + "epoch": 0.023366841621445304, + "grad_norm": 0.10793238133192062, + "learning_rate": 0.0009995537635262952, + "loss": 3.4695, + "step": 788 + }, + { + "epoch": 0.023396494973756784, + "grad_norm": 0.11465749889612198, + "learning_rate": 0.0009995517739198878, + "loss": 3.4837, + "step": 789 + }, + { + "epoch": 0.023426148326068263, + "grad_norm": 0.11703135073184967, + "learning_rate": 0.0009995497798898636, + "loss": 3.4913, + "step": 790 + }, + { + "epoch": 0.02345580167837974, + "grad_norm": 0.10588518530130386, + "learning_rate": 0.0009995477814362403, + "loss": 3.4858, + "step": 791 + }, + { + "epoch": 0.02348545503069122, + "grad_norm": 0.12496661394834518, + "learning_rate": 0.0009995457785590355, + "loss": 3.4335, + "step": 792 + }, + { + "epoch": 0.0235151083830027, + "grad_norm": 0.1190372109413147, + "learning_rate": 0.0009995437712582674, + "loss": 3.4355, + "step": 793 + }, + { + "epoch": 0.02354476173531418, + "grad_norm": 0.12190520018339157, + "learning_rate": 0.0009995417595339534, + "loss": 3.4915, + "step": 794 + }, + { + "epoch": 0.023574415087625655, + "grad_norm": 0.1305021196603775, + "learning_rate": 0.0009995397433861114, + "loss": 3.4809, + "step": 795 + }, + { + "epoch": 0.023604068439937134, + "grad_norm": 0.1582127809524536, + "learning_rate": 0.000999537722814759, + "loss": 3.4788, + "step": 796 + }, + { + "epoch": 0.023633721792248614, + "grad_norm": 0.19858476519584656, + "learning_rate": 0.0009995356978199144, + "loss": 3.4754, + "step": 797 + }, + { + "epoch": 0.023663375144560093, + "grad_norm": 0.21356894075870514, + "learning_rate": 0.0009995336684015957, + "loss": 3.4893, + "step": 798 + }, + { + "epoch": 0.023693028496871573, + "grad_norm": 0.26238659024238586, + "learning_rate": 0.0009995316345598204, + "loss": 3.4976, + "step": 799 + }, + { + "epoch": 0.02372268184918305, + "grad_norm": 0.2157573103904724, + "learning_rate": 0.0009995295962946067, + "loss": 3.4745, + "step": 800 + }, + { + "epoch": 0.02375233520149453, + "grad_norm": 0.1777871698141098, + "learning_rate": 0.0009995275536059728, + "loss": 3.4828, + "step": 801 + }, + { + "epoch": 0.02378198855380601, + "grad_norm": 0.21195323765277863, + "learning_rate": 0.0009995255064939367, + "loss": 3.5112, + "step": 802 + }, + { + "epoch": 0.023811641906117488, + "grad_norm": 0.26790615916252136, + "learning_rate": 0.0009995234549585162, + "loss": 3.5021, + "step": 803 + }, + { + "epoch": 0.023841295258428964, + "grad_norm": 0.24591320753097534, + "learning_rate": 0.0009995213989997301, + "loss": 3.5098, + "step": 804 + }, + { + "epoch": 0.023870948610740444, + "grad_norm": 0.18133743107318878, + "learning_rate": 0.0009995193386175961, + "loss": 3.4603, + "step": 805 + }, + { + "epoch": 0.023900601963051923, + "grad_norm": 0.1383960247039795, + "learning_rate": 0.0009995172738121326, + "loss": 3.4529, + "step": 806 + }, + { + "epoch": 0.023930255315363403, + "grad_norm": 0.1607171595096588, + "learning_rate": 0.000999515204583358, + "loss": 3.4451, + "step": 807 + }, + { + "epoch": 0.02395990866767488, + "grad_norm": 0.14086303114891052, + "learning_rate": 0.0009995131309312904, + "loss": 3.4612, + "step": 808 + }, + { + "epoch": 0.02398956201998636, + "grad_norm": 0.1433851420879364, + "learning_rate": 0.0009995110528559484, + "loss": 3.4534, + "step": 809 + }, + { + "epoch": 0.02401921537229784, + "grad_norm": 0.12848323583602905, + "learning_rate": 0.0009995089703573503, + "loss": 3.478, + "step": 810 + }, + { + "epoch": 0.024048868724609318, + "grad_norm": 0.11757951229810715, + "learning_rate": 0.0009995068834355145, + "loss": 3.5082, + "step": 811 + }, + { + "epoch": 0.024078522076920798, + "grad_norm": 0.14078503847122192, + "learning_rate": 0.0009995047920904594, + "loss": 3.4108, + "step": 812 + }, + { + "epoch": 0.024108175429232274, + "grad_norm": 0.13240627944469452, + "learning_rate": 0.0009995026963222039, + "loss": 3.4711, + "step": 813 + }, + { + "epoch": 0.024137828781543753, + "grad_norm": 0.13502393662929535, + "learning_rate": 0.000999500596130766, + "loss": 3.4395, + "step": 814 + }, + { + "epoch": 0.024167482133855233, + "grad_norm": 0.11705522984266281, + "learning_rate": 0.0009994984915161647, + "loss": 3.4637, + "step": 815 + }, + { + "epoch": 0.024197135486166713, + "grad_norm": 0.1680763065814972, + "learning_rate": 0.0009994963824784184, + "loss": 3.4341, + "step": 816 + }, + { + "epoch": 0.02422678883847819, + "grad_norm": 0.2928238809108734, + "learning_rate": 0.0009994942690175462, + "loss": 3.441, + "step": 817 + }, + { + "epoch": 0.02425644219078967, + "grad_norm": 0.14053966104984283, + "learning_rate": 0.0009994921511335662, + "loss": 3.4634, + "step": 818 + }, + { + "epoch": 0.024286095543101148, + "grad_norm": 0.13428661227226257, + "learning_rate": 0.0009994900288264976, + "loss": 3.4697, + "step": 819 + }, + { + "epoch": 0.024315748895412628, + "grad_norm": 0.11459262669086456, + "learning_rate": 0.000999487902096359, + "loss": 3.4441, + "step": 820 + }, + { + "epoch": 0.024345402247724104, + "grad_norm": 0.13305845856666565, + "learning_rate": 0.0009994857709431694, + "loss": 3.4401, + "step": 821 + }, + { + "epoch": 0.024375055600035583, + "grad_norm": 0.1537901908159256, + "learning_rate": 0.0009994836353669474, + "loss": 3.4371, + "step": 822 + }, + { + "epoch": 0.024404708952347063, + "grad_norm": 0.1714632660150528, + "learning_rate": 0.0009994814953677123, + "loss": 3.4523, + "step": 823 + }, + { + "epoch": 0.024434362304658543, + "grad_norm": 0.19433526694774628, + "learning_rate": 0.0009994793509454827, + "loss": 3.506, + "step": 824 + }, + { + "epoch": 0.02446401565697002, + "grad_norm": 0.22160497307777405, + "learning_rate": 0.0009994772021002776, + "loss": 3.4285, + "step": 825 + }, + { + "epoch": 0.0244936690092815, + "grad_norm": 0.20578637719154358, + "learning_rate": 0.0009994750488321162, + "loss": 3.4735, + "step": 826 + }, + { + "epoch": 0.024523322361592978, + "grad_norm": 0.1645437777042389, + "learning_rate": 0.0009994728911410175, + "loss": 3.451, + "step": 827 + }, + { + "epoch": 0.024552975713904458, + "grad_norm": 0.1736719310283661, + "learning_rate": 0.0009994707290270008, + "loss": 3.4752, + "step": 828 + }, + { + "epoch": 0.024582629066215937, + "grad_norm": 0.15550890564918518, + "learning_rate": 0.000999468562490085, + "loss": 3.4311, + "step": 829 + }, + { + "epoch": 0.024612282418527413, + "grad_norm": 0.1673474758863449, + "learning_rate": 0.0009994663915302894, + "loss": 3.4761, + "step": 830 + }, + { + "epoch": 0.024641935770838893, + "grad_norm": 0.17850887775421143, + "learning_rate": 0.0009994642161476328, + "loss": 3.4425, + "step": 831 + }, + { + "epoch": 0.024671589123150373, + "grad_norm": 0.1626821905374527, + "learning_rate": 0.0009994620363421353, + "loss": 3.4696, + "step": 832 + }, + { + "epoch": 0.024701242475461852, + "grad_norm": 0.17699427902698517, + "learning_rate": 0.0009994598521138153, + "loss": 3.4374, + "step": 833 + }, + { + "epoch": 0.02473089582777333, + "grad_norm": 0.19510869681835175, + "learning_rate": 0.0009994576634626928, + "loss": 3.4665, + "step": 834 + }, + { + "epoch": 0.024760549180084808, + "grad_norm": 0.14776422083377838, + "learning_rate": 0.000999455470388787, + "loss": 3.4613, + "step": 835 + }, + { + "epoch": 0.024790202532396288, + "grad_norm": 0.1737036556005478, + "learning_rate": 0.0009994532728921173, + "loss": 3.4472, + "step": 836 + }, + { + "epoch": 0.024819855884707767, + "grad_norm": 0.15682005882263184, + "learning_rate": 0.000999451070972703, + "loss": 3.3929, + "step": 837 + }, + { + "epoch": 0.024849509237019243, + "grad_norm": 0.18332526087760925, + "learning_rate": 0.0009994488646305638, + "loss": 3.4706, + "step": 838 + }, + { + "epoch": 0.024879162589330723, + "grad_norm": 0.21191582083702087, + "learning_rate": 0.0009994466538657191, + "loss": 3.4296, + "step": 839 + }, + { + "epoch": 0.024908815941642203, + "grad_norm": 0.18952326476573944, + "learning_rate": 0.0009994444386781888, + "loss": 3.4359, + "step": 840 + }, + { + "epoch": 0.024938469293953682, + "grad_norm": 0.14797569811344147, + "learning_rate": 0.000999442219067992, + "loss": 3.4657, + "step": 841 + }, + { + "epoch": 0.024968122646265162, + "grad_norm": 0.2600487470626831, + "learning_rate": 0.0009994399950351486, + "loss": 3.48, + "step": 842 + }, + { + "epoch": 0.024997775998576638, + "grad_norm": 0.14705456793308258, + "learning_rate": 0.0009994377665796786, + "loss": 3.4243, + "step": 843 + }, + { + "epoch": 0.025027429350888118, + "grad_norm": 0.17900070548057556, + "learning_rate": 0.0009994355337016013, + "loss": 3.4624, + "step": 844 + }, + { + "epoch": 0.025057082703199597, + "grad_norm": 0.19516892731189728, + "learning_rate": 0.0009994332964009367, + "loss": 3.4401, + "step": 845 + }, + { + "epoch": 0.025086736055511077, + "grad_norm": 0.16290034353733063, + "learning_rate": 0.0009994310546777043, + "loss": 3.4685, + "step": 846 + }, + { + "epoch": 0.025116389407822553, + "grad_norm": 0.16387757658958435, + "learning_rate": 0.0009994288085319243, + "loss": 3.4395, + "step": 847 + }, + { + "epoch": 0.025146042760134033, + "grad_norm": 0.14633533358573914, + "learning_rate": 0.0009994265579636166, + "loss": 3.459, + "step": 848 + }, + { + "epoch": 0.025175696112445512, + "grad_norm": 0.15188589692115784, + "learning_rate": 0.000999424302972801, + "loss": 3.4577, + "step": 849 + }, + { + "epoch": 0.02520534946475699, + "grad_norm": 0.14591626822948456, + "learning_rate": 0.0009994220435594972, + "loss": 3.395, + "step": 850 + }, + { + "epoch": 0.025235002817068468, + "grad_norm": 0.13420212268829346, + "learning_rate": 0.0009994197797237256, + "loss": 3.4138, + "step": 851 + }, + { + "epoch": 0.025264656169379947, + "grad_norm": 0.15434718132019043, + "learning_rate": 0.0009994175114655065, + "loss": 3.4636, + "step": 852 + }, + { + "epoch": 0.025294309521691427, + "grad_norm": 0.18841923773288727, + "learning_rate": 0.000999415238784859, + "loss": 3.4478, + "step": 853 + }, + { + "epoch": 0.025323962874002907, + "grad_norm": 0.2085038423538208, + "learning_rate": 0.0009994129616818044, + "loss": 3.4222, + "step": 854 + }, + { + "epoch": 0.025353616226314386, + "grad_norm": 0.17029227316379547, + "learning_rate": 0.0009994106801563618, + "loss": 3.4209, + "step": 855 + }, + { + "epoch": 0.025383269578625862, + "grad_norm": 0.17727376520633698, + "learning_rate": 0.0009994083942085523, + "loss": 3.4763, + "step": 856 + }, + { + "epoch": 0.025412922930937342, + "grad_norm": 0.15831786394119263, + "learning_rate": 0.0009994061038383956, + "loss": 3.4423, + "step": 857 + }, + { + "epoch": 0.02544257628324882, + "grad_norm": 0.17576071619987488, + "learning_rate": 0.0009994038090459121, + "loss": 3.4559, + "step": 858 + }, + { + "epoch": 0.0254722296355603, + "grad_norm": 0.15584194660186768, + "learning_rate": 0.000999401509831122, + "loss": 3.428, + "step": 859 + }, + { + "epoch": 0.025501882987871777, + "grad_norm": 0.130891814827919, + "learning_rate": 0.0009993992061940462, + "loss": 3.4319, + "step": 860 + }, + { + "epoch": 0.025531536340183257, + "grad_norm": 0.14059291779994965, + "learning_rate": 0.0009993968981347045, + "loss": 3.4566, + "step": 861 + }, + { + "epoch": 0.025561189692494737, + "grad_norm": 0.1609935313463211, + "learning_rate": 0.0009993945856531174, + "loss": 3.4231, + "step": 862 + }, + { + "epoch": 0.025590843044806216, + "grad_norm": 0.16397549211978912, + "learning_rate": 0.0009993922687493056, + "loss": 3.4522, + "step": 863 + }, + { + "epoch": 0.025620496397117692, + "grad_norm": 0.13765175640583038, + "learning_rate": 0.0009993899474232896, + "loss": 3.4486, + "step": 864 + }, + { + "epoch": 0.025650149749429172, + "grad_norm": 0.1479179859161377, + "learning_rate": 0.00099938762167509, + "loss": 3.47, + "step": 865 + }, + { + "epoch": 0.02567980310174065, + "grad_norm": 0.15456388890743256, + "learning_rate": 0.000999385291504727, + "loss": 3.4816, + "step": 866 + }, + { + "epoch": 0.02570945645405213, + "grad_norm": 0.19142664968967438, + "learning_rate": 0.0009993829569122218, + "loss": 3.4307, + "step": 867 + }, + { + "epoch": 0.02573910980636361, + "grad_norm": 0.20329008996486664, + "learning_rate": 0.0009993806178975949, + "loss": 3.4666, + "step": 868 + }, + { + "epoch": 0.025768763158675087, + "grad_norm": 0.1731116622686386, + "learning_rate": 0.0009993782744608666, + "loss": 3.4513, + "step": 869 + }, + { + "epoch": 0.025798416510986567, + "grad_norm": 0.16149261593818665, + "learning_rate": 0.0009993759266020581, + "loss": 3.4553, + "step": 870 + }, + { + "epoch": 0.025828069863298046, + "grad_norm": 0.1774822622537613, + "learning_rate": 0.00099937357432119, + "loss": 3.4372, + "step": 871 + }, + { + "epoch": 0.025857723215609526, + "grad_norm": 0.18413682281970978, + "learning_rate": 0.0009993712176182832, + "loss": 3.4432, + "step": 872 + }, + { + "epoch": 0.025887376567921002, + "grad_norm": 0.16395269334316254, + "learning_rate": 0.0009993688564933585, + "loss": 3.4264, + "step": 873 + }, + { + "epoch": 0.02591702992023248, + "grad_norm": 0.1883307695388794, + "learning_rate": 0.0009993664909464372, + "loss": 3.4538, + "step": 874 + }, + { + "epoch": 0.02594668327254396, + "grad_norm": 0.1655484437942505, + "learning_rate": 0.0009993641209775394, + "loss": 3.406, + "step": 875 + }, + { + "epoch": 0.02597633662485544, + "grad_norm": 0.15677280724048615, + "learning_rate": 0.0009993617465866868, + "loss": 3.4532, + "step": 876 + }, + { + "epoch": 0.026005989977166917, + "grad_norm": 0.1254117637872696, + "learning_rate": 0.0009993593677739003, + "loss": 3.4165, + "step": 877 + }, + { + "epoch": 0.026035643329478397, + "grad_norm": 0.11642273515462875, + "learning_rate": 0.0009993569845392009, + "loss": 3.4297, + "step": 878 + }, + { + "epoch": 0.026065296681789876, + "grad_norm": 0.12752379477024078, + "learning_rate": 0.0009993545968826096, + "loss": 3.3763, + "step": 879 + }, + { + "epoch": 0.026094950034101356, + "grad_norm": 0.1324237883090973, + "learning_rate": 0.0009993522048041476, + "loss": 3.3951, + "step": 880 + }, + { + "epoch": 0.026124603386412835, + "grad_norm": 0.12477660924196243, + "learning_rate": 0.000999349808303836, + "loss": 3.4197, + "step": 881 + }, + { + "epoch": 0.02615425673872431, + "grad_norm": 0.13342803716659546, + "learning_rate": 0.0009993474073816966, + "loss": 3.4428, + "step": 882 + }, + { + "epoch": 0.02618391009103579, + "grad_norm": 0.12535899877548218, + "learning_rate": 0.0009993450020377498, + "loss": 3.4328, + "step": 883 + }, + { + "epoch": 0.02621356344334727, + "grad_norm": 0.14100883901119232, + "learning_rate": 0.0009993425922720173, + "loss": 3.4423, + "step": 884 + }, + { + "epoch": 0.02624321679565875, + "grad_norm": 0.15354037284851074, + "learning_rate": 0.0009993401780845207, + "loss": 3.4333, + "step": 885 + }, + { + "epoch": 0.026272870147970227, + "grad_norm": 0.19369323551654816, + "learning_rate": 0.0009993377594752807, + "loss": 3.4555, + "step": 886 + }, + { + "epoch": 0.026302523500281706, + "grad_norm": 0.18816789984703064, + "learning_rate": 0.0009993353364443195, + "loss": 3.4285, + "step": 887 + }, + { + "epoch": 0.026332176852593186, + "grad_norm": 0.17926491796970367, + "learning_rate": 0.0009993329089916581, + "loss": 3.4443, + "step": 888 + }, + { + "epoch": 0.026361830204904665, + "grad_norm": 0.18845754861831665, + "learning_rate": 0.000999330477117318, + "loss": 3.4116, + "step": 889 + }, + { + "epoch": 0.02639148355721614, + "grad_norm": 0.1966557800769806, + "learning_rate": 0.000999328040821321, + "loss": 3.4151, + "step": 890 + }, + { + "epoch": 0.02642113690952762, + "grad_norm": 0.15556176006793976, + "learning_rate": 0.0009993256001036882, + "loss": 3.4396, + "step": 891 + }, + { + "epoch": 0.0264507902618391, + "grad_norm": 0.16122651100158691, + "learning_rate": 0.0009993231549644418, + "loss": 3.409, + "step": 892 + }, + { + "epoch": 0.02648044361415058, + "grad_norm": 0.144083634018898, + "learning_rate": 0.0009993207054036029, + "loss": 3.4382, + "step": 893 + }, + { + "epoch": 0.02651009696646206, + "grad_norm": 0.1255359649658203, + "learning_rate": 0.0009993182514211937, + "loss": 3.4359, + "step": 894 + }, + { + "epoch": 0.026539750318773536, + "grad_norm": 0.13498471677303314, + "learning_rate": 0.0009993157930172354, + "loss": 3.4161, + "step": 895 + }, + { + "epoch": 0.026569403671085016, + "grad_norm": 0.15262062847614288, + "learning_rate": 0.0009993133301917502, + "loss": 3.4304, + "step": 896 + }, + { + "epoch": 0.026599057023396495, + "grad_norm": 0.19517424702644348, + "learning_rate": 0.0009993108629447595, + "loss": 3.446, + "step": 897 + }, + { + "epoch": 0.026628710375707975, + "grad_norm": 0.2073994129896164, + "learning_rate": 0.0009993083912762859, + "loss": 3.3708, + "step": 898 + }, + { + "epoch": 0.02665836372801945, + "grad_norm": 0.19396139681339264, + "learning_rate": 0.0009993059151863503, + "loss": 3.4537, + "step": 899 + }, + { + "epoch": 0.02668801708033093, + "grad_norm": 0.20467451214790344, + "learning_rate": 0.0009993034346749755, + "loss": 3.4338, + "step": 900 + }, + { + "epoch": 0.02671767043264241, + "grad_norm": 0.16975277662277222, + "learning_rate": 0.000999300949742183, + "loss": 3.4521, + "step": 901 + }, + { + "epoch": 0.02674732378495389, + "grad_norm": 0.13981637358665466, + "learning_rate": 0.0009992984603879947, + "loss": 3.3993, + "step": 902 + }, + { + "epoch": 0.026776977137265366, + "grad_norm": 0.15933892130851746, + "learning_rate": 0.0009992959666124328, + "loss": 3.4569, + "step": 903 + }, + { + "epoch": 0.026806630489576846, + "grad_norm": 0.1286165863275528, + "learning_rate": 0.0009992934684155198, + "loss": 3.4177, + "step": 904 + }, + { + "epoch": 0.026836283841888325, + "grad_norm": 0.15500207245349884, + "learning_rate": 0.0009992909657972771, + "loss": 3.4112, + "step": 905 + }, + { + "epoch": 0.026865937194199805, + "grad_norm": 0.14730295538902283, + "learning_rate": 0.0009992884587577272, + "loss": 3.3898, + "step": 906 + }, + { + "epoch": 0.026895590546511285, + "grad_norm": 0.1390581727027893, + "learning_rate": 0.0009992859472968923, + "loss": 3.3844, + "step": 907 + }, + { + "epoch": 0.02692524389882276, + "grad_norm": 0.14831165969371796, + "learning_rate": 0.0009992834314147946, + "loss": 3.4359, + "step": 908 + }, + { + "epoch": 0.02695489725113424, + "grad_norm": 0.11020945012569427, + "learning_rate": 0.0009992809111114566, + "loss": 3.3862, + "step": 909 + }, + { + "epoch": 0.02698455060344572, + "grad_norm": 0.11670216917991638, + "learning_rate": 0.0009992783863869005, + "loss": 3.4353, + "step": 910 + }, + { + "epoch": 0.0270142039557572, + "grad_norm": 0.11578277498483658, + "learning_rate": 0.0009992758572411485, + "loss": 3.3988, + "step": 911 + }, + { + "epoch": 0.027043857308068676, + "grad_norm": 0.12403073161840439, + "learning_rate": 0.000999273323674223, + "loss": 3.4433, + "step": 912 + }, + { + "epoch": 0.027073510660380155, + "grad_norm": 0.13855759799480438, + "learning_rate": 0.0009992707856861466, + "loss": 3.4272, + "step": 913 + }, + { + "epoch": 0.027103164012691635, + "grad_norm": 0.12674005329608917, + "learning_rate": 0.0009992682432769415, + "loss": 3.3973, + "step": 914 + }, + { + "epoch": 0.027132817365003115, + "grad_norm": 0.14288844168186188, + "learning_rate": 0.0009992656964466307, + "loss": 3.4223, + "step": 915 + }, + { + "epoch": 0.02716247071731459, + "grad_norm": 0.15534715354442596, + "learning_rate": 0.0009992631451952363, + "loss": 3.409, + "step": 916 + }, + { + "epoch": 0.02719212406962607, + "grad_norm": 0.17284570634365082, + "learning_rate": 0.000999260589522781, + "loss": 3.4076, + "step": 917 + }, + { + "epoch": 0.02722177742193755, + "grad_norm": 0.16283130645751953, + "learning_rate": 0.0009992580294292874, + "loss": 3.4128, + "step": 918 + }, + { + "epoch": 0.02725143077424903, + "grad_norm": 0.11888277530670166, + "learning_rate": 0.0009992554649147784, + "loss": 3.4114, + "step": 919 + }, + { + "epoch": 0.02728108412656051, + "grad_norm": 0.17518319189548492, + "learning_rate": 0.0009992528959792766, + "loss": 3.4439, + "step": 920 + }, + { + "epoch": 0.027310737478871985, + "grad_norm": 0.16875606775283813, + "learning_rate": 0.0009992503226228047, + "loss": 3.3957, + "step": 921 + }, + { + "epoch": 0.027340390831183465, + "grad_norm": 0.184597909450531, + "learning_rate": 0.0009992477448453854, + "loss": 3.3753, + "step": 922 + }, + { + "epoch": 0.027370044183494945, + "grad_norm": 0.207902729511261, + "learning_rate": 0.0009992451626470418, + "loss": 3.4202, + "step": 923 + }, + { + "epoch": 0.027399697535806424, + "grad_norm": 0.19347083568572998, + "learning_rate": 0.0009992425760277964, + "loss": 3.4258, + "step": 924 + }, + { + "epoch": 0.0274293508881179, + "grad_norm": 0.23026995360851288, + "learning_rate": 0.0009992399849876724, + "loss": 3.434, + "step": 925 + }, + { + "epoch": 0.02745900424042938, + "grad_norm": 0.21197979152202606, + "learning_rate": 0.0009992373895266926, + "loss": 3.3976, + "step": 926 + }, + { + "epoch": 0.02748865759274086, + "grad_norm": 0.18382684886455536, + "learning_rate": 0.0009992347896448802, + "loss": 3.4232, + "step": 927 + }, + { + "epoch": 0.02751831094505234, + "grad_norm": 0.18143543601036072, + "learning_rate": 0.0009992321853422579, + "loss": 3.4099, + "step": 928 + }, + { + "epoch": 0.027547964297363815, + "grad_norm": 0.1875055879354477, + "learning_rate": 0.000999229576618849, + "loss": 3.3996, + "step": 929 + }, + { + "epoch": 0.027577617649675295, + "grad_norm": 0.1948208212852478, + "learning_rate": 0.0009992269634746763, + "loss": 3.4208, + "step": 930 + }, + { + "epoch": 0.027607271001986775, + "grad_norm": 0.17612384259700775, + "learning_rate": 0.0009992243459097632, + "loss": 3.3818, + "step": 931 + }, + { + "epoch": 0.027636924354298254, + "grad_norm": 0.14720790088176727, + "learning_rate": 0.0009992217239241329, + "loss": 3.4331, + "step": 932 + }, + { + "epoch": 0.027666577706609734, + "grad_norm": 0.1649421751499176, + "learning_rate": 0.0009992190975178085, + "loss": 3.3993, + "step": 933 + }, + { + "epoch": 0.02769623105892121, + "grad_norm": 0.18896211683750153, + "learning_rate": 0.0009992164666908132, + "loss": 3.4187, + "step": 934 + }, + { + "epoch": 0.02772588441123269, + "grad_norm": 0.15838801860809326, + "learning_rate": 0.0009992138314431705, + "loss": 3.4082, + "step": 935 + }, + { + "epoch": 0.02775553776354417, + "grad_norm": 0.16059467196464539, + "learning_rate": 0.0009992111917749037, + "loss": 3.4298, + "step": 936 + }, + { + "epoch": 0.02778519111585565, + "grad_norm": 0.15692181885242462, + "learning_rate": 0.000999208547686036, + "loss": 3.3699, + "step": 937 + }, + { + "epoch": 0.027814844468167125, + "grad_norm": 0.1372152715921402, + "learning_rate": 0.000999205899176591, + "loss": 3.4259, + "step": 938 + }, + { + "epoch": 0.027844497820478604, + "grad_norm": 0.1573851853609085, + "learning_rate": 0.000999203246246592, + "loss": 3.3706, + "step": 939 + }, + { + "epoch": 0.027874151172790084, + "grad_norm": 0.1543709933757782, + "learning_rate": 0.0009992005888960628, + "loss": 3.4246, + "step": 940 + }, + { + "epoch": 0.027903804525101564, + "grad_norm": 0.18460766971111298, + "learning_rate": 0.0009991979271250263, + "loss": 3.4214, + "step": 941 + }, + { + "epoch": 0.02793345787741304, + "grad_norm": 0.17602510750293732, + "learning_rate": 0.0009991952609335068, + "loss": 3.4098, + "step": 942 + }, + { + "epoch": 0.02796311122972452, + "grad_norm": 0.18420009315013885, + "learning_rate": 0.0009991925903215276, + "loss": 3.3928, + "step": 943 + }, + { + "epoch": 0.027992764582036, + "grad_norm": 0.1356619894504547, + "learning_rate": 0.000999189915289112, + "loss": 3.4121, + "step": 944 + }, + { + "epoch": 0.02802241793434748, + "grad_norm": 0.17004960775375366, + "learning_rate": 0.0009991872358362844, + "loss": 3.3995, + "step": 945 + }, + { + "epoch": 0.02805207128665896, + "grad_norm": 0.1575985699892044, + "learning_rate": 0.0009991845519630679, + "loss": 3.3665, + "step": 946 + }, + { + "epoch": 0.028081724638970434, + "grad_norm": 0.12375177443027496, + "learning_rate": 0.0009991818636694864, + "loss": 3.3801, + "step": 947 + }, + { + "epoch": 0.028111377991281914, + "grad_norm": 0.11298052221536636, + "learning_rate": 0.0009991791709555642, + "loss": 3.3614, + "step": 948 + }, + { + "epoch": 0.028141031343593394, + "grad_norm": 0.12766651809215546, + "learning_rate": 0.0009991764738213245, + "loss": 3.371, + "step": 949 + }, + { + "epoch": 0.028170684695904873, + "grad_norm": 0.11765781790018082, + "learning_rate": 0.0009991737722667914, + "loss": 3.4038, + "step": 950 + }, + { + "epoch": 0.02820033804821635, + "grad_norm": 0.13131293654441833, + "learning_rate": 0.000999171066291989, + "loss": 3.4168, + "step": 951 + }, + { + "epoch": 0.02822999140052783, + "grad_norm": 0.13796773552894592, + "learning_rate": 0.000999168355896941, + "loss": 3.4289, + "step": 952 + }, + { + "epoch": 0.02825964475283931, + "grad_norm": 0.1334906369447708, + "learning_rate": 0.0009991656410816717, + "loss": 3.3765, + "step": 953 + }, + { + "epoch": 0.02828929810515079, + "grad_norm": 0.1260908991098404, + "learning_rate": 0.000999162921846205, + "loss": 3.3791, + "step": 954 + }, + { + "epoch": 0.028318951457462264, + "grad_norm": 0.11944028735160828, + "learning_rate": 0.0009991601981905647, + "loss": 3.3736, + "step": 955 + }, + { + "epoch": 0.028348604809773744, + "grad_norm": 0.14735393226146698, + "learning_rate": 0.000999157470114775, + "loss": 3.4047, + "step": 956 + }, + { + "epoch": 0.028378258162085224, + "grad_norm": 0.20263487100601196, + "learning_rate": 0.0009991547376188607, + "loss": 3.4142, + "step": 957 + }, + { + "epoch": 0.028407911514396703, + "grad_norm": 0.24621687829494476, + "learning_rate": 0.0009991520007028452, + "loss": 3.3572, + "step": 958 + }, + { + "epoch": 0.028437564866708183, + "grad_norm": 0.23366869986057281, + "learning_rate": 0.0009991492593667533, + "loss": 3.4206, + "step": 959 + }, + { + "epoch": 0.02846721821901966, + "grad_norm": 0.21197810769081116, + "learning_rate": 0.000999146513610609, + "loss": 3.3761, + "step": 960 + }, + { + "epoch": 0.02849687157133114, + "grad_norm": 0.1928301304578781, + "learning_rate": 0.0009991437634344364, + "loss": 3.3978, + "step": 961 + }, + { + "epoch": 0.02852652492364262, + "grad_norm": 0.15580929815769196, + "learning_rate": 0.0009991410088382603, + "loss": 3.3944, + "step": 962 + }, + { + "epoch": 0.028556178275954098, + "grad_norm": 0.1395106464624405, + "learning_rate": 0.0009991382498221047, + "loss": 3.3878, + "step": 963 + }, + { + "epoch": 0.028585831628265574, + "grad_norm": 0.16585183143615723, + "learning_rate": 0.0009991354863859946, + "loss": 3.3686, + "step": 964 + }, + { + "epoch": 0.028615484980577054, + "grad_norm": 0.16844509541988373, + "learning_rate": 0.0009991327185299536, + "loss": 3.3854, + "step": 965 + }, + { + "epoch": 0.028645138332888533, + "grad_norm": 0.18855324387550354, + "learning_rate": 0.000999129946254007, + "loss": 3.3806, + "step": 966 + }, + { + "epoch": 0.028674791685200013, + "grad_norm": 0.19917811453342438, + "learning_rate": 0.000999127169558179, + "loss": 3.3748, + "step": 967 + }, + { + "epoch": 0.02870444503751149, + "grad_norm": 0.16288860142230988, + "learning_rate": 0.0009991243884424944, + "loss": 3.3685, + "step": 968 + }, + { + "epoch": 0.02873409838982297, + "grad_norm": 0.15356512367725372, + "learning_rate": 0.0009991216029069773, + "loss": 3.4297, + "step": 969 + }, + { + "epoch": 0.028763751742134448, + "grad_norm": 0.1661999225616455, + "learning_rate": 0.000999118812951653, + "loss": 3.3786, + "step": 970 + }, + { + "epoch": 0.028793405094445928, + "grad_norm": 0.17119944095611572, + "learning_rate": 0.000999116018576546, + "loss": 3.4337, + "step": 971 + }, + { + "epoch": 0.028823058446757407, + "grad_norm": 0.13042542338371277, + "learning_rate": 0.0009991132197816807, + "loss": 3.4042, + "step": 972 + }, + { + "epoch": 0.028852711799068884, + "grad_norm": 0.13592076301574707, + "learning_rate": 0.0009991104165670824, + "loss": 3.3597, + "step": 973 + }, + { + "epoch": 0.028882365151380363, + "grad_norm": 0.1673053801059723, + "learning_rate": 0.0009991076089327757, + "loss": 3.3622, + "step": 974 + }, + { + "epoch": 0.028912018503691843, + "grad_norm": 0.14556460082530975, + "learning_rate": 0.0009991047968787854, + "loss": 3.3404, + "step": 975 + }, + { + "epoch": 0.028941671856003322, + "grad_norm": 0.14809955656528473, + "learning_rate": 0.0009991019804051363, + "loss": 3.3625, + "step": 976 + }, + { + "epoch": 0.0289713252083148, + "grad_norm": 0.15520627796649933, + "learning_rate": 0.0009990991595118536, + "loss": 3.3738, + "step": 977 + }, + { + "epoch": 0.029000978560626278, + "grad_norm": 0.13072098791599274, + "learning_rate": 0.0009990963341989622, + "loss": 3.3892, + "step": 978 + }, + { + "epoch": 0.029030631912937758, + "grad_norm": 0.1459171324968338, + "learning_rate": 0.0009990935044664872, + "loss": 3.3602, + "step": 979 + }, + { + "epoch": 0.029060285265249237, + "grad_norm": 0.14466765522956848, + "learning_rate": 0.0009990906703144533, + "loss": 3.3712, + "step": 980 + }, + { + "epoch": 0.029089938617560714, + "grad_norm": 0.13773871958255768, + "learning_rate": 0.000999087831742886, + "loss": 3.3835, + "step": 981 + }, + { + "epoch": 0.029119591969872193, + "grad_norm": 0.12249275296926498, + "learning_rate": 0.0009990849887518104, + "loss": 3.3747, + "step": 982 + }, + { + "epoch": 0.029149245322183673, + "grad_norm": 0.11859538406133652, + "learning_rate": 0.0009990821413412515, + "loss": 3.3233, + "step": 983 + }, + { + "epoch": 0.029178898674495152, + "grad_norm": 0.13092906773090363, + "learning_rate": 0.0009990792895112344, + "loss": 3.3454, + "step": 984 + }, + { + "epoch": 0.029208552026806632, + "grad_norm": 0.1395421326160431, + "learning_rate": 0.0009990764332617845, + "loss": 3.3614, + "step": 985 + }, + { + "epoch": 0.029238205379118108, + "grad_norm": 0.11083746701478958, + "learning_rate": 0.0009990735725929273, + "loss": 3.3145, + "step": 986 + }, + { + "epoch": 0.029267858731429588, + "grad_norm": 0.1351291686296463, + "learning_rate": 0.0009990707075046878, + "loss": 3.3777, + "step": 987 + }, + { + "epoch": 0.029297512083741067, + "grad_norm": 0.11636842787265778, + "learning_rate": 0.0009990678379970916, + "loss": 3.389, + "step": 988 + }, + { + "epoch": 0.029327165436052547, + "grad_norm": 0.16228042542934418, + "learning_rate": 0.0009990649640701642, + "loss": 3.3808, + "step": 989 + }, + { + "epoch": 0.029356818788364023, + "grad_norm": 0.1958370804786682, + "learning_rate": 0.0009990620857239308, + "loss": 3.3815, + "step": 990 + }, + { + "epoch": 0.029386472140675503, + "grad_norm": 0.16409574449062347, + "learning_rate": 0.0009990592029584168, + "loss": 3.3643, + "step": 991 + }, + { + "epoch": 0.029416125492986982, + "grad_norm": 0.15479068458080292, + "learning_rate": 0.000999056315773648, + "loss": 3.3622, + "step": 992 + }, + { + "epoch": 0.029445778845298462, + "grad_norm": 0.2064674198627472, + "learning_rate": 0.0009990534241696499, + "loss": 3.3702, + "step": 993 + }, + { + "epoch": 0.029475432197609938, + "grad_norm": 0.19268183410167694, + "learning_rate": 0.0009990505281464478, + "loss": 3.3606, + "step": 994 + }, + { + "epoch": 0.029505085549921418, + "grad_norm": 0.15210480988025665, + "learning_rate": 0.0009990476277040678, + "loss": 3.3656, + "step": 995 + }, + { + "epoch": 0.029534738902232897, + "grad_norm": 0.1374867856502533, + "learning_rate": 0.0009990447228425355, + "loss": 3.3702, + "step": 996 + }, + { + "epoch": 0.029564392254544377, + "grad_norm": 0.13579338788986206, + "learning_rate": 0.0009990418135618765, + "loss": 3.355, + "step": 997 + }, + { + "epoch": 0.029594045606855857, + "grad_norm": 0.14936432242393494, + "learning_rate": 0.0009990388998621165, + "loss": 3.362, + "step": 998 + }, + { + "epoch": 0.029623698959167333, + "grad_norm": 0.17579972743988037, + "learning_rate": 0.0009990359817432814, + "loss": 3.3037, + "step": 999 + }, + { + "epoch": 0.029653352311478812, + "grad_norm": 0.20174360275268555, + "learning_rate": 0.0009990330592053972, + "loss": 3.3519, + "step": 1000 + }, + { + "epoch": 0.029683005663790292, + "grad_norm": 0.20294705033302307, + "learning_rate": 0.0009990301322484894, + "loss": 3.3823, + "step": 1001 + }, + { + "epoch": 0.02971265901610177, + "grad_norm": 0.14644396305084229, + "learning_rate": 0.000999027200872584, + "loss": 3.3615, + "step": 1002 + }, + { + "epoch": 0.029742312368413248, + "grad_norm": 0.1392974853515625, + "learning_rate": 0.0009990242650777072, + "loss": 3.3713, + "step": 1003 + }, + { + "epoch": 0.029771965720724727, + "grad_norm": 0.1563488245010376, + "learning_rate": 0.000999021324863885, + "loss": 3.3562, + "step": 1004 + }, + { + "epoch": 0.029801619073036207, + "grad_norm": 0.15654148161411285, + "learning_rate": 0.0009990183802311432, + "loss": 3.3886, + "step": 1005 + }, + { + "epoch": 0.029831272425347687, + "grad_norm": 0.13660003244876862, + "learning_rate": 0.0009990154311795081, + "loss": 3.3658, + "step": 1006 + }, + { + "epoch": 0.029860925777659163, + "grad_norm": 0.1616012454032898, + "learning_rate": 0.0009990124777090055, + "loss": 3.352, + "step": 1007 + }, + { + "epoch": 0.029890579129970642, + "grad_norm": 0.1833736002445221, + "learning_rate": 0.0009990095198196618, + "loss": 3.4105, + "step": 1008 + }, + { + "epoch": 0.029920232482282122, + "grad_norm": 0.17853493988513947, + "learning_rate": 0.0009990065575115033, + "loss": 3.3311, + "step": 1009 + }, + { + "epoch": 0.0299498858345936, + "grad_norm": 0.17110998928546906, + "learning_rate": 0.000999003590784556, + "loss": 3.3636, + "step": 1010 + }, + { + "epoch": 0.02997953918690508, + "grad_norm": 0.17940178513526917, + "learning_rate": 0.0009990006196388462, + "loss": 3.3568, + "step": 1011 + }, + { + "epoch": 0.030009192539216557, + "grad_norm": 0.138361856341362, + "learning_rate": 0.0009989976440744003, + "loss": 3.3091, + "step": 1012 + }, + { + "epoch": 0.030038845891528037, + "grad_norm": 0.13654614984989166, + "learning_rate": 0.0009989946640912447, + "loss": 3.3418, + "step": 1013 + }, + { + "epoch": 0.030068499243839517, + "grad_norm": 0.1399814784526825, + "learning_rate": 0.0009989916796894055, + "loss": 3.3609, + "step": 1014 + }, + { + "epoch": 0.030098152596150996, + "grad_norm": 0.13507311046123505, + "learning_rate": 0.0009989886908689095, + "loss": 3.3784, + "step": 1015 + }, + { + "epoch": 0.030127805948462472, + "grad_norm": 0.1904987245798111, + "learning_rate": 0.000998985697629783, + "loss": 3.3363, + "step": 1016 + }, + { + "epoch": 0.030157459300773952, + "grad_norm": 0.19803470373153687, + "learning_rate": 0.0009989826999720524, + "loss": 3.388, + "step": 1017 + }, + { + "epoch": 0.03018711265308543, + "grad_norm": 0.18163329362869263, + "learning_rate": 0.0009989796978957443, + "loss": 3.3039, + "step": 1018 + }, + { + "epoch": 0.03021676600539691, + "grad_norm": 0.14719076454639435, + "learning_rate": 0.0009989766914008855, + "loss": 3.3346, + "step": 1019 + }, + { + "epoch": 0.030246419357708387, + "grad_norm": 0.17520882189273834, + "learning_rate": 0.0009989736804875023, + "loss": 3.35, + "step": 1020 + }, + { + "epoch": 0.030276072710019867, + "grad_norm": 0.1531393975019455, + "learning_rate": 0.0009989706651556216, + "loss": 3.3148, + "step": 1021 + }, + { + "epoch": 0.030305726062331347, + "grad_norm": 0.15307554602622986, + "learning_rate": 0.00099896764540527, + "loss": 3.3303, + "step": 1022 + }, + { + "epoch": 0.030335379414642826, + "grad_norm": 0.1718251258134842, + "learning_rate": 0.0009989646212364743, + "loss": 3.3988, + "step": 1023 + }, + { + "epoch": 0.030365032766954306, + "grad_norm": 0.16382542252540588, + "learning_rate": 0.000998961592649261, + "loss": 3.3381, + "step": 1024 + }, + { + "epoch": 0.030394686119265782, + "grad_norm": 0.1374882459640503, + "learning_rate": 0.0009989585596436572, + "loss": 3.345, + "step": 1025 + }, + { + "epoch": 0.03042433947157726, + "grad_norm": 0.16234079003334045, + "learning_rate": 0.00099895552221969, + "loss": 3.3588, + "step": 1026 + }, + { + "epoch": 0.03045399282388874, + "grad_norm": 0.16714729368686676, + "learning_rate": 0.0009989524803773857, + "loss": 3.3553, + "step": 1027 + }, + { + "epoch": 0.03048364617620022, + "grad_norm": 0.1446862816810608, + "learning_rate": 0.0009989494341167717, + "loss": 3.3929, + "step": 1028 + }, + { + "epoch": 0.030513299528511697, + "grad_norm": 0.15732957422733307, + "learning_rate": 0.0009989463834378749, + "loss": 3.3501, + "step": 1029 + }, + { + "epoch": 0.030542952880823176, + "grad_norm": 0.13050588965415955, + "learning_rate": 0.0009989433283407222, + "loss": 3.334, + "step": 1030 + }, + { + "epoch": 0.030572606233134656, + "grad_norm": 0.13028709590435028, + "learning_rate": 0.0009989402688253405, + "loss": 3.3324, + "step": 1031 + }, + { + "epoch": 0.030602259585446136, + "grad_norm": 0.12160800397396088, + "learning_rate": 0.000998937204891757, + "loss": 3.3417, + "step": 1032 + }, + { + "epoch": 0.030631912937757612, + "grad_norm": 0.11876342445611954, + "learning_rate": 0.0009989341365399993, + "loss": 3.3174, + "step": 1033 + }, + { + "epoch": 0.03066156629006909, + "grad_norm": 0.1173582673072815, + "learning_rate": 0.0009989310637700938, + "loss": 3.3467, + "step": 1034 + }, + { + "epoch": 0.03069121964238057, + "grad_norm": 0.15256018936634064, + "learning_rate": 0.0009989279865820684, + "loss": 3.3216, + "step": 1035 + }, + { + "epoch": 0.03072087299469205, + "grad_norm": 0.19201652705669403, + "learning_rate": 0.0009989249049759499, + "loss": 3.3727, + "step": 1036 + }, + { + "epoch": 0.03075052634700353, + "grad_norm": 0.20195382833480835, + "learning_rate": 0.0009989218189517656, + "loss": 3.3607, + "step": 1037 + }, + { + "epoch": 0.030780179699315006, + "grad_norm": 0.22422738373279572, + "learning_rate": 0.0009989187285095432, + "loss": 3.3363, + "step": 1038 + }, + { + "epoch": 0.030809833051626486, + "grad_norm": 0.21712474524974823, + "learning_rate": 0.0009989156336493096, + "loss": 3.3058, + "step": 1039 + }, + { + "epoch": 0.030839486403937966, + "grad_norm": 0.1764804869890213, + "learning_rate": 0.0009989125343710925, + "loss": 3.3474, + "step": 1040 + }, + { + "epoch": 0.030869139756249445, + "grad_norm": 0.136272132396698, + "learning_rate": 0.0009989094306749194, + "loss": 3.3682, + "step": 1041 + }, + { + "epoch": 0.03089879310856092, + "grad_norm": 0.13105617463588715, + "learning_rate": 0.0009989063225608174, + "loss": 3.3356, + "step": 1042 + }, + { + "epoch": 0.0309284464608724, + "grad_norm": 0.13549314439296722, + "learning_rate": 0.0009989032100288146, + "loss": 3.3635, + "step": 1043 + }, + { + "epoch": 0.03095809981318388, + "grad_norm": 0.13478679955005646, + "learning_rate": 0.000998900093078938, + "loss": 3.3332, + "step": 1044 + }, + { + "epoch": 0.03098775316549536, + "grad_norm": 0.14672693610191345, + "learning_rate": 0.0009988969717112156, + "loss": 3.331, + "step": 1045 + }, + { + "epoch": 0.031017406517806836, + "grad_norm": 0.1738240122795105, + "learning_rate": 0.0009988938459256746, + "loss": 3.3617, + "step": 1046 + }, + { + "epoch": 0.031047059870118316, + "grad_norm": 0.14559821784496307, + "learning_rate": 0.0009988907157223433, + "loss": 3.3415, + "step": 1047 + }, + { + "epoch": 0.031076713222429796, + "grad_norm": 0.15308025479316711, + "learning_rate": 0.0009988875811012489, + "loss": 3.3561, + "step": 1048 + }, + { + "epoch": 0.031106366574741275, + "grad_norm": 0.13334397971630096, + "learning_rate": 0.0009988844420624195, + "loss": 3.3758, + "step": 1049 + }, + { + "epoch": 0.031136019927052755, + "grad_norm": 0.1323356330394745, + "learning_rate": 0.0009988812986058825, + "loss": 3.3611, + "step": 1050 + }, + { + "epoch": 0.03116567327936423, + "grad_norm": 0.12457907944917679, + "learning_rate": 0.000998878150731666, + "loss": 3.3369, + "step": 1051 + }, + { + "epoch": 0.03119532663167571, + "grad_norm": 0.11913447827100754, + "learning_rate": 0.000998874998439798, + "loss": 3.3006, + "step": 1052 + }, + { + "epoch": 0.03122497998398719, + "grad_norm": 0.14749464392662048, + "learning_rate": 0.000998871841730306, + "loss": 3.3437, + "step": 1053 + }, + { + "epoch": 0.031254633336298666, + "grad_norm": 0.1910131871700287, + "learning_rate": 0.0009988686806032185, + "loss": 3.3331, + "step": 1054 + }, + { + "epoch": 0.03128428668861015, + "grad_norm": 0.20547601580619812, + "learning_rate": 0.0009988655150585631, + "loss": 3.3269, + "step": 1055 + }, + { + "epoch": 0.031313940040921626, + "grad_norm": 0.16925400495529175, + "learning_rate": 0.0009988623450963678, + "loss": 3.3558, + "step": 1056 + }, + { + "epoch": 0.0313435933932331, + "grad_norm": 0.15093067288398743, + "learning_rate": 0.000998859170716661, + "loss": 3.367, + "step": 1057 + }, + { + "epoch": 0.031373246745544585, + "grad_norm": 0.20205602049827576, + "learning_rate": 0.0009988559919194707, + "loss": 3.3502, + "step": 1058 + }, + { + "epoch": 0.03140290009785606, + "grad_norm": 0.16259817779064178, + "learning_rate": 0.0009988528087048248, + "loss": 3.3339, + "step": 1059 + }, + { + "epoch": 0.031432553450167544, + "grad_norm": 0.15040993690490723, + "learning_rate": 0.0009988496210727516, + "loss": 3.2668, + "step": 1060 + }, + { + "epoch": 0.03146220680247902, + "grad_norm": 0.14104129374027252, + "learning_rate": 0.0009988464290232794, + "loss": 3.3355, + "step": 1061 + }, + { + "epoch": 0.031491860154790496, + "grad_norm": 0.15340903401374817, + "learning_rate": 0.0009988432325564365, + "loss": 3.3295, + "step": 1062 + }, + { + "epoch": 0.03152151350710198, + "grad_norm": 0.12461686879396439, + "learning_rate": 0.000998840031672251, + "loss": 3.3701, + "step": 1063 + }, + { + "epoch": 0.031551166859413456, + "grad_norm": 0.14545580744743347, + "learning_rate": 0.0009988368263707517, + "loss": 3.3244, + "step": 1064 + }, + { + "epoch": 0.03158082021172494, + "grad_norm": 0.16092830896377563, + "learning_rate": 0.0009988336166519664, + "loss": 3.3296, + "step": 1065 + }, + { + "epoch": 0.031610473564036415, + "grad_norm": 0.15279744565486908, + "learning_rate": 0.0009988304025159238, + "loss": 3.3151, + "step": 1066 + }, + { + "epoch": 0.03164012691634789, + "grad_norm": 0.15908300876617432, + "learning_rate": 0.0009988271839626525, + "loss": 3.3577, + "step": 1067 + }, + { + "epoch": 0.031669780268659374, + "grad_norm": 0.1814371645450592, + "learning_rate": 0.000998823960992181, + "loss": 3.3358, + "step": 1068 + }, + { + "epoch": 0.03169943362097085, + "grad_norm": 0.20290109515190125, + "learning_rate": 0.0009988207336045375, + "loss": 3.3189, + "step": 1069 + }, + { + "epoch": 0.031729086973282326, + "grad_norm": 0.1747296303510666, + "learning_rate": 0.0009988175017997508, + "loss": 3.3145, + "step": 1070 + }, + { + "epoch": 0.03175874032559381, + "grad_norm": 0.13760963082313538, + "learning_rate": 0.0009988142655778494, + "loss": 3.3348, + "step": 1071 + }, + { + "epoch": 0.031788393677905286, + "grad_norm": 0.13494150340557098, + "learning_rate": 0.0009988110249388622, + "loss": 3.323, + "step": 1072 + }, + { + "epoch": 0.03181804703021677, + "grad_norm": 0.13324330747127533, + "learning_rate": 0.0009988077798828178, + "loss": 3.3391, + "step": 1073 + }, + { + "epoch": 0.031847700382528245, + "grad_norm": 0.125082865357399, + "learning_rate": 0.0009988045304097448, + "loss": 3.3003, + "step": 1074 + }, + { + "epoch": 0.03187735373483972, + "grad_norm": 0.1126454621553421, + "learning_rate": 0.000998801276519672, + "loss": 3.317, + "step": 1075 + }, + { + "epoch": 0.031907007087151204, + "grad_norm": 0.1309182345867157, + "learning_rate": 0.0009987980182126284, + "loss": 3.3605, + "step": 1076 + }, + { + "epoch": 0.03193666043946268, + "grad_norm": 0.13478223979473114, + "learning_rate": 0.0009987947554886427, + "loss": 3.3213, + "step": 1077 + }, + { + "epoch": 0.03196631379177416, + "grad_norm": 0.14128144085407257, + "learning_rate": 0.0009987914883477437, + "loss": 3.3188, + "step": 1078 + }, + { + "epoch": 0.03199596714408564, + "grad_norm": 0.15783387422561646, + "learning_rate": 0.0009987882167899608, + "loss": 3.3285, + "step": 1079 + }, + { + "epoch": 0.032025620496397116, + "grad_norm": 0.13681860268115997, + "learning_rate": 0.0009987849408153223, + "loss": 3.3514, + "step": 1080 + }, + { + "epoch": 0.0320552738487086, + "grad_norm": 0.14964190125465393, + "learning_rate": 0.0009987816604238575, + "loss": 3.3465, + "step": 1081 + }, + { + "epoch": 0.032084927201020075, + "grad_norm": 0.16151084005832672, + "learning_rate": 0.0009987783756155958, + "loss": 3.3588, + "step": 1082 + }, + { + "epoch": 0.03211458055333155, + "grad_norm": 0.15999139845371246, + "learning_rate": 0.0009987750863905658, + "loss": 3.3078, + "step": 1083 + }, + { + "epoch": 0.032144233905643034, + "grad_norm": 0.19426235556602478, + "learning_rate": 0.0009987717927487968, + "loss": 3.2811, + "step": 1084 + }, + { + "epoch": 0.03217388725795451, + "grad_norm": 0.18917734920978546, + "learning_rate": 0.000998768494690318, + "loss": 3.3403, + "step": 1085 + }, + { + "epoch": 0.03220354061026599, + "grad_norm": 0.16947659850120544, + "learning_rate": 0.0009987651922151585, + "loss": 3.3216, + "step": 1086 + }, + { + "epoch": 0.03223319396257747, + "grad_norm": 0.16568775475025177, + "learning_rate": 0.0009987618853233475, + "loss": 3.3253, + "step": 1087 + }, + { + "epoch": 0.032262847314888946, + "grad_norm": 0.15306630730628967, + "learning_rate": 0.0009987585740149146, + "loss": 3.316, + "step": 1088 + }, + { + "epoch": 0.03229250066720043, + "grad_norm": 0.1569882482290268, + "learning_rate": 0.0009987552582898887, + "loss": 3.3367, + "step": 1089 + }, + { + "epoch": 0.032322154019511905, + "grad_norm": 0.1827508509159088, + "learning_rate": 0.0009987519381482995, + "loss": 3.3245, + "step": 1090 + }, + { + "epoch": 0.03235180737182339, + "grad_norm": 0.1879909336566925, + "learning_rate": 0.0009987486135901763, + "loss": 3.3247, + "step": 1091 + }, + { + "epoch": 0.032381460724134864, + "grad_norm": 0.1745413988828659, + "learning_rate": 0.0009987452846155485, + "loss": 3.3333, + "step": 1092 + }, + { + "epoch": 0.03241111407644634, + "grad_norm": 0.15445344150066376, + "learning_rate": 0.0009987419512244456, + "loss": 3.3022, + "step": 1093 + }, + { + "epoch": 0.03244076742875782, + "grad_norm": 0.12734057009220123, + "learning_rate": 0.0009987386134168972, + "loss": 3.3124, + "step": 1094 + }, + { + "epoch": 0.0324704207810693, + "grad_norm": 0.14649274945259094, + "learning_rate": 0.0009987352711929326, + "loss": 3.3631, + "step": 1095 + }, + { + "epoch": 0.032500074133380775, + "grad_norm": 0.13812966644763947, + "learning_rate": 0.0009987319245525817, + "loss": 3.2954, + "step": 1096 + }, + { + "epoch": 0.03252972748569226, + "grad_norm": 0.1579577773809433, + "learning_rate": 0.000998728573495874, + "loss": 3.3555, + "step": 1097 + }, + { + "epoch": 0.032559380838003735, + "grad_norm": 0.17296075820922852, + "learning_rate": 0.000998725218022839, + "loss": 3.3223, + "step": 1098 + }, + { + "epoch": 0.03258903419031522, + "grad_norm": 0.1603272259235382, + "learning_rate": 0.0009987218581335067, + "loss": 3.3104, + "step": 1099 + }, + { + "epoch": 0.032618687542626694, + "grad_norm": 0.17921245098114014, + "learning_rate": 0.0009987184938279067, + "loss": 3.3315, + "step": 1100 + }, + { + "epoch": 0.03264834089493817, + "grad_norm": 0.16884484887123108, + "learning_rate": 0.000998715125106069, + "loss": 3.3189, + "step": 1101 + }, + { + "epoch": 0.03267799424724965, + "grad_norm": 0.16542035341262817, + "learning_rate": 0.000998711751968023, + "loss": 3.3084, + "step": 1102 + }, + { + "epoch": 0.03270764759956113, + "grad_norm": 0.16732650995254517, + "learning_rate": 0.000998708374413799, + "loss": 3.3211, + "step": 1103 + }, + { + "epoch": 0.03273730095187261, + "grad_norm": 0.1824609488248825, + "learning_rate": 0.000998704992443427, + "loss": 3.3223, + "step": 1104 + }, + { + "epoch": 0.03276695430418409, + "grad_norm": 0.13073886930942535, + "learning_rate": 0.0009987016060569362, + "loss": 3.3531, + "step": 1105 + }, + { + "epoch": 0.032796607656495565, + "grad_norm": 0.1385257989168167, + "learning_rate": 0.0009986982152543574, + "loss": 3.2788, + "step": 1106 + }, + { + "epoch": 0.03282626100880705, + "grad_norm": 0.15115046501159668, + "learning_rate": 0.0009986948200357202, + "loss": 3.3487, + "step": 1107 + }, + { + "epoch": 0.032855914361118524, + "grad_norm": 0.14457787573337555, + "learning_rate": 0.0009986914204010548, + "loss": 3.3294, + "step": 1108 + }, + { + "epoch": 0.03288556771343, + "grad_norm": 0.1274213045835495, + "learning_rate": 0.0009986880163503913, + "loss": 3.2976, + "step": 1109 + }, + { + "epoch": 0.03291522106574148, + "grad_norm": 0.12270127981901169, + "learning_rate": 0.0009986846078837597, + "loss": 3.3269, + "step": 1110 + }, + { + "epoch": 0.03294487441805296, + "grad_norm": 0.13571977615356445, + "learning_rate": 0.0009986811950011903, + "loss": 3.357, + "step": 1111 + }, + { + "epoch": 0.03297452777036444, + "grad_norm": 0.13858433067798615, + "learning_rate": 0.0009986777777027133, + "loss": 3.2903, + "step": 1112 + }, + { + "epoch": 0.03300418112267592, + "grad_norm": 0.13173320889472961, + "learning_rate": 0.0009986743559883592, + "loss": 3.2902, + "step": 1113 + }, + { + "epoch": 0.033033834474987395, + "grad_norm": 0.14811553061008453, + "learning_rate": 0.0009986709298581578, + "loss": 3.3125, + "step": 1114 + }, + { + "epoch": 0.03306348782729888, + "grad_norm": 0.1543726623058319, + "learning_rate": 0.00099866749931214, + "loss": 3.3198, + "step": 1115 + }, + { + "epoch": 0.033093141179610354, + "grad_norm": 0.16919025778770447, + "learning_rate": 0.0009986640643503358, + "loss": 3.2943, + "step": 1116 + }, + { + "epoch": 0.03312279453192184, + "grad_norm": 0.19615085422992706, + "learning_rate": 0.0009986606249727757, + "loss": 3.3661, + "step": 1117 + }, + { + "epoch": 0.03315244788423331, + "grad_norm": 0.18044164776802063, + "learning_rate": 0.00099865718117949, + "loss": 3.3234, + "step": 1118 + }, + { + "epoch": 0.03318210123654479, + "grad_norm": 0.14874958992004395, + "learning_rate": 0.0009986537329705098, + "loss": 3.2978, + "step": 1119 + }, + { + "epoch": 0.03321175458885627, + "grad_norm": 0.15448588132858276, + "learning_rate": 0.0009986502803458646, + "loss": 3.3038, + "step": 1120 + }, + { + "epoch": 0.03324140794116775, + "grad_norm": 0.17124629020690918, + "learning_rate": 0.000998646823305586, + "loss": 3.3418, + "step": 1121 + }, + { + "epoch": 0.033271061293479225, + "grad_norm": 0.19051948189735413, + "learning_rate": 0.000998643361849704, + "loss": 3.305, + "step": 1122 + }, + { + "epoch": 0.03330071464579071, + "grad_norm": 0.15571105480194092, + "learning_rate": 0.0009986398959782497, + "loss": 3.2963, + "step": 1123 + }, + { + "epoch": 0.033330367998102184, + "grad_norm": 0.14054855704307556, + "learning_rate": 0.0009986364256912533, + "loss": 3.32, + "step": 1124 + }, + { + "epoch": 0.03336002135041367, + "grad_norm": 0.12594832479953766, + "learning_rate": 0.0009986329509887458, + "loss": 3.3023, + "step": 1125 + }, + { + "epoch": 0.03338967470272514, + "grad_norm": 0.12498734146356583, + "learning_rate": 0.000998629471870758, + "loss": 3.3148, + "step": 1126 + }, + { + "epoch": 0.03341932805503662, + "grad_norm": 0.12582840025424957, + "learning_rate": 0.0009986259883373206, + "loss": 3.316, + "step": 1127 + }, + { + "epoch": 0.0334489814073481, + "grad_norm": 0.1399635225534439, + "learning_rate": 0.0009986225003884644, + "loss": 3.3134, + "step": 1128 + }, + { + "epoch": 0.03347863475965958, + "grad_norm": 0.12241680175065994, + "learning_rate": 0.0009986190080242202, + "loss": 3.2617, + "step": 1129 + }, + { + "epoch": 0.03350828811197106, + "grad_norm": 0.13458748161792755, + "learning_rate": 0.0009986155112446196, + "loss": 3.3318, + "step": 1130 + }, + { + "epoch": 0.03353794146428254, + "grad_norm": 0.13544143736362457, + "learning_rate": 0.0009986120100496927, + "loss": 3.3409, + "step": 1131 + }, + { + "epoch": 0.033567594816594014, + "grad_norm": 0.1244361400604248, + "learning_rate": 0.000998608504439471, + "loss": 3.2878, + "step": 1132 + }, + { + "epoch": 0.0335972481689055, + "grad_norm": 0.1241643875837326, + "learning_rate": 0.0009986049944139853, + "loss": 3.2885, + "step": 1133 + }, + { + "epoch": 0.03362690152121697, + "grad_norm": 0.1203351840376854, + "learning_rate": 0.0009986014799732669, + "loss": 3.2974, + "step": 1134 + }, + { + "epoch": 0.03365655487352845, + "grad_norm": 0.1367395669221878, + "learning_rate": 0.0009985979611173469, + "loss": 3.3199, + "step": 1135 + }, + { + "epoch": 0.03368620822583993, + "grad_norm": 0.16242288053035736, + "learning_rate": 0.0009985944378462562, + "loss": 3.2981, + "step": 1136 + }, + { + "epoch": 0.03371586157815141, + "grad_norm": 0.18292765319347382, + "learning_rate": 0.000998590910160026, + "loss": 3.2879, + "step": 1137 + }, + { + "epoch": 0.03374551493046289, + "grad_norm": 0.18435773253440857, + "learning_rate": 0.000998587378058688, + "loss": 3.2908, + "step": 1138 + }, + { + "epoch": 0.03377516828277437, + "grad_norm": 0.1717274934053421, + "learning_rate": 0.0009985838415422733, + "loss": 3.2911, + "step": 1139 + }, + { + "epoch": 0.033804821635085844, + "grad_norm": 0.17733153700828552, + "learning_rate": 0.0009985803006108127, + "loss": 3.3042, + "step": 1140 + }, + { + "epoch": 0.03383447498739733, + "grad_norm": 0.18396557867527008, + "learning_rate": 0.0009985767552643382, + "loss": 3.3204, + "step": 1141 + }, + { + "epoch": 0.0338641283397088, + "grad_norm": 0.18847046792507172, + "learning_rate": 0.000998573205502881, + "loss": 3.3251, + "step": 1142 + }, + { + "epoch": 0.033893781692020286, + "grad_norm": 0.20578326284885406, + "learning_rate": 0.0009985696513264723, + "loss": 3.2974, + "step": 1143 + }, + { + "epoch": 0.03392343504433176, + "grad_norm": 0.19052313268184662, + "learning_rate": 0.0009985660927351438, + "loss": 3.332, + "step": 1144 + }, + { + "epoch": 0.03395308839664324, + "grad_norm": 0.1762562096118927, + "learning_rate": 0.000998562529728927, + "loss": 3.2932, + "step": 1145 + }, + { + "epoch": 0.03398274174895472, + "grad_norm": 0.1608876734972, + "learning_rate": 0.0009985589623078535, + "loss": 3.2994, + "step": 1146 + }, + { + "epoch": 0.0340123951012662, + "grad_norm": 0.15478192269802094, + "learning_rate": 0.0009985553904719548, + "loss": 3.316, + "step": 1147 + }, + { + "epoch": 0.034042048453577674, + "grad_norm": 0.145472452044487, + "learning_rate": 0.0009985518142212625, + "loss": 3.3142, + "step": 1148 + }, + { + "epoch": 0.03407170180588916, + "grad_norm": 0.18035681545734406, + "learning_rate": 0.0009985482335558085, + "loss": 3.2735, + "step": 1149 + }, + { + "epoch": 0.03410135515820063, + "grad_norm": 0.19270533323287964, + "learning_rate": 0.000998544648475624, + "loss": 3.3043, + "step": 1150 + }, + { + "epoch": 0.034131008510512116, + "grad_norm": 0.1379501223564148, + "learning_rate": 0.0009985410589807412, + "loss": 3.3347, + "step": 1151 + }, + { + "epoch": 0.03416066186282359, + "grad_norm": 0.12201102823019028, + "learning_rate": 0.0009985374650711917, + "loss": 3.2767, + "step": 1152 + }, + { + "epoch": 0.03419031521513507, + "grad_norm": 0.12263856083154678, + "learning_rate": 0.0009985338667470075, + "loss": 3.285, + "step": 1153 + }, + { + "epoch": 0.03421996856744655, + "grad_norm": 0.13254107534885406, + "learning_rate": 0.0009985302640082203, + "loss": 3.3142, + "step": 1154 + }, + { + "epoch": 0.03424962191975803, + "grad_norm": 0.14229363203048706, + "learning_rate": 0.000998526656854862, + "loss": 3.3179, + "step": 1155 + }, + { + "epoch": 0.03427927527206951, + "grad_norm": 0.13499978184700012, + "learning_rate": 0.0009985230452869646, + "loss": 3.3191, + "step": 1156 + }, + { + "epoch": 0.03430892862438099, + "grad_norm": 0.11768639832735062, + "learning_rate": 0.00099851942930456, + "loss": 3.2595, + "step": 1157 + }, + { + "epoch": 0.03433858197669246, + "grad_norm": 0.11292122304439545, + "learning_rate": 0.0009985158089076804, + "loss": 3.3021, + "step": 1158 + }, + { + "epoch": 0.034368235329003946, + "grad_norm": 0.14030654728412628, + "learning_rate": 0.0009985121840963575, + "loss": 3.3144, + "step": 1159 + }, + { + "epoch": 0.03439788868131542, + "grad_norm": 0.19377990067005157, + "learning_rate": 0.000998508554870624, + "loss": 3.3098, + "step": 1160 + }, + { + "epoch": 0.0344275420336269, + "grad_norm": 0.16929931938648224, + "learning_rate": 0.0009985049212305115, + "loss": 3.33, + "step": 1161 + }, + { + "epoch": 0.03445719538593838, + "grad_norm": 0.15381009876728058, + "learning_rate": 0.0009985012831760522, + "loss": 3.3025, + "step": 1162 + }, + { + "epoch": 0.03448684873824986, + "grad_norm": 0.15330329537391663, + "learning_rate": 0.0009984976407072788, + "loss": 3.2898, + "step": 1163 + }, + { + "epoch": 0.03451650209056134, + "grad_norm": 0.14523157477378845, + "learning_rate": 0.000998493993824223, + "loss": 3.2813, + "step": 1164 + }, + { + "epoch": 0.03454615544287282, + "grad_norm": 0.157217338681221, + "learning_rate": 0.0009984903425269173, + "loss": 3.2933, + "step": 1165 + }, + { + "epoch": 0.03457580879518429, + "grad_norm": 0.15900510549545288, + "learning_rate": 0.000998486686815394, + "loss": 3.3013, + "step": 1166 + }, + { + "epoch": 0.034605462147495776, + "grad_norm": 0.15883080661296844, + "learning_rate": 0.000998483026689686, + "loss": 3.3056, + "step": 1167 + }, + { + "epoch": 0.03463511549980725, + "grad_norm": 0.15795861184597015, + "learning_rate": 0.0009984793621498247, + "loss": 3.2801, + "step": 1168 + }, + { + "epoch": 0.034664768852118735, + "grad_norm": 0.15574131906032562, + "learning_rate": 0.0009984756931958431, + "loss": 3.2994, + "step": 1169 + }, + { + "epoch": 0.03469442220443021, + "grad_norm": 0.12283702194690704, + "learning_rate": 0.000998472019827774, + "loss": 3.2683, + "step": 1170 + }, + { + "epoch": 0.03472407555674169, + "grad_norm": 0.12808606028556824, + "learning_rate": 0.0009984683420456496, + "loss": 3.2851, + "step": 1171 + }, + { + "epoch": 0.03475372890905317, + "grad_norm": 0.15359346568584442, + "learning_rate": 0.0009984646598495022, + "loss": 3.252, + "step": 1172 + }, + { + "epoch": 0.03478338226136465, + "grad_norm": 0.18579630553722382, + "learning_rate": 0.0009984609732393648, + "loss": 3.2928, + "step": 1173 + }, + { + "epoch": 0.03481303561367612, + "grad_norm": 0.20440632104873657, + "learning_rate": 0.00099845728221527, + "loss": 3.2706, + "step": 1174 + }, + { + "epoch": 0.034842688965987606, + "grad_norm": 0.20564450323581696, + "learning_rate": 0.0009984535867772501, + "loss": 3.2924, + "step": 1175 + }, + { + "epoch": 0.03487234231829908, + "grad_norm": 0.19930849969387054, + "learning_rate": 0.0009984498869253385, + "loss": 3.283, + "step": 1176 + }, + { + "epoch": 0.034901995670610565, + "grad_norm": 0.1776721179485321, + "learning_rate": 0.0009984461826595674, + "loss": 3.311, + "step": 1177 + }, + { + "epoch": 0.03493164902292204, + "grad_norm": 0.17234157025814056, + "learning_rate": 0.0009984424739799698, + "loss": 3.2678, + "step": 1178 + }, + { + "epoch": 0.03496130237523352, + "grad_norm": 0.14205694198608398, + "learning_rate": 0.0009984387608865785, + "loss": 3.2763, + "step": 1179 + }, + { + "epoch": 0.034990955727545, + "grad_norm": 0.14800933003425598, + "learning_rate": 0.0009984350433794266, + "loss": 3.3182, + "step": 1180 + }, + { + "epoch": 0.03502060907985648, + "grad_norm": 0.16076980531215668, + "learning_rate": 0.0009984313214585468, + "loss": 3.2454, + "step": 1181 + }, + { + "epoch": 0.03505026243216796, + "grad_norm": 0.1611899584531784, + "learning_rate": 0.0009984275951239719, + "loss": 3.2528, + "step": 1182 + }, + { + "epoch": 0.035079915784479436, + "grad_norm": 0.14336656033992767, + "learning_rate": 0.0009984238643757353, + "loss": 3.2947, + "step": 1183 + }, + { + "epoch": 0.03510956913679091, + "grad_norm": 0.1708669513463974, + "learning_rate": 0.0009984201292138697, + "loss": 3.2994, + "step": 1184 + }, + { + "epoch": 0.035139222489102395, + "grad_norm": 0.12990067899227142, + "learning_rate": 0.0009984163896384084, + "loss": 3.3092, + "step": 1185 + }, + { + "epoch": 0.03516887584141387, + "grad_norm": 0.13291899859905243, + "learning_rate": 0.0009984126456493842, + "loss": 3.2665, + "step": 1186 + }, + { + "epoch": 0.03519852919372535, + "grad_norm": 0.15190526843070984, + "learning_rate": 0.0009984088972468308, + "loss": 3.2706, + "step": 1187 + }, + { + "epoch": 0.03522818254603683, + "grad_norm": 0.1484754979610443, + "learning_rate": 0.0009984051444307809, + "loss": 3.3033, + "step": 1188 + }, + { + "epoch": 0.03525783589834831, + "grad_norm": 0.14188642799854279, + "learning_rate": 0.000998401387201268, + "loss": 3.2677, + "step": 1189 + }, + { + "epoch": 0.03528748925065979, + "grad_norm": 0.13993458449840546, + "learning_rate": 0.000998397625558325, + "loss": 3.2829, + "step": 1190 + }, + { + "epoch": 0.035317142602971266, + "grad_norm": 0.1457710713148117, + "learning_rate": 0.0009983938595019856, + "loss": 3.2366, + "step": 1191 + }, + { + "epoch": 0.03534679595528274, + "grad_norm": 0.13349732756614685, + "learning_rate": 0.000998390089032283, + "loss": 3.2845, + "step": 1192 + }, + { + "epoch": 0.035376449307594225, + "grad_norm": 0.13364729285240173, + "learning_rate": 0.0009983863141492506, + "loss": 3.3235, + "step": 1193 + }, + { + "epoch": 0.0354061026599057, + "grad_norm": 0.14987614750862122, + "learning_rate": 0.000998382534852922, + "loss": 3.2941, + "step": 1194 + }, + { + "epoch": 0.035435756012217184, + "grad_norm": 0.16358329355716705, + "learning_rate": 0.0009983787511433303, + "loss": 3.2507, + "step": 1195 + }, + { + "epoch": 0.03546540936452866, + "grad_norm": 0.19246752560138702, + "learning_rate": 0.0009983749630205095, + "loss": 3.268, + "step": 1196 + }, + { + "epoch": 0.03549506271684014, + "grad_norm": 0.18890410661697388, + "learning_rate": 0.0009983711704844927, + "loss": 3.2912, + "step": 1197 + }, + { + "epoch": 0.03552471606915162, + "grad_norm": 0.22735057771205902, + "learning_rate": 0.0009983673735353136, + "loss": 3.2807, + "step": 1198 + }, + { + "epoch": 0.035554369421463096, + "grad_norm": 0.20899879932403564, + "learning_rate": 0.000998363572173006, + "loss": 3.3153, + "step": 1199 + }, + { + "epoch": 0.03558402277377457, + "grad_norm": 0.1985611766576767, + "learning_rate": 0.0009983597663976032, + "loss": 3.2902, + "step": 1200 + }, + { + "epoch": 0.035613676126086055, + "grad_norm": 0.1939896047115326, + "learning_rate": 0.0009983559562091392, + "loss": 3.3119, + "step": 1201 + }, + { + "epoch": 0.03564332947839753, + "grad_norm": 0.14265739917755127, + "learning_rate": 0.0009983521416076478, + "loss": 3.2906, + "step": 1202 + }, + { + "epoch": 0.035672982830709014, + "grad_norm": 0.1582029163837433, + "learning_rate": 0.0009983483225931625, + "loss": 3.2665, + "step": 1203 + }, + { + "epoch": 0.03570263618302049, + "grad_norm": 0.14525839686393738, + "learning_rate": 0.0009983444991657174, + "loss": 3.2486, + "step": 1204 + }, + { + "epoch": 0.03573228953533197, + "grad_norm": 0.13567465543746948, + "learning_rate": 0.0009983406713253461, + "loss": 3.2676, + "step": 1205 + }, + { + "epoch": 0.03576194288764345, + "grad_norm": 0.12761691212654114, + "learning_rate": 0.0009983368390720827, + "loss": 3.2695, + "step": 1206 + }, + { + "epoch": 0.035791596239954926, + "grad_norm": 0.1179099753499031, + "learning_rate": 0.000998333002405961, + "loss": 3.2521, + "step": 1207 + }, + { + "epoch": 0.03582124959226641, + "grad_norm": 0.1322491616010666, + "learning_rate": 0.000998329161327015, + "loss": 3.2825, + "step": 1208 + }, + { + "epoch": 0.035850902944577885, + "grad_norm": 0.1282850056886673, + "learning_rate": 0.0009983253158352787, + "loss": 3.2721, + "step": 1209 + }, + { + "epoch": 0.03588055629688936, + "grad_norm": 0.12417730689048767, + "learning_rate": 0.0009983214659307865, + "loss": 3.2634, + "step": 1210 + }, + { + "epoch": 0.035910209649200844, + "grad_norm": 0.13575567305088043, + "learning_rate": 0.0009983176116135717, + "loss": 3.2834, + "step": 1211 + }, + { + "epoch": 0.03593986300151232, + "grad_norm": 0.13724413514137268, + "learning_rate": 0.0009983137528836693, + "loss": 3.2778, + "step": 1212 + }, + { + "epoch": 0.0359695163538238, + "grad_norm": 0.15397842228412628, + "learning_rate": 0.000998309889741113, + "loss": 3.2694, + "step": 1213 + }, + { + "epoch": 0.03599916970613528, + "grad_norm": 0.14653156697750092, + "learning_rate": 0.000998306022185937, + "loss": 3.2545, + "step": 1214 + }, + { + "epoch": 0.036028823058446756, + "grad_norm": 0.15764033794403076, + "learning_rate": 0.0009983021502181757, + "loss": 3.2798, + "step": 1215 + }, + { + "epoch": 0.03605847641075824, + "grad_norm": 0.17976851761341095, + "learning_rate": 0.0009982982738378633, + "loss": 3.3031, + "step": 1216 + }, + { + "epoch": 0.036088129763069715, + "grad_norm": 0.15977974236011505, + "learning_rate": 0.000998294393045034, + "loss": 3.2853, + "step": 1217 + }, + { + "epoch": 0.03611778311538119, + "grad_norm": 0.1580987274646759, + "learning_rate": 0.0009982905078397227, + "loss": 3.2623, + "step": 1218 + }, + { + "epoch": 0.036147436467692674, + "grad_norm": 0.1343582421541214, + "learning_rate": 0.0009982866182219631, + "loss": 3.2523, + "step": 1219 + }, + { + "epoch": 0.03617708982000415, + "grad_norm": 0.14424677193164825, + "learning_rate": 0.0009982827241917902, + "loss": 3.2801, + "step": 1220 + }, + { + "epoch": 0.036206743172315634, + "grad_norm": 0.14698562026023865, + "learning_rate": 0.000998278825749238, + "loss": 3.3149, + "step": 1221 + }, + { + "epoch": 0.03623639652462711, + "grad_norm": 0.1464674025774002, + "learning_rate": 0.0009982749228943414, + "loss": 3.2598, + "step": 1222 + }, + { + "epoch": 0.036266049876938586, + "grad_norm": 0.17159350216388702, + "learning_rate": 0.0009982710156271348, + "loss": 3.2804, + "step": 1223 + }, + { + "epoch": 0.03629570322925007, + "grad_norm": 0.21752230823040009, + "learning_rate": 0.000998267103947653, + "loss": 3.3009, + "step": 1224 + }, + { + "epoch": 0.036325356581561545, + "grad_norm": 0.2240111082792282, + "learning_rate": 0.0009982631878559303, + "loss": 3.2904, + "step": 1225 + }, + { + "epoch": 0.03635500993387302, + "grad_norm": 0.20685036480426788, + "learning_rate": 0.0009982592673520015, + "loss": 3.2832, + "step": 1226 + }, + { + "epoch": 0.036384663286184504, + "grad_norm": 0.22207878530025482, + "learning_rate": 0.0009982553424359012, + "loss": 3.2631, + "step": 1227 + }, + { + "epoch": 0.03641431663849598, + "grad_norm": 0.17259229719638824, + "learning_rate": 0.0009982514131076647, + "loss": 3.3167, + "step": 1228 + }, + { + "epoch": 0.036443969990807464, + "grad_norm": 0.15339523553848267, + "learning_rate": 0.0009982474793673263, + "loss": 3.2516, + "step": 1229 + }, + { + "epoch": 0.03647362334311894, + "grad_norm": 0.1534426212310791, + "learning_rate": 0.000998243541214921, + "loss": 3.2625, + "step": 1230 + }, + { + "epoch": 0.036503276695430416, + "grad_norm": 0.1451827585697174, + "learning_rate": 0.0009982395986504835, + "loss": 3.2779, + "step": 1231 + }, + { + "epoch": 0.0365329300477419, + "grad_norm": 0.1488625705242157, + "learning_rate": 0.0009982356516740488, + "loss": 3.2918, + "step": 1232 + }, + { + "epoch": 0.036562583400053375, + "grad_norm": 0.15239018201828003, + "learning_rate": 0.000998231700285652, + "loss": 3.3114, + "step": 1233 + }, + { + "epoch": 0.03659223675236486, + "grad_norm": 0.1376861333847046, + "learning_rate": 0.0009982277444853277, + "loss": 3.2586, + "step": 1234 + }, + { + "epoch": 0.036621890104676334, + "grad_norm": 0.14986158907413483, + "learning_rate": 0.0009982237842731116, + "loss": 3.268, + "step": 1235 + }, + { + "epoch": 0.03665154345698781, + "grad_norm": 0.16020646691322327, + "learning_rate": 0.0009982198196490382, + "loss": 3.2755, + "step": 1236 + }, + { + "epoch": 0.036681196809299293, + "grad_norm": 0.14761695265769958, + "learning_rate": 0.0009982158506131426, + "loss": 3.2595, + "step": 1237 + }, + { + "epoch": 0.03671085016161077, + "grad_norm": 0.15076595544815063, + "learning_rate": 0.0009982118771654604, + "loss": 3.2714, + "step": 1238 + }, + { + "epoch": 0.036740503513922246, + "grad_norm": 0.13587453961372375, + "learning_rate": 0.0009982078993060264, + "loss": 3.2802, + "step": 1239 + }, + { + "epoch": 0.03677015686623373, + "grad_norm": 0.15622973442077637, + "learning_rate": 0.000998203917034876, + "loss": 3.2674, + "step": 1240 + }, + { + "epoch": 0.036799810218545205, + "grad_norm": 0.1519775092601776, + "learning_rate": 0.0009981999303520443, + "loss": 3.2637, + "step": 1241 + }, + { + "epoch": 0.03682946357085669, + "grad_norm": 0.14545674622058868, + "learning_rate": 0.0009981959392575666, + "loss": 3.2492, + "step": 1242 + }, + { + "epoch": 0.036859116923168164, + "grad_norm": 0.15287357568740845, + "learning_rate": 0.0009981919437514785, + "loss": 3.2763, + "step": 1243 + }, + { + "epoch": 0.03688877027547964, + "grad_norm": 0.13498282432556152, + "learning_rate": 0.0009981879438338153, + "loss": 3.2488, + "step": 1244 + }, + { + "epoch": 0.03691842362779112, + "grad_norm": 0.1602298766374588, + "learning_rate": 0.0009981839395046123, + "loss": 3.2554, + "step": 1245 + }, + { + "epoch": 0.0369480769801026, + "grad_norm": 0.14671257138252258, + "learning_rate": 0.0009981799307639048, + "loss": 3.2769, + "step": 1246 + }, + { + "epoch": 0.03697773033241408, + "grad_norm": 0.17551331222057343, + "learning_rate": 0.0009981759176117288, + "loss": 3.251, + "step": 1247 + }, + { + "epoch": 0.03700738368472556, + "grad_norm": 0.19934992492198944, + "learning_rate": 0.0009981719000481193, + "loss": 3.2923, + "step": 1248 + }, + { + "epoch": 0.037037037037037035, + "grad_norm": 0.17933784425258636, + "learning_rate": 0.0009981678780731124, + "loss": 3.2427, + "step": 1249 + }, + { + "epoch": 0.03706669038934852, + "grad_norm": 0.1619875431060791, + "learning_rate": 0.000998163851686743, + "loss": 3.2582, + "step": 1250 + }, + { + "epoch": 0.037096343741659994, + "grad_norm": 0.16055172681808472, + "learning_rate": 0.0009981598208890475, + "loss": 3.2374, + "step": 1251 + }, + { + "epoch": 0.03712599709397147, + "grad_norm": 0.14235369861125946, + "learning_rate": 0.000998155785680061, + "loss": 3.2876, + "step": 1252 + }, + { + "epoch": 0.03715565044628295, + "grad_norm": 0.16503888368606567, + "learning_rate": 0.00099815174605982, + "loss": 3.2222, + "step": 1253 + }, + { + "epoch": 0.03718530379859443, + "grad_norm": 0.17198407649993896, + "learning_rate": 0.0009981477020283593, + "loss": 3.2726, + "step": 1254 + }, + { + "epoch": 0.03721495715090591, + "grad_norm": 0.16605716943740845, + "learning_rate": 0.0009981436535857157, + "loss": 3.2651, + "step": 1255 + }, + { + "epoch": 0.03724461050321739, + "grad_norm": 0.1704363226890564, + "learning_rate": 0.0009981396007319242, + "loss": 3.2212, + "step": 1256 + }, + { + "epoch": 0.037274263855528865, + "grad_norm": 0.15988388657569885, + "learning_rate": 0.000998135543467021, + "loss": 3.2609, + "step": 1257 + }, + { + "epoch": 0.03730391720784035, + "grad_norm": 0.1697203814983368, + "learning_rate": 0.0009981314817910421, + "loss": 3.2716, + "step": 1258 + }, + { + "epoch": 0.037333570560151824, + "grad_norm": 0.16878293454647064, + "learning_rate": 0.0009981274157040234, + "loss": 3.2488, + "step": 1259 + }, + { + "epoch": 0.03736322391246331, + "grad_norm": 0.15087458491325378, + "learning_rate": 0.000998123345206001, + "loss": 3.2518, + "step": 1260 + }, + { + "epoch": 0.03739287726477478, + "grad_norm": 0.13975460827350616, + "learning_rate": 0.0009981192702970107, + "loss": 3.2675, + "step": 1261 + }, + { + "epoch": 0.03742253061708626, + "grad_norm": 0.13470585644245148, + "learning_rate": 0.0009981151909770891, + "loss": 3.2705, + "step": 1262 + }, + { + "epoch": 0.03745218396939774, + "grad_norm": 0.11410683393478394, + "learning_rate": 0.0009981111072462716, + "loss": 3.2489, + "step": 1263 + }, + { + "epoch": 0.03748183732170922, + "grad_norm": 0.130640909075737, + "learning_rate": 0.000998107019104595, + "loss": 3.2437, + "step": 1264 + }, + { + "epoch": 0.037511490674020695, + "grad_norm": 0.13667018711566925, + "learning_rate": 0.0009981029265520953, + "loss": 3.2882, + "step": 1265 + }, + { + "epoch": 0.03754114402633218, + "grad_norm": 0.14673349261283875, + "learning_rate": 0.0009980988295888085, + "loss": 3.2682, + "step": 1266 + }, + { + "epoch": 0.037570797378643654, + "grad_norm": 0.15469008684158325, + "learning_rate": 0.0009980947282147712, + "loss": 3.2705, + "step": 1267 + }, + { + "epoch": 0.03760045073095514, + "grad_norm": 0.1509637087583542, + "learning_rate": 0.0009980906224300195, + "loss": 3.2551, + "step": 1268 + }, + { + "epoch": 0.03763010408326661, + "grad_norm": 0.18037179112434387, + "learning_rate": 0.0009980865122345898, + "loss": 3.2516, + "step": 1269 + }, + { + "epoch": 0.03765975743557809, + "grad_norm": 0.1921384036540985, + "learning_rate": 0.0009980823976285186, + "loss": 3.2376, + "step": 1270 + }, + { + "epoch": 0.03768941078788957, + "grad_norm": 0.20808595418930054, + "learning_rate": 0.0009980782786118423, + "loss": 3.2703, + "step": 1271 + }, + { + "epoch": 0.03771906414020105, + "grad_norm": 0.19840198755264282, + "learning_rate": 0.0009980741551845972, + "loss": 3.2559, + "step": 1272 + }, + { + "epoch": 0.03774871749251253, + "grad_norm": 0.16524310410022736, + "learning_rate": 0.0009980700273468203, + "loss": 3.243, + "step": 1273 + }, + { + "epoch": 0.03777837084482401, + "grad_norm": 0.11579440534114838, + "learning_rate": 0.0009980658950985476, + "loss": 3.2395, + "step": 1274 + }, + { + "epoch": 0.037808024197135484, + "grad_norm": 0.15125927329063416, + "learning_rate": 0.000998061758439816, + "loss": 3.2526, + "step": 1275 + }, + { + "epoch": 0.03783767754944697, + "grad_norm": 0.16875167191028595, + "learning_rate": 0.0009980576173706619, + "loss": 3.2174, + "step": 1276 + }, + { + "epoch": 0.03786733090175844, + "grad_norm": 0.18401576578617096, + "learning_rate": 0.0009980534718911221, + "loss": 3.2494, + "step": 1277 + }, + { + "epoch": 0.03789698425406992, + "grad_norm": 0.17436330020427704, + "learning_rate": 0.0009980493220012334, + "loss": 3.2568, + "step": 1278 + }, + { + "epoch": 0.0379266376063814, + "grad_norm": 0.16991618275642395, + "learning_rate": 0.0009980451677010325, + "loss": 3.2652, + "step": 1279 + }, + { + "epoch": 0.03795629095869288, + "grad_norm": 0.1783277988433838, + "learning_rate": 0.000998041008990556, + "loss": 3.3108, + "step": 1280 + }, + { + "epoch": 0.03798594431100436, + "grad_norm": 0.15106965601444244, + "learning_rate": 0.000998036845869841, + "loss": 3.2427, + "step": 1281 + }, + { + "epoch": 0.03801559766331584, + "grad_norm": 0.14329375326633453, + "learning_rate": 0.0009980326783389241, + "loss": 3.2676, + "step": 1282 + }, + { + "epoch": 0.038045251015627314, + "grad_norm": 0.14505243301391602, + "learning_rate": 0.0009980285063978427, + "loss": 3.2681, + "step": 1283 + }, + { + "epoch": 0.0380749043679388, + "grad_norm": 0.15989568829536438, + "learning_rate": 0.0009980243300466332, + "loss": 3.2714, + "step": 1284 + }, + { + "epoch": 0.03810455772025027, + "grad_norm": 0.17468056082725525, + "learning_rate": 0.0009980201492853326, + "loss": 3.2265, + "step": 1285 + }, + { + "epoch": 0.038134211072561756, + "grad_norm": 0.18703921139240265, + "learning_rate": 0.0009980159641139782, + "loss": 3.2282, + "step": 1286 + }, + { + "epoch": 0.03816386442487323, + "grad_norm": 0.14016416668891907, + "learning_rate": 0.000998011774532607, + "loss": 3.2482, + "step": 1287 + }, + { + "epoch": 0.03819351777718471, + "grad_norm": 0.1324215531349182, + "learning_rate": 0.000998007580541256, + "loss": 3.2391, + "step": 1288 + }, + { + "epoch": 0.03822317112949619, + "grad_norm": 0.14077231287956238, + "learning_rate": 0.0009980033821399624, + "loss": 3.273, + "step": 1289 + }, + { + "epoch": 0.03825282448180767, + "grad_norm": 0.16318736970424652, + "learning_rate": 0.0009979991793287635, + "loss": 3.282, + "step": 1290 + }, + { + "epoch": 0.038282477834119144, + "grad_norm": 0.15775229036808014, + "learning_rate": 0.000997994972107696, + "loss": 3.1962, + "step": 1291 + }, + { + "epoch": 0.03831213118643063, + "grad_norm": 0.13785965740680695, + "learning_rate": 0.000997990760476798, + "loss": 3.204, + "step": 1292 + }, + { + "epoch": 0.0383417845387421, + "grad_norm": 0.13929100334644318, + "learning_rate": 0.0009979865444361062, + "loss": 3.2445, + "step": 1293 + }, + { + "epoch": 0.038371437891053586, + "grad_norm": 0.1563628613948822, + "learning_rate": 0.000997982323985658, + "loss": 3.2521, + "step": 1294 + }, + { + "epoch": 0.03840109124336506, + "grad_norm": 0.12388569861650467, + "learning_rate": 0.0009979780991254909, + "loss": 3.2487, + "step": 1295 + }, + { + "epoch": 0.03843074459567654, + "grad_norm": 0.1261054426431656, + "learning_rate": 0.0009979738698556422, + "loss": 3.2412, + "step": 1296 + }, + { + "epoch": 0.03846039794798802, + "grad_norm": 0.13396011292934418, + "learning_rate": 0.0009979696361761495, + "loss": 3.2729, + "step": 1297 + }, + { + "epoch": 0.0384900513002995, + "grad_norm": 0.13765949010849, + "learning_rate": 0.00099796539808705, + "loss": 3.2412, + "step": 1298 + }, + { + "epoch": 0.03851970465261098, + "grad_norm": 0.1583358347415924, + "learning_rate": 0.0009979611555883817, + "loss": 3.259, + "step": 1299 + }, + { + "epoch": 0.03854935800492246, + "grad_norm": 0.16352160274982452, + "learning_rate": 0.0009979569086801816, + "loss": 3.2652, + "step": 1300 + }, + { + "epoch": 0.03857901135723393, + "grad_norm": 0.17326420545578003, + "learning_rate": 0.0009979526573624877, + "loss": 3.2621, + "step": 1301 + }, + { + "epoch": 0.038608664709545416, + "grad_norm": 0.20741502940654755, + "learning_rate": 0.0009979484016353376, + "loss": 3.2381, + "step": 1302 + }, + { + "epoch": 0.03863831806185689, + "grad_norm": 0.19674061238765717, + "learning_rate": 0.000997944141498769, + "loss": 3.2748, + "step": 1303 + }, + { + "epoch": 0.03866797141416837, + "grad_norm": 0.14944988489151, + "learning_rate": 0.0009979398769528196, + "loss": 3.2369, + "step": 1304 + }, + { + "epoch": 0.03869762476647985, + "grad_norm": 0.15216153860092163, + "learning_rate": 0.0009979356079975268, + "loss": 3.2545, + "step": 1305 + }, + { + "epoch": 0.03872727811879133, + "grad_norm": 0.15987923741340637, + "learning_rate": 0.000997931334632929, + "loss": 3.2581, + "step": 1306 + }, + { + "epoch": 0.03875693147110281, + "grad_norm": 0.18709635734558105, + "learning_rate": 0.0009979270568590637, + "loss": 3.2611, + "step": 1307 + }, + { + "epoch": 0.03878658482341429, + "grad_norm": 0.2151561826467514, + "learning_rate": 0.0009979227746759688, + "loss": 3.2581, + "step": 1308 + }, + { + "epoch": 0.03881623817572576, + "grad_norm": 0.15651457011699677, + "learning_rate": 0.0009979184880836824, + "loss": 3.2316, + "step": 1309 + }, + { + "epoch": 0.038845891528037246, + "grad_norm": 0.1899145096540451, + "learning_rate": 0.0009979141970822422, + "loss": 3.2591, + "step": 1310 + }, + { + "epoch": 0.03887554488034872, + "grad_norm": 0.2086762934923172, + "learning_rate": 0.0009979099016716865, + "loss": 3.295, + "step": 1311 + }, + { + "epoch": 0.038905198232660206, + "grad_norm": 0.18184995651245117, + "learning_rate": 0.0009979056018520529, + "loss": 3.2605, + "step": 1312 + }, + { + "epoch": 0.03893485158497168, + "grad_norm": 0.1613602340221405, + "learning_rate": 0.0009979012976233798, + "loss": 3.2603, + "step": 1313 + }, + { + "epoch": 0.03896450493728316, + "grad_norm": 0.14983469247817993, + "learning_rate": 0.0009978969889857052, + "loss": 3.2711, + "step": 1314 + }, + { + "epoch": 0.03899415828959464, + "grad_norm": 0.1337909698486328, + "learning_rate": 0.0009978926759390673, + "loss": 3.2465, + "step": 1315 + }, + { + "epoch": 0.03902381164190612, + "grad_norm": 0.12786221504211426, + "learning_rate": 0.0009978883584835043, + "loss": 3.2492, + "step": 1316 + }, + { + "epoch": 0.03905346499421759, + "grad_norm": 0.11513863503932953, + "learning_rate": 0.0009978840366190547, + "loss": 3.2434, + "step": 1317 + }, + { + "epoch": 0.039083118346529076, + "grad_norm": 0.12749448418617249, + "learning_rate": 0.0009978797103457563, + "loss": 3.2854, + "step": 1318 + }, + { + "epoch": 0.03911277169884055, + "grad_norm": 0.11403565853834152, + "learning_rate": 0.0009978753796636476, + "loss": 3.2363, + "step": 1319 + }, + { + "epoch": 0.039142425051152036, + "grad_norm": 0.1473824381828308, + "learning_rate": 0.0009978710445727667, + "loss": 3.2473, + "step": 1320 + }, + { + "epoch": 0.03917207840346351, + "grad_norm": 0.148517444729805, + "learning_rate": 0.0009978667050731527, + "loss": 3.232, + "step": 1321 + }, + { + "epoch": 0.03920173175577499, + "grad_norm": 0.1434476673603058, + "learning_rate": 0.0009978623611648432, + "loss": 3.1861, + "step": 1322 + }, + { + "epoch": 0.03923138510808647, + "grad_norm": 0.13040420413017273, + "learning_rate": 0.0009978580128478772, + "loss": 3.2258, + "step": 1323 + }, + { + "epoch": 0.03926103846039795, + "grad_norm": 0.13179907202720642, + "learning_rate": 0.000997853660122293, + "loss": 3.2449, + "step": 1324 + }, + { + "epoch": 0.03929069181270943, + "grad_norm": 0.1499057412147522, + "learning_rate": 0.0009978493029881292, + "loss": 3.2397, + "step": 1325 + }, + { + "epoch": 0.039320345165020906, + "grad_norm": 0.16329452395439148, + "learning_rate": 0.0009978449414454243, + "loss": 3.2378, + "step": 1326 + }, + { + "epoch": 0.03934999851733238, + "grad_norm": 0.1368931233882904, + "learning_rate": 0.0009978405754942172, + "loss": 3.2218, + "step": 1327 + }, + { + "epoch": 0.039379651869643865, + "grad_norm": 0.135603129863739, + "learning_rate": 0.0009978362051345463, + "loss": 3.2115, + "step": 1328 + }, + { + "epoch": 0.03940930522195534, + "grad_norm": 0.1551092118024826, + "learning_rate": 0.00099783183036645, + "loss": 3.2396, + "step": 1329 + }, + { + "epoch": 0.03943895857426682, + "grad_norm": 0.17476171255111694, + "learning_rate": 0.0009978274511899677, + "loss": 3.2346, + "step": 1330 + }, + { + "epoch": 0.0394686119265783, + "grad_norm": 0.1595994532108307, + "learning_rate": 0.000997823067605138, + "loss": 3.2325, + "step": 1331 + }, + { + "epoch": 0.03949826527888978, + "grad_norm": 0.1995125710964203, + "learning_rate": 0.0009978186796119992, + "loss": 3.2797, + "step": 1332 + }, + { + "epoch": 0.03952791863120126, + "grad_norm": 0.21886511147022247, + "learning_rate": 0.000997814287210591, + "loss": 3.2299, + "step": 1333 + }, + { + "epoch": 0.039557571983512736, + "grad_norm": 0.1497652679681778, + "learning_rate": 0.0009978098904009514, + "loss": 3.2369, + "step": 1334 + }, + { + "epoch": 0.03958722533582421, + "grad_norm": 0.1477714478969574, + "learning_rate": 0.00099780548918312, + "loss": 3.2642, + "step": 1335 + }, + { + "epoch": 0.039616878688135695, + "grad_norm": 0.17373575270175934, + "learning_rate": 0.0009978010835571356, + "loss": 3.2539, + "step": 1336 + }, + { + "epoch": 0.03964653204044717, + "grad_norm": 0.18533386290073395, + "learning_rate": 0.000997796673523037, + "loss": 3.257, + "step": 1337 + }, + { + "epoch": 0.039676185392758655, + "grad_norm": 0.1664152443408966, + "learning_rate": 0.0009977922590808635, + "loss": 3.2576, + "step": 1338 + }, + { + "epoch": 0.03970583874507013, + "grad_norm": 0.15508729219436646, + "learning_rate": 0.0009977878402306541, + "loss": 3.2854, + "step": 1339 + }, + { + "epoch": 0.03973549209738161, + "grad_norm": 0.16315753757953644, + "learning_rate": 0.0009977834169724478, + "loss": 3.2315, + "step": 1340 + }, + { + "epoch": 0.03976514544969309, + "grad_norm": 0.16739881038665771, + "learning_rate": 0.000997778989306284, + "loss": 3.2281, + "step": 1341 + }, + { + "epoch": 0.039794798802004566, + "grad_norm": 0.21020478010177612, + "learning_rate": 0.0009977745572322019, + "loss": 3.2278, + "step": 1342 + }, + { + "epoch": 0.03982445215431604, + "grad_norm": 0.1781032383441925, + "learning_rate": 0.0009977701207502406, + "loss": 3.2339, + "step": 1343 + }, + { + "epoch": 0.039854105506627525, + "grad_norm": 0.14443011581897736, + "learning_rate": 0.0009977656798604393, + "loss": 3.214, + "step": 1344 + }, + { + "epoch": 0.039883758858939, + "grad_norm": 0.16122205555438995, + "learning_rate": 0.0009977612345628377, + "loss": 3.2345, + "step": 1345 + }, + { + "epoch": 0.039913412211250485, + "grad_norm": 0.1561478227376938, + "learning_rate": 0.0009977567848574746, + "loss": 3.1992, + "step": 1346 + }, + { + "epoch": 0.03994306556356196, + "grad_norm": 0.1746213138103485, + "learning_rate": 0.0009977523307443902, + "loss": 3.2355, + "step": 1347 + }, + { + "epoch": 0.03997271891587344, + "grad_norm": 0.11591945588588715, + "learning_rate": 0.000997747872223623, + "loss": 3.2257, + "step": 1348 + }, + { + "epoch": 0.04000237226818492, + "grad_norm": 0.13074465095996857, + "learning_rate": 0.0009977434092952133, + "loss": 3.2285, + "step": 1349 + }, + { + "epoch": 0.040032025620496396, + "grad_norm": 0.14363522827625275, + "learning_rate": 0.0009977389419592, + "loss": 3.2047, + "step": 1350 + }, + { + "epoch": 0.04006167897280788, + "grad_norm": 0.14703014492988586, + "learning_rate": 0.000997734470215623, + "loss": 3.2439, + "step": 1351 + }, + { + "epoch": 0.040091332325119355, + "grad_norm": 0.14900118112564087, + "learning_rate": 0.000997729994064522, + "loss": 3.2405, + "step": 1352 + }, + { + "epoch": 0.04012098567743083, + "grad_norm": 0.16265639662742615, + "learning_rate": 0.0009977255135059364, + "loss": 3.2447, + "step": 1353 + }, + { + "epoch": 0.040150639029742315, + "grad_norm": 0.15985915064811707, + "learning_rate": 0.000997721028539906, + "loss": 3.2347, + "step": 1354 + }, + { + "epoch": 0.04018029238205379, + "grad_norm": 0.15885314345359802, + "learning_rate": 0.0009977165391664704, + "loss": 3.2159, + "step": 1355 + }, + { + "epoch": 0.04020994573436527, + "grad_norm": 0.16120758652687073, + "learning_rate": 0.0009977120453856694, + "loss": 3.2539, + "step": 1356 + }, + { + "epoch": 0.04023959908667675, + "grad_norm": 0.13098131120204926, + "learning_rate": 0.0009977075471975427, + "loss": 3.2195, + "step": 1357 + }, + { + "epoch": 0.040269252438988226, + "grad_norm": 0.1265735924243927, + "learning_rate": 0.0009977030446021303, + "loss": 3.2163, + "step": 1358 + }, + { + "epoch": 0.04029890579129971, + "grad_norm": 0.149482861161232, + "learning_rate": 0.000997698537599472, + "loss": 3.2183, + "step": 1359 + }, + { + "epoch": 0.040328559143611185, + "grad_norm": 0.14838778972625732, + "learning_rate": 0.0009976940261896077, + "loss": 3.2085, + "step": 1360 + }, + { + "epoch": 0.04035821249592266, + "grad_norm": 0.16116264462471008, + "learning_rate": 0.0009976895103725777, + "loss": 3.2222, + "step": 1361 + }, + { + "epoch": 0.040387865848234145, + "grad_norm": 0.19230914115905762, + "learning_rate": 0.0009976849901484214, + "loss": 3.2418, + "step": 1362 + }, + { + "epoch": 0.04041751920054562, + "grad_norm": 0.1869610697031021, + "learning_rate": 0.000997680465517179, + "loss": 3.1772, + "step": 1363 + }, + { + "epoch": 0.040447172552857104, + "grad_norm": 0.16128931939601898, + "learning_rate": 0.0009976759364788907, + "loss": 3.2363, + "step": 1364 + }, + { + "epoch": 0.04047682590516858, + "grad_norm": 0.15309351682662964, + "learning_rate": 0.0009976714030335964, + "loss": 3.2291, + "step": 1365 + }, + { + "epoch": 0.040506479257480056, + "grad_norm": 0.16723978519439697, + "learning_rate": 0.0009976668651813369, + "loss": 3.2429, + "step": 1366 + }, + { + "epoch": 0.04053613260979154, + "grad_norm": 0.20548413693904877, + "learning_rate": 0.0009976623229221513, + "loss": 3.2463, + "step": 1367 + }, + { + "epoch": 0.040565785962103015, + "grad_norm": 0.19403481483459473, + "learning_rate": 0.0009976577762560808, + "loss": 3.2126, + "step": 1368 + }, + { + "epoch": 0.04059543931441449, + "grad_norm": 0.15234243869781494, + "learning_rate": 0.0009976532251831651, + "loss": 3.2496, + "step": 1369 + }, + { + "epoch": 0.040625092666725975, + "grad_norm": 0.16458022594451904, + "learning_rate": 0.000997648669703445, + "loss": 3.2342, + "step": 1370 + }, + { + "epoch": 0.04065474601903745, + "grad_norm": 0.18399566411972046, + "learning_rate": 0.00099764410981696, + "loss": 3.235, + "step": 1371 + }, + { + "epoch": 0.040684399371348934, + "grad_norm": 0.14505958557128906, + "learning_rate": 0.0009976395455237512, + "loss": 3.2445, + "step": 1372 + }, + { + "epoch": 0.04071405272366041, + "grad_norm": 0.14383502304553986, + "learning_rate": 0.0009976349768238588, + "loss": 3.2106, + "step": 1373 + }, + { + "epoch": 0.040743706075971886, + "grad_norm": 0.1798080950975418, + "learning_rate": 0.0009976304037173232, + "loss": 3.2132, + "step": 1374 + }, + { + "epoch": 0.04077335942828337, + "grad_norm": 0.20174269378185272, + "learning_rate": 0.0009976258262041852, + "loss": 3.1743, + "step": 1375 + }, + { + "epoch": 0.040803012780594845, + "grad_norm": 0.16423170268535614, + "learning_rate": 0.000997621244284485, + "loss": 3.2343, + "step": 1376 + }, + { + "epoch": 0.04083266613290633, + "grad_norm": 0.16631634533405304, + "learning_rate": 0.000997616657958263, + "loss": 3.1897, + "step": 1377 + }, + { + "epoch": 0.040862319485217805, + "grad_norm": 0.15693524479866028, + "learning_rate": 0.0009976120672255603, + "loss": 3.2151, + "step": 1378 + }, + { + "epoch": 0.04089197283752928, + "grad_norm": 0.14157290756702423, + "learning_rate": 0.0009976074720864174, + "loss": 3.2433, + "step": 1379 + }, + { + "epoch": 0.040921626189840764, + "grad_norm": 0.1377018243074417, + "learning_rate": 0.0009976028725408748, + "loss": 3.2309, + "step": 1380 + }, + { + "epoch": 0.04095127954215224, + "grad_norm": 0.1520671546459198, + "learning_rate": 0.0009975982685889735, + "loss": 3.2007, + "step": 1381 + }, + { + "epoch": 0.040980932894463716, + "grad_norm": 0.154196634888649, + "learning_rate": 0.000997593660230754, + "loss": 3.2111, + "step": 1382 + }, + { + "epoch": 0.0410105862467752, + "grad_norm": 0.1299341320991516, + "learning_rate": 0.0009975890474662572, + "loss": 3.2134, + "step": 1383 + }, + { + "epoch": 0.041040239599086675, + "grad_norm": 0.13287225365638733, + "learning_rate": 0.000997584430295524, + "loss": 3.2025, + "step": 1384 + }, + { + "epoch": 0.04106989295139816, + "grad_norm": 0.14498308300971985, + "learning_rate": 0.0009975798087185953, + "loss": 3.2149, + "step": 1385 + }, + { + "epoch": 0.041099546303709635, + "grad_norm": 0.14939042925834656, + "learning_rate": 0.000997575182735512, + "loss": 3.2134, + "step": 1386 + }, + { + "epoch": 0.04112919965602111, + "grad_norm": 0.15461696684360504, + "learning_rate": 0.0009975705523463149, + "loss": 3.2313, + "step": 1387 + }, + { + "epoch": 0.041158853008332594, + "grad_norm": 0.20174911618232727, + "learning_rate": 0.0009975659175510453, + "loss": 3.2283, + "step": 1388 + }, + { + "epoch": 0.04118850636064407, + "grad_norm": 0.20307481288909912, + "learning_rate": 0.000997561278349744, + "loss": 3.2006, + "step": 1389 + }, + { + "epoch": 0.04121815971295555, + "grad_norm": 0.15141503512859344, + "learning_rate": 0.0009975566347424523, + "loss": 3.2276, + "step": 1390 + }, + { + "epoch": 0.04124781306526703, + "grad_norm": 0.15885844826698303, + "learning_rate": 0.000997551986729211, + "loss": 3.2315, + "step": 1391 + }, + { + "epoch": 0.041277466417578505, + "grad_norm": 0.15128855407238007, + "learning_rate": 0.0009975473343100615, + "loss": 3.2196, + "step": 1392 + }, + { + "epoch": 0.04130711976988999, + "grad_norm": 0.14701910316944122, + "learning_rate": 0.0009975426774850452, + "loss": 3.1733, + "step": 1393 + }, + { + "epoch": 0.041336773122201464, + "grad_norm": 0.15375177562236786, + "learning_rate": 0.000997538016254203, + "loss": 3.249, + "step": 1394 + }, + { + "epoch": 0.04136642647451294, + "grad_norm": 0.17322830855846405, + "learning_rate": 0.000997533350617576, + "loss": 3.258, + "step": 1395 + }, + { + "epoch": 0.041396079826824424, + "grad_norm": 0.1609209179878235, + "learning_rate": 0.000997528680575206, + "loss": 3.2526, + "step": 1396 + }, + { + "epoch": 0.0414257331791359, + "grad_norm": 0.19134896993637085, + "learning_rate": 0.000997524006127134, + "loss": 3.2284, + "step": 1397 + }, + { + "epoch": 0.04145538653144738, + "grad_norm": 0.25309211015701294, + "learning_rate": 0.0009975193272734016, + "loss": 3.242, + "step": 1398 + }, + { + "epoch": 0.04148503988375886, + "grad_norm": 0.22211793065071106, + "learning_rate": 0.0009975146440140503, + "loss": 3.2177, + "step": 1399 + }, + { + "epoch": 0.041514693236070335, + "grad_norm": 0.18826112151145935, + "learning_rate": 0.0009975099563491211, + "loss": 3.2235, + "step": 1400 + }, + { + "epoch": 0.04154434658838182, + "grad_norm": 0.16333650052547455, + "learning_rate": 0.0009975052642786561, + "loss": 3.2581, + "step": 1401 + }, + { + "epoch": 0.041573999940693294, + "grad_norm": 0.151687890291214, + "learning_rate": 0.0009975005678026967, + "loss": 3.2044, + "step": 1402 + }, + { + "epoch": 0.04160365329300478, + "grad_norm": 0.13441501557826996, + "learning_rate": 0.000997495866921284, + "loss": 3.2376, + "step": 1403 + }, + { + "epoch": 0.041633306645316254, + "grad_norm": 0.12556974589824677, + "learning_rate": 0.0009974911616344605, + "loss": 3.2233, + "step": 1404 + }, + { + "epoch": 0.04166295999762773, + "grad_norm": 0.1328982263803482, + "learning_rate": 0.000997486451942267, + "loss": 3.203, + "step": 1405 + }, + { + "epoch": 0.04169261334993921, + "grad_norm": 0.13721023499965668, + "learning_rate": 0.0009974817378447455, + "loss": 3.2278, + "step": 1406 + }, + { + "epoch": 0.04172226670225069, + "grad_norm": 0.12177716940641403, + "learning_rate": 0.000997477019341938, + "loss": 3.2434, + "step": 1407 + }, + { + "epoch": 0.041751920054562165, + "grad_norm": 0.16166914999485016, + "learning_rate": 0.0009974722964338862, + "loss": 3.216, + "step": 1408 + }, + { + "epoch": 0.04178157340687365, + "grad_norm": 0.19744619727134705, + "learning_rate": 0.0009974675691206318, + "loss": 3.231, + "step": 1409 + }, + { + "epoch": 0.041811226759185124, + "grad_norm": 0.19574855268001556, + "learning_rate": 0.0009974628374022165, + "loss": 3.2668, + "step": 1410 + }, + { + "epoch": 0.04184088011149661, + "grad_norm": 0.17518474161624908, + "learning_rate": 0.0009974581012786826, + "loss": 3.2254, + "step": 1411 + }, + { + "epoch": 0.041870533463808084, + "grad_norm": 0.1917324662208557, + "learning_rate": 0.0009974533607500715, + "loss": 3.2053, + "step": 1412 + }, + { + "epoch": 0.04190018681611956, + "grad_norm": 0.17542777955532074, + "learning_rate": 0.0009974486158164258, + "loss": 3.1937, + "step": 1413 + }, + { + "epoch": 0.04192984016843104, + "grad_norm": 0.1575460284948349, + "learning_rate": 0.000997443866477787, + "loss": 3.1959, + "step": 1414 + }, + { + "epoch": 0.04195949352074252, + "grad_norm": 0.14341112971305847, + "learning_rate": 0.0009974391127341978, + "loss": 3.2169, + "step": 1415 + }, + { + "epoch": 0.041989146873054, + "grad_norm": 0.17159809172153473, + "learning_rate": 0.0009974343545856995, + "loss": 3.2556, + "step": 1416 + }, + { + "epoch": 0.04201880022536548, + "grad_norm": 0.15328463912010193, + "learning_rate": 0.0009974295920323346, + "loss": 3.18, + "step": 1417 + }, + { + "epoch": 0.042048453577676954, + "grad_norm": 0.1372525691986084, + "learning_rate": 0.0009974248250741455, + "loss": 3.1926, + "step": 1418 + }, + { + "epoch": 0.04207810692998844, + "grad_norm": 0.13289591670036316, + "learning_rate": 0.000997420053711174, + "loss": 3.1894, + "step": 1419 + }, + { + "epoch": 0.042107760282299914, + "grad_norm": 0.1281103938817978, + "learning_rate": 0.0009974152779434627, + "loss": 3.2143, + "step": 1420 + }, + { + "epoch": 0.04213741363461139, + "grad_norm": 0.13050012290477753, + "learning_rate": 0.0009974104977710535, + "loss": 3.2286, + "step": 1421 + }, + { + "epoch": 0.04216706698692287, + "grad_norm": 0.1748843789100647, + "learning_rate": 0.0009974057131939891, + "loss": 3.2456, + "step": 1422 + }, + { + "epoch": 0.04219672033923435, + "grad_norm": 0.21160157024860382, + "learning_rate": 0.0009974009242123118, + "loss": 3.2365, + "step": 1423 + }, + { + "epoch": 0.04222637369154583, + "grad_norm": 0.19777938723564148, + "learning_rate": 0.0009973961308260637, + "loss": 3.1959, + "step": 1424 + }, + { + "epoch": 0.04225602704385731, + "grad_norm": 0.15844379365444183, + "learning_rate": 0.0009973913330352877, + "loss": 3.1773, + "step": 1425 + }, + { + "epoch": 0.042285680396168784, + "grad_norm": 0.1409563571214676, + "learning_rate": 0.000997386530840026, + "loss": 3.2051, + "step": 1426 + }, + { + "epoch": 0.04231533374848027, + "grad_norm": 0.15303823351860046, + "learning_rate": 0.0009973817242403215, + "loss": 3.1903, + "step": 1427 + }, + { + "epoch": 0.042344987100791744, + "grad_norm": 0.17546163499355316, + "learning_rate": 0.000997376913236216, + "loss": 3.179, + "step": 1428 + }, + { + "epoch": 0.04237464045310323, + "grad_norm": 0.19726824760437012, + "learning_rate": 0.0009973720978277527, + "loss": 3.2155, + "step": 1429 + }, + { + "epoch": 0.0424042938054147, + "grad_norm": 0.1710353046655655, + "learning_rate": 0.0009973672780149742, + "loss": 3.2072, + "step": 1430 + }, + { + "epoch": 0.04243394715772618, + "grad_norm": 0.1447640061378479, + "learning_rate": 0.000997362453797923, + "loss": 3.2072, + "step": 1431 + }, + { + "epoch": 0.04246360051003766, + "grad_norm": 0.14231577515602112, + "learning_rate": 0.000997357625176642, + "loss": 3.2293, + "step": 1432 + }, + { + "epoch": 0.04249325386234914, + "grad_norm": 0.1669834852218628, + "learning_rate": 0.0009973527921511738, + "loss": 3.2042, + "step": 1433 + }, + { + "epoch": 0.042522907214660614, + "grad_norm": 0.15472088754177094, + "learning_rate": 0.0009973479547215611, + "loss": 3.1942, + "step": 1434 + }, + { + "epoch": 0.0425525605669721, + "grad_norm": 0.15818437933921814, + "learning_rate": 0.000997343112887847, + "loss": 3.2557, + "step": 1435 + }, + { + "epoch": 0.042582213919283574, + "grad_norm": 0.21027866005897522, + "learning_rate": 0.0009973382666500744, + "loss": 3.2474, + "step": 1436 + }, + { + "epoch": 0.04261186727159506, + "grad_norm": 0.21736900508403778, + "learning_rate": 0.000997333416008286, + "loss": 3.2419, + "step": 1437 + }, + { + "epoch": 0.04264152062390653, + "grad_norm": 0.2207692265510559, + "learning_rate": 0.0009973285609625247, + "loss": 3.2327, + "step": 1438 + }, + { + "epoch": 0.04267117397621801, + "grad_norm": 0.25078946352005005, + "learning_rate": 0.0009973237015128338, + "loss": 3.222, + "step": 1439 + }, + { + "epoch": 0.04270082732852949, + "grad_norm": 0.17488908767700195, + "learning_rate": 0.000997318837659256, + "loss": 3.2025, + "step": 1440 + }, + { + "epoch": 0.04273048068084097, + "grad_norm": 0.15480104088783264, + "learning_rate": 0.0009973139694018347, + "loss": 3.2236, + "step": 1441 + }, + { + "epoch": 0.04276013403315245, + "grad_norm": 0.1769389659166336, + "learning_rate": 0.000997309096740613, + "loss": 3.2039, + "step": 1442 + }, + { + "epoch": 0.04278978738546393, + "grad_norm": 0.1570817232131958, + "learning_rate": 0.0009973042196756334, + "loss": 3.2167, + "step": 1443 + }, + { + "epoch": 0.042819440737775404, + "grad_norm": 0.16007007658481598, + "learning_rate": 0.00099729933820694, + "loss": 3.205, + "step": 1444 + }, + { + "epoch": 0.04284909409008689, + "grad_norm": 0.1653987616300583, + "learning_rate": 0.0009972944523345753, + "loss": 3.2115, + "step": 1445 + }, + { + "epoch": 0.04287874744239836, + "grad_norm": 0.14741134643554688, + "learning_rate": 0.000997289562058583, + "loss": 3.2634, + "step": 1446 + }, + { + "epoch": 0.04290840079470984, + "grad_norm": 0.12908752262592316, + "learning_rate": 0.0009972846673790062, + "loss": 3.2395, + "step": 1447 + }, + { + "epoch": 0.04293805414702132, + "grad_norm": 0.12361299991607666, + "learning_rate": 0.0009972797682958885, + "loss": 3.2066, + "step": 1448 + }, + { + "epoch": 0.0429677074993328, + "grad_norm": 0.154439315199852, + "learning_rate": 0.0009972748648092728, + "loss": 3.2124, + "step": 1449 + }, + { + "epoch": 0.04299736085164428, + "grad_norm": 0.1457812637090683, + "learning_rate": 0.000997269956919203, + "loss": 3.2357, + "step": 1450 + }, + { + "epoch": 0.04302701420395576, + "grad_norm": 0.13215969502925873, + "learning_rate": 0.0009972650446257224, + "loss": 3.2183, + "step": 1451 + }, + { + "epoch": 0.043056667556267234, + "grad_norm": 0.1357736736536026, + "learning_rate": 0.0009972601279288743, + "loss": 3.2133, + "step": 1452 + }, + { + "epoch": 0.04308632090857872, + "grad_norm": 0.13783477246761322, + "learning_rate": 0.0009972552068287027, + "loss": 3.1977, + "step": 1453 + }, + { + "epoch": 0.04311597426089019, + "grad_norm": 0.1710147261619568, + "learning_rate": 0.0009972502813252507, + "loss": 3.2449, + "step": 1454 + }, + { + "epoch": 0.043145627613201676, + "grad_norm": 0.18320469558238983, + "learning_rate": 0.0009972453514185621, + "loss": 3.1855, + "step": 1455 + }, + { + "epoch": 0.04317528096551315, + "grad_norm": 0.1528881937265396, + "learning_rate": 0.0009972404171086806, + "loss": 3.2038, + "step": 1456 + }, + { + "epoch": 0.04320493431782463, + "grad_norm": 0.1394737958908081, + "learning_rate": 0.0009972354783956499, + "loss": 3.1952, + "step": 1457 + }, + { + "epoch": 0.04323458767013611, + "grad_norm": 0.14594481885433197, + "learning_rate": 0.0009972305352795136, + "loss": 3.1858, + "step": 1458 + }, + { + "epoch": 0.04326424102244759, + "grad_norm": 0.14396123588085175, + "learning_rate": 0.0009972255877603157, + "loss": 3.1834, + "step": 1459 + }, + { + "epoch": 0.043293894374759063, + "grad_norm": 0.16258907318115234, + "learning_rate": 0.0009972206358380999, + "loss": 3.1955, + "step": 1460 + }, + { + "epoch": 0.04332354772707055, + "grad_norm": 0.17607097327709198, + "learning_rate": 0.00099721567951291, + "loss": 3.2304, + "step": 1461 + }, + { + "epoch": 0.04335320107938202, + "grad_norm": 0.1642773449420929, + "learning_rate": 0.0009972107187847896, + "loss": 3.1791, + "step": 1462 + }, + { + "epoch": 0.043382854431693506, + "grad_norm": 0.18116731941699982, + "learning_rate": 0.0009972057536537834, + "loss": 3.2277, + "step": 1463 + }, + { + "epoch": 0.04341250778400498, + "grad_norm": 0.14067457616329193, + "learning_rate": 0.0009972007841199345, + "loss": 3.1671, + "step": 1464 + }, + { + "epoch": 0.04344216113631646, + "grad_norm": 0.1645730435848236, + "learning_rate": 0.0009971958101832874, + "loss": 3.1988, + "step": 1465 + }, + { + "epoch": 0.04347181448862794, + "grad_norm": 0.19291391968727112, + "learning_rate": 0.0009971908318438863, + "loss": 3.2389, + "step": 1466 + }, + { + "epoch": 0.04350146784093942, + "grad_norm": 0.1506495177745819, + "learning_rate": 0.0009971858491017748, + "loss": 3.1856, + "step": 1467 + }, + { + "epoch": 0.0435311211932509, + "grad_norm": 0.15495575964450836, + "learning_rate": 0.0009971808619569974, + "loss": 3.1766, + "step": 1468 + }, + { + "epoch": 0.04356077454556238, + "grad_norm": 0.12650077044963837, + "learning_rate": 0.000997175870409598, + "loss": 3.1866, + "step": 1469 + }, + { + "epoch": 0.04359042789787385, + "grad_norm": 0.14040561020374298, + "learning_rate": 0.0009971708744596212, + "loss": 3.2372, + "step": 1470 + }, + { + "epoch": 0.043620081250185336, + "grad_norm": 0.15817800164222717, + "learning_rate": 0.0009971658741071106, + "loss": 3.1602, + "step": 1471 + }, + { + "epoch": 0.04364973460249681, + "grad_norm": 0.17352978885173798, + "learning_rate": 0.000997160869352111, + "loss": 3.1983, + "step": 1472 + }, + { + "epoch": 0.04367938795480829, + "grad_norm": 0.18833889067173004, + "learning_rate": 0.0009971558601946666, + "loss": 3.1995, + "step": 1473 + }, + { + "epoch": 0.04370904130711977, + "grad_norm": 0.17220906913280487, + "learning_rate": 0.0009971508466348217, + "loss": 3.1642, + "step": 1474 + }, + { + "epoch": 0.04373869465943125, + "grad_norm": 0.15689468383789062, + "learning_rate": 0.0009971458286726208, + "loss": 3.2328, + "step": 1475 + }, + { + "epoch": 0.04376834801174273, + "grad_norm": 0.1756535917520523, + "learning_rate": 0.0009971408063081083, + "loss": 3.2014, + "step": 1476 + }, + { + "epoch": 0.043798001364054207, + "grad_norm": 0.16160428524017334, + "learning_rate": 0.0009971357795413283, + "loss": 3.1646, + "step": 1477 + }, + { + "epoch": 0.04382765471636568, + "grad_norm": 0.19566623866558075, + "learning_rate": 0.0009971307483723258, + "loss": 3.2058, + "step": 1478 + }, + { + "epoch": 0.043857308068677166, + "grad_norm": 0.18781480193138123, + "learning_rate": 0.0009971257128011453, + "loss": 3.199, + "step": 1479 + }, + { + "epoch": 0.04388696142098864, + "grad_norm": 0.15670202672481537, + "learning_rate": 0.0009971206728278312, + "loss": 3.1794, + "step": 1480 + }, + { + "epoch": 0.043916614773300125, + "grad_norm": 0.1387728452682495, + "learning_rate": 0.0009971156284524284, + "loss": 3.2123, + "step": 1481 + }, + { + "epoch": 0.0439462681256116, + "grad_norm": 0.1578994244337082, + "learning_rate": 0.000997110579674981, + "loss": 3.1869, + "step": 1482 + }, + { + "epoch": 0.04397592147792308, + "grad_norm": 0.16452737152576447, + "learning_rate": 0.0009971055264955345, + "loss": 3.1707, + "step": 1483 + }, + { + "epoch": 0.04400557483023456, + "grad_norm": 0.1496034860610962, + "learning_rate": 0.000997100468914133, + "loss": 3.1411, + "step": 1484 + }, + { + "epoch": 0.044035228182546036, + "grad_norm": 0.12582992017269135, + "learning_rate": 0.0009970954069308216, + "loss": 3.202, + "step": 1485 + }, + { + "epoch": 0.04406488153485751, + "grad_norm": 0.12854424118995667, + "learning_rate": 0.0009970903405456448, + "loss": 3.195, + "step": 1486 + }, + { + "epoch": 0.044094534887168996, + "grad_norm": 0.14303217828273773, + "learning_rate": 0.0009970852697586481, + "loss": 3.2079, + "step": 1487 + }, + { + "epoch": 0.04412418823948047, + "grad_norm": 0.12269775569438934, + "learning_rate": 0.0009970801945698759, + "loss": 3.1887, + "step": 1488 + }, + { + "epoch": 0.044153841591791955, + "grad_norm": 0.12158212065696716, + "learning_rate": 0.000997075114979373, + "loss": 3.2127, + "step": 1489 + }, + { + "epoch": 0.04418349494410343, + "grad_norm": 0.142191544175148, + "learning_rate": 0.000997070030987185, + "loss": 3.2005, + "step": 1490 + }, + { + "epoch": 0.04421314829641491, + "grad_norm": 0.1513468623161316, + "learning_rate": 0.0009970649425933562, + "loss": 3.2234, + "step": 1491 + }, + { + "epoch": 0.04424280164872639, + "grad_norm": 0.17087078094482422, + "learning_rate": 0.0009970598497979321, + "loss": 3.2109, + "step": 1492 + }, + { + "epoch": 0.044272455001037866, + "grad_norm": 0.18332251906394958, + "learning_rate": 0.000997054752600958, + "loss": 3.2295, + "step": 1493 + }, + { + "epoch": 0.04430210835334935, + "grad_norm": 0.17821776866912842, + "learning_rate": 0.0009970496510024786, + "loss": 3.1697, + "step": 1494 + }, + { + "epoch": 0.044331761705660826, + "grad_norm": 0.15674060583114624, + "learning_rate": 0.000997044545002539, + "loss": 3.1817, + "step": 1495 + }, + { + "epoch": 0.0443614150579723, + "grad_norm": 0.1593378186225891, + "learning_rate": 0.0009970394346011848, + "loss": 3.183, + "step": 1496 + }, + { + "epoch": 0.044391068410283785, + "grad_norm": 0.19284683465957642, + "learning_rate": 0.000997034319798461, + "loss": 3.1845, + "step": 1497 + }, + { + "epoch": 0.04442072176259526, + "grad_norm": 0.20348860323429108, + "learning_rate": 0.0009970292005944132, + "loss": 3.2088, + "step": 1498 + }, + { + "epoch": 0.04445037511490674, + "grad_norm": 0.23543845117092133, + "learning_rate": 0.0009970240769890863, + "loss": 3.1915, + "step": 1499 + }, + { + "epoch": 0.04448002846721822, + "grad_norm": 0.22921469807624817, + "learning_rate": 0.0009970189489825261, + "loss": 3.1906, + "step": 1500 + }, + { + "epoch": 0.044509681819529696, + "grad_norm": 0.20710843801498413, + "learning_rate": 0.0009970138165747778, + "loss": 3.1942, + "step": 1501 + }, + { + "epoch": 0.04453933517184118, + "grad_norm": 0.1876649260520935, + "learning_rate": 0.0009970086797658866, + "loss": 3.2345, + "step": 1502 + }, + { + "epoch": 0.044568988524152656, + "grad_norm": 0.22407172620296478, + "learning_rate": 0.0009970035385558982, + "loss": 3.1683, + "step": 1503 + }, + { + "epoch": 0.04459864187646413, + "grad_norm": 0.18624499440193176, + "learning_rate": 0.0009969983929448585, + "loss": 3.1499, + "step": 1504 + }, + { + "epoch": 0.044628295228775615, + "grad_norm": 0.2067796140909195, + "learning_rate": 0.0009969932429328124, + "loss": 3.2457, + "step": 1505 + }, + { + "epoch": 0.04465794858108709, + "grad_norm": 0.18601927161216736, + "learning_rate": 0.0009969880885198062, + "loss": 3.2073, + "step": 1506 + }, + { + "epoch": 0.044687601933398574, + "grad_norm": 0.16297361254692078, + "learning_rate": 0.0009969829297058848, + "loss": 3.2046, + "step": 1507 + }, + { + "epoch": 0.04471725528571005, + "grad_norm": 0.1504916250705719, + "learning_rate": 0.0009969777664910944, + "loss": 3.2003, + "step": 1508 + }, + { + "epoch": 0.044746908638021526, + "grad_norm": 0.1352687031030655, + "learning_rate": 0.0009969725988754805, + "loss": 3.193, + "step": 1509 + }, + { + "epoch": 0.04477656199033301, + "grad_norm": 0.12231147289276123, + "learning_rate": 0.000996967426859089, + "loss": 3.1766, + "step": 1510 + }, + { + "epoch": 0.044806215342644486, + "grad_norm": 0.13140574097633362, + "learning_rate": 0.0009969622504419655, + "loss": 3.1911, + "step": 1511 + }, + { + "epoch": 0.04483586869495596, + "grad_norm": 0.12270963191986084, + "learning_rate": 0.0009969570696241562, + "loss": 3.1841, + "step": 1512 + }, + { + "epoch": 0.044865522047267445, + "grad_norm": 0.12355536222457886, + "learning_rate": 0.0009969518844057067, + "loss": 3.2171, + "step": 1513 + }, + { + "epoch": 0.04489517539957892, + "grad_norm": 0.14259228110313416, + "learning_rate": 0.0009969466947866627, + "loss": 3.1754, + "step": 1514 + }, + { + "epoch": 0.044924828751890404, + "grad_norm": 0.14672254025936127, + "learning_rate": 0.0009969415007670707, + "loss": 3.1938, + "step": 1515 + }, + { + "epoch": 0.04495448210420188, + "grad_norm": 0.14808635413646698, + "learning_rate": 0.0009969363023469764, + "loss": 3.1859, + "step": 1516 + }, + { + "epoch": 0.044984135456513356, + "grad_norm": 0.18619988858699799, + "learning_rate": 0.0009969310995264257, + "loss": 3.1785, + "step": 1517 + }, + { + "epoch": 0.04501378880882484, + "grad_norm": 0.2081875503063202, + "learning_rate": 0.0009969258923054648, + "loss": 3.1677, + "step": 1518 + }, + { + "epoch": 0.045043442161136316, + "grad_norm": 0.20022568106651306, + "learning_rate": 0.00099692068068414, + "loss": 3.2254, + "step": 1519 + }, + { + "epoch": 0.0450730955134478, + "grad_norm": 0.16186517477035522, + "learning_rate": 0.0009969154646624972, + "loss": 3.2459, + "step": 1520 + }, + { + "epoch": 0.045102748865759275, + "grad_norm": 0.16302476823329926, + "learning_rate": 0.0009969102442405826, + "loss": 3.2036, + "step": 1521 + }, + { + "epoch": 0.04513240221807075, + "grad_norm": 0.15559126436710358, + "learning_rate": 0.0009969050194184425, + "loss": 3.1935, + "step": 1522 + }, + { + "epoch": 0.045162055570382234, + "grad_norm": 0.14174950122833252, + "learning_rate": 0.0009968997901961233, + "loss": 3.1462, + "step": 1523 + }, + { + "epoch": 0.04519170892269371, + "grad_norm": 0.1220371276140213, + "learning_rate": 0.000996894556573671, + "loss": 3.1625, + "step": 1524 + }, + { + "epoch": 0.045221362275005186, + "grad_norm": 0.12115288525819778, + "learning_rate": 0.0009968893185511322, + "loss": 3.1909, + "step": 1525 + }, + { + "epoch": 0.04525101562731667, + "grad_norm": 0.11092729866504669, + "learning_rate": 0.000996884076128553, + "loss": 3.182, + "step": 1526 + }, + { + "epoch": 0.045280668979628146, + "grad_norm": 0.1176074668765068, + "learning_rate": 0.0009968788293059803, + "loss": 3.1516, + "step": 1527 + }, + { + "epoch": 0.04531032233193963, + "grad_norm": 0.1304144561290741, + "learning_rate": 0.0009968735780834603, + "loss": 3.1722, + "step": 1528 + }, + { + "epoch": 0.045339975684251105, + "grad_norm": 0.13852503895759583, + "learning_rate": 0.0009968683224610394, + "loss": 3.172, + "step": 1529 + }, + { + "epoch": 0.04536962903656258, + "grad_norm": 0.14756610989570618, + "learning_rate": 0.000996863062438764, + "loss": 3.1579, + "step": 1530 + }, + { + "epoch": 0.045399282388874064, + "grad_norm": 0.14955000579357147, + "learning_rate": 0.000996857798016681, + "loss": 3.1241, + "step": 1531 + }, + { + "epoch": 0.04542893574118554, + "grad_norm": 0.1634589582681656, + "learning_rate": 0.0009968525291948372, + "loss": 3.2281, + "step": 1532 + }, + { + "epoch": 0.04545858909349702, + "grad_norm": 0.2077220380306244, + "learning_rate": 0.0009968472559732787, + "loss": 3.185, + "step": 1533 + }, + { + "epoch": 0.0454882424458085, + "grad_norm": 0.20334672927856445, + "learning_rate": 0.0009968419783520524, + "loss": 3.2101, + "step": 1534 + }, + { + "epoch": 0.045517895798119976, + "grad_norm": 0.17740599811077118, + "learning_rate": 0.0009968366963312052, + "loss": 3.1636, + "step": 1535 + }, + { + "epoch": 0.04554754915043146, + "grad_norm": 0.15518805384635925, + "learning_rate": 0.0009968314099107838, + "loss": 3.1727, + "step": 1536 + }, + { + "epoch": 0.045577202502742935, + "grad_norm": 0.17502427101135254, + "learning_rate": 0.000996826119090835, + "loss": 3.1917, + "step": 1537 + }, + { + "epoch": 0.04560685585505441, + "grad_norm": 0.1894330233335495, + "learning_rate": 0.0009968208238714056, + "loss": 3.1758, + "step": 1538 + }, + { + "epoch": 0.045636509207365894, + "grad_norm": 0.15970078110694885, + "learning_rate": 0.0009968155242525425, + "loss": 3.2296, + "step": 1539 + }, + { + "epoch": 0.04566616255967737, + "grad_norm": 0.1596508026123047, + "learning_rate": 0.0009968102202342927, + "loss": 3.1752, + "step": 1540 + }, + { + "epoch": 0.04569581591198885, + "grad_norm": 0.16031008958816528, + "learning_rate": 0.000996804911816703, + "loss": 3.158, + "step": 1541 + }, + { + "epoch": 0.04572546926430033, + "grad_norm": 0.15984508395195007, + "learning_rate": 0.000996799598999821, + "loss": 3.1818, + "step": 1542 + }, + { + "epoch": 0.045755122616611806, + "grad_norm": 0.1720057725906372, + "learning_rate": 0.0009967942817836928, + "loss": 3.1675, + "step": 1543 + }, + { + "epoch": 0.04578477596892329, + "grad_norm": 0.1628018617630005, + "learning_rate": 0.000996788960168366, + "loss": 3.1811, + "step": 1544 + }, + { + "epoch": 0.045814429321234765, + "grad_norm": 0.16023200750350952, + "learning_rate": 0.0009967836341538878, + "loss": 3.1965, + "step": 1545 + }, + { + "epoch": 0.04584408267354625, + "grad_norm": 0.1806061863899231, + "learning_rate": 0.0009967783037403053, + "loss": 3.198, + "step": 1546 + }, + { + "epoch": 0.045873736025857724, + "grad_norm": 0.1953926831483841, + "learning_rate": 0.0009967729689276655, + "loss": 3.2095, + "step": 1547 + }, + { + "epoch": 0.0459033893781692, + "grad_norm": 0.19415943324565887, + "learning_rate": 0.0009967676297160158, + "loss": 3.1499, + "step": 1548 + }, + { + "epoch": 0.04593304273048068, + "grad_norm": 0.20064815878868103, + "learning_rate": 0.0009967622861054035, + "loss": 3.1816, + "step": 1549 + }, + { + "epoch": 0.04596269608279216, + "grad_norm": 0.19666296243667603, + "learning_rate": 0.000996756938095876, + "loss": 3.1793, + "step": 1550 + }, + { + "epoch": 0.045992349435103635, + "grad_norm": 0.19220072031021118, + "learning_rate": 0.0009967515856874804, + "loss": 3.1607, + "step": 1551 + }, + { + "epoch": 0.04602200278741512, + "grad_norm": 0.20447592437267303, + "learning_rate": 0.0009967462288802643, + "loss": 3.1988, + "step": 1552 + }, + { + "epoch": 0.046051656139726595, + "grad_norm": 0.1558152139186859, + "learning_rate": 0.0009967408676742752, + "loss": 3.1993, + "step": 1553 + }, + { + "epoch": 0.04608130949203808, + "grad_norm": 0.15485714375972748, + "learning_rate": 0.0009967355020695603, + "loss": 3.175, + "step": 1554 + }, + { + "epoch": 0.046110962844349554, + "grad_norm": 0.14356942474842072, + "learning_rate": 0.0009967301320661672, + "loss": 3.1441, + "step": 1555 + }, + { + "epoch": 0.04614061619666103, + "grad_norm": 0.17432674765586853, + "learning_rate": 0.0009967247576641437, + "loss": 3.1815, + "step": 1556 + }, + { + "epoch": 0.04617026954897251, + "grad_norm": 0.16643020510673523, + "learning_rate": 0.0009967193788635372, + "loss": 3.1569, + "step": 1557 + }, + { + "epoch": 0.04619992290128399, + "grad_norm": 0.15869277715682983, + "learning_rate": 0.000996713995664395, + "loss": 3.2133, + "step": 1558 + }, + { + "epoch": 0.04622957625359547, + "grad_norm": 0.14219924807548523, + "learning_rate": 0.0009967086080667656, + "loss": 3.1644, + "step": 1559 + }, + { + "epoch": 0.04625922960590695, + "grad_norm": 0.14483623206615448, + "learning_rate": 0.0009967032160706959, + "loss": 3.1882, + "step": 1560 + }, + { + "epoch": 0.046288882958218425, + "grad_norm": 0.13770592212677002, + "learning_rate": 0.0009966978196762342, + "loss": 3.1823, + "step": 1561 + }, + { + "epoch": 0.04631853631052991, + "grad_norm": 0.14131711423397064, + "learning_rate": 0.0009966924188834277, + "loss": 3.2081, + "step": 1562 + }, + { + "epoch": 0.046348189662841384, + "grad_norm": 0.16849827766418457, + "learning_rate": 0.0009966870136923248, + "loss": 3.1857, + "step": 1563 + }, + { + "epoch": 0.04637784301515286, + "grad_norm": 0.17567801475524902, + "learning_rate": 0.000996681604102973, + "loss": 3.1732, + "step": 1564 + }, + { + "epoch": 0.04640749636746434, + "grad_norm": 0.18932205438613892, + "learning_rate": 0.0009966761901154207, + "loss": 3.2113, + "step": 1565 + }, + { + "epoch": 0.04643714971977582, + "grad_norm": 0.18117894232273102, + "learning_rate": 0.0009966707717297151, + "loss": 3.1945, + "step": 1566 + }, + { + "epoch": 0.0464668030720873, + "grad_norm": 0.1611478328704834, + "learning_rate": 0.0009966653489459048, + "loss": 3.2147, + "step": 1567 + }, + { + "epoch": 0.04649645642439878, + "grad_norm": 0.1721586436033249, + "learning_rate": 0.0009966599217640375, + "loss": 3.1663, + "step": 1568 + }, + { + "epoch": 0.046526109776710255, + "grad_norm": 0.16043025255203247, + "learning_rate": 0.0009966544901841613, + "loss": 3.1431, + "step": 1569 + }, + { + "epoch": 0.04655576312902174, + "grad_norm": 0.1600242406129837, + "learning_rate": 0.0009966490542063244, + "loss": 3.1813, + "step": 1570 + }, + { + "epoch": 0.046585416481333214, + "grad_norm": 0.1534273326396942, + "learning_rate": 0.0009966436138305749, + "loss": 3.1817, + "step": 1571 + }, + { + "epoch": 0.0466150698336447, + "grad_norm": 0.18066051602363586, + "learning_rate": 0.000996638169056961, + "loss": 3.2172, + "step": 1572 + }, + { + "epoch": 0.04664472318595617, + "grad_norm": 0.1809774786233902, + "learning_rate": 0.0009966327198855304, + "loss": 3.1915, + "step": 1573 + }, + { + "epoch": 0.04667437653826765, + "grad_norm": 0.15137788653373718, + "learning_rate": 0.0009966272663163324, + "loss": 3.2052, + "step": 1574 + }, + { + "epoch": 0.04670402989057913, + "grad_norm": 0.1746668815612793, + "learning_rate": 0.0009966218083494146, + "loss": 3.156, + "step": 1575 + }, + { + "epoch": 0.04673368324289061, + "grad_norm": 0.15097258985042572, + "learning_rate": 0.0009966163459848253, + "loss": 3.17, + "step": 1576 + }, + { + "epoch": 0.046763336595202085, + "grad_norm": 0.14298182725906372, + "learning_rate": 0.0009966108792226129, + "loss": 3.1521, + "step": 1577 + }, + { + "epoch": 0.04679298994751357, + "grad_norm": 0.1811993569135666, + "learning_rate": 0.0009966054080628262, + "loss": 3.1682, + "step": 1578 + }, + { + "epoch": 0.046822643299825044, + "grad_norm": 0.13808071613311768, + "learning_rate": 0.0009965999325055133, + "loss": 3.1811, + "step": 1579 + }, + { + "epoch": 0.04685229665213653, + "grad_norm": 0.14653487503528595, + "learning_rate": 0.0009965944525507225, + "loss": 3.1926, + "step": 1580 + }, + { + "epoch": 0.046881950004448, + "grad_norm": 0.16048982739448547, + "learning_rate": 0.0009965889681985028, + "loss": 3.1837, + "step": 1581 + }, + { + "epoch": 0.04691160335675948, + "grad_norm": 0.15767627954483032, + "learning_rate": 0.0009965834794489026, + "loss": 3.1689, + "step": 1582 + }, + { + "epoch": 0.04694125670907096, + "grad_norm": 0.16122186183929443, + "learning_rate": 0.0009965779863019704, + "loss": 3.1687, + "step": 1583 + }, + { + "epoch": 0.04697091006138244, + "grad_norm": 0.13416975736618042, + "learning_rate": 0.0009965724887577548, + "loss": 3.1888, + "step": 1584 + }, + { + "epoch": 0.04700056341369392, + "grad_norm": 0.1563858985900879, + "learning_rate": 0.0009965669868163048, + "loss": 3.1997, + "step": 1585 + }, + { + "epoch": 0.0470302167660054, + "grad_norm": 0.16046376526355743, + "learning_rate": 0.0009965614804776687, + "loss": 3.1749, + "step": 1586 + }, + { + "epoch": 0.047059870118316874, + "grad_norm": 0.14426372945308685, + "learning_rate": 0.0009965559697418956, + "loss": 3.1884, + "step": 1587 + }, + { + "epoch": 0.04708952347062836, + "grad_norm": 0.1554444283246994, + "learning_rate": 0.000996550454609034, + "loss": 3.1585, + "step": 1588 + }, + { + "epoch": 0.04711917682293983, + "grad_norm": 0.15588673949241638, + "learning_rate": 0.0009965449350791329, + "loss": 3.1776, + "step": 1589 + }, + { + "epoch": 0.04714883017525131, + "grad_norm": 0.1743895262479782, + "learning_rate": 0.0009965394111522412, + "loss": 3.175, + "step": 1590 + }, + { + "epoch": 0.04717848352756279, + "grad_norm": 0.1781124323606491, + "learning_rate": 0.0009965338828284078, + "loss": 3.1673, + "step": 1591 + }, + { + "epoch": 0.04720813687987427, + "grad_norm": 0.15286381542682648, + "learning_rate": 0.0009965283501076818, + "loss": 3.1174, + "step": 1592 + }, + { + "epoch": 0.04723779023218575, + "grad_norm": 0.13032777607440948, + "learning_rate": 0.0009965228129901118, + "loss": 3.1523, + "step": 1593 + }, + { + "epoch": 0.04726744358449723, + "grad_norm": 0.1401076316833496, + "learning_rate": 0.0009965172714757472, + "loss": 3.1691, + "step": 1594 + }, + { + "epoch": 0.047297096936808704, + "grad_norm": 0.14049266278743744, + "learning_rate": 0.0009965117255646369, + "loss": 3.1719, + "step": 1595 + }, + { + "epoch": 0.04732675028912019, + "grad_norm": 0.15737734735012054, + "learning_rate": 0.00099650617525683, + "loss": 3.1615, + "step": 1596 + }, + { + "epoch": 0.04735640364143166, + "grad_norm": 0.17384354770183563, + "learning_rate": 0.0009965006205523758, + "loss": 3.1543, + "step": 1597 + }, + { + "epoch": 0.047386056993743146, + "grad_norm": 0.17231452465057373, + "learning_rate": 0.0009964950614513232, + "loss": 3.1345, + "step": 1598 + }, + { + "epoch": 0.04741571034605462, + "grad_norm": 0.15380223095417023, + "learning_rate": 0.0009964894979537216, + "loss": 3.1767, + "step": 1599 + }, + { + "epoch": 0.0474453636983661, + "grad_norm": 0.1522851288318634, + "learning_rate": 0.0009964839300596205, + "loss": 3.139, + "step": 1600 + }, + { + "epoch": 0.04747501705067758, + "grad_norm": 0.14666661620140076, + "learning_rate": 0.0009964783577690688, + "loss": 3.1608, + "step": 1601 + }, + { + "epoch": 0.04750467040298906, + "grad_norm": 0.1517452448606491, + "learning_rate": 0.000996472781082116, + "loss": 3.1961, + "step": 1602 + }, + { + "epoch": 0.047534323755300534, + "grad_norm": 0.15507088601589203, + "learning_rate": 0.0009964671999988118, + "loss": 3.1531, + "step": 1603 + }, + { + "epoch": 0.04756397710761202, + "grad_norm": 0.18835672736167908, + "learning_rate": 0.000996461614519205, + "loss": 3.174, + "step": 1604 + }, + { + "epoch": 0.04759363045992349, + "grad_norm": 0.1920921504497528, + "learning_rate": 0.0009964560246433457, + "loss": 3.1641, + "step": 1605 + }, + { + "epoch": 0.047623283812234976, + "grad_norm": 0.17479604482650757, + "learning_rate": 0.0009964504303712827, + "loss": 3.1859, + "step": 1606 + }, + { + "epoch": 0.04765293716454645, + "grad_norm": 0.1660638302564621, + "learning_rate": 0.0009964448317030662, + "loss": 3.1702, + "step": 1607 + }, + { + "epoch": 0.04768259051685793, + "grad_norm": 0.18197117745876312, + "learning_rate": 0.0009964392286387453, + "loss": 3.1846, + "step": 1608 + }, + { + "epoch": 0.04771224386916941, + "grad_norm": 0.17962317168712616, + "learning_rate": 0.00099643362117837, + "loss": 3.1364, + "step": 1609 + }, + { + "epoch": 0.04774189722148089, + "grad_norm": 0.1938111037015915, + "learning_rate": 0.0009964280093219897, + "loss": 3.1587, + "step": 1610 + }, + { + "epoch": 0.04777155057379237, + "grad_norm": 0.20874682068824768, + "learning_rate": 0.0009964223930696541, + "loss": 3.1971, + "step": 1611 + }, + { + "epoch": 0.04780120392610385, + "grad_norm": 0.22782467305660248, + "learning_rate": 0.0009964167724214133, + "loss": 3.1511, + "step": 1612 + }, + { + "epoch": 0.04783085727841532, + "grad_norm": 0.20297861099243164, + "learning_rate": 0.0009964111473773165, + "loss": 3.1649, + "step": 1613 + }, + { + "epoch": 0.047860510630726806, + "grad_norm": 0.19409380853176117, + "learning_rate": 0.0009964055179374138, + "loss": 3.1523, + "step": 1614 + }, + { + "epoch": 0.04789016398303828, + "grad_norm": 0.20282971858978271, + "learning_rate": 0.000996399884101755, + "loss": 3.1461, + "step": 1615 + }, + { + "epoch": 0.04791981733534976, + "grad_norm": 0.18021351099014282, + "learning_rate": 0.00099639424587039, + "loss": 3.1677, + "step": 1616 + }, + { + "epoch": 0.04794947068766124, + "grad_norm": 0.19420664012432098, + "learning_rate": 0.0009963886032433688, + "loss": 3.1714, + "step": 1617 + }, + { + "epoch": 0.04797912403997272, + "grad_norm": 0.2124893069267273, + "learning_rate": 0.0009963829562207413, + "loss": 3.1751, + "step": 1618 + }, + { + "epoch": 0.0480087773922842, + "grad_norm": 0.17360728979110718, + "learning_rate": 0.0009963773048025577, + "loss": 3.1768, + "step": 1619 + }, + { + "epoch": 0.04803843074459568, + "grad_norm": 0.1888532042503357, + "learning_rate": 0.0009963716489888677, + "loss": 3.1794, + "step": 1620 + }, + { + "epoch": 0.04806808409690715, + "grad_norm": 0.1585383266210556, + "learning_rate": 0.0009963659887797217, + "loss": 3.198, + "step": 1621 + }, + { + "epoch": 0.048097737449218636, + "grad_norm": 0.19600167870521545, + "learning_rate": 0.0009963603241751695, + "loss": 3.1899, + "step": 1622 + }, + { + "epoch": 0.04812739080153011, + "grad_norm": 0.11210490763187408, + "learning_rate": 0.0009963546551752613, + "loss": 3.1655, + "step": 1623 + }, + { + "epoch": 0.048157044153841595, + "grad_norm": 0.1247895285487175, + "learning_rate": 0.0009963489817800476, + "loss": 3.192, + "step": 1624 + }, + { + "epoch": 0.04818669750615307, + "grad_norm": 0.14794568717479706, + "learning_rate": 0.0009963433039895785, + "loss": 3.19, + "step": 1625 + }, + { + "epoch": 0.04821635085846455, + "grad_norm": 0.11477237194776535, + "learning_rate": 0.0009963376218039043, + "loss": 3.1141, + "step": 1626 + }, + { + "epoch": 0.04824600421077603, + "grad_norm": 0.1412738561630249, + "learning_rate": 0.0009963319352230752, + "loss": 3.1623, + "step": 1627 + }, + { + "epoch": 0.04827565756308751, + "grad_norm": 0.14601512253284454, + "learning_rate": 0.0009963262442471417, + "loss": 3.1841, + "step": 1628 + }, + { + "epoch": 0.04830531091539898, + "grad_norm": 0.13648682832717896, + "learning_rate": 0.000996320548876154, + "loss": 3.1358, + "step": 1629 + }, + { + "epoch": 0.048334964267710466, + "grad_norm": 0.13048428297042847, + "learning_rate": 0.0009963148491101625, + "loss": 3.145, + "step": 1630 + }, + { + "epoch": 0.04836461762002194, + "grad_norm": 0.16633184254169464, + "learning_rate": 0.0009963091449492181, + "loss": 3.1702, + "step": 1631 + }, + { + "epoch": 0.048394270972333425, + "grad_norm": 0.17320100963115692, + "learning_rate": 0.000996303436393371, + "loss": 3.1637, + "step": 1632 + }, + { + "epoch": 0.0484239243246449, + "grad_norm": 0.17310874164104462, + "learning_rate": 0.0009962977234426716, + "loss": 3.1907, + "step": 1633 + }, + { + "epoch": 0.04845357767695638, + "grad_norm": 0.1653977632522583, + "learning_rate": 0.0009962920060971707, + "loss": 3.1406, + "step": 1634 + }, + { + "epoch": 0.04848323102926786, + "grad_norm": 0.15627259016036987, + "learning_rate": 0.000996286284356919, + "loss": 3.1755, + "step": 1635 + }, + { + "epoch": 0.04851288438157934, + "grad_norm": 0.1659286767244339, + "learning_rate": 0.000996280558221967, + "loss": 3.1572, + "step": 1636 + }, + { + "epoch": 0.04854253773389081, + "grad_norm": 0.1901272088289261, + "learning_rate": 0.0009962748276923655, + "loss": 3.1509, + "step": 1637 + }, + { + "epoch": 0.048572191086202296, + "grad_norm": 0.17279964685440063, + "learning_rate": 0.0009962690927681653, + "loss": 3.158, + "step": 1638 + }, + { + "epoch": 0.04860184443851377, + "grad_norm": 0.16860656440258026, + "learning_rate": 0.000996263353449417, + "loss": 3.0937, + "step": 1639 + }, + { + "epoch": 0.048631497790825255, + "grad_norm": 0.16616274416446686, + "learning_rate": 0.0009962576097361715, + "loss": 3.1435, + "step": 1640 + }, + { + "epoch": 0.04866115114313673, + "grad_norm": 0.17831121385097504, + "learning_rate": 0.0009962518616284798, + "loss": 3.1505, + "step": 1641 + }, + { + "epoch": 0.04869080449544821, + "grad_norm": 0.1564028412103653, + "learning_rate": 0.0009962461091263925, + "loss": 3.153, + "step": 1642 + }, + { + "epoch": 0.04872045784775969, + "grad_norm": 0.16095088422298431, + "learning_rate": 0.0009962403522299607, + "loss": 3.1551, + "step": 1643 + }, + { + "epoch": 0.04875011120007117, + "grad_norm": 0.15048366785049438, + "learning_rate": 0.0009962345909392356, + "loss": 3.1699, + "step": 1644 + }, + { + "epoch": 0.04877976455238265, + "grad_norm": 0.1391669511795044, + "learning_rate": 0.000996228825254268, + "loss": 3.1856, + "step": 1645 + }, + { + "epoch": 0.048809417904694126, + "grad_norm": 0.1643586903810501, + "learning_rate": 0.0009962230551751091, + "loss": 3.1662, + "step": 1646 + }, + { + "epoch": 0.0488390712570056, + "grad_norm": 0.13637904822826385, + "learning_rate": 0.0009962172807018096, + "loss": 3.1696, + "step": 1647 + }, + { + "epoch": 0.048868724609317085, + "grad_norm": 0.16011463105678558, + "learning_rate": 0.000996211501834421, + "loss": 3.1616, + "step": 1648 + }, + { + "epoch": 0.04889837796162856, + "grad_norm": 0.19544711709022522, + "learning_rate": 0.0009962057185729945, + "loss": 3.1937, + "step": 1649 + }, + { + "epoch": 0.04892803131394004, + "grad_norm": 0.1933554857969284, + "learning_rate": 0.000996199930917581, + "loss": 3.1287, + "step": 1650 + }, + { + "epoch": 0.04895768466625152, + "grad_norm": 0.1863107979297638, + "learning_rate": 0.000996194138868232, + "loss": 3.165, + "step": 1651 + }, + { + "epoch": 0.048987338018563, + "grad_norm": 0.18985003232955933, + "learning_rate": 0.0009961883424249986, + "loss": 3.1625, + "step": 1652 + }, + { + "epoch": 0.04901699137087448, + "grad_norm": 0.1567056030035019, + "learning_rate": 0.0009961825415879325, + "loss": 3.1417, + "step": 1653 + }, + { + "epoch": 0.049046644723185956, + "grad_norm": 0.14515186846256256, + "learning_rate": 0.0009961767363570848, + "loss": 3.1524, + "step": 1654 + }, + { + "epoch": 0.04907629807549743, + "grad_norm": 0.16232119500637054, + "learning_rate": 0.0009961709267325067, + "loss": 3.1456, + "step": 1655 + }, + { + "epoch": 0.049105951427808915, + "grad_norm": 0.16044345498085022, + "learning_rate": 0.00099616511271425, + "loss": 3.1701, + "step": 1656 + }, + { + "epoch": 0.04913560478012039, + "grad_norm": 0.14801466464996338, + "learning_rate": 0.0009961592943023663, + "loss": 3.1681, + "step": 1657 + }, + { + "epoch": 0.049165258132431874, + "grad_norm": 0.15657329559326172, + "learning_rate": 0.0009961534714969067, + "loss": 3.1416, + "step": 1658 + }, + { + "epoch": 0.04919491148474335, + "grad_norm": 0.17818069458007812, + "learning_rate": 0.000996147644297923, + "loss": 3.1671, + "step": 1659 + }, + { + "epoch": 0.04922456483705483, + "grad_norm": 0.17309406399726868, + "learning_rate": 0.0009961418127054666, + "loss": 3.1471, + "step": 1660 + }, + { + "epoch": 0.04925421818936631, + "grad_norm": 0.1805344671010971, + "learning_rate": 0.0009961359767195893, + "loss": 3.1981, + "step": 1661 + }, + { + "epoch": 0.049283871541677786, + "grad_norm": 0.17988047003746033, + "learning_rate": 0.0009961301363403427, + "loss": 3.1546, + "step": 1662 + }, + { + "epoch": 0.04931352489398926, + "grad_norm": 0.18702332675457, + "learning_rate": 0.0009961242915677787, + "loss": 3.1387, + "step": 1663 + }, + { + "epoch": 0.049343178246300745, + "grad_norm": 0.1745089590549469, + "learning_rate": 0.000996118442401949, + "loss": 3.1503, + "step": 1664 + }, + { + "epoch": 0.04937283159861222, + "grad_norm": 0.1366676688194275, + "learning_rate": 0.0009961125888429054, + "loss": 3.1785, + "step": 1665 + }, + { + "epoch": 0.049402484950923704, + "grad_norm": 0.16553911566734314, + "learning_rate": 0.0009961067308906994, + "loss": 3.1475, + "step": 1666 + }, + { + "epoch": 0.04943213830323518, + "grad_norm": 0.16006770730018616, + "learning_rate": 0.0009961008685453834, + "loss": 3.1577, + "step": 1667 + }, + { + "epoch": 0.04946179165554666, + "grad_norm": 0.16348518431186676, + "learning_rate": 0.000996095001807009, + "loss": 3.1743, + "step": 1668 + }, + { + "epoch": 0.04949144500785814, + "grad_norm": 0.1544448435306549, + "learning_rate": 0.0009960891306756282, + "loss": 3.1642, + "step": 1669 + }, + { + "epoch": 0.049521098360169616, + "grad_norm": 0.16561239957809448, + "learning_rate": 0.000996083255151293, + "loss": 3.1415, + "step": 1670 + }, + { + "epoch": 0.0495507517124811, + "grad_norm": 0.17550653219223022, + "learning_rate": 0.0009960773752340554, + "loss": 3.1491, + "step": 1671 + }, + { + "epoch": 0.049580405064792575, + "grad_norm": 0.15142576396465302, + "learning_rate": 0.0009960714909239673, + "loss": 3.1674, + "step": 1672 + }, + { + "epoch": 0.04961005841710405, + "grad_norm": 0.1575300693511963, + "learning_rate": 0.0009960656022210811, + "loss": 3.1838, + "step": 1673 + }, + { + "epoch": 0.049639711769415534, + "grad_norm": 0.18607032299041748, + "learning_rate": 0.000996059709125449, + "loss": 3.1702, + "step": 1674 + }, + { + "epoch": 0.04966936512172701, + "grad_norm": 0.18077732622623444, + "learning_rate": 0.000996053811637123, + "loss": 3.1296, + "step": 1675 + }, + { + "epoch": 0.04969901847403849, + "grad_norm": 0.18478167057037354, + "learning_rate": 0.0009960479097561553, + "loss": 3.1295, + "step": 1676 + }, + { + "epoch": 0.04972867182634997, + "grad_norm": 0.19129857420921326, + "learning_rate": 0.000996042003482598, + "loss": 3.1621, + "step": 1677 + }, + { + "epoch": 0.049758325178661446, + "grad_norm": 0.170602485537529, + "learning_rate": 0.0009960360928165039, + "loss": 3.127, + "step": 1678 + }, + { + "epoch": 0.04978797853097293, + "grad_norm": 0.1363821029663086, + "learning_rate": 0.000996030177757925, + "loss": 3.1581, + "step": 1679 + }, + { + "epoch": 0.049817631883284405, + "grad_norm": 0.13050633668899536, + "learning_rate": 0.0009960242583069137, + "loss": 3.1276, + "step": 1680 + }, + { + "epoch": 0.04984728523559588, + "grad_norm": 0.1402127742767334, + "learning_rate": 0.0009960183344635226, + "loss": 3.1679, + "step": 1681 + }, + { + "epoch": 0.049876938587907364, + "grad_norm": 0.13234589993953705, + "learning_rate": 0.0009960124062278037, + "loss": 3.1551, + "step": 1682 + }, + { + "epoch": 0.04990659194021884, + "grad_norm": 0.1265239119529724, + "learning_rate": 0.00099600647359981, + "loss": 3.165, + "step": 1683 + }, + { + "epoch": 0.049936245292530324, + "grad_norm": 0.1661902815103531, + "learning_rate": 0.0009960005365795938, + "loss": 3.1048, + "step": 1684 + }, + { + "epoch": 0.0499658986448418, + "grad_norm": 0.20484760403633118, + "learning_rate": 0.0009959945951672079, + "loss": 3.1179, + "step": 1685 + }, + { + "epoch": 0.049995551997153276, + "grad_norm": 0.17668333649635315, + "learning_rate": 0.0009959886493627044, + "loss": 3.1671, + "step": 1686 + }, + { + "epoch": 0.05002520534946476, + "grad_norm": 0.19563740491867065, + "learning_rate": 0.0009959826991661365, + "loss": 3.1537, + "step": 1687 + }, + { + "epoch": 0.050054858701776235, + "grad_norm": 0.19460098445415497, + "learning_rate": 0.0009959767445775565, + "loss": 3.1966, + "step": 1688 + }, + { + "epoch": 0.05008451205408771, + "grad_norm": 0.18405914306640625, + "learning_rate": 0.0009959707855970174, + "loss": 3.1545, + "step": 1689 + }, + { + "epoch": 0.050114165406399194, + "grad_norm": 0.17227736115455627, + "learning_rate": 0.0009959648222245719, + "loss": 3.1721, + "step": 1690 + }, + { + "epoch": 0.05014381875871067, + "grad_norm": 0.16144122183322906, + "learning_rate": 0.0009959588544602726, + "loss": 3.1184, + "step": 1691 + }, + { + "epoch": 0.050173472111022153, + "grad_norm": 0.1618700921535492, + "learning_rate": 0.0009959528823041727, + "loss": 3.1712, + "step": 1692 + }, + { + "epoch": 0.05020312546333363, + "grad_norm": 0.17715413868427277, + "learning_rate": 0.0009959469057563247, + "loss": 3.1594, + "step": 1693 + }, + { + "epoch": 0.050232778815645106, + "grad_norm": 0.1829715222120285, + "learning_rate": 0.0009959409248167818, + "loss": 3.1556, + "step": 1694 + }, + { + "epoch": 0.05026243216795659, + "grad_norm": 0.16900968551635742, + "learning_rate": 0.000995934939485597, + "loss": 3.1403, + "step": 1695 + }, + { + "epoch": 0.050292085520268065, + "grad_norm": 0.17874297499656677, + "learning_rate": 0.0009959289497628232, + "loss": 3.1396, + "step": 1696 + }, + { + "epoch": 0.05032173887257955, + "grad_norm": 0.1686682403087616, + "learning_rate": 0.0009959229556485132, + "loss": 3.1409, + "step": 1697 + }, + { + "epoch": 0.050351392224891024, + "grad_norm": 0.14787741005420685, + "learning_rate": 0.0009959169571427205, + "loss": 3.1358, + "step": 1698 + }, + { + "epoch": 0.0503810455772025, + "grad_norm": 0.16440637409687042, + "learning_rate": 0.000995910954245498, + "loss": 3.1312, + "step": 1699 + }, + { + "epoch": 0.05041069892951398, + "grad_norm": 0.16251756250858307, + "learning_rate": 0.000995904946956899, + "loss": 3.1554, + "step": 1700 + }, + { + "epoch": 0.05044035228182546, + "grad_norm": 0.13482867181301117, + "learning_rate": 0.0009958989352769761, + "loss": 3.1525, + "step": 1701 + }, + { + "epoch": 0.050470005634136936, + "grad_norm": 0.1293991506099701, + "learning_rate": 0.0009958929192057835, + "loss": 3.1327, + "step": 1702 + }, + { + "epoch": 0.05049965898644842, + "grad_norm": 0.15265122056007385, + "learning_rate": 0.0009958868987433736, + "loss": 3.1495, + "step": 1703 + }, + { + "epoch": 0.050529312338759895, + "grad_norm": 0.18871912360191345, + "learning_rate": 0.0009958808738898004, + "loss": 3.1498, + "step": 1704 + }, + { + "epoch": 0.05055896569107138, + "grad_norm": 0.16564558446407318, + "learning_rate": 0.0009958748446451168, + "loss": 3.1507, + "step": 1705 + }, + { + "epoch": 0.050588619043382854, + "grad_norm": 0.16121481359004974, + "learning_rate": 0.0009958688110093764, + "loss": 3.1308, + "step": 1706 + }, + { + "epoch": 0.05061827239569433, + "grad_norm": 0.13778869807720184, + "learning_rate": 0.0009958627729826325, + "loss": 3.1502, + "step": 1707 + }, + { + "epoch": 0.05064792574800581, + "grad_norm": 0.14582081139087677, + "learning_rate": 0.0009958567305649387, + "loss": 3.1451, + "step": 1708 + }, + { + "epoch": 0.05067757910031729, + "grad_norm": 0.17525580525398254, + "learning_rate": 0.0009958506837563484, + "loss": 3.1534, + "step": 1709 + }, + { + "epoch": 0.05070723245262877, + "grad_norm": 0.1791628748178482, + "learning_rate": 0.0009958446325569151, + "loss": 3.164, + "step": 1710 + }, + { + "epoch": 0.05073688580494025, + "grad_norm": 0.17741134762763977, + "learning_rate": 0.0009958385769666927, + "loss": 3.1515, + "step": 1711 + }, + { + "epoch": 0.050766539157251725, + "grad_norm": 0.16404485702514648, + "learning_rate": 0.0009958325169857343, + "loss": 3.1392, + "step": 1712 + }, + { + "epoch": 0.05079619250956321, + "grad_norm": 0.19957496225833893, + "learning_rate": 0.0009958264526140942, + "loss": 3.1737, + "step": 1713 + }, + { + "epoch": 0.050825845861874684, + "grad_norm": 0.1991364061832428, + "learning_rate": 0.0009958203838518255, + "loss": 3.2015, + "step": 1714 + }, + { + "epoch": 0.05085549921418616, + "grad_norm": 0.1578870713710785, + "learning_rate": 0.0009958143106989822, + "loss": 3.1414, + "step": 1715 + }, + { + "epoch": 0.05088515256649764, + "grad_norm": 0.1689874678850174, + "learning_rate": 0.000995808233155618, + "loss": 3.159, + "step": 1716 + }, + { + "epoch": 0.05091480591880912, + "grad_norm": 0.1742061972618103, + "learning_rate": 0.0009958021512217869, + "loss": 3.1542, + "step": 1717 + }, + { + "epoch": 0.0509444592711206, + "grad_norm": 0.20970673859119415, + "learning_rate": 0.0009957960648975428, + "loss": 3.1584, + "step": 1718 + }, + { + "epoch": 0.05097411262343208, + "grad_norm": 0.18217217922210693, + "learning_rate": 0.0009957899741829394, + "loss": 3.1858, + "step": 1719 + }, + { + "epoch": 0.051003765975743555, + "grad_norm": 0.1486641764640808, + "learning_rate": 0.0009957838790780305, + "loss": 3.1591, + "step": 1720 + }, + { + "epoch": 0.05103341932805504, + "grad_norm": 0.1416257619857788, + "learning_rate": 0.0009957777795828703, + "loss": 3.1473, + "step": 1721 + }, + { + "epoch": 0.051063072680366514, + "grad_norm": 0.14143550395965576, + "learning_rate": 0.0009957716756975128, + "loss": 3.0839, + "step": 1722 + }, + { + "epoch": 0.051092726032678, + "grad_norm": 0.17622457444667816, + "learning_rate": 0.000995765567422012, + "loss": 3.1573, + "step": 1723 + }, + { + "epoch": 0.05112237938498947, + "grad_norm": 0.14934232831001282, + "learning_rate": 0.000995759454756422, + "loss": 3.1339, + "step": 1724 + }, + { + "epoch": 0.05115203273730095, + "grad_norm": 0.15859349071979523, + "learning_rate": 0.000995753337700797, + "loss": 3.1559, + "step": 1725 + }, + { + "epoch": 0.05118168608961243, + "grad_norm": 0.17094196379184723, + "learning_rate": 0.000995747216255191, + "loss": 3.1242, + "step": 1726 + }, + { + "epoch": 0.05121133944192391, + "grad_norm": 0.17465606331825256, + "learning_rate": 0.0009957410904196584, + "loss": 3.177, + "step": 1727 + }, + { + "epoch": 0.051240992794235385, + "grad_norm": 0.1558445245027542, + "learning_rate": 0.0009957349601942532, + "loss": 3.1477, + "step": 1728 + }, + { + "epoch": 0.05127064614654687, + "grad_norm": 0.16189788281917572, + "learning_rate": 0.00099572882557903, + "loss": 3.1384, + "step": 1729 + }, + { + "epoch": 0.051300299498858344, + "grad_norm": 0.19792887568473816, + "learning_rate": 0.000995722686574043, + "loss": 3.1447, + "step": 1730 + }, + { + "epoch": 0.05132995285116983, + "grad_norm": 0.20802189409732819, + "learning_rate": 0.0009957165431793463, + "loss": 3.1581, + "step": 1731 + }, + { + "epoch": 0.0513596062034813, + "grad_norm": 0.19560277462005615, + "learning_rate": 0.0009957103953949947, + "loss": 3.2095, + "step": 1732 + }, + { + "epoch": 0.05138925955579278, + "grad_norm": 0.1697721630334854, + "learning_rate": 0.0009957042432210423, + "loss": 3.1671, + "step": 1733 + }, + { + "epoch": 0.05141891290810426, + "grad_norm": 0.16889013350009918, + "learning_rate": 0.0009956980866575437, + "loss": 3.1509, + "step": 1734 + }, + { + "epoch": 0.05144856626041574, + "grad_norm": 0.15944179892539978, + "learning_rate": 0.0009956919257045537, + "loss": 3.1236, + "step": 1735 + }, + { + "epoch": 0.05147821961272722, + "grad_norm": 0.1528543084859848, + "learning_rate": 0.0009956857603621266, + "loss": 3.1705, + "step": 1736 + }, + { + "epoch": 0.0515078729650387, + "grad_norm": 0.1617509424686432, + "learning_rate": 0.000995679590630317, + "loss": 3.1516, + "step": 1737 + }, + { + "epoch": 0.051537526317350174, + "grad_norm": 0.18086081743240356, + "learning_rate": 0.0009956734165091792, + "loss": 3.1595, + "step": 1738 + }, + { + "epoch": 0.05156717966966166, + "grad_norm": 0.1659056544303894, + "learning_rate": 0.0009956672379987685, + "loss": 3.1409, + "step": 1739 + }, + { + "epoch": 0.05159683302197313, + "grad_norm": 0.1484328657388687, + "learning_rate": 0.0009956610550991393, + "loss": 3.1139, + "step": 1740 + }, + { + "epoch": 0.05162648637428461, + "grad_norm": 0.15620620548725128, + "learning_rate": 0.0009956548678103465, + "loss": 3.1719, + "step": 1741 + }, + { + "epoch": 0.05165613972659609, + "grad_norm": 0.16199877858161926, + "learning_rate": 0.0009956486761324445, + "loss": 3.1755, + "step": 1742 + }, + { + "epoch": 0.05168579307890757, + "grad_norm": 0.1389780342578888, + "learning_rate": 0.0009956424800654886, + "loss": 3.1565, + "step": 1743 + }, + { + "epoch": 0.05171544643121905, + "grad_norm": 0.15736082196235657, + "learning_rate": 0.0009956362796095335, + "loss": 3.1547, + "step": 1744 + }, + { + "epoch": 0.05174509978353053, + "grad_norm": 0.1441192477941513, + "learning_rate": 0.0009956300747646339, + "loss": 3.1457, + "step": 1745 + }, + { + "epoch": 0.051774753135842004, + "grad_norm": 0.15667304396629333, + "learning_rate": 0.000995623865530845, + "loss": 3.1429, + "step": 1746 + }, + { + "epoch": 0.05180440648815349, + "grad_norm": 0.19083663821220398, + "learning_rate": 0.0009956176519082217, + "loss": 3.1548, + "step": 1747 + }, + { + "epoch": 0.05183405984046496, + "grad_norm": 0.16475000977516174, + "learning_rate": 0.000995611433896819, + "loss": 3.1487, + "step": 1748 + }, + { + "epoch": 0.051863713192776446, + "grad_norm": 0.12803533673286438, + "learning_rate": 0.000995605211496692, + "loss": 3.1297, + "step": 1749 + }, + { + "epoch": 0.05189336654508792, + "grad_norm": 0.15195195376873016, + "learning_rate": 0.0009955989847078958, + "loss": 3.1116, + "step": 1750 + }, + { + "epoch": 0.0519230198973994, + "grad_norm": 0.15747974812984467, + "learning_rate": 0.0009955927535304854, + "loss": 3.1569, + "step": 1751 + }, + { + "epoch": 0.05195267324971088, + "grad_norm": 0.14747905731201172, + "learning_rate": 0.0009955865179645162, + "loss": 3.1354, + "step": 1752 + }, + { + "epoch": 0.05198232660202236, + "grad_norm": 0.1503918468952179, + "learning_rate": 0.0009955802780100434, + "loss": 3.1217, + "step": 1753 + }, + { + "epoch": 0.052011979954333834, + "grad_norm": 0.14971989393234253, + "learning_rate": 0.0009955740336671222, + "loss": 3.105, + "step": 1754 + }, + { + "epoch": 0.05204163330664532, + "grad_norm": 0.15212392807006836, + "learning_rate": 0.0009955677849358077, + "loss": 3.1223, + "step": 1755 + }, + { + "epoch": 0.05207128665895679, + "grad_norm": 0.1546335518360138, + "learning_rate": 0.0009955615318161554, + "loss": 3.1437, + "step": 1756 + }, + { + "epoch": 0.052100940011268276, + "grad_norm": 0.15852467715740204, + "learning_rate": 0.0009955552743082209, + "loss": 3.1261, + "step": 1757 + }, + { + "epoch": 0.05213059336357975, + "grad_norm": 0.17211604118347168, + "learning_rate": 0.000995549012412059, + "loss": 3.1023, + "step": 1758 + }, + { + "epoch": 0.05216024671589123, + "grad_norm": 0.1985173225402832, + "learning_rate": 0.0009955427461277259, + "loss": 3.1556, + "step": 1759 + }, + { + "epoch": 0.05218990006820271, + "grad_norm": 0.2073604315519333, + "learning_rate": 0.0009955364754552765, + "loss": 3.1237, + "step": 1760 + }, + { + "epoch": 0.05221955342051419, + "grad_norm": 0.2369944155216217, + "learning_rate": 0.0009955302003947666, + "loss": 3.1494, + "step": 1761 + }, + { + "epoch": 0.05224920677282567, + "grad_norm": 0.21948029100894928, + "learning_rate": 0.0009955239209462519, + "loss": 3.1446, + "step": 1762 + }, + { + "epoch": 0.05227886012513715, + "grad_norm": 0.18879923224449158, + "learning_rate": 0.0009955176371097877, + "loss": 3.1467, + "step": 1763 + }, + { + "epoch": 0.05230851347744862, + "grad_norm": 0.19748468697071075, + "learning_rate": 0.0009955113488854296, + "loss": 3.1634, + "step": 1764 + }, + { + "epoch": 0.052338166829760106, + "grad_norm": 0.19093650579452515, + "learning_rate": 0.0009955050562732335, + "loss": 3.1001, + "step": 1765 + }, + { + "epoch": 0.05236782018207158, + "grad_norm": 0.19175007939338684, + "learning_rate": 0.0009954987592732552, + "loss": 3.1219, + "step": 1766 + }, + { + "epoch": 0.05239747353438306, + "grad_norm": 0.21448463201522827, + "learning_rate": 0.0009954924578855504, + "loss": 3.1808, + "step": 1767 + }, + { + "epoch": 0.05242712688669454, + "grad_norm": 0.17156259715557098, + "learning_rate": 0.0009954861521101748, + "loss": 3.1333, + "step": 1768 + }, + { + "epoch": 0.05245678023900602, + "grad_norm": 0.1757633239030838, + "learning_rate": 0.000995479841947184, + "loss": 3.1151, + "step": 1769 + }, + { + "epoch": 0.0524864335913175, + "grad_norm": 0.16650258004665375, + "learning_rate": 0.0009954735273966344, + "loss": 3.1188, + "step": 1770 + }, + { + "epoch": 0.05251608694362898, + "grad_norm": 0.14785106480121613, + "learning_rate": 0.0009954672084585817, + "loss": 3.1119, + "step": 1771 + }, + { + "epoch": 0.05254574029594045, + "grad_norm": 0.13176459074020386, + "learning_rate": 0.0009954608851330817, + "loss": 3.1239, + "step": 1772 + }, + { + "epoch": 0.052575393648251936, + "grad_norm": 0.1501159965991974, + "learning_rate": 0.0009954545574201905, + "loss": 3.1449, + "step": 1773 + }, + { + "epoch": 0.05260504700056341, + "grad_norm": 0.14709195494651794, + "learning_rate": 0.0009954482253199644, + "loss": 3.1647, + "step": 1774 + }, + { + "epoch": 0.052634700352874896, + "grad_norm": 0.14099110662937164, + "learning_rate": 0.000995441888832459, + "loss": 3.1296, + "step": 1775 + }, + { + "epoch": 0.05266435370518637, + "grad_norm": 0.12300805747509003, + "learning_rate": 0.0009954355479577306, + "loss": 3.1288, + "step": 1776 + }, + { + "epoch": 0.05269400705749785, + "grad_norm": 0.12295231968164444, + "learning_rate": 0.0009954292026958355, + "loss": 3.1104, + "step": 1777 + }, + { + "epoch": 0.05272366040980933, + "grad_norm": 0.12879975140094757, + "learning_rate": 0.0009954228530468297, + "loss": 3.1284, + "step": 1778 + }, + { + "epoch": 0.05275331376212081, + "grad_norm": 0.12339261174201965, + "learning_rate": 0.0009954164990107694, + "loss": 3.1205, + "step": 1779 + }, + { + "epoch": 0.05278296711443228, + "grad_norm": 0.1209510788321495, + "learning_rate": 0.0009954101405877111, + "loss": 3.1238, + "step": 1780 + }, + { + "epoch": 0.052812620466743766, + "grad_norm": 0.14571043848991394, + "learning_rate": 0.0009954037777777111, + "loss": 3.0951, + "step": 1781 + }, + { + "epoch": 0.05284227381905524, + "grad_norm": 0.16386014223098755, + "learning_rate": 0.0009953974105808255, + "loss": 3.1452, + "step": 1782 + }, + { + "epoch": 0.052871927171366725, + "grad_norm": 0.21412444114685059, + "learning_rate": 0.0009953910389971109, + "loss": 3.1621, + "step": 1783 + }, + { + "epoch": 0.0529015805236782, + "grad_norm": 0.2102348506450653, + "learning_rate": 0.0009953846630266234, + "loss": 3.1419, + "step": 1784 + }, + { + "epoch": 0.05293123387598968, + "grad_norm": 0.19540438055992126, + "learning_rate": 0.0009953782826694197, + "loss": 3.1669, + "step": 1785 + }, + { + "epoch": 0.05296088722830116, + "grad_norm": 0.18225890398025513, + "learning_rate": 0.0009953718979255563, + "loss": 3.1106, + "step": 1786 + }, + { + "epoch": 0.05299054058061264, + "grad_norm": 0.17170126736164093, + "learning_rate": 0.0009953655087950896, + "loss": 3.1188, + "step": 1787 + }, + { + "epoch": 0.05302019393292412, + "grad_norm": 0.16529208421707153, + "learning_rate": 0.0009953591152780765, + "loss": 3.1009, + "step": 1788 + }, + { + "epoch": 0.053049847285235596, + "grad_norm": 0.17111149430274963, + "learning_rate": 0.0009953527173745735, + "loss": 3.1254, + "step": 1789 + }, + { + "epoch": 0.05307950063754707, + "grad_norm": 0.16362956166267395, + "learning_rate": 0.000995346315084637, + "loss": 3.1017, + "step": 1790 + }, + { + "epoch": 0.053109153989858555, + "grad_norm": 0.17184926569461823, + "learning_rate": 0.0009953399084083239, + "loss": 3.1205, + "step": 1791 + }, + { + "epoch": 0.05313880734217003, + "grad_norm": 0.17222459614276886, + "learning_rate": 0.0009953334973456908, + "loss": 3.1453, + "step": 1792 + }, + { + "epoch": 0.05316846069448151, + "grad_norm": 0.1437840610742569, + "learning_rate": 0.0009953270818967945, + "loss": 3.1347, + "step": 1793 + }, + { + "epoch": 0.05319811404679299, + "grad_norm": 0.17168152332305908, + "learning_rate": 0.0009953206620616922, + "loss": 3.112, + "step": 1794 + }, + { + "epoch": 0.05322776739910447, + "grad_norm": 0.19084542989730835, + "learning_rate": 0.00099531423784044, + "loss": 3.1525, + "step": 1795 + }, + { + "epoch": 0.05325742075141595, + "grad_norm": 0.21168695390224457, + "learning_rate": 0.0009953078092330954, + "loss": 3.1401, + "step": 1796 + }, + { + "epoch": 0.053287074103727426, + "grad_norm": 0.1984683722257614, + "learning_rate": 0.0009953013762397152, + "loss": 3.1567, + "step": 1797 + }, + { + "epoch": 0.0533167274560389, + "grad_norm": 0.1696804016828537, + "learning_rate": 0.0009952949388603563, + "loss": 3.1275, + "step": 1798 + }, + { + "epoch": 0.053346380808350385, + "grad_norm": 0.16823208332061768, + "learning_rate": 0.0009952884970950756, + "loss": 3.1067, + "step": 1799 + }, + { + "epoch": 0.05337603416066186, + "grad_norm": 0.1622621715068817, + "learning_rate": 0.0009952820509439302, + "loss": 3.1361, + "step": 1800 + }, + { + "epoch": 0.053405687512973345, + "grad_norm": 0.1671653836965561, + "learning_rate": 0.0009952756004069775, + "loss": 3.1408, + "step": 1801 + }, + { + "epoch": 0.05343534086528482, + "grad_norm": 0.16666610538959503, + "learning_rate": 0.000995269145484274, + "loss": 3.1188, + "step": 1802 + }, + { + "epoch": 0.0534649942175963, + "grad_norm": 0.1579926460981369, + "learning_rate": 0.0009952626861758774, + "loss": 3.1618, + "step": 1803 + }, + { + "epoch": 0.05349464756990778, + "grad_norm": 0.15322642028331757, + "learning_rate": 0.0009952562224818447, + "loss": 3.1136, + "step": 1804 + }, + { + "epoch": 0.053524300922219256, + "grad_norm": 0.1612243503332138, + "learning_rate": 0.0009952497544022329, + "loss": 3.1515, + "step": 1805 + }, + { + "epoch": 0.05355395427453073, + "grad_norm": 0.15637637674808502, + "learning_rate": 0.0009952432819370998, + "loss": 3.1419, + "step": 1806 + }, + { + "epoch": 0.053583607626842215, + "grad_norm": 0.1428048461675644, + "learning_rate": 0.0009952368050865023, + "loss": 3.1628, + "step": 1807 + }, + { + "epoch": 0.05361326097915369, + "grad_norm": 0.16324622929096222, + "learning_rate": 0.000995230323850498, + "loss": 3.1377, + "step": 1808 + }, + { + "epoch": 0.053642914331465175, + "grad_norm": 0.14183054864406586, + "learning_rate": 0.000995223838229144, + "loss": 3.1282, + "step": 1809 + }, + { + "epoch": 0.05367256768377665, + "grad_norm": 0.16275514662265778, + "learning_rate": 0.000995217348222498, + "loss": 3.1045, + "step": 1810 + }, + { + "epoch": 0.05370222103608813, + "grad_norm": 0.14490559697151184, + "learning_rate": 0.0009952108538306176, + "loss": 3.1103, + "step": 1811 + }, + { + "epoch": 0.05373187438839961, + "grad_norm": 0.12230914831161499, + "learning_rate": 0.0009952043550535597, + "loss": 3.1429, + "step": 1812 + }, + { + "epoch": 0.053761527740711086, + "grad_norm": 0.11764136701822281, + "learning_rate": 0.0009951978518913825, + "loss": 3.0973, + "step": 1813 + }, + { + "epoch": 0.05379118109302257, + "grad_norm": 0.14331097900867462, + "learning_rate": 0.0009951913443441431, + "loss": 3.1144, + "step": 1814 + }, + { + "epoch": 0.053820834445334045, + "grad_norm": 0.156180277466774, + "learning_rate": 0.0009951848324118995, + "loss": 3.1141, + "step": 1815 + }, + { + "epoch": 0.05385048779764552, + "grad_norm": 0.16896530985832214, + "learning_rate": 0.0009951783160947092, + "loss": 3.1028, + "step": 1816 + }, + { + "epoch": 0.053880141149957005, + "grad_norm": 0.18178465962409973, + "learning_rate": 0.00099517179539263, + "loss": 3.0651, + "step": 1817 + }, + { + "epoch": 0.05390979450226848, + "grad_norm": 0.19785098731517792, + "learning_rate": 0.0009951652703057195, + "loss": 3.118, + "step": 1818 + }, + { + "epoch": 0.05393944785457996, + "grad_norm": 0.24093502759933472, + "learning_rate": 0.0009951587408340355, + "loss": 3.1607, + "step": 1819 + }, + { + "epoch": 0.05396910120689144, + "grad_norm": 0.2037716507911682, + "learning_rate": 0.0009951522069776358, + "loss": 3.1084, + "step": 1820 + }, + { + "epoch": 0.053998754559202916, + "grad_norm": 0.19441190361976624, + "learning_rate": 0.0009951456687365783, + "loss": 3.1519, + "step": 1821 + }, + { + "epoch": 0.0540284079115144, + "grad_norm": 0.17325802147388458, + "learning_rate": 0.000995139126110921, + "loss": 3.1212, + "step": 1822 + }, + { + "epoch": 0.054058061263825875, + "grad_norm": 0.1531010866165161, + "learning_rate": 0.0009951325791007217, + "loss": 3.0973, + "step": 1823 + }, + { + "epoch": 0.05408771461613735, + "grad_norm": 0.1453300267457962, + "learning_rate": 0.0009951260277060385, + "loss": 3.1255, + "step": 1824 + }, + { + "epoch": 0.054117367968448835, + "grad_norm": 0.1643105149269104, + "learning_rate": 0.0009951194719269292, + "loss": 3.1183, + "step": 1825 + }, + { + "epoch": 0.05414702132076031, + "grad_norm": 0.16649304330348969, + "learning_rate": 0.000995112911763452, + "loss": 3.1121, + "step": 1826 + }, + { + "epoch": 0.054176674673071794, + "grad_norm": 0.1615777462720871, + "learning_rate": 0.0009951063472156652, + "loss": 3.1226, + "step": 1827 + }, + { + "epoch": 0.05420632802538327, + "grad_norm": 0.14454534649848938, + "learning_rate": 0.0009950997782836267, + "loss": 3.1303, + "step": 1828 + }, + { + "epoch": 0.054235981377694746, + "grad_norm": 0.12891048192977905, + "learning_rate": 0.0009950932049673945, + "loss": 3.1673, + "step": 1829 + }, + { + "epoch": 0.05426563473000623, + "grad_norm": 0.13621163368225098, + "learning_rate": 0.000995086627267027, + "loss": 3.199, + "step": 1830 + }, + { + "epoch": 0.054295288082317705, + "grad_norm": 0.16365551948547363, + "learning_rate": 0.0009950800451825825, + "loss": 3.1437, + "step": 1831 + }, + { + "epoch": 0.05432494143462918, + "grad_norm": 0.2049248069524765, + "learning_rate": 0.0009950734587141192, + "loss": 3.1266, + "step": 1832 + }, + { + "epoch": 0.054354594786940665, + "grad_norm": 0.20344169437885284, + "learning_rate": 0.0009950668678616954, + "loss": 3.1512, + "step": 1833 + }, + { + "epoch": 0.05438424813925214, + "grad_norm": 0.2088024616241455, + "learning_rate": 0.0009950602726253696, + "loss": 3.1427, + "step": 1834 + }, + { + "epoch": 0.054413901491563624, + "grad_norm": 0.15708917379379272, + "learning_rate": 0.0009950536730052, + "loss": 3.112, + "step": 1835 + }, + { + "epoch": 0.0544435548438751, + "grad_norm": 0.14405570924282074, + "learning_rate": 0.0009950470690012452, + "loss": 3.1044, + "step": 1836 + }, + { + "epoch": 0.054473208196186576, + "grad_norm": 0.15644440054893494, + "learning_rate": 0.0009950404606135638, + "loss": 3.1392, + "step": 1837 + }, + { + "epoch": 0.05450286154849806, + "grad_norm": 0.14508774876594543, + "learning_rate": 0.0009950338478422137, + "loss": 3.1365, + "step": 1838 + }, + { + "epoch": 0.054532514900809535, + "grad_norm": 0.14215120673179626, + "learning_rate": 0.0009950272306872543, + "loss": 3.0903, + "step": 1839 + }, + { + "epoch": 0.05456216825312102, + "grad_norm": 0.14577119052410126, + "learning_rate": 0.000995020609148744, + "loss": 3.1082, + "step": 1840 + }, + { + "epoch": 0.054591821605432495, + "grad_norm": 0.1488174945116043, + "learning_rate": 0.0009950139832267408, + "loss": 3.1079, + "step": 1841 + }, + { + "epoch": 0.05462147495774397, + "grad_norm": 0.15683677792549133, + "learning_rate": 0.000995007352921304, + "loss": 3.1165, + "step": 1842 + }, + { + "epoch": 0.054651128310055454, + "grad_norm": 0.1391807198524475, + "learning_rate": 0.000995000718232492, + "loss": 3.1005, + "step": 1843 + }, + { + "epoch": 0.05468078166236693, + "grad_norm": 0.14337046444416046, + "learning_rate": 0.0009949940791603637, + "loss": 3.151, + "step": 1844 + }, + { + "epoch": 0.054710435014678406, + "grad_norm": 0.13950394093990326, + "learning_rate": 0.0009949874357049779, + "loss": 3.1178, + "step": 1845 + }, + { + "epoch": 0.05474008836698989, + "grad_norm": 0.1369420737028122, + "learning_rate": 0.0009949807878663936, + "loss": 3.1104, + "step": 1846 + }, + { + "epoch": 0.054769741719301365, + "grad_norm": 0.14293219149112701, + "learning_rate": 0.0009949741356446691, + "loss": 3.1105, + "step": 1847 + }, + { + "epoch": 0.05479939507161285, + "grad_norm": 0.15976610779762268, + "learning_rate": 0.0009949674790398638, + "loss": 3.1429, + "step": 1848 + }, + { + "epoch": 0.054829048423924324, + "grad_norm": 0.1753707379102707, + "learning_rate": 0.0009949608180520365, + "loss": 3.1002, + "step": 1849 + }, + { + "epoch": 0.0548587017762358, + "grad_norm": 0.16530197858810425, + "learning_rate": 0.0009949541526812463, + "loss": 3.1001, + "step": 1850 + }, + { + "epoch": 0.054888355128547284, + "grad_norm": 0.17241324484348297, + "learning_rate": 0.000994947482927552, + "loss": 3.0945, + "step": 1851 + }, + { + "epoch": 0.05491800848085876, + "grad_norm": 0.16707314550876617, + "learning_rate": 0.0009949408087910128, + "loss": 3.1152, + "step": 1852 + }, + { + "epoch": 0.05494766183317024, + "grad_norm": 0.16133993864059448, + "learning_rate": 0.0009949341302716878, + "loss": 3.154, + "step": 1853 + }, + { + "epoch": 0.05497731518548172, + "grad_norm": 0.16790835559368134, + "learning_rate": 0.0009949274473696362, + "loss": 3.1002, + "step": 1854 + }, + { + "epoch": 0.055006968537793195, + "grad_norm": 0.16087956726551056, + "learning_rate": 0.0009949207600849171, + "loss": 3.1439, + "step": 1855 + }, + { + "epoch": 0.05503662189010468, + "grad_norm": 0.2091078907251358, + "learning_rate": 0.0009949140684175897, + "loss": 3.1144, + "step": 1856 + }, + { + "epoch": 0.055066275242416154, + "grad_norm": 0.22255004942417145, + "learning_rate": 0.0009949073723677132, + "loss": 3.1206, + "step": 1857 + }, + { + "epoch": 0.05509592859472763, + "grad_norm": 0.1990332454442978, + "learning_rate": 0.0009949006719353472, + "loss": 3.1139, + "step": 1858 + }, + { + "epoch": 0.055125581947039114, + "grad_norm": 0.17988665401935577, + "learning_rate": 0.0009948939671205505, + "loss": 3.1118, + "step": 1859 + }, + { + "epoch": 0.05515523529935059, + "grad_norm": 0.17880135774612427, + "learning_rate": 0.0009948872579233828, + "loss": 3.1514, + "step": 1860 + }, + { + "epoch": 0.05518488865166207, + "grad_norm": 0.18074102699756622, + "learning_rate": 0.0009948805443439036, + "loss": 3.082, + "step": 1861 + }, + { + "epoch": 0.05521454200397355, + "grad_norm": 0.16740421950817108, + "learning_rate": 0.0009948738263821722, + "loss": 3.1066, + "step": 1862 + }, + { + "epoch": 0.055244195356285025, + "grad_norm": 0.1713441014289856, + "learning_rate": 0.0009948671040382483, + "loss": 3.1103, + "step": 1863 + }, + { + "epoch": 0.05527384870859651, + "grad_norm": 0.19510500133037567, + "learning_rate": 0.000994860377312191, + "loss": 3.1402, + "step": 1864 + }, + { + "epoch": 0.055303502060907984, + "grad_norm": 0.2106478214263916, + "learning_rate": 0.00099485364620406, + "loss": 3.1394, + "step": 1865 + }, + { + "epoch": 0.05533315541321947, + "grad_norm": 0.1909620463848114, + "learning_rate": 0.0009948469107139153, + "loss": 3.0929, + "step": 1866 + }, + { + "epoch": 0.055362808765530944, + "grad_norm": 0.17605902254581451, + "learning_rate": 0.0009948401708418163, + "loss": 3.1058, + "step": 1867 + }, + { + "epoch": 0.05539246211784242, + "grad_norm": 0.15627679228782654, + "learning_rate": 0.0009948334265878224, + "loss": 3.144, + "step": 1868 + }, + { + "epoch": 0.0554221154701539, + "grad_norm": 0.17265313863754272, + "learning_rate": 0.0009948266779519937, + "loss": 3.1176, + "step": 1869 + }, + { + "epoch": 0.05545176882246538, + "grad_norm": 0.17847858369350433, + "learning_rate": 0.0009948199249343898, + "loss": 3.1169, + "step": 1870 + }, + { + "epoch": 0.055481422174776855, + "grad_norm": 0.14710073173046112, + "learning_rate": 0.0009948131675350707, + "loss": 3.1416, + "step": 1871 + }, + { + "epoch": 0.05551107552708834, + "grad_norm": 0.14508011937141418, + "learning_rate": 0.000994806405754096, + "loss": 3.1406, + "step": 1872 + }, + { + "epoch": 0.055540728879399814, + "grad_norm": 0.15583771467208862, + "learning_rate": 0.0009947996395915253, + "loss": 3.1304, + "step": 1873 + }, + { + "epoch": 0.0555703822317113, + "grad_norm": 0.15295331180095673, + "learning_rate": 0.0009947928690474193, + "loss": 3.1463, + "step": 1874 + }, + { + "epoch": 0.055600035584022774, + "grad_norm": 0.15520413219928741, + "learning_rate": 0.0009947860941218374, + "loss": 3.0981, + "step": 1875 + }, + { + "epoch": 0.05562968893633425, + "grad_norm": 0.14259016513824463, + "learning_rate": 0.0009947793148148397, + "loss": 3.1413, + "step": 1876 + }, + { + "epoch": 0.05565934228864573, + "grad_norm": 0.12932612001895905, + "learning_rate": 0.0009947725311264862, + "loss": 3.1521, + "step": 1877 + }, + { + "epoch": 0.05568899564095721, + "grad_norm": 0.1717842072248459, + "learning_rate": 0.0009947657430568369, + "loss": 3.1404, + "step": 1878 + }, + { + "epoch": 0.05571864899326869, + "grad_norm": 0.1199052482843399, + "learning_rate": 0.0009947589506059521, + "loss": 3.149, + "step": 1879 + }, + { + "epoch": 0.05574830234558017, + "grad_norm": 0.12362087517976761, + "learning_rate": 0.0009947521537738918, + "loss": 3.132, + "step": 1880 + }, + { + "epoch": 0.055777955697891644, + "grad_norm": 0.14521943032741547, + "learning_rate": 0.0009947453525607163, + "loss": 3.1045, + "step": 1881 + }, + { + "epoch": 0.05580760905020313, + "grad_norm": 0.1427953541278839, + "learning_rate": 0.000994738546966486, + "loss": 3.1197, + "step": 1882 + }, + { + "epoch": 0.055837262402514604, + "grad_norm": 0.15237122774124146, + "learning_rate": 0.0009947317369912608, + "loss": 3.1112, + "step": 1883 + }, + { + "epoch": 0.05586691575482608, + "grad_norm": 0.17318381369113922, + "learning_rate": 0.0009947249226351011, + "loss": 3.1244, + "step": 1884 + }, + { + "epoch": 0.05589656910713756, + "grad_norm": 0.16627350449562073, + "learning_rate": 0.0009947181038980674, + "loss": 3.1263, + "step": 1885 + }, + { + "epoch": 0.05592622245944904, + "grad_norm": 0.17096708714962006, + "learning_rate": 0.00099471128078022, + "loss": 3.0855, + "step": 1886 + }, + { + "epoch": 0.05595587581176052, + "grad_norm": 0.17609743773937225, + "learning_rate": 0.000994704453281619, + "loss": 3.136, + "step": 1887 + }, + { + "epoch": 0.055985529164072, + "grad_norm": 0.18185976147651672, + "learning_rate": 0.0009946976214023253, + "loss": 3.0766, + "step": 1888 + }, + { + "epoch": 0.056015182516383474, + "grad_norm": 0.16613028943538666, + "learning_rate": 0.0009946907851423993, + "loss": 3.1339, + "step": 1889 + }, + { + "epoch": 0.05604483586869496, + "grad_norm": 0.18553870916366577, + "learning_rate": 0.0009946839445019015, + "loss": 3.139, + "step": 1890 + }, + { + "epoch": 0.056074489221006434, + "grad_norm": 0.1891546994447708, + "learning_rate": 0.0009946770994808925, + "loss": 3.1596, + "step": 1891 + }, + { + "epoch": 0.05610414257331792, + "grad_norm": 0.20134082436561584, + "learning_rate": 0.0009946702500794327, + "loss": 3.1078, + "step": 1892 + }, + { + "epoch": 0.05613379592562939, + "grad_norm": 0.19970904290676117, + "learning_rate": 0.000994663396297583, + "loss": 3.1297, + "step": 1893 + }, + { + "epoch": 0.05616344927794087, + "grad_norm": 0.19845564663410187, + "learning_rate": 0.000994656538135404, + "loss": 3.104, + "step": 1894 + }, + { + "epoch": 0.05619310263025235, + "grad_norm": 0.19954083859920502, + "learning_rate": 0.0009946496755929566, + "loss": 3.1522, + "step": 1895 + }, + { + "epoch": 0.05622275598256383, + "grad_norm": 0.17040038108825684, + "learning_rate": 0.0009946428086703013, + "loss": 3.1044, + "step": 1896 + }, + { + "epoch": 0.056252409334875304, + "grad_norm": 0.15712960064411163, + "learning_rate": 0.000994635937367499, + "loss": 3.1085, + "step": 1897 + }, + { + "epoch": 0.05628206268718679, + "grad_norm": 0.1394195556640625, + "learning_rate": 0.0009946290616846107, + "loss": 3.1356, + "step": 1898 + }, + { + "epoch": 0.056311716039498264, + "grad_norm": 0.16438521444797516, + "learning_rate": 0.000994622181621697, + "loss": 3.1238, + "step": 1899 + }, + { + "epoch": 0.05634136939180975, + "grad_norm": 0.1748441457748413, + "learning_rate": 0.000994615297178819, + "loss": 3.1147, + "step": 1900 + }, + { + "epoch": 0.05637102274412122, + "grad_norm": 0.15845583379268646, + "learning_rate": 0.000994608408356038, + "loss": 3.0907, + "step": 1901 + }, + { + "epoch": 0.0564006760964327, + "grad_norm": 0.1611388921737671, + "learning_rate": 0.0009946015151534142, + "loss": 3.1124, + "step": 1902 + }, + { + "epoch": 0.05643032944874418, + "grad_norm": 0.14254498481750488, + "learning_rate": 0.0009945946175710092, + "loss": 3.1053, + "step": 1903 + }, + { + "epoch": 0.05645998280105566, + "grad_norm": 0.13034196197986603, + "learning_rate": 0.000994587715608884, + "loss": 3.1649, + "step": 1904 + }, + { + "epoch": 0.05648963615336714, + "grad_norm": 0.16552548110485077, + "learning_rate": 0.0009945808092670996, + "loss": 3.074, + "step": 1905 + }, + { + "epoch": 0.05651928950567862, + "grad_norm": 0.16100598871707916, + "learning_rate": 0.0009945738985457173, + "loss": 3.1121, + "step": 1906 + }, + { + "epoch": 0.056548942857990094, + "grad_norm": 0.14411579072475433, + "learning_rate": 0.0009945669834447981, + "loss": 3.0934, + "step": 1907 + }, + { + "epoch": 0.05657859621030158, + "grad_norm": 0.1642703413963318, + "learning_rate": 0.0009945600639644037, + "loss": 3.1028, + "step": 1908 + }, + { + "epoch": 0.05660824956261305, + "grad_norm": 0.18304617702960968, + "learning_rate": 0.0009945531401045948, + "loss": 3.0923, + "step": 1909 + }, + { + "epoch": 0.05663790291492453, + "grad_norm": 0.20340679585933685, + "learning_rate": 0.000994546211865433, + "loss": 3.138, + "step": 1910 + }, + { + "epoch": 0.05666755626723601, + "grad_norm": 0.18914370238780975, + "learning_rate": 0.0009945392792469797, + "loss": 3.0793, + "step": 1911 + }, + { + "epoch": 0.05669720961954749, + "grad_norm": 0.18292932212352753, + "learning_rate": 0.000994532342249296, + "loss": 3.1105, + "step": 1912 + }, + { + "epoch": 0.05672686297185897, + "grad_norm": 0.17221719026565552, + "learning_rate": 0.0009945254008724438, + "loss": 3.123, + "step": 1913 + }, + { + "epoch": 0.05675651632417045, + "grad_norm": 0.14426691830158234, + "learning_rate": 0.000994518455116484, + "loss": 3.0747, + "step": 1914 + }, + { + "epoch": 0.056786169676481923, + "grad_norm": 0.13842999935150146, + "learning_rate": 0.0009945115049814785, + "loss": 3.0753, + "step": 1915 + }, + { + "epoch": 0.05681582302879341, + "grad_norm": 0.12922503054141998, + "learning_rate": 0.0009945045504674889, + "loss": 3.1601, + "step": 1916 + }, + { + "epoch": 0.05684547638110488, + "grad_norm": 0.13721716403961182, + "learning_rate": 0.0009944975915745764, + "loss": 3.1449, + "step": 1917 + }, + { + "epoch": 0.056875129733416366, + "grad_norm": 0.15122485160827637, + "learning_rate": 0.000994490628302803, + "loss": 3.0957, + "step": 1918 + }, + { + "epoch": 0.05690478308572784, + "grad_norm": 0.15485215187072754, + "learning_rate": 0.0009944836606522302, + "loss": 3.1289, + "step": 1919 + }, + { + "epoch": 0.05693443643803932, + "grad_norm": 0.1834070235490799, + "learning_rate": 0.0009944766886229195, + "loss": 3.112, + "step": 1920 + }, + { + "epoch": 0.0569640897903508, + "grad_norm": 0.2009546011686325, + "learning_rate": 0.000994469712214933, + "loss": 3.1145, + "step": 1921 + }, + { + "epoch": 0.05699374314266228, + "grad_norm": 0.19008979201316833, + "learning_rate": 0.0009944627314283324, + "loss": 3.1335, + "step": 1922 + }, + { + "epoch": 0.05702339649497375, + "grad_norm": 0.17231690883636475, + "learning_rate": 0.0009944557462631793, + "loss": 3.1159, + "step": 1923 + }, + { + "epoch": 0.05705304984728524, + "grad_norm": 0.1729309856891632, + "learning_rate": 0.000994448756719536, + "loss": 3.0977, + "step": 1924 + }, + { + "epoch": 0.05708270319959671, + "grad_norm": 0.14279957115650177, + "learning_rate": 0.0009944417627974639, + "loss": 3.1252, + "step": 1925 + }, + { + "epoch": 0.057112356551908196, + "grad_norm": 0.14165666699409485, + "learning_rate": 0.000994434764497025, + "loss": 3.0832, + "step": 1926 + }, + { + "epoch": 0.05714200990421967, + "grad_norm": 0.1794901341199875, + "learning_rate": 0.0009944277618182814, + "loss": 3.1138, + "step": 1927 + }, + { + "epoch": 0.05717166325653115, + "grad_norm": 0.18329434096813202, + "learning_rate": 0.0009944207547612951, + "loss": 3.1179, + "step": 1928 + }, + { + "epoch": 0.05720131660884263, + "grad_norm": 0.16682645678520203, + "learning_rate": 0.0009944137433261283, + "loss": 3.107, + "step": 1929 + }, + { + "epoch": 0.05723096996115411, + "grad_norm": 0.17221657931804657, + "learning_rate": 0.0009944067275128427, + "loss": 3.0716, + "step": 1930 + }, + { + "epoch": 0.05726062331346559, + "grad_norm": 0.17593233287334442, + "learning_rate": 0.000994399707321501, + "loss": 3.1597, + "step": 1931 + }, + { + "epoch": 0.057290276665777067, + "grad_norm": 0.18136908113956451, + "learning_rate": 0.0009943926827521647, + "loss": 3.1082, + "step": 1932 + }, + { + "epoch": 0.05731993001808854, + "grad_norm": 0.1823781430721283, + "learning_rate": 0.0009943856538048965, + "loss": 3.1027, + "step": 1933 + }, + { + "epoch": 0.057349583370400026, + "grad_norm": 0.2049592137336731, + "learning_rate": 0.0009943786204797585, + "loss": 3.1328, + "step": 1934 + }, + { + "epoch": 0.0573792367227115, + "grad_norm": 0.1972852349281311, + "learning_rate": 0.0009943715827768129, + "loss": 3.1534, + "step": 1935 + }, + { + "epoch": 0.05740889007502298, + "grad_norm": 0.15736223757266998, + "learning_rate": 0.0009943645406961222, + "loss": 3.0999, + "step": 1936 + }, + { + "epoch": 0.05743854342733446, + "grad_norm": 0.1616155207157135, + "learning_rate": 0.0009943574942377486, + "loss": 3.0657, + "step": 1937 + }, + { + "epoch": 0.05746819677964594, + "grad_norm": 0.1848721206188202, + "learning_rate": 0.0009943504434017543, + "loss": 3.144, + "step": 1938 + }, + { + "epoch": 0.05749785013195742, + "grad_norm": 0.16343964636325836, + "learning_rate": 0.000994343388188202, + "loss": 3.0897, + "step": 1939 + }, + { + "epoch": 0.057527503484268896, + "grad_norm": 0.19154973328113556, + "learning_rate": 0.0009943363285971544, + "loss": 3.1038, + "step": 1940 + }, + { + "epoch": 0.05755715683658037, + "grad_norm": 0.21019834280014038, + "learning_rate": 0.0009943292646286738, + "loss": 3.1104, + "step": 1941 + }, + { + "epoch": 0.057586810188891856, + "grad_norm": 0.20070314407348633, + "learning_rate": 0.0009943221962828224, + "loss": 3.105, + "step": 1942 + }, + { + "epoch": 0.05761646354120333, + "grad_norm": 0.1838969886302948, + "learning_rate": 0.0009943151235596633, + "loss": 3.0887, + "step": 1943 + }, + { + "epoch": 0.057646116893514815, + "grad_norm": 0.16597020626068115, + "learning_rate": 0.000994308046459259, + "loss": 3.1199, + "step": 1944 + }, + { + "epoch": 0.05767577024582629, + "grad_norm": 0.14424176514148712, + "learning_rate": 0.0009943009649816719, + "loss": 3.107, + "step": 1945 + }, + { + "epoch": 0.05770542359813777, + "grad_norm": 0.17707104980945587, + "learning_rate": 0.0009942938791269648, + "loss": 3.0739, + "step": 1946 + }, + { + "epoch": 0.05773507695044925, + "grad_norm": 0.1510569453239441, + "learning_rate": 0.0009942867888952007, + "loss": 3.0742, + "step": 1947 + }, + { + "epoch": 0.057764730302760726, + "grad_norm": 0.1313198357820511, + "learning_rate": 0.0009942796942864424, + "loss": 3.0858, + "step": 1948 + }, + { + "epoch": 0.0577943836550722, + "grad_norm": 0.15037451684474945, + "learning_rate": 0.0009942725953007525, + "loss": 3.1071, + "step": 1949 + }, + { + "epoch": 0.057824037007383686, + "grad_norm": 0.17017540335655212, + "learning_rate": 0.0009942654919381938, + "loss": 3.0945, + "step": 1950 + }, + { + "epoch": 0.05785369035969516, + "grad_norm": 0.22303612530231476, + "learning_rate": 0.0009942583841988295, + "loss": 3.0897, + "step": 1951 + }, + { + "epoch": 0.057883343712006645, + "grad_norm": 0.2085268348455429, + "learning_rate": 0.000994251272082722, + "loss": 3.0729, + "step": 1952 + }, + { + "epoch": 0.05791299706431812, + "grad_norm": 0.22399137914180756, + "learning_rate": 0.000994244155589935, + "loss": 3.1237, + "step": 1953 + }, + { + "epoch": 0.0579426504166296, + "grad_norm": 0.15757079422473907, + "learning_rate": 0.0009942370347205312, + "loss": 3.1081, + "step": 1954 + }, + { + "epoch": 0.05797230376894108, + "grad_norm": 0.1576818823814392, + "learning_rate": 0.0009942299094745737, + "loss": 3.1122, + "step": 1955 + }, + { + "epoch": 0.058001957121252556, + "grad_norm": 0.1564008891582489, + "learning_rate": 0.0009942227798521253, + "loss": 3.0747, + "step": 1956 + }, + { + "epoch": 0.05803161047356404, + "grad_norm": 0.13335572183132172, + "learning_rate": 0.0009942156458532493, + "loss": 3.0659, + "step": 1957 + }, + { + "epoch": 0.058061263825875516, + "grad_norm": 0.12716850638389587, + "learning_rate": 0.000994208507478009, + "loss": 3.0719, + "step": 1958 + }, + { + "epoch": 0.05809091717818699, + "grad_norm": 0.1430489718914032, + "learning_rate": 0.0009942013647264677, + "loss": 3.106, + "step": 1959 + }, + { + "epoch": 0.058120570530498475, + "grad_norm": 0.13057096302509308, + "learning_rate": 0.0009941942175986883, + "loss": 3.1041, + "step": 1960 + }, + { + "epoch": 0.05815022388280995, + "grad_norm": 0.15096749365329742, + "learning_rate": 0.0009941870660947342, + "loss": 3.1373, + "step": 1961 + }, + { + "epoch": 0.05817987723512143, + "grad_norm": 0.13341908156871796, + "learning_rate": 0.0009941799102146688, + "loss": 3.1079, + "step": 1962 + }, + { + "epoch": 0.05820953058743291, + "grad_norm": 0.12982769310474396, + "learning_rate": 0.0009941727499585557, + "loss": 3.103, + "step": 1963 + }, + { + "epoch": 0.058239183939744386, + "grad_norm": 0.1164279356598854, + "learning_rate": 0.0009941655853264579, + "loss": 3.0955, + "step": 1964 + }, + { + "epoch": 0.05826883729205587, + "grad_norm": 0.1235099732875824, + "learning_rate": 0.0009941584163184391, + "loss": 3.099, + "step": 1965 + }, + { + "epoch": 0.058298490644367346, + "grad_norm": 0.13973885774612427, + "learning_rate": 0.0009941512429345626, + "loss": 3.0944, + "step": 1966 + }, + { + "epoch": 0.05832814399667882, + "grad_norm": 0.15009306371212006, + "learning_rate": 0.0009941440651748921, + "loss": 3.1318, + "step": 1967 + }, + { + "epoch": 0.058357797348990305, + "grad_norm": 0.15035425126552582, + "learning_rate": 0.0009941368830394912, + "loss": 3.113, + "step": 1968 + }, + { + "epoch": 0.05838745070130178, + "grad_norm": 0.1388549953699112, + "learning_rate": 0.0009941296965284233, + "loss": 3.1212, + "step": 1969 + }, + { + "epoch": 0.058417104053613264, + "grad_norm": 0.16316959261894226, + "learning_rate": 0.000994122505641752, + "loss": 3.0925, + "step": 1970 + }, + { + "epoch": 0.05844675740592474, + "grad_norm": 0.2024182230234146, + "learning_rate": 0.0009941153103795414, + "loss": 3.1431, + "step": 1971 + }, + { + "epoch": 0.058476410758236216, + "grad_norm": 0.23509380221366882, + "learning_rate": 0.0009941081107418545, + "loss": 3.0995, + "step": 1972 + }, + { + "epoch": 0.0585060641105477, + "grad_norm": 0.23908694088459015, + "learning_rate": 0.0009941009067287558, + "loss": 3.1178, + "step": 1973 + }, + { + "epoch": 0.058535717462859176, + "grad_norm": 0.18930654227733612, + "learning_rate": 0.0009940936983403087, + "loss": 3.0694, + "step": 1974 + }, + { + "epoch": 0.05856537081517065, + "grad_norm": 0.17105263471603394, + "learning_rate": 0.000994086485576577, + "loss": 3.1302, + "step": 1975 + }, + { + "epoch": 0.058595024167482135, + "grad_norm": 0.15545529127120972, + "learning_rate": 0.0009940792684376245, + "loss": 3.139, + "step": 1976 + }, + { + "epoch": 0.05862467751979361, + "grad_norm": 0.17355167865753174, + "learning_rate": 0.0009940720469235156, + "loss": 3.1232, + "step": 1977 + }, + { + "epoch": 0.058654330872105094, + "grad_norm": 0.15801064670085907, + "learning_rate": 0.0009940648210343137, + "loss": 3.1019, + "step": 1978 + }, + { + "epoch": 0.05868398422441657, + "grad_norm": 0.15269796550273895, + "learning_rate": 0.000994057590770083, + "loss": 3.1126, + "step": 1979 + }, + { + "epoch": 0.058713637576728046, + "grad_norm": 0.1697423905134201, + "learning_rate": 0.0009940503561308876, + "loss": 3.0956, + "step": 1980 + }, + { + "epoch": 0.05874329092903953, + "grad_norm": 0.20782847702503204, + "learning_rate": 0.0009940431171167915, + "loss": 3.1009, + "step": 1981 + }, + { + "epoch": 0.058772944281351006, + "grad_norm": 0.18902915716171265, + "learning_rate": 0.0009940358737278588, + "loss": 3.0978, + "step": 1982 + }, + { + "epoch": 0.05880259763366249, + "grad_norm": 0.1982400119304657, + "learning_rate": 0.0009940286259641539, + "loss": 3.1095, + "step": 1983 + }, + { + "epoch": 0.058832250985973965, + "grad_norm": 0.16122227907180786, + "learning_rate": 0.0009940213738257402, + "loss": 3.1467, + "step": 1984 + }, + { + "epoch": 0.05886190433828544, + "grad_norm": 0.15110108256340027, + "learning_rate": 0.0009940141173126827, + "loss": 3.0973, + "step": 1985 + }, + { + "epoch": 0.058891557690596924, + "grad_norm": 0.17356076836585999, + "learning_rate": 0.0009940068564250454, + "loss": 3.0947, + "step": 1986 + }, + { + "epoch": 0.0589212110429084, + "grad_norm": 0.18561793863773346, + "learning_rate": 0.0009939995911628927, + "loss": 3.1207, + "step": 1987 + }, + { + "epoch": 0.058950864395219876, + "grad_norm": 0.19366998970508575, + "learning_rate": 0.0009939923215262886, + "loss": 3.1067, + "step": 1988 + }, + { + "epoch": 0.05898051774753136, + "grad_norm": 0.19150377810001373, + "learning_rate": 0.0009939850475152979, + "loss": 3.0943, + "step": 1989 + }, + { + "epoch": 0.059010171099842836, + "grad_norm": 0.21944142878055573, + "learning_rate": 0.0009939777691299846, + "loss": 3.1181, + "step": 1990 + }, + { + "epoch": 0.05903982445215432, + "grad_norm": 0.21467991173267365, + "learning_rate": 0.0009939704863704136, + "loss": 3.1065, + "step": 1991 + }, + { + "epoch": 0.059069477804465795, + "grad_norm": 0.20320408046245575, + "learning_rate": 0.000993963199236649, + "loss": 3.1228, + "step": 1992 + }, + { + "epoch": 0.05909913115677727, + "grad_norm": 0.1556350290775299, + "learning_rate": 0.0009939559077287554, + "loss": 3.1197, + "step": 1993 + }, + { + "epoch": 0.059128784509088754, + "grad_norm": 0.17391671240329742, + "learning_rate": 0.0009939486118467975, + "loss": 3.1064, + "step": 1994 + }, + { + "epoch": 0.05915843786140023, + "grad_norm": 0.17055316269397736, + "learning_rate": 0.00099394131159084, + "loss": 3.1301, + "step": 1995 + }, + { + "epoch": 0.05918809121371171, + "grad_norm": 0.17137108743190765, + "learning_rate": 0.0009939340069609474, + "loss": 3.0736, + "step": 1996 + }, + { + "epoch": 0.05921774456602319, + "grad_norm": 0.1790078729391098, + "learning_rate": 0.0009939266979571842, + "loss": 3.0892, + "step": 1997 + }, + { + "epoch": 0.059247397918334666, + "grad_norm": 0.1637566089630127, + "learning_rate": 0.0009939193845796156, + "loss": 3.115, + "step": 1998 + }, + { + "epoch": 0.05927705127064615, + "grad_norm": 0.15661661326885223, + "learning_rate": 0.000993912066828306, + "loss": 3.0766, + "step": 1999 + }, + { + "epoch": 0.059306704622957625, + "grad_norm": 0.1350967437028885, + "learning_rate": 0.00099390474470332, + "loss": 3.1119, + "step": 2000 + }, + { + "epoch": 0.0593363579752691, + "grad_norm": 0.146154522895813, + "learning_rate": 0.0009938974182047227, + "loss": 3.1103, + "step": 2001 + }, + { + "epoch": 0.059366011327580584, + "grad_norm": 0.17114712297916412, + "learning_rate": 0.0009938900873325794, + "loss": 3.1074, + "step": 2002 + }, + { + "epoch": 0.05939566467989206, + "grad_norm": 0.16524861752986908, + "learning_rate": 0.0009938827520869543, + "loss": 3.0987, + "step": 2003 + }, + { + "epoch": 0.05942531803220354, + "grad_norm": 0.1541460007429123, + "learning_rate": 0.0009938754124679127, + "loss": 3.0887, + "step": 2004 + }, + { + "epoch": 0.05945497138451502, + "grad_norm": 0.15945684909820557, + "learning_rate": 0.0009938680684755195, + "loss": 3.1077, + "step": 2005 + }, + { + "epoch": 0.059484624736826495, + "grad_norm": 0.175740048289299, + "learning_rate": 0.0009938607201098399, + "loss": 3.1016, + "step": 2006 + }, + { + "epoch": 0.05951427808913798, + "grad_norm": 0.19442473351955414, + "learning_rate": 0.0009938533673709386, + "loss": 3.1213, + "step": 2007 + }, + { + "epoch": 0.059543931441449455, + "grad_norm": 0.22046665847301483, + "learning_rate": 0.0009938460102588813, + "loss": 3.046, + "step": 2008 + }, + { + "epoch": 0.05957358479376094, + "grad_norm": 0.1785135567188263, + "learning_rate": 0.0009938386487737326, + "loss": 3.0901, + "step": 2009 + }, + { + "epoch": 0.059603238146072414, + "grad_norm": 0.16880623996257782, + "learning_rate": 0.000993831282915558, + "loss": 3.095, + "step": 2010 + }, + { + "epoch": 0.05963289149838389, + "grad_norm": 0.16224956512451172, + "learning_rate": 0.0009938239126844226, + "loss": 3.1103, + "step": 2011 + }, + { + "epoch": 0.05966254485069537, + "grad_norm": 0.146871417760849, + "learning_rate": 0.0009938165380803917, + "loss": 3.1134, + "step": 2012 + }, + { + "epoch": 0.05969219820300685, + "grad_norm": 0.15596738457679749, + "learning_rate": 0.0009938091591035305, + "loss": 3.099, + "step": 2013 + }, + { + "epoch": 0.059721851555318325, + "grad_norm": 0.14139734208583832, + "learning_rate": 0.0009938017757539046, + "loss": 3.1021, + "step": 2014 + }, + { + "epoch": 0.05975150490762981, + "grad_norm": 0.13376769423484802, + "learning_rate": 0.0009937943880315792, + "loss": 3.1134, + "step": 2015 + }, + { + "epoch": 0.059781158259941285, + "grad_norm": 0.13649475574493408, + "learning_rate": 0.0009937869959366196, + "loss": 3.1295, + "step": 2016 + }, + { + "epoch": 0.05981081161225277, + "grad_norm": 0.14847807586193085, + "learning_rate": 0.0009937795994690915, + "loss": 3.0978, + "step": 2017 + }, + { + "epoch": 0.059840464964564244, + "grad_norm": 0.16870902478694916, + "learning_rate": 0.0009937721986290602, + "loss": 3.0707, + "step": 2018 + }, + { + "epoch": 0.05987011831687572, + "grad_norm": 0.1798715740442276, + "learning_rate": 0.0009937647934165914, + "loss": 3.123, + "step": 2019 + }, + { + "epoch": 0.0598997716691872, + "grad_norm": 0.17850156128406525, + "learning_rate": 0.0009937573838317505, + "loss": 3.1172, + "step": 2020 + }, + { + "epoch": 0.05992942502149868, + "grad_norm": 0.19758476316928864, + "learning_rate": 0.0009937499698746033, + "loss": 3.1601, + "step": 2021 + }, + { + "epoch": 0.05995907837381016, + "grad_norm": 0.16462600231170654, + "learning_rate": 0.0009937425515452155, + "loss": 3.1093, + "step": 2022 + }, + { + "epoch": 0.05998873172612164, + "grad_norm": 0.17881104350090027, + "learning_rate": 0.0009937351288436523, + "loss": 3.1188, + "step": 2023 + }, + { + "epoch": 0.060018385078433115, + "grad_norm": 0.18261897563934326, + "learning_rate": 0.00099372770176998, + "loss": 3.1294, + "step": 2024 + }, + { + "epoch": 0.0600480384307446, + "grad_norm": 0.16543659567832947, + "learning_rate": 0.0009937202703242643, + "loss": 3.0783, + "step": 2025 + }, + { + "epoch": 0.060077691783056074, + "grad_norm": 0.1726488471031189, + "learning_rate": 0.0009937128345065707, + "loss": 3.0639, + "step": 2026 + }, + { + "epoch": 0.06010734513536755, + "grad_norm": 0.17452777922153473, + "learning_rate": 0.0009937053943169653, + "loss": 3.1126, + "step": 2027 + }, + { + "epoch": 0.06013699848767903, + "grad_norm": 0.18501216173171997, + "learning_rate": 0.0009936979497555136, + "loss": 3.0985, + "step": 2028 + }, + { + "epoch": 0.06016665183999051, + "grad_norm": 0.1827004849910736, + "learning_rate": 0.000993690500822282, + "loss": 3.057, + "step": 2029 + }, + { + "epoch": 0.06019630519230199, + "grad_norm": 0.18076173961162567, + "learning_rate": 0.0009936830475173364, + "loss": 3.1328, + "step": 2030 + }, + { + "epoch": 0.06022595854461347, + "grad_norm": 0.19567817449569702, + "learning_rate": 0.0009936755898407425, + "loss": 3.0717, + "step": 2031 + }, + { + "epoch": 0.060255611896924945, + "grad_norm": 0.16863888502120972, + "learning_rate": 0.0009936681277925665, + "loss": 3.0866, + "step": 2032 + }, + { + "epoch": 0.06028526524923643, + "grad_norm": 0.1784806251525879, + "learning_rate": 0.0009936606613728746, + "loss": 3.0679, + "step": 2033 + }, + { + "epoch": 0.060314918601547904, + "grad_norm": 0.17419250309467316, + "learning_rate": 0.0009936531905817328, + "loss": 3.0758, + "step": 2034 + }, + { + "epoch": 0.06034457195385939, + "grad_norm": 0.14182201027870178, + "learning_rate": 0.0009936457154192074, + "loss": 3.0559, + "step": 2035 + }, + { + "epoch": 0.06037422530617086, + "grad_norm": 0.12921294569969177, + "learning_rate": 0.0009936382358853642, + "loss": 3.125, + "step": 2036 + }, + { + "epoch": 0.06040387865848234, + "grad_norm": 0.12960827350616455, + "learning_rate": 0.0009936307519802698, + "loss": 3.0908, + "step": 2037 + }, + { + "epoch": 0.06043353201079382, + "grad_norm": 0.1335584968328476, + "learning_rate": 0.0009936232637039904, + "loss": 3.067, + "step": 2038 + }, + { + "epoch": 0.0604631853631053, + "grad_norm": 0.15264105796813965, + "learning_rate": 0.000993615771056592, + "loss": 3.0855, + "step": 2039 + }, + { + "epoch": 0.060492838715416775, + "grad_norm": 0.16818362474441528, + "learning_rate": 0.0009936082740381416, + "loss": 3.1322, + "step": 2040 + }, + { + "epoch": 0.06052249206772826, + "grad_norm": 0.16504649817943573, + "learning_rate": 0.000993600772648705, + "loss": 3.0991, + "step": 2041 + }, + { + "epoch": 0.060552145420039734, + "grad_norm": 0.17267842590808868, + "learning_rate": 0.000993593266888349, + "loss": 3.1019, + "step": 2042 + }, + { + "epoch": 0.06058179877235122, + "grad_norm": 0.17948053777217865, + "learning_rate": 0.0009935857567571395, + "loss": 3.1048, + "step": 2043 + }, + { + "epoch": 0.06061145212466269, + "grad_norm": 0.18024267256259918, + "learning_rate": 0.0009935782422551438, + "loss": 3.0921, + "step": 2044 + }, + { + "epoch": 0.06064110547697417, + "grad_norm": 0.19814851880073547, + "learning_rate": 0.000993570723382428, + "loss": 3.0911, + "step": 2045 + }, + { + "epoch": 0.06067075882928565, + "grad_norm": 0.17662085592746735, + "learning_rate": 0.0009935632001390586, + "loss": 3.064, + "step": 2046 + }, + { + "epoch": 0.06070041218159713, + "grad_norm": 0.14539635181427002, + "learning_rate": 0.0009935556725251024, + "loss": 3.1061, + "step": 2047 + }, + { + "epoch": 0.06073006553390861, + "grad_norm": 0.175316721200943, + "learning_rate": 0.000993548140540626, + "loss": 3.0914, + "step": 2048 + }, + { + "epoch": 0.06075971888622009, + "grad_norm": 0.18685734272003174, + "learning_rate": 0.000993540604185696, + "loss": 3.1032, + "step": 2049 + }, + { + "epoch": 0.060789372238531564, + "grad_norm": 0.1633213609457016, + "learning_rate": 0.0009935330634603793, + "loss": 3.0922, + "step": 2050 + }, + { + "epoch": 0.06081902559084305, + "grad_norm": 0.14487320184707642, + "learning_rate": 0.0009935255183647427, + "loss": 3.1053, + "step": 2051 + }, + { + "epoch": 0.06084867894315452, + "grad_norm": 0.16073374450206757, + "learning_rate": 0.0009935179688988528, + "loss": 3.0935, + "step": 2052 + }, + { + "epoch": 0.060878332295466, + "grad_norm": 0.1415397822856903, + "learning_rate": 0.0009935104150627766, + "loss": 3.0921, + "step": 2053 + }, + { + "epoch": 0.06090798564777748, + "grad_norm": 0.1253649890422821, + "learning_rate": 0.0009935028568565812, + "loss": 3.0875, + "step": 2054 + }, + { + "epoch": 0.06093763900008896, + "grad_norm": 0.13925595581531525, + "learning_rate": 0.000993495294280333, + "loss": 3.0855, + "step": 2055 + }, + { + "epoch": 0.06096729235240044, + "grad_norm": 0.139118030667305, + "learning_rate": 0.0009934877273340993, + "loss": 3.096, + "step": 2056 + }, + { + "epoch": 0.06099694570471192, + "grad_norm": 0.15635323524475098, + "learning_rate": 0.0009934801560179472, + "loss": 3.1195, + "step": 2057 + }, + { + "epoch": 0.061026599057023394, + "grad_norm": 0.18224334716796875, + "learning_rate": 0.0009934725803319435, + "loss": 3.0905, + "step": 2058 + }, + { + "epoch": 0.06105625240933488, + "grad_norm": 0.17914849519729614, + "learning_rate": 0.0009934650002761554, + "loss": 3.0622, + "step": 2059 + }, + { + "epoch": 0.06108590576164635, + "grad_norm": 0.1780582219362259, + "learning_rate": 0.00099345741585065, + "loss": 3.1106, + "step": 2060 + }, + { + "epoch": 0.061115559113957836, + "grad_norm": 0.19713127613067627, + "learning_rate": 0.0009934498270554946, + "loss": 3.0836, + "step": 2061 + }, + { + "epoch": 0.06114521246626931, + "grad_norm": 0.16630269587039948, + "learning_rate": 0.0009934422338907564, + "loss": 3.0791, + "step": 2062 + }, + { + "epoch": 0.06117486581858079, + "grad_norm": 0.17879332602024078, + "learning_rate": 0.000993434636356502, + "loss": 3.0798, + "step": 2063 + }, + { + "epoch": 0.06120451917089227, + "grad_norm": 0.20987601578235626, + "learning_rate": 0.0009934270344527996, + "loss": 3.1012, + "step": 2064 + }, + { + "epoch": 0.06123417252320375, + "grad_norm": 0.2162037044763565, + "learning_rate": 0.000993419428179716, + "loss": 3.0745, + "step": 2065 + }, + { + "epoch": 0.061263825875515224, + "grad_norm": 0.18230250477790833, + "learning_rate": 0.0009934118175373187, + "loss": 3.0813, + "step": 2066 + }, + { + "epoch": 0.06129347922782671, + "grad_norm": 0.17781904339790344, + "learning_rate": 0.0009934042025256753, + "loss": 3.057, + "step": 2067 + }, + { + "epoch": 0.06132313258013818, + "grad_norm": 0.17600704729557037, + "learning_rate": 0.0009933965831448526, + "loss": 3.0948, + "step": 2068 + }, + { + "epoch": 0.061352785932449666, + "grad_norm": 0.14514857530593872, + "learning_rate": 0.0009933889593949188, + "loss": 3.0917, + "step": 2069 + }, + { + "epoch": 0.06138243928476114, + "grad_norm": 0.14631690084934235, + "learning_rate": 0.0009933813312759407, + "loss": 3.0917, + "step": 2070 + }, + { + "epoch": 0.06141209263707262, + "grad_norm": 0.15443401038646698, + "learning_rate": 0.0009933736987879865, + "loss": 3.1267, + "step": 2071 + }, + { + "epoch": 0.0614417459893841, + "grad_norm": 0.14220604300498962, + "learning_rate": 0.0009933660619311235, + "loss": 3.0869, + "step": 2072 + }, + { + "epoch": 0.06147139934169558, + "grad_norm": 0.16132178902626038, + "learning_rate": 0.0009933584207054192, + "loss": 3.0788, + "step": 2073 + }, + { + "epoch": 0.06150105269400706, + "grad_norm": 0.214135080575943, + "learning_rate": 0.0009933507751109416, + "loss": 3.0475, + "step": 2074 + }, + { + "epoch": 0.06153070604631854, + "grad_norm": 0.19921426475048065, + "learning_rate": 0.000993343125147758, + "loss": 3.0823, + "step": 2075 + }, + { + "epoch": 0.06156035939863001, + "grad_norm": 0.17184817790985107, + "learning_rate": 0.0009933354708159365, + "loss": 3.105, + "step": 2076 + }, + { + "epoch": 0.061590012750941496, + "grad_norm": 0.19387377798557281, + "learning_rate": 0.0009933278121155447, + "loss": 3.1003, + "step": 2077 + }, + { + "epoch": 0.06161966610325297, + "grad_norm": 0.1957625150680542, + "learning_rate": 0.0009933201490466502, + "loss": 3.0686, + "step": 2078 + }, + { + "epoch": 0.06164931945556445, + "grad_norm": 0.16778606176376343, + "learning_rate": 0.0009933124816093215, + "loss": 3.1038, + "step": 2079 + }, + { + "epoch": 0.06167897280787593, + "grad_norm": 0.15898875892162323, + "learning_rate": 0.0009933048098036258, + "loss": 3.1046, + "step": 2080 + }, + { + "epoch": 0.06170862616018741, + "grad_norm": 0.1520952731370926, + "learning_rate": 0.0009932971336296314, + "loss": 3.0976, + "step": 2081 + }, + { + "epoch": 0.06173827951249889, + "grad_norm": 0.1478729248046875, + "learning_rate": 0.0009932894530874064, + "loss": 3.0948, + "step": 2082 + }, + { + "epoch": 0.06176793286481037, + "grad_norm": 0.15398643910884857, + "learning_rate": 0.0009932817681770185, + "loss": 3.0815, + "step": 2083 + }, + { + "epoch": 0.06179758621712184, + "grad_norm": 0.15198306739330292, + "learning_rate": 0.0009932740788985356, + "loss": 3.0822, + "step": 2084 + }, + { + "epoch": 0.061827239569433326, + "grad_norm": 0.1444011777639389, + "learning_rate": 0.0009932663852520265, + "loss": 3.0863, + "step": 2085 + }, + { + "epoch": 0.0618568929217448, + "grad_norm": 0.13881760835647583, + "learning_rate": 0.0009932586872375586, + "loss": 3.1337, + "step": 2086 + }, + { + "epoch": 0.061886546274056285, + "grad_norm": 0.12824034690856934, + "learning_rate": 0.0009932509848552004, + "loss": 3.0741, + "step": 2087 + }, + { + "epoch": 0.06191619962636776, + "grad_norm": 0.15204478800296783, + "learning_rate": 0.0009932432781050203, + "loss": 3.1226, + "step": 2088 + }, + { + "epoch": 0.06194585297867924, + "grad_norm": 0.184153750538826, + "learning_rate": 0.000993235566987086, + "loss": 3.0868, + "step": 2089 + }, + { + "epoch": 0.06197550633099072, + "grad_norm": 0.17315009236335754, + "learning_rate": 0.0009932278515014663, + "loss": 3.1039, + "step": 2090 + }, + { + "epoch": 0.0620051596833022, + "grad_norm": 0.1608259677886963, + "learning_rate": 0.0009932201316482292, + "loss": 3.0401, + "step": 2091 + }, + { + "epoch": 0.06203481303561367, + "grad_norm": 0.15058468282222748, + "learning_rate": 0.0009932124074274432, + "loss": 3.0405, + "step": 2092 + }, + { + "epoch": 0.062064466387925156, + "grad_norm": 0.16113963723182678, + "learning_rate": 0.0009932046788391766, + "loss": 3.0833, + "step": 2093 + }, + { + "epoch": 0.06209411974023663, + "grad_norm": 0.16068622469902039, + "learning_rate": 0.0009931969458834983, + "loss": 3.0738, + "step": 2094 + }, + { + "epoch": 0.062123773092548115, + "grad_norm": 0.19578702747821808, + "learning_rate": 0.000993189208560476, + "loss": 3.0914, + "step": 2095 + }, + { + "epoch": 0.06215342644485959, + "grad_norm": 0.16664400696754456, + "learning_rate": 0.0009931814668701787, + "loss": 3.1059, + "step": 2096 + }, + { + "epoch": 0.06218307979717107, + "grad_norm": 0.16531161963939667, + "learning_rate": 0.0009931737208126747, + "loss": 3.0768, + "step": 2097 + }, + { + "epoch": 0.06221273314948255, + "grad_norm": 0.1773262619972229, + "learning_rate": 0.000993165970388033, + "loss": 3.1054, + "step": 2098 + }, + { + "epoch": 0.06224238650179403, + "grad_norm": 0.17380300164222717, + "learning_rate": 0.0009931582155963217, + "loss": 3.1068, + "step": 2099 + }, + { + "epoch": 0.06227203985410551, + "grad_norm": 0.20511500537395477, + "learning_rate": 0.0009931504564376099, + "loss": 3.0832, + "step": 2100 + }, + { + "epoch": 0.062301693206416986, + "grad_norm": 0.23038630187511444, + "learning_rate": 0.0009931426929119663, + "loss": 3.0855, + "step": 2101 + }, + { + "epoch": 0.06233134655872846, + "grad_norm": 0.15641778707504272, + "learning_rate": 0.0009931349250194594, + "loss": 3.0966, + "step": 2102 + }, + { + "epoch": 0.062360999911039945, + "grad_norm": 0.14747166633605957, + "learning_rate": 0.000993127152760158, + "loss": 3.0725, + "step": 2103 + }, + { + "epoch": 0.06239065326335142, + "grad_norm": 0.1457287073135376, + "learning_rate": 0.000993119376134131, + "loss": 3.0407, + "step": 2104 + }, + { + "epoch": 0.0624203066156629, + "grad_norm": 0.1532292664051056, + "learning_rate": 0.0009931115951414475, + "loss": 3.0599, + "step": 2105 + }, + { + "epoch": 0.06244995996797438, + "grad_norm": 0.15367119014263153, + "learning_rate": 0.0009931038097821762, + "loss": 3.0571, + "step": 2106 + }, + { + "epoch": 0.06247961332028586, + "grad_norm": 0.15834535658359528, + "learning_rate": 0.0009930960200563858, + "loss": 3.0779, + "step": 2107 + }, + { + "epoch": 0.06250926667259733, + "grad_norm": 0.15288014709949493, + "learning_rate": 0.0009930882259641457, + "loss": 3.0616, + "step": 2108 + }, + { + "epoch": 0.06253892002490882, + "grad_norm": 0.16770577430725098, + "learning_rate": 0.0009930804275055246, + "loss": 3.1014, + "step": 2109 + }, + { + "epoch": 0.0625685733772203, + "grad_norm": 0.20333674550056458, + "learning_rate": 0.0009930726246805916, + "loss": 3.0968, + "step": 2110 + }, + { + "epoch": 0.06259822672953178, + "grad_norm": 0.21599280834197998, + "learning_rate": 0.0009930648174894159, + "loss": 3.0919, + "step": 2111 + }, + { + "epoch": 0.06262788008184325, + "grad_norm": 0.208180770277977, + "learning_rate": 0.0009930570059320668, + "loss": 3.0777, + "step": 2112 + }, + { + "epoch": 0.06265753343415473, + "grad_norm": 0.17004969716072083, + "learning_rate": 0.000993049190008613, + "loss": 3.082, + "step": 2113 + }, + { + "epoch": 0.0626871867864662, + "grad_norm": 0.17140896618366241, + "learning_rate": 0.0009930413697191243, + "loss": 3.0743, + "step": 2114 + }, + { + "epoch": 0.0627168401387777, + "grad_norm": 0.19436229765415192, + "learning_rate": 0.0009930335450636695, + "loss": 3.0894, + "step": 2115 + }, + { + "epoch": 0.06274649349108917, + "grad_norm": 0.2145598828792572, + "learning_rate": 0.000993025716042318, + "loss": 3.0956, + "step": 2116 + }, + { + "epoch": 0.06277614684340065, + "grad_norm": 0.23281750082969666, + "learning_rate": 0.000993017882655139, + "loss": 3.073, + "step": 2117 + }, + { + "epoch": 0.06280580019571212, + "grad_norm": 0.17094650864601135, + "learning_rate": 0.0009930100449022023, + "loss": 3.0636, + "step": 2118 + }, + { + "epoch": 0.0628354535480236, + "grad_norm": 0.15242315828800201, + "learning_rate": 0.000993002202783577, + "loss": 3.1359, + "step": 2119 + }, + { + "epoch": 0.06286510690033509, + "grad_norm": 0.17969569563865662, + "learning_rate": 0.0009929943562993324, + "loss": 3.1024, + "step": 2120 + }, + { + "epoch": 0.06289476025264656, + "grad_norm": 0.15601877868175507, + "learning_rate": 0.0009929865054495383, + "loss": 3.0577, + "step": 2121 + }, + { + "epoch": 0.06292441360495804, + "grad_norm": 0.13959021866321564, + "learning_rate": 0.0009929786502342638, + "loss": 3.0574, + "step": 2122 + }, + { + "epoch": 0.06295406695726952, + "grad_norm": 0.1488111913204193, + "learning_rate": 0.0009929707906535792, + "loss": 3.069, + "step": 2123 + }, + { + "epoch": 0.06298372030958099, + "grad_norm": 0.18113882839679718, + "learning_rate": 0.0009929629267075534, + "loss": 3.0997, + "step": 2124 + }, + { + "epoch": 0.06301337366189248, + "grad_norm": 0.17501363158226013, + "learning_rate": 0.0009929550583962562, + "loss": 3.0455, + "step": 2125 + }, + { + "epoch": 0.06304302701420396, + "grad_norm": 0.18579654395580292, + "learning_rate": 0.0009929471857197574, + "loss": 3.1088, + "step": 2126 + }, + { + "epoch": 0.06307268036651544, + "grad_norm": 0.18747560679912567, + "learning_rate": 0.0009929393086781267, + "loss": 3.0507, + "step": 2127 + }, + { + "epoch": 0.06310233371882691, + "grad_norm": 0.18186995387077332, + "learning_rate": 0.0009929314272714338, + "loss": 3.0608, + "step": 2128 + }, + { + "epoch": 0.06313198707113839, + "grad_norm": 0.2055380940437317, + "learning_rate": 0.0009929235414997484, + "loss": 3.0786, + "step": 2129 + }, + { + "epoch": 0.06316164042344988, + "grad_norm": 0.2091158777475357, + "learning_rate": 0.0009929156513631405, + "loss": 3.0843, + "step": 2130 + }, + { + "epoch": 0.06319129377576135, + "grad_norm": 0.1795136034488678, + "learning_rate": 0.00099290775686168, + "loss": 3.0701, + "step": 2131 + }, + { + "epoch": 0.06322094712807283, + "grad_norm": 0.20910105109214783, + "learning_rate": 0.0009928998579954364, + "loss": 3.08, + "step": 2132 + }, + { + "epoch": 0.0632506004803843, + "grad_norm": 0.19249095022678375, + "learning_rate": 0.0009928919547644805, + "loss": 3.06, + "step": 2133 + }, + { + "epoch": 0.06328025383269578, + "grad_norm": 0.13794919848442078, + "learning_rate": 0.000992884047168881, + "loss": 3.0508, + "step": 2134 + }, + { + "epoch": 0.06330990718500727, + "grad_norm": 0.17024624347686768, + "learning_rate": 0.0009928761352087092, + "loss": 3.0833, + "step": 2135 + }, + { + "epoch": 0.06333956053731875, + "grad_norm": 0.1529189944267273, + "learning_rate": 0.0009928682188840346, + "loss": 3.0758, + "step": 2136 + }, + { + "epoch": 0.06336921388963022, + "grad_norm": 0.1590924859046936, + "learning_rate": 0.000992860298194927, + "loss": 3.0837, + "step": 2137 + }, + { + "epoch": 0.0633988672419417, + "grad_norm": 0.15103574097156525, + "learning_rate": 0.0009928523731414572, + "loss": 3.0555, + "step": 2138 + }, + { + "epoch": 0.06342852059425318, + "grad_norm": 0.16147753596305847, + "learning_rate": 0.0009928444437236948, + "loss": 3.0591, + "step": 2139 + }, + { + "epoch": 0.06345817394656465, + "grad_norm": 0.14596626162528992, + "learning_rate": 0.0009928365099417106, + "loss": 3.0836, + "step": 2140 + }, + { + "epoch": 0.06348782729887614, + "grad_norm": 0.1723179817199707, + "learning_rate": 0.000992828571795574, + "loss": 3.0759, + "step": 2141 + }, + { + "epoch": 0.06351748065118762, + "grad_norm": 0.18574924767017365, + "learning_rate": 0.0009928206292853562, + "loss": 3.1066, + "step": 2142 + }, + { + "epoch": 0.0635471340034991, + "grad_norm": 0.1667831689119339, + "learning_rate": 0.000992812682411127, + "loss": 3.0816, + "step": 2143 + }, + { + "epoch": 0.06357678735581057, + "grad_norm": 0.13541771471500397, + "learning_rate": 0.000992804731172957, + "loss": 3.0722, + "step": 2144 + }, + { + "epoch": 0.06360644070812205, + "grad_norm": 0.1227579116821289, + "learning_rate": 0.0009927967755709165, + "loss": 3.0772, + "step": 2145 + }, + { + "epoch": 0.06363609406043354, + "grad_norm": 0.14157111942768097, + "learning_rate": 0.0009927888156050758, + "loss": 3.096, + "step": 2146 + }, + { + "epoch": 0.06366574741274501, + "grad_norm": 0.14049889147281647, + "learning_rate": 0.0009927808512755056, + "loss": 3.0664, + "step": 2147 + }, + { + "epoch": 0.06369540076505649, + "grad_norm": 0.17661313712596893, + "learning_rate": 0.0009927728825822764, + "loss": 3.0629, + "step": 2148 + }, + { + "epoch": 0.06372505411736797, + "grad_norm": 0.17739230394363403, + "learning_rate": 0.0009927649095254588, + "loss": 3.0744, + "step": 2149 + }, + { + "epoch": 0.06375470746967944, + "grad_norm": 0.14507494866847992, + "learning_rate": 0.0009927569321051234, + "loss": 3.0708, + "step": 2150 + }, + { + "epoch": 0.06378436082199093, + "grad_norm": 0.14335106313228607, + "learning_rate": 0.0009927489503213404, + "loss": 3.086, + "step": 2151 + }, + { + "epoch": 0.06381401417430241, + "grad_norm": 0.13458846509456635, + "learning_rate": 0.0009927409641741815, + "loss": 3.0521, + "step": 2152 + }, + { + "epoch": 0.06384366752661388, + "grad_norm": 0.14252710342407227, + "learning_rate": 0.0009927329736637164, + "loss": 3.0676, + "step": 2153 + }, + { + "epoch": 0.06387332087892536, + "grad_norm": 0.14856936037540436, + "learning_rate": 0.000992724978790016, + "loss": 3.0659, + "step": 2154 + }, + { + "epoch": 0.06390297423123684, + "grad_norm": 0.1781037300825119, + "learning_rate": 0.0009927169795531517, + "loss": 3.0457, + "step": 2155 + }, + { + "epoch": 0.06393262758354833, + "grad_norm": 0.18032948672771454, + "learning_rate": 0.000992708975953194, + "loss": 3.1414, + "step": 2156 + }, + { + "epoch": 0.0639622809358598, + "grad_norm": 0.17820759117603302, + "learning_rate": 0.0009927009679902136, + "loss": 3.0841, + "step": 2157 + }, + { + "epoch": 0.06399193428817128, + "grad_norm": 0.16959629952907562, + "learning_rate": 0.0009926929556642815, + "loss": 3.0329, + "step": 2158 + }, + { + "epoch": 0.06402158764048275, + "grad_norm": 0.17318183183670044, + "learning_rate": 0.0009926849389754687, + "loss": 3.0741, + "step": 2159 + }, + { + "epoch": 0.06405124099279423, + "grad_norm": 0.17064304649829865, + "learning_rate": 0.0009926769179238466, + "loss": 3.0992, + "step": 2160 + }, + { + "epoch": 0.06408089434510572, + "grad_norm": 0.17153216898441315, + "learning_rate": 0.0009926688925094855, + "loss": 3.0967, + "step": 2161 + }, + { + "epoch": 0.0641105476974172, + "grad_norm": 0.15380409359931946, + "learning_rate": 0.0009926608627324567, + "loss": 3.0559, + "step": 2162 + }, + { + "epoch": 0.06414020104972867, + "grad_norm": 0.14934217929840088, + "learning_rate": 0.0009926528285928316, + "loss": 3.053, + "step": 2163 + }, + { + "epoch": 0.06416985440204015, + "grad_norm": 0.15825510025024414, + "learning_rate": 0.000992644790090681, + "loss": 3.0763, + "step": 2164 + }, + { + "epoch": 0.06419950775435163, + "grad_norm": 0.17947524785995483, + "learning_rate": 0.0009926367472260762, + "loss": 3.0826, + "step": 2165 + }, + { + "epoch": 0.0642291611066631, + "grad_norm": 0.1594153791666031, + "learning_rate": 0.0009926286999990886, + "loss": 3.0833, + "step": 2166 + }, + { + "epoch": 0.06425881445897459, + "grad_norm": 0.1784011274576187, + "learning_rate": 0.0009926206484097892, + "loss": 3.0558, + "step": 2167 + }, + { + "epoch": 0.06428846781128607, + "grad_norm": 0.15402622520923615, + "learning_rate": 0.0009926125924582495, + "loss": 3.1205, + "step": 2168 + }, + { + "epoch": 0.06431812116359754, + "grad_norm": 0.16604363918304443, + "learning_rate": 0.0009926045321445407, + "loss": 3.1228, + "step": 2169 + }, + { + "epoch": 0.06434777451590902, + "grad_norm": 0.17796343564987183, + "learning_rate": 0.0009925964674687342, + "loss": 3.03, + "step": 2170 + }, + { + "epoch": 0.0643774278682205, + "grad_norm": 0.13439446687698364, + "learning_rate": 0.0009925883984309015, + "loss": 3.0448, + "step": 2171 + }, + { + "epoch": 0.06440708122053199, + "grad_norm": 0.1378248631954193, + "learning_rate": 0.0009925803250311136, + "loss": 3.0596, + "step": 2172 + }, + { + "epoch": 0.06443673457284346, + "grad_norm": 0.15484607219696045, + "learning_rate": 0.0009925722472694427, + "loss": 3.0681, + "step": 2173 + }, + { + "epoch": 0.06446638792515494, + "grad_norm": 0.16759386658668518, + "learning_rate": 0.00099256416514596, + "loss": 3.0264, + "step": 2174 + }, + { + "epoch": 0.06449604127746641, + "grad_norm": 0.1877717524766922, + "learning_rate": 0.0009925560786607371, + "loss": 3.09, + "step": 2175 + }, + { + "epoch": 0.06452569462977789, + "grad_norm": 0.15957607328891754, + "learning_rate": 0.0009925479878138456, + "loss": 3.0174, + "step": 2176 + }, + { + "epoch": 0.06455534798208938, + "grad_norm": 0.1580880880355835, + "learning_rate": 0.000992539892605357, + "loss": 3.0815, + "step": 2177 + }, + { + "epoch": 0.06458500133440086, + "grad_norm": 0.16340292990207672, + "learning_rate": 0.000992531793035343, + "loss": 3.0498, + "step": 2178 + }, + { + "epoch": 0.06461465468671233, + "grad_norm": 0.16932201385498047, + "learning_rate": 0.0009925236891038757, + "loss": 3.0538, + "step": 2179 + }, + { + "epoch": 0.06464430803902381, + "grad_norm": 0.1695159673690796, + "learning_rate": 0.0009925155808110265, + "loss": 3.0739, + "step": 2180 + }, + { + "epoch": 0.06467396139133529, + "grad_norm": 0.15621091425418854, + "learning_rate": 0.0009925074681568671, + "loss": 3.0697, + "step": 2181 + }, + { + "epoch": 0.06470361474364678, + "grad_norm": 0.15978245437145233, + "learning_rate": 0.0009924993511414696, + "loss": 3.0655, + "step": 2182 + }, + { + "epoch": 0.06473326809595825, + "grad_norm": 0.18860913813114166, + "learning_rate": 0.000992491229764906, + "loss": 3.0923, + "step": 2183 + }, + { + "epoch": 0.06476292144826973, + "grad_norm": 0.21527138352394104, + "learning_rate": 0.000992483104027248, + "loss": 3.0977, + "step": 2184 + }, + { + "epoch": 0.0647925748005812, + "grad_norm": 0.1670016646385193, + "learning_rate": 0.0009924749739285675, + "loss": 3.0546, + "step": 2185 + }, + { + "epoch": 0.06482222815289268, + "grad_norm": 0.14456380903720856, + "learning_rate": 0.0009924668394689364, + "loss": 3.0723, + "step": 2186 + }, + { + "epoch": 0.06485188150520416, + "grad_norm": 0.15704599022865295, + "learning_rate": 0.0009924587006484272, + "loss": 3.1211, + "step": 2187 + }, + { + "epoch": 0.06488153485751565, + "grad_norm": 0.13620083034038544, + "learning_rate": 0.0009924505574671115, + "loss": 3.0621, + "step": 2188 + }, + { + "epoch": 0.06491118820982712, + "grad_norm": 0.13940203189849854, + "learning_rate": 0.0009924424099250618, + "loss": 3.0913, + "step": 2189 + }, + { + "epoch": 0.0649408415621386, + "grad_norm": 0.18742384016513824, + "learning_rate": 0.0009924342580223497, + "loss": 3.0842, + "step": 2190 + }, + { + "epoch": 0.06497049491445007, + "grad_norm": 0.2546399235725403, + "learning_rate": 0.0009924261017590478, + "loss": 3.0768, + "step": 2191 + }, + { + "epoch": 0.06500014826676155, + "grad_norm": 0.2314475029706955, + "learning_rate": 0.0009924179411352286, + "loss": 3.0749, + "step": 2192 + }, + { + "epoch": 0.06502980161907304, + "grad_norm": 0.16486646234989166, + "learning_rate": 0.0009924097761509637, + "loss": 3.0929, + "step": 2193 + }, + { + "epoch": 0.06505945497138452, + "grad_norm": 0.18244735896587372, + "learning_rate": 0.0009924016068063256, + "loss": 3.0534, + "step": 2194 + }, + { + "epoch": 0.065089108323696, + "grad_norm": 0.19813233613967896, + "learning_rate": 0.000992393433101387, + "loss": 3.041, + "step": 2195 + }, + { + "epoch": 0.06511876167600747, + "grad_norm": 0.17042598128318787, + "learning_rate": 0.00099238525503622, + "loss": 3.0558, + "step": 2196 + }, + { + "epoch": 0.06514841502831895, + "grad_norm": 0.16045212745666504, + "learning_rate": 0.000992377072610897, + "loss": 3.0773, + "step": 2197 + }, + { + "epoch": 0.06517806838063044, + "grad_norm": 0.1403326839208603, + "learning_rate": 0.0009923688858254904, + "loss": 3.0792, + "step": 2198 + }, + { + "epoch": 0.06520772173294191, + "grad_norm": 0.14426523447036743, + "learning_rate": 0.0009923606946800729, + "loss": 3.105, + "step": 2199 + }, + { + "epoch": 0.06523737508525339, + "grad_norm": 0.1538630872964859, + "learning_rate": 0.0009923524991747171, + "loss": 3.0865, + "step": 2200 + }, + { + "epoch": 0.06526702843756486, + "grad_norm": 0.17295435070991516, + "learning_rate": 0.0009923442993094952, + "loss": 3.0582, + "step": 2201 + }, + { + "epoch": 0.06529668178987634, + "grad_norm": 0.15420952439308167, + "learning_rate": 0.00099233609508448, + "loss": 3.0423, + "step": 2202 + }, + { + "epoch": 0.06532633514218783, + "grad_norm": 0.15604418516159058, + "learning_rate": 0.0009923278864997442, + "loss": 3.095, + "step": 2203 + }, + { + "epoch": 0.0653559884944993, + "grad_norm": 0.1736731231212616, + "learning_rate": 0.0009923196735553605, + "loss": 3.1015, + "step": 2204 + }, + { + "epoch": 0.06538564184681078, + "grad_norm": 0.17924118041992188, + "learning_rate": 0.0009923114562514015, + "loss": 3.0474, + "step": 2205 + }, + { + "epoch": 0.06541529519912226, + "grad_norm": 0.17299354076385498, + "learning_rate": 0.0009923032345879402, + "loss": 3.0642, + "step": 2206 + }, + { + "epoch": 0.06544494855143373, + "grad_norm": 0.1548534631729126, + "learning_rate": 0.000992295008565049, + "loss": 3.0916, + "step": 2207 + }, + { + "epoch": 0.06547460190374522, + "grad_norm": 0.1671396940946579, + "learning_rate": 0.0009922867781828014, + "loss": 3.031, + "step": 2208 + }, + { + "epoch": 0.0655042552560567, + "grad_norm": 0.18347211182117462, + "learning_rate": 0.0009922785434412695, + "loss": 3.0283, + "step": 2209 + }, + { + "epoch": 0.06553390860836818, + "grad_norm": 0.17207586765289307, + "learning_rate": 0.000992270304340527, + "loss": 3.0485, + "step": 2210 + }, + { + "epoch": 0.06556356196067965, + "grad_norm": 0.1933683604001999, + "learning_rate": 0.000992262060880646, + "loss": 3.0526, + "step": 2211 + }, + { + "epoch": 0.06559321531299113, + "grad_norm": 0.16845299303531647, + "learning_rate": 0.0009922538130617002, + "loss": 3.1008, + "step": 2212 + }, + { + "epoch": 0.0656228686653026, + "grad_norm": 0.15045605599880219, + "learning_rate": 0.0009922455608837623, + "loss": 3.0768, + "step": 2213 + }, + { + "epoch": 0.0656525220176141, + "grad_norm": 0.15835560858249664, + "learning_rate": 0.0009922373043469057, + "loss": 3.0638, + "step": 2214 + }, + { + "epoch": 0.06568217536992557, + "grad_norm": 0.20094947516918182, + "learning_rate": 0.0009922290434512032, + "loss": 3.0894, + "step": 2215 + }, + { + "epoch": 0.06571182872223705, + "grad_norm": 0.18823747336864471, + "learning_rate": 0.0009922207781967278, + "loss": 3.0786, + "step": 2216 + }, + { + "epoch": 0.06574148207454852, + "grad_norm": 0.16968576610088348, + "learning_rate": 0.0009922125085835532, + "loss": 3.08, + "step": 2217 + }, + { + "epoch": 0.06577113542686, + "grad_norm": 0.17057274281978607, + "learning_rate": 0.0009922042346117521, + "loss": 3.0793, + "step": 2218 + }, + { + "epoch": 0.06580078877917149, + "grad_norm": 0.17439042031764984, + "learning_rate": 0.0009921959562813982, + "loss": 3.0187, + "step": 2219 + }, + { + "epoch": 0.06583044213148297, + "grad_norm": 0.16875435411930084, + "learning_rate": 0.0009921876735925644, + "loss": 3.057, + "step": 2220 + }, + { + "epoch": 0.06586009548379444, + "grad_norm": 0.17420624196529388, + "learning_rate": 0.0009921793865453245, + "loss": 3.069, + "step": 2221 + }, + { + "epoch": 0.06588974883610592, + "grad_norm": 0.15019254386425018, + "learning_rate": 0.0009921710951397516, + "loss": 3.0501, + "step": 2222 + }, + { + "epoch": 0.0659194021884174, + "grad_norm": 0.1772003173828125, + "learning_rate": 0.000992162799375919, + "loss": 3.0378, + "step": 2223 + }, + { + "epoch": 0.06594905554072888, + "grad_norm": 0.1947108507156372, + "learning_rate": 0.0009921544992539005, + "loss": 3.0774, + "step": 2224 + }, + { + "epoch": 0.06597870889304036, + "grad_norm": 0.18838249146938324, + "learning_rate": 0.0009921461947737696, + "loss": 3.0567, + "step": 2225 + }, + { + "epoch": 0.06600836224535184, + "grad_norm": 0.210444375872612, + "learning_rate": 0.0009921378859355993, + "loss": 3.0624, + "step": 2226 + }, + { + "epoch": 0.06603801559766331, + "grad_norm": 0.1723151057958603, + "learning_rate": 0.0009921295727394637, + "loss": 3.0696, + "step": 2227 + }, + { + "epoch": 0.06606766894997479, + "grad_norm": 0.15878431499004364, + "learning_rate": 0.0009921212551854365, + "loss": 3.0803, + "step": 2228 + }, + { + "epoch": 0.06609732230228628, + "grad_norm": 0.1690525859594345, + "learning_rate": 0.0009921129332735909, + "loss": 3.0359, + "step": 2229 + }, + { + "epoch": 0.06612697565459776, + "grad_norm": 0.1714230477809906, + "learning_rate": 0.0009921046070040006, + "loss": 3.0706, + "step": 2230 + }, + { + "epoch": 0.06615662900690923, + "grad_norm": 0.16684451699256897, + "learning_rate": 0.0009920962763767399, + "loss": 3.0531, + "step": 2231 + }, + { + "epoch": 0.06618628235922071, + "grad_norm": 0.17379458248615265, + "learning_rate": 0.000992087941391882, + "loss": 3.051, + "step": 2232 + }, + { + "epoch": 0.06621593571153218, + "grad_norm": 0.1763063669204712, + "learning_rate": 0.0009920796020495008, + "loss": 3.0481, + "step": 2233 + }, + { + "epoch": 0.06624558906384367, + "grad_norm": 0.15595358610153198, + "learning_rate": 0.0009920712583496704, + "loss": 3.0514, + "step": 2234 + }, + { + "epoch": 0.06627524241615515, + "grad_norm": 0.1623973697423935, + "learning_rate": 0.0009920629102924646, + "loss": 3.0677, + "step": 2235 + }, + { + "epoch": 0.06630489576846663, + "grad_norm": 0.16280968487262726, + "learning_rate": 0.0009920545578779572, + "loss": 3.0896, + "step": 2236 + }, + { + "epoch": 0.0663345491207781, + "grad_norm": 0.16378739476203918, + "learning_rate": 0.0009920462011062223, + "loss": 3.0298, + "step": 2237 + }, + { + "epoch": 0.06636420247308958, + "grad_norm": 0.15205638110637665, + "learning_rate": 0.0009920378399773339, + "loss": 3.0803, + "step": 2238 + }, + { + "epoch": 0.06639385582540105, + "grad_norm": 0.15712697803974152, + "learning_rate": 0.0009920294744913659, + "loss": 3.0366, + "step": 2239 + }, + { + "epoch": 0.06642350917771254, + "grad_norm": 0.1496170610189438, + "learning_rate": 0.0009920211046483922, + "loss": 3.0916, + "step": 2240 + }, + { + "epoch": 0.06645316253002402, + "grad_norm": 0.1260298490524292, + "learning_rate": 0.0009920127304484873, + "loss": 3.0281, + "step": 2241 + }, + { + "epoch": 0.0664828158823355, + "grad_norm": 0.14310938119888306, + "learning_rate": 0.0009920043518917255, + "loss": 3.0542, + "step": 2242 + }, + { + "epoch": 0.06651246923464697, + "grad_norm": 0.13579310476779938, + "learning_rate": 0.0009919959689781803, + "loss": 3.0541, + "step": 2243 + }, + { + "epoch": 0.06654212258695845, + "grad_norm": 0.14054428040981293, + "learning_rate": 0.0009919875817079268, + "loss": 3.0549, + "step": 2244 + }, + { + "epoch": 0.06657177593926994, + "grad_norm": 0.13571076095104218, + "learning_rate": 0.0009919791900810384, + "loss": 3.075, + "step": 2245 + }, + { + "epoch": 0.06660142929158142, + "grad_norm": 0.13811516761779785, + "learning_rate": 0.00099197079409759, + "loss": 3.0991, + "step": 2246 + }, + { + "epoch": 0.06663108264389289, + "grad_norm": 0.1478816270828247, + "learning_rate": 0.0009919623937576557, + "loss": 3.0539, + "step": 2247 + }, + { + "epoch": 0.06666073599620437, + "grad_norm": 0.17104999721050262, + "learning_rate": 0.0009919539890613101, + "loss": 3.0381, + "step": 2248 + }, + { + "epoch": 0.06669038934851584, + "grad_norm": 0.1851874440908432, + "learning_rate": 0.0009919455800086274, + "loss": 3.0904, + "step": 2249 + }, + { + "epoch": 0.06672004270082733, + "grad_norm": 0.17753402888774872, + "learning_rate": 0.0009919371665996822, + "loss": 3.0256, + "step": 2250 + }, + { + "epoch": 0.06674969605313881, + "grad_norm": 0.1601143777370453, + "learning_rate": 0.0009919287488345488, + "loss": 3.0716, + "step": 2251 + }, + { + "epoch": 0.06677934940545029, + "grad_norm": 0.16469046473503113, + "learning_rate": 0.0009919203267133021, + "loss": 3.0424, + "step": 2252 + }, + { + "epoch": 0.06680900275776176, + "grad_norm": 0.1711011677980423, + "learning_rate": 0.0009919119002360162, + "loss": 3.0539, + "step": 2253 + }, + { + "epoch": 0.06683865611007324, + "grad_norm": 0.1551426202058792, + "learning_rate": 0.0009919034694027661, + "loss": 3.0729, + "step": 2254 + }, + { + "epoch": 0.06686830946238473, + "grad_norm": 0.18601416051387787, + "learning_rate": 0.0009918950342136265, + "loss": 3.0731, + "step": 2255 + }, + { + "epoch": 0.0668979628146962, + "grad_norm": 0.17170704901218414, + "learning_rate": 0.0009918865946686717, + "loss": 3.0953, + "step": 2256 + }, + { + "epoch": 0.06692761616700768, + "grad_norm": 0.1457299143075943, + "learning_rate": 0.000991878150767977, + "loss": 3.0846, + "step": 2257 + }, + { + "epoch": 0.06695726951931916, + "grad_norm": 0.14949700236320496, + "learning_rate": 0.0009918697025116166, + "loss": 3.0586, + "step": 2258 + }, + { + "epoch": 0.06698692287163063, + "grad_norm": 0.1532086431980133, + "learning_rate": 0.0009918612498996655, + "loss": 3.0375, + "step": 2259 + }, + { + "epoch": 0.06701657622394212, + "grad_norm": 0.16279448568820953, + "learning_rate": 0.0009918527929321987, + "loss": 3.0364, + "step": 2260 + }, + { + "epoch": 0.0670462295762536, + "grad_norm": 0.18078835308551788, + "learning_rate": 0.0009918443316092912, + "loss": 3.0649, + "step": 2261 + }, + { + "epoch": 0.06707588292856508, + "grad_norm": 0.16974367201328278, + "learning_rate": 0.0009918358659310174, + "loss": 3.0517, + "step": 2262 + }, + { + "epoch": 0.06710553628087655, + "grad_norm": 0.16895665228366852, + "learning_rate": 0.0009918273958974527, + "loss": 3.0815, + "step": 2263 + }, + { + "epoch": 0.06713518963318803, + "grad_norm": 0.1631120890378952, + "learning_rate": 0.0009918189215086719, + "loss": 3.0557, + "step": 2264 + }, + { + "epoch": 0.0671648429854995, + "grad_norm": 0.1654173731803894, + "learning_rate": 0.0009918104427647503, + "loss": 3.0759, + "step": 2265 + }, + { + "epoch": 0.067194496337811, + "grad_norm": 0.17230357229709625, + "learning_rate": 0.0009918019596657627, + "loss": 3.051, + "step": 2266 + }, + { + "epoch": 0.06722414969012247, + "grad_norm": 0.1977580338716507, + "learning_rate": 0.0009917934722117844, + "loss": 3.0551, + "step": 2267 + }, + { + "epoch": 0.06725380304243395, + "grad_norm": 0.20109428465366364, + "learning_rate": 0.0009917849804028905, + "loss": 3.0709, + "step": 2268 + }, + { + "epoch": 0.06728345639474542, + "grad_norm": 0.1948021948337555, + "learning_rate": 0.0009917764842391561, + "loss": 3.0161, + "step": 2269 + }, + { + "epoch": 0.0673131097470569, + "grad_norm": 0.1734899878501892, + "learning_rate": 0.0009917679837206565, + "loss": 3.0494, + "step": 2270 + }, + { + "epoch": 0.06734276309936839, + "grad_norm": 0.15161098539829254, + "learning_rate": 0.0009917594788474671, + "loss": 3.0576, + "step": 2271 + }, + { + "epoch": 0.06737241645167986, + "grad_norm": 0.17779316008090973, + "learning_rate": 0.0009917509696196632, + "loss": 3.0341, + "step": 2272 + }, + { + "epoch": 0.06740206980399134, + "grad_norm": 0.18783089518547058, + "learning_rate": 0.0009917424560373198, + "loss": 3.1014, + "step": 2273 + }, + { + "epoch": 0.06743172315630282, + "grad_norm": 0.15819185972213745, + "learning_rate": 0.0009917339381005127, + "loss": 3.0736, + "step": 2274 + }, + { + "epoch": 0.06746137650861429, + "grad_norm": 0.17774739861488342, + "learning_rate": 0.000991725415809317, + "loss": 3.078, + "step": 2275 + }, + { + "epoch": 0.06749102986092578, + "grad_norm": 0.20144081115722656, + "learning_rate": 0.0009917168891638085, + "loss": 3.0455, + "step": 2276 + }, + { + "epoch": 0.06752068321323726, + "grad_norm": 0.1941981017589569, + "learning_rate": 0.0009917083581640626, + "loss": 3.063, + "step": 2277 + }, + { + "epoch": 0.06755033656554874, + "grad_norm": 0.16416974365711212, + "learning_rate": 0.0009916998228101546, + "loss": 3.0783, + "step": 2278 + }, + { + "epoch": 0.06757998991786021, + "grad_norm": 0.14508956670761108, + "learning_rate": 0.0009916912831021605, + "loss": 3.0509, + "step": 2279 + }, + { + "epoch": 0.06760964327017169, + "grad_norm": 0.17038659751415253, + "learning_rate": 0.0009916827390401555, + "loss": 3.1007, + "step": 2280 + }, + { + "epoch": 0.06763929662248318, + "grad_norm": 0.17195938527584076, + "learning_rate": 0.0009916741906242155, + "loss": 3.0367, + "step": 2281 + }, + { + "epoch": 0.06766894997479465, + "grad_norm": 0.1929522007703781, + "learning_rate": 0.0009916656378544163, + "loss": 3.0687, + "step": 2282 + }, + { + "epoch": 0.06769860332710613, + "grad_norm": 0.1885850578546524, + "learning_rate": 0.0009916570807308332, + "loss": 3.0265, + "step": 2283 + }, + { + "epoch": 0.0677282566794176, + "grad_norm": 0.1821499913930893, + "learning_rate": 0.0009916485192535424, + "loss": 3.0302, + "step": 2284 + }, + { + "epoch": 0.06775791003172908, + "grad_norm": 0.17920111119747162, + "learning_rate": 0.0009916399534226196, + "loss": 3.0932, + "step": 2285 + }, + { + "epoch": 0.06778756338404057, + "grad_norm": 0.18004801869392395, + "learning_rate": 0.0009916313832381406, + "loss": 3.0693, + "step": 2286 + }, + { + "epoch": 0.06781721673635205, + "grad_norm": 0.19392569363117218, + "learning_rate": 0.0009916228087001812, + "loss": 3.0683, + "step": 2287 + }, + { + "epoch": 0.06784687008866352, + "grad_norm": 0.20220065116882324, + "learning_rate": 0.0009916142298088176, + "loss": 3.0751, + "step": 2288 + }, + { + "epoch": 0.067876523440975, + "grad_norm": 0.17559342086315155, + "learning_rate": 0.0009916056465641256, + "loss": 3.0681, + "step": 2289 + }, + { + "epoch": 0.06790617679328648, + "grad_norm": 0.1815510392189026, + "learning_rate": 0.0009915970589661812, + "loss": 3.0486, + "step": 2290 + }, + { + "epoch": 0.06793583014559795, + "grad_norm": 0.1718764752149582, + "learning_rate": 0.0009915884670150604, + "loss": 3.0547, + "step": 2291 + }, + { + "epoch": 0.06796548349790944, + "grad_norm": 0.1534785032272339, + "learning_rate": 0.0009915798707108394, + "loss": 3.0852, + "step": 2292 + }, + { + "epoch": 0.06799513685022092, + "grad_norm": 0.18003825843334198, + "learning_rate": 0.0009915712700535942, + "loss": 3.0556, + "step": 2293 + }, + { + "epoch": 0.0680247902025324, + "grad_norm": 0.17338939011096954, + "learning_rate": 0.000991562665043401, + "loss": 3.0328, + "step": 2294 + }, + { + "epoch": 0.06805444355484387, + "grad_norm": 0.16695483028888702, + "learning_rate": 0.0009915540556803364, + "loss": 3.0605, + "step": 2295 + }, + { + "epoch": 0.06808409690715535, + "grad_norm": 0.16816766560077667, + "learning_rate": 0.0009915454419644758, + "loss": 3.0764, + "step": 2296 + }, + { + "epoch": 0.06811375025946684, + "grad_norm": 0.13632380962371826, + "learning_rate": 0.000991536823895896, + "loss": 3.0179, + "step": 2297 + }, + { + "epoch": 0.06814340361177831, + "grad_norm": 0.15624652802944183, + "learning_rate": 0.0009915282014746735, + "loss": 3.105, + "step": 2298 + }, + { + "epoch": 0.06817305696408979, + "grad_norm": 0.15441232919692993, + "learning_rate": 0.0009915195747008843, + "loss": 3.0473, + "step": 2299 + }, + { + "epoch": 0.06820271031640127, + "grad_norm": 0.17220206558704376, + "learning_rate": 0.0009915109435746049, + "loss": 3.0634, + "step": 2300 + }, + { + "epoch": 0.06823236366871274, + "grad_norm": 0.16428139805793762, + "learning_rate": 0.0009915023080959116, + "loss": 3.0563, + "step": 2301 + }, + { + "epoch": 0.06826201702102423, + "grad_norm": 0.15617063641548157, + "learning_rate": 0.0009914936682648811, + "loss": 3.0342, + "step": 2302 + }, + { + "epoch": 0.06829167037333571, + "grad_norm": 0.1400563269853592, + "learning_rate": 0.0009914850240815899, + "loss": 3.0667, + "step": 2303 + }, + { + "epoch": 0.06832132372564718, + "grad_norm": 0.1432494819164276, + "learning_rate": 0.0009914763755461142, + "loss": 3.0531, + "step": 2304 + }, + { + "epoch": 0.06835097707795866, + "grad_norm": 0.14326098561286926, + "learning_rate": 0.000991467722658531, + "loss": 3.0651, + "step": 2305 + }, + { + "epoch": 0.06838063043027014, + "grad_norm": 0.13328677415847778, + "learning_rate": 0.000991459065418917, + "loss": 3.0103, + "step": 2306 + }, + { + "epoch": 0.06841028378258163, + "grad_norm": 0.12769784033298492, + "learning_rate": 0.0009914504038273481, + "loss": 3.0616, + "step": 2307 + }, + { + "epoch": 0.0684399371348931, + "grad_norm": 0.15196239948272705, + "learning_rate": 0.0009914417378839018, + "loss": 3.0612, + "step": 2308 + }, + { + "epoch": 0.06846959048720458, + "grad_norm": 0.14431050419807434, + "learning_rate": 0.0009914330675886544, + "loss": 3.0412, + "step": 2309 + }, + { + "epoch": 0.06849924383951606, + "grad_norm": 0.14237982034683228, + "learning_rate": 0.0009914243929416832, + "loss": 3.0307, + "step": 2310 + }, + { + "epoch": 0.06852889719182753, + "grad_norm": 0.16503861546516418, + "learning_rate": 0.0009914157139430642, + "loss": 3.0464, + "step": 2311 + }, + { + "epoch": 0.06855855054413902, + "grad_norm": 0.21642249822616577, + "learning_rate": 0.000991407030592875, + "loss": 3.1177, + "step": 2312 + }, + { + "epoch": 0.0685882038964505, + "grad_norm": 0.21495994925498962, + "learning_rate": 0.000991398342891192, + "loss": 3.0752, + "step": 2313 + }, + { + "epoch": 0.06861785724876197, + "grad_norm": 0.2150513231754303, + "learning_rate": 0.0009913896508380925, + "loss": 3.0859, + "step": 2314 + }, + { + "epoch": 0.06864751060107345, + "grad_norm": 0.253201961517334, + "learning_rate": 0.0009913809544336532, + "loss": 3.0625, + "step": 2315 + }, + { + "epoch": 0.06867716395338493, + "grad_norm": 0.23353400826454163, + "learning_rate": 0.0009913722536779512, + "loss": 3.1007, + "step": 2316 + }, + { + "epoch": 0.0687068173056964, + "grad_norm": 0.20715847611427307, + "learning_rate": 0.0009913635485710637, + "loss": 3.0431, + "step": 2317 + }, + { + "epoch": 0.06873647065800789, + "grad_norm": 0.18392622470855713, + "learning_rate": 0.0009913548391130675, + "loss": 3.0375, + "step": 2318 + }, + { + "epoch": 0.06876612401031937, + "grad_norm": 0.16104011237621307, + "learning_rate": 0.0009913461253040399, + "loss": 3.0473, + "step": 2319 + }, + { + "epoch": 0.06879577736263084, + "grad_norm": 0.16901813447475433, + "learning_rate": 0.000991337407144058, + "loss": 3.0337, + "step": 2320 + }, + { + "epoch": 0.06882543071494232, + "grad_norm": 0.14425215125083923, + "learning_rate": 0.000991328684633199, + "loss": 3.0369, + "step": 2321 + }, + { + "epoch": 0.0688550840672538, + "grad_norm": 0.14055784046649933, + "learning_rate": 0.00099131995777154, + "loss": 3.0634, + "step": 2322 + }, + { + "epoch": 0.06888473741956529, + "grad_norm": 0.13722161948680878, + "learning_rate": 0.000991311226559159, + "loss": 3.0366, + "step": 2323 + }, + { + "epoch": 0.06891439077187676, + "grad_norm": 0.1293288767337799, + "learning_rate": 0.000991302490996132, + "loss": 3.0511, + "step": 2324 + }, + { + "epoch": 0.06894404412418824, + "grad_norm": 0.1226244792342186, + "learning_rate": 0.0009912937510825376, + "loss": 3.0701, + "step": 2325 + }, + { + "epoch": 0.06897369747649972, + "grad_norm": 0.12337706983089447, + "learning_rate": 0.0009912850068184527, + "loss": 3.0586, + "step": 2326 + }, + { + "epoch": 0.06900335082881119, + "grad_norm": 0.142462357878685, + "learning_rate": 0.0009912762582039544, + "loss": 3.0756, + "step": 2327 + }, + { + "epoch": 0.06903300418112268, + "grad_norm": 0.15497374534606934, + "learning_rate": 0.0009912675052391208, + "loss": 3.0414, + "step": 2328 + }, + { + "epoch": 0.06906265753343416, + "grad_norm": 0.13805393874645233, + "learning_rate": 0.0009912587479240292, + "loss": 3.0457, + "step": 2329 + }, + { + "epoch": 0.06909231088574563, + "grad_norm": 0.1429641991853714, + "learning_rate": 0.000991249986258757, + "loss": 3.0802, + "step": 2330 + }, + { + "epoch": 0.06912196423805711, + "grad_norm": 0.15202659368515015, + "learning_rate": 0.0009912412202433816, + "loss": 3.0235, + "step": 2331 + }, + { + "epoch": 0.06915161759036859, + "grad_norm": 0.16541656851768494, + "learning_rate": 0.000991232449877981, + "loss": 3.0228, + "step": 2332 + }, + { + "epoch": 0.06918127094268008, + "grad_norm": 0.16695734858512878, + "learning_rate": 0.000991223675162633, + "loss": 3.0875, + "step": 2333 + }, + { + "epoch": 0.06921092429499155, + "grad_norm": 0.20015478134155273, + "learning_rate": 0.0009912148960974146, + "loss": 3.0489, + "step": 2334 + }, + { + "epoch": 0.06924057764730303, + "grad_norm": 0.24225077033042908, + "learning_rate": 0.0009912061126824043, + "loss": 3.0446, + "step": 2335 + }, + { + "epoch": 0.0692702309996145, + "grad_norm": 0.24474650621414185, + "learning_rate": 0.0009911973249176794, + "loss": 3.0834, + "step": 2336 + }, + { + "epoch": 0.06929988435192598, + "grad_norm": 0.27283546328544617, + "learning_rate": 0.0009911885328033178, + "loss": 3.0531, + "step": 2337 + }, + { + "epoch": 0.06932953770423747, + "grad_norm": 0.24618476629257202, + "learning_rate": 0.0009911797363393977, + "loss": 3.0394, + "step": 2338 + }, + { + "epoch": 0.06935919105654895, + "grad_norm": 0.21044482290744781, + "learning_rate": 0.0009911709355259965, + "loss": 3.069, + "step": 2339 + }, + { + "epoch": 0.06938884440886042, + "grad_norm": 0.16336867213249207, + "learning_rate": 0.0009911621303631925, + "loss": 3.0331, + "step": 2340 + }, + { + "epoch": 0.0694184977611719, + "grad_norm": 0.16178515553474426, + "learning_rate": 0.0009911533208510634, + "loss": 3.0397, + "step": 2341 + }, + { + "epoch": 0.06944815111348338, + "grad_norm": 0.16805359721183777, + "learning_rate": 0.0009911445069896877, + "loss": 3.0638, + "step": 2342 + }, + { + "epoch": 0.06947780446579485, + "grad_norm": 0.14785990118980408, + "learning_rate": 0.0009911356887791426, + "loss": 3.0587, + "step": 2343 + }, + { + "epoch": 0.06950745781810634, + "grad_norm": 0.1383371204137802, + "learning_rate": 0.000991126866219507, + "loss": 3.0691, + "step": 2344 + }, + { + "epoch": 0.06953711117041782, + "grad_norm": 0.12466645985841751, + "learning_rate": 0.0009911180393108586, + "loss": 3.0012, + "step": 2345 + }, + { + "epoch": 0.0695667645227293, + "grad_norm": 0.13621051609516144, + "learning_rate": 0.0009911092080532756, + "loss": 3.0675, + "step": 2346 + }, + { + "epoch": 0.06959641787504077, + "grad_norm": 0.14069993793964386, + "learning_rate": 0.0009911003724468361, + "loss": 3.047, + "step": 2347 + }, + { + "epoch": 0.06962607122735225, + "grad_norm": 0.13846446573734283, + "learning_rate": 0.0009910915324916187, + "loss": 3.0506, + "step": 2348 + }, + { + "epoch": 0.06965572457966374, + "grad_norm": 0.11421026289463043, + "learning_rate": 0.0009910826881877016, + "loss": 3.0816, + "step": 2349 + }, + { + "epoch": 0.06968537793197521, + "grad_norm": 0.12282147258520126, + "learning_rate": 0.0009910738395351628, + "loss": 3.0325, + "step": 2350 + }, + { + "epoch": 0.06971503128428669, + "grad_norm": 0.12843655049800873, + "learning_rate": 0.000991064986534081, + "loss": 3.0708, + "step": 2351 + }, + { + "epoch": 0.06974468463659816, + "grad_norm": 0.16966105997562408, + "learning_rate": 0.0009910561291845345, + "loss": 3.054, + "step": 2352 + }, + { + "epoch": 0.06977433798890964, + "grad_norm": 0.2105402648448944, + "learning_rate": 0.0009910472674866016, + "loss": 3.0539, + "step": 2353 + }, + { + "epoch": 0.06980399134122113, + "grad_norm": 0.20280393958091736, + "learning_rate": 0.0009910384014403608, + "loss": 3.0127, + "step": 2354 + }, + { + "epoch": 0.0698336446935326, + "grad_norm": 0.13896247744560242, + "learning_rate": 0.0009910295310458907, + "loss": 2.983, + "step": 2355 + }, + { + "epoch": 0.06986329804584408, + "grad_norm": 0.15429076552391052, + "learning_rate": 0.0009910206563032698, + "loss": 3.0613, + "step": 2356 + }, + { + "epoch": 0.06989295139815556, + "grad_norm": 0.14808224141597748, + "learning_rate": 0.0009910117772125768, + "loss": 3.0425, + "step": 2357 + }, + { + "epoch": 0.06992260475046704, + "grad_norm": 0.1404808610677719, + "learning_rate": 0.0009910028937738901, + "loss": 3.0389, + "step": 2358 + }, + { + "epoch": 0.06995225810277853, + "grad_norm": 0.13700291514396667, + "learning_rate": 0.0009909940059872886, + "loss": 3.0552, + "step": 2359 + }, + { + "epoch": 0.06998191145509, + "grad_norm": 0.15737910568714142, + "learning_rate": 0.000990985113852851, + "loss": 3.0294, + "step": 2360 + }, + { + "epoch": 0.07001156480740148, + "grad_norm": 0.1461695432662964, + "learning_rate": 0.0009909762173706557, + "loss": 3.0489, + "step": 2361 + }, + { + "epoch": 0.07004121815971295, + "grad_norm": 0.1306583434343338, + "learning_rate": 0.0009909673165407818, + "loss": 3.0365, + "step": 2362 + }, + { + "epoch": 0.07007087151202443, + "grad_norm": 0.14024874567985535, + "learning_rate": 0.0009909584113633081, + "loss": 3.0565, + "step": 2363 + }, + { + "epoch": 0.07010052486433592, + "grad_norm": 0.14863789081573486, + "learning_rate": 0.0009909495018383134, + "loss": 3.0623, + "step": 2364 + }, + { + "epoch": 0.0701301782166474, + "grad_norm": 0.13993829488754272, + "learning_rate": 0.0009909405879658766, + "loss": 3.0514, + "step": 2365 + }, + { + "epoch": 0.07015983156895887, + "grad_norm": 0.16825363039970398, + "learning_rate": 0.0009909316697460765, + "loss": 3.0465, + "step": 2366 + }, + { + "epoch": 0.07018948492127035, + "grad_norm": 0.20929484069347382, + "learning_rate": 0.0009909227471789923, + "loss": 3.0593, + "step": 2367 + }, + { + "epoch": 0.07021913827358182, + "grad_norm": 0.21316030621528625, + "learning_rate": 0.000990913820264703, + "loss": 3.034, + "step": 2368 + }, + { + "epoch": 0.0702487916258933, + "grad_norm": 0.19779789447784424, + "learning_rate": 0.0009909048890032874, + "loss": 3.0165, + "step": 2369 + }, + { + "epoch": 0.07027844497820479, + "grad_norm": 0.18751567602157593, + "learning_rate": 0.0009908959533948248, + "loss": 3.0236, + "step": 2370 + }, + { + "epoch": 0.07030809833051627, + "grad_norm": 0.17040999233722687, + "learning_rate": 0.000990887013439394, + "loss": 2.9947, + "step": 2371 + }, + { + "epoch": 0.07033775168282774, + "grad_norm": 0.17797726392745972, + "learning_rate": 0.000990878069137075, + "loss": 3.0493, + "step": 2372 + }, + { + "epoch": 0.07036740503513922, + "grad_norm": 0.1621602177619934, + "learning_rate": 0.0009908691204879461, + "loss": 3.0792, + "step": 2373 + }, + { + "epoch": 0.0703970583874507, + "grad_norm": 0.1688588261604309, + "learning_rate": 0.000990860167492087, + "loss": 3.0555, + "step": 2374 + }, + { + "epoch": 0.07042671173976219, + "grad_norm": 0.1699257493019104, + "learning_rate": 0.0009908512101495766, + "loss": 3.0213, + "step": 2375 + }, + { + "epoch": 0.07045636509207366, + "grad_norm": 0.16854381561279297, + "learning_rate": 0.0009908422484604946, + "loss": 3.0395, + "step": 2376 + }, + { + "epoch": 0.07048601844438514, + "grad_norm": 0.15241725742816925, + "learning_rate": 0.0009908332824249205, + "loss": 3.0269, + "step": 2377 + }, + { + "epoch": 0.07051567179669661, + "grad_norm": 0.15124036371707916, + "learning_rate": 0.0009908243120429331, + "loss": 3.0436, + "step": 2378 + }, + { + "epoch": 0.07054532514900809, + "grad_norm": 0.15772031247615814, + "learning_rate": 0.0009908153373146124, + "loss": 3.0628, + "step": 2379 + }, + { + "epoch": 0.07057497850131958, + "grad_norm": 0.15696193277835846, + "learning_rate": 0.0009908063582400376, + "loss": 3.0177, + "step": 2380 + }, + { + "epoch": 0.07060463185363106, + "grad_norm": 0.17232291400432587, + "learning_rate": 0.000990797374819288, + "loss": 3.0205, + "step": 2381 + }, + { + "epoch": 0.07063428520594253, + "grad_norm": 0.14863908290863037, + "learning_rate": 0.0009907883870524437, + "loss": 3.0714, + "step": 2382 + }, + { + "epoch": 0.07066393855825401, + "grad_norm": 0.14148752391338348, + "learning_rate": 0.0009907793949395839, + "loss": 3.0353, + "step": 2383 + }, + { + "epoch": 0.07069359191056548, + "grad_norm": 0.15019197762012482, + "learning_rate": 0.0009907703984807883, + "loss": 3.0665, + "step": 2384 + }, + { + "epoch": 0.07072324526287697, + "grad_norm": 0.17457416653633118, + "learning_rate": 0.0009907613976761365, + "loss": 3.0243, + "step": 2385 + }, + { + "epoch": 0.07075289861518845, + "grad_norm": 0.184925839304924, + "learning_rate": 0.0009907523925257085, + "loss": 3.0336, + "step": 2386 + }, + { + "epoch": 0.07078255196749993, + "grad_norm": 0.19716157019138336, + "learning_rate": 0.0009907433830295836, + "loss": 3.019, + "step": 2387 + }, + { + "epoch": 0.0708122053198114, + "grad_norm": 0.17946098744869232, + "learning_rate": 0.0009907343691878418, + "loss": 3.0193, + "step": 2388 + }, + { + "epoch": 0.07084185867212288, + "grad_norm": 0.16619522869586945, + "learning_rate": 0.000990725351000563, + "loss": 3.0475, + "step": 2389 + }, + { + "epoch": 0.07087151202443437, + "grad_norm": 0.1721688061952591, + "learning_rate": 0.0009907163284678271, + "loss": 3.0451, + "step": 2390 + }, + { + "epoch": 0.07090116537674584, + "grad_norm": 0.1719646006822586, + "learning_rate": 0.0009907073015897139, + "loss": 3.0528, + "step": 2391 + }, + { + "epoch": 0.07093081872905732, + "grad_norm": 0.17701832950115204, + "learning_rate": 0.0009906982703663033, + "loss": 3.0802, + "step": 2392 + }, + { + "epoch": 0.0709604720813688, + "grad_norm": 0.1897512823343277, + "learning_rate": 0.0009906892347976751, + "loss": 3.0164, + "step": 2393 + }, + { + "epoch": 0.07099012543368027, + "grad_norm": 0.17217205464839935, + "learning_rate": 0.0009906801948839094, + "loss": 3.0645, + "step": 2394 + }, + { + "epoch": 0.07101977878599175, + "grad_norm": 0.15826302766799927, + "learning_rate": 0.0009906711506250867, + "loss": 3.0171, + "step": 2395 + }, + { + "epoch": 0.07104943213830324, + "grad_norm": 0.14905327558517456, + "learning_rate": 0.0009906621020212866, + "loss": 3.0627, + "step": 2396 + }, + { + "epoch": 0.07107908549061472, + "grad_norm": 0.15345114469528198, + "learning_rate": 0.0009906530490725893, + "loss": 3.0171, + "step": 2397 + }, + { + "epoch": 0.07110873884292619, + "grad_norm": 0.1439930498600006, + "learning_rate": 0.0009906439917790751, + "loss": 3.0153, + "step": 2398 + }, + { + "epoch": 0.07113839219523767, + "grad_norm": 0.16812117397785187, + "learning_rate": 0.0009906349301408242, + "loss": 3.0657, + "step": 2399 + }, + { + "epoch": 0.07116804554754914, + "grad_norm": 0.1803150773048401, + "learning_rate": 0.0009906258641579166, + "loss": 3.0355, + "step": 2400 + }, + { + "epoch": 0.07119769889986063, + "grad_norm": 0.1940024495124817, + "learning_rate": 0.000990616793830433, + "loss": 3.042, + "step": 2401 + }, + { + "epoch": 0.07122735225217211, + "grad_norm": 0.16224586963653564, + "learning_rate": 0.0009906077191584532, + "loss": 3.0566, + "step": 2402 + }, + { + "epoch": 0.07125700560448359, + "grad_norm": 0.16366861760616302, + "learning_rate": 0.000990598640142058, + "loss": 3.0087, + "step": 2403 + }, + { + "epoch": 0.07128665895679506, + "grad_norm": 0.17143139243125916, + "learning_rate": 0.0009905895567813277, + "loss": 3.0698, + "step": 2404 + }, + { + "epoch": 0.07131631230910654, + "grad_norm": 0.16960138082504272, + "learning_rate": 0.0009905804690763425, + "loss": 3.0387, + "step": 2405 + }, + { + "epoch": 0.07134596566141803, + "grad_norm": 0.1496911346912384, + "learning_rate": 0.0009905713770271831, + "loss": 3.015, + "step": 2406 + }, + { + "epoch": 0.0713756190137295, + "grad_norm": 0.14890053868293762, + "learning_rate": 0.00099056228063393, + "loss": 3.0583, + "step": 2407 + }, + { + "epoch": 0.07140527236604098, + "grad_norm": 0.13959889113903046, + "learning_rate": 0.0009905531798966637, + "loss": 3.0269, + "step": 2408 + }, + { + "epoch": 0.07143492571835246, + "grad_norm": 0.1368602216243744, + "learning_rate": 0.0009905440748154647, + "loss": 3.0636, + "step": 2409 + }, + { + "epoch": 0.07146457907066393, + "grad_norm": 0.16659097373485565, + "learning_rate": 0.0009905349653904136, + "loss": 3.0483, + "step": 2410 + }, + { + "epoch": 0.07149423242297542, + "grad_norm": 0.15841874480247498, + "learning_rate": 0.0009905258516215915, + "loss": 3.0615, + "step": 2411 + }, + { + "epoch": 0.0715238857752869, + "grad_norm": 0.188920259475708, + "learning_rate": 0.0009905167335090787, + "loss": 3.0564, + "step": 2412 + }, + { + "epoch": 0.07155353912759838, + "grad_norm": 0.1892063170671463, + "learning_rate": 0.000990507611052956, + "loss": 3.0155, + "step": 2413 + }, + { + "epoch": 0.07158319247990985, + "grad_norm": 0.17033033072948456, + "learning_rate": 0.000990498484253304, + "loss": 3.0407, + "step": 2414 + }, + { + "epoch": 0.07161284583222133, + "grad_norm": 0.1554175317287445, + "learning_rate": 0.0009904893531102038, + "loss": 3.0328, + "step": 2415 + }, + { + "epoch": 0.07164249918453282, + "grad_norm": 0.1656954437494278, + "learning_rate": 0.0009904802176237365, + "loss": 3.0164, + "step": 2416 + }, + { + "epoch": 0.0716721525368443, + "grad_norm": 0.1610756516456604, + "learning_rate": 0.0009904710777939823, + "loss": 3.0427, + "step": 2417 + }, + { + "epoch": 0.07170180588915577, + "grad_norm": 0.19330406188964844, + "learning_rate": 0.0009904619336210227, + "loss": 3.0677, + "step": 2418 + }, + { + "epoch": 0.07173145924146725, + "grad_norm": 0.19964586198329926, + "learning_rate": 0.0009904527851049385, + "loss": 3.0592, + "step": 2419 + }, + { + "epoch": 0.07176111259377872, + "grad_norm": 0.156934455037117, + "learning_rate": 0.0009904436322458107, + "loss": 2.9804, + "step": 2420 + }, + { + "epoch": 0.0717907659460902, + "grad_norm": 0.21484951674938202, + "learning_rate": 0.0009904344750437204, + "loss": 3.0528, + "step": 2421 + }, + { + "epoch": 0.07182041929840169, + "grad_norm": 0.2125215083360672, + "learning_rate": 0.0009904253134987485, + "loss": 3.0008, + "step": 2422 + }, + { + "epoch": 0.07185007265071316, + "grad_norm": 0.1581474393606186, + "learning_rate": 0.0009904161476109764, + "loss": 3.0686, + "step": 2423 + }, + { + "epoch": 0.07187972600302464, + "grad_norm": 0.1644710898399353, + "learning_rate": 0.0009904069773804852, + "loss": 3.0369, + "step": 2424 + }, + { + "epoch": 0.07190937935533612, + "grad_norm": 0.14335468411445618, + "learning_rate": 0.0009903978028073558, + "loss": 3.057, + "step": 2425 + }, + { + "epoch": 0.0719390327076476, + "grad_norm": 0.12931117415428162, + "learning_rate": 0.0009903886238916697, + "loss": 3.0134, + "step": 2426 + }, + { + "epoch": 0.07196868605995908, + "grad_norm": 0.13928727805614471, + "learning_rate": 0.0009903794406335084, + "loss": 3.0611, + "step": 2427 + }, + { + "epoch": 0.07199833941227056, + "grad_norm": 0.1352480947971344, + "learning_rate": 0.0009903702530329528, + "loss": 3.011, + "step": 2428 + }, + { + "epoch": 0.07202799276458204, + "grad_norm": 0.12818777561187744, + "learning_rate": 0.0009903610610900843, + "loss": 3.0111, + "step": 2429 + }, + { + "epoch": 0.07205764611689351, + "grad_norm": 0.14162267744541168, + "learning_rate": 0.0009903518648049848, + "loss": 3.0565, + "step": 2430 + }, + { + "epoch": 0.07208729946920499, + "grad_norm": 0.14130736887454987, + "learning_rate": 0.0009903426641777351, + "loss": 3.0093, + "step": 2431 + }, + { + "epoch": 0.07211695282151648, + "grad_norm": 0.13703227043151855, + "learning_rate": 0.000990333459208417, + "loss": 3.0466, + "step": 2432 + }, + { + "epoch": 0.07214660617382795, + "grad_norm": 0.17181110382080078, + "learning_rate": 0.000990324249897112, + "loss": 3.0115, + "step": 2433 + }, + { + "epoch": 0.07217625952613943, + "grad_norm": 0.165774405002594, + "learning_rate": 0.0009903150362439018, + "loss": 3.0317, + "step": 2434 + }, + { + "epoch": 0.0722059128784509, + "grad_norm": 0.18124324083328247, + "learning_rate": 0.0009903058182488675, + "loss": 3.0409, + "step": 2435 + }, + { + "epoch": 0.07223556623076238, + "grad_norm": 0.20558154582977295, + "learning_rate": 0.000990296595912091, + "loss": 3.0432, + "step": 2436 + }, + { + "epoch": 0.07226521958307387, + "grad_norm": 0.2458517849445343, + "learning_rate": 0.0009902873692336541, + "loss": 3.0406, + "step": 2437 + }, + { + "epoch": 0.07229487293538535, + "grad_norm": 0.26448503136634827, + "learning_rate": 0.0009902781382136383, + "loss": 3.0516, + "step": 2438 + }, + { + "epoch": 0.07232452628769682, + "grad_norm": 0.22286033630371094, + "learning_rate": 0.0009902689028521256, + "loss": 3.0794, + "step": 2439 + }, + { + "epoch": 0.0723541796400083, + "grad_norm": 0.18882331252098083, + "learning_rate": 0.0009902596631491975, + "loss": 3.0617, + "step": 2440 + }, + { + "epoch": 0.07238383299231978, + "grad_norm": 0.18364235758781433, + "learning_rate": 0.000990250419104936, + "loss": 3.0302, + "step": 2441 + }, + { + "epoch": 0.07241348634463127, + "grad_norm": 0.1680847704410553, + "learning_rate": 0.0009902411707194228, + "loss": 3.0098, + "step": 2442 + }, + { + "epoch": 0.07244313969694274, + "grad_norm": 0.175267294049263, + "learning_rate": 0.0009902319179927398, + "loss": 3.0555, + "step": 2443 + }, + { + "epoch": 0.07247279304925422, + "grad_norm": 0.17276068031787872, + "learning_rate": 0.000990222660924969, + "loss": 2.993, + "step": 2444 + }, + { + "epoch": 0.0725024464015657, + "grad_norm": 0.16013318300247192, + "learning_rate": 0.0009902133995161927, + "loss": 3.0402, + "step": 2445 + }, + { + "epoch": 0.07253209975387717, + "grad_norm": 0.13774314522743225, + "learning_rate": 0.0009902041337664924, + "loss": 3.039, + "step": 2446 + }, + { + "epoch": 0.07256175310618865, + "grad_norm": 0.13180133700370789, + "learning_rate": 0.0009901948636759504, + "loss": 3.0454, + "step": 2447 + }, + { + "epoch": 0.07259140645850014, + "grad_norm": 0.12058401852846146, + "learning_rate": 0.0009901855892446487, + "loss": 3.0445, + "step": 2448 + }, + { + "epoch": 0.07262105981081161, + "grad_norm": 0.13604262471199036, + "learning_rate": 0.0009901763104726694, + "loss": 3.0155, + "step": 2449 + }, + { + "epoch": 0.07265071316312309, + "grad_norm": 0.1472819745540619, + "learning_rate": 0.000990167027360095, + "loss": 3.0555, + "step": 2450 + }, + { + "epoch": 0.07268036651543457, + "grad_norm": 0.12747520208358765, + "learning_rate": 0.0009901577399070072, + "loss": 3.0314, + "step": 2451 + }, + { + "epoch": 0.07271001986774604, + "grad_norm": 0.1373155415058136, + "learning_rate": 0.0009901484481134885, + "loss": 3.05, + "step": 2452 + }, + { + "epoch": 0.07273967322005753, + "grad_norm": 0.15641649067401886, + "learning_rate": 0.0009901391519796213, + "loss": 3.0429, + "step": 2453 + }, + { + "epoch": 0.07276932657236901, + "grad_norm": 0.1403154879808426, + "learning_rate": 0.0009901298515054878, + "loss": 3.0043, + "step": 2454 + }, + { + "epoch": 0.07279897992468048, + "grad_norm": 0.13583318889141083, + "learning_rate": 0.00099012054669117, + "loss": 3.0156, + "step": 2455 + }, + { + "epoch": 0.07282863327699196, + "grad_norm": 0.14934909343719482, + "learning_rate": 0.000990111237536751, + "loss": 3.0342, + "step": 2456 + }, + { + "epoch": 0.07285828662930344, + "grad_norm": 0.15426617860794067, + "learning_rate": 0.0009901019240423127, + "loss": 3.0231, + "step": 2457 + }, + { + "epoch": 0.07288793998161493, + "grad_norm": 0.1762295365333557, + "learning_rate": 0.0009900926062079377, + "loss": 3.0388, + "step": 2458 + }, + { + "epoch": 0.0729175933339264, + "grad_norm": 0.2072557508945465, + "learning_rate": 0.0009900832840337086, + "loss": 3.0254, + "step": 2459 + }, + { + "epoch": 0.07294724668623788, + "grad_norm": 0.2268260270357132, + "learning_rate": 0.0009900739575197078, + "loss": 3.053, + "step": 2460 + }, + { + "epoch": 0.07297690003854936, + "grad_norm": 0.2094334363937378, + "learning_rate": 0.0009900646266660183, + "loss": 3.0407, + "step": 2461 + }, + { + "epoch": 0.07300655339086083, + "grad_norm": 0.20162315666675568, + "learning_rate": 0.000990055291472722, + "loss": 2.9949, + "step": 2462 + }, + { + "epoch": 0.07303620674317232, + "grad_norm": 0.15497168898582458, + "learning_rate": 0.0009900459519399023, + "loss": 3.0371, + "step": 2463 + }, + { + "epoch": 0.0730658600954838, + "grad_norm": 0.16613782942295074, + "learning_rate": 0.0009900366080676413, + "loss": 3.0567, + "step": 2464 + }, + { + "epoch": 0.07309551344779527, + "grad_norm": 0.14581656455993652, + "learning_rate": 0.0009900272598560222, + "loss": 3.0453, + "step": 2465 + }, + { + "epoch": 0.07312516680010675, + "grad_norm": 0.14290273189544678, + "learning_rate": 0.0009900179073051277, + "loss": 3.0292, + "step": 2466 + }, + { + "epoch": 0.07315482015241823, + "grad_norm": 0.1602434515953064, + "learning_rate": 0.0009900085504150403, + "loss": 3.0161, + "step": 2467 + }, + { + "epoch": 0.07318447350472972, + "grad_norm": 0.1868446171283722, + "learning_rate": 0.000989999189185843, + "loss": 3.004, + "step": 2468 + }, + { + "epoch": 0.07321412685704119, + "grad_norm": 0.19470396637916565, + "learning_rate": 0.0009899898236176191, + "loss": 3.0469, + "step": 2469 + }, + { + "epoch": 0.07324378020935267, + "grad_norm": 0.19548213481903076, + "learning_rate": 0.000989980453710451, + "loss": 3.033, + "step": 2470 + }, + { + "epoch": 0.07327343356166414, + "grad_norm": 0.1987846940755844, + "learning_rate": 0.0009899710794644219, + "loss": 3.0525, + "step": 2471 + }, + { + "epoch": 0.07330308691397562, + "grad_norm": 0.17971928417682648, + "learning_rate": 0.000989961700879615, + "loss": 3.0395, + "step": 2472 + }, + { + "epoch": 0.0733327402662871, + "grad_norm": 0.14394427835941315, + "learning_rate": 0.000989952317956113, + "loss": 3.0228, + "step": 2473 + }, + { + "epoch": 0.07336239361859859, + "grad_norm": 0.1587861180305481, + "learning_rate": 0.0009899429306939988, + "loss": 3.0683, + "step": 2474 + }, + { + "epoch": 0.07339204697091006, + "grad_norm": 0.17676258087158203, + "learning_rate": 0.0009899335390933562, + "loss": 3.0241, + "step": 2475 + }, + { + "epoch": 0.07342170032322154, + "grad_norm": 0.20462675392627716, + "learning_rate": 0.000989924143154268, + "loss": 3.061, + "step": 2476 + }, + { + "epoch": 0.07345135367553302, + "grad_norm": 0.1995423287153244, + "learning_rate": 0.0009899147428768173, + "loss": 3.0227, + "step": 2477 + }, + { + "epoch": 0.07348100702784449, + "grad_norm": 0.18190763890743256, + "learning_rate": 0.0009899053382610877, + "loss": 3.0369, + "step": 2478 + }, + { + "epoch": 0.07351066038015598, + "grad_norm": 0.15933352708816528, + "learning_rate": 0.0009898959293071618, + "loss": 2.9804, + "step": 2479 + }, + { + "epoch": 0.07354031373246746, + "grad_norm": 0.15115344524383545, + "learning_rate": 0.0009898865160151238, + "loss": 3.0373, + "step": 2480 + }, + { + "epoch": 0.07356996708477893, + "grad_norm": 0.15901680290699005, + "learning_rate": 0.0009898770983850565, + "loss": 3.014, + "step": 2481 + }, + { + "epoch": 0.07359962043709041, + "grad_norm": 0.13252228498458862, + "learning_rate": 0.0009898676764170432, + "loss": 3.0278, + "step": 2482 + }, + { + "epoch": 0.07362927378940189, + "grad_norm": 0.1493133306503296, + "learning_rate": 0.0009898582501111676, + "loss": 3.0143, + "step": 2483 + }, + { + "epoch": 0.07365892714171338, + "grad_norm": 0.1422765851020813, + "learning_rate": 0.0009898488194675134, + "loss": 3.0307, + "step": 2484 + }, + { + "epoch": 0.07368858049402485, + "grad_norm": 0.13301850855350494, + "learning_rate": 0.0009898393844861636, + "loss": 3.0502, + "step": 2485 + }, + { + "epoch": 0.07371823384633633, + "grad_norm": 0.12564785778522491, + "learning_rate": 0.000989829945167202, + "loss": 3.0109, + "step": 2486 + }, + { + "epoch": 0.0737478871986478, + "grad_norm": 0.12464495003223419, + "learning_rate": 0.000989820501510712, + "loss": 3.0193, + "step": 2487 + }, + { + "epoch": 0.07377754055095928, + "grad_norm": 0.15088312327861786, + "learning_rate": 0.0009898110535167775, + "loss": 3.0278, + "step": 2488 + }, + { + "epoch": 0.07380719390327077, + "grad_norm": 0.15340857207775116, + "learning_rate": 0.000989801601185482, + "loss": 3.004, + "step": 2489 + }, + { + "epoch": 0.07383684725558225, + "grad_norm": 0.1256677508354187, + "learning_rate": 0.0009897921445169095, + "loss": 3.0564, + "step": 2490 + }, + { + "epoch": 0.07386650060789372, + "grad_norm": 0.1399754136800766, + "learning_rate": 0.0009897826835111431, + "loss": 3.0313, + "step": 2491 + }, + { + "epoch": 0.0738961539602052, + "grad_norm": 0.1579759418964386, + "learning_rate": 0.0009897732181682673, + "loss": 3.0192, + "step": 2492 + }, + { + "epoch": 0.07392580731251668, + "grad_norm": 0.16140241920948029, + "learning_rate": 0.0009897637484883655, + "loss": 3.0561, + "step": 2493 + }, + { + "epoch": 0.07395546066482817, + "grad_norm": 0.15495896339416504, + "learning_rate": 0.0009897542744715215, + "loss": 3.002, + "step": 2494 + }, + { + "epoch": 0.07398511401713964, + "grad_norm": 0.16073556244373322, + "learning_rate": 0.0009897447961178193, + "loss": 3.0706, + "step": 2495 + }, + { + "epoch": 0.07401476736945112, + "grad_norm": 0.2143372744321823, + "learning_rate": 0.0009897353134273432, + "loss": 3.0163, + "step": 2496 + }, + { + "epoch": 0.0740444207217626, + "grad_norm": 0.22905407845973969, + "learning_rate": 0.0009897258264001767, + "loss": 3.0364, + "step": 2497 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 0.19550231099128723, + "learning_rate": 0.0009897163350364039, + "loss": 3.0392, + "step": 2498 + }, + { + "epoch": 0.07410372742638555, + "grad_norm": 0.19346697628498077, + "learning_rate": 0.0009897068393361088, + "loss": 3.03, + "step": 2499 + }, + { + "epoch": 0.07413338077869704, + "grad_norm": 0.17187051475048065, + "learning_rate": 0.0009896973392993756, + "loss": 3.0032, + "step": 2500 + }, + { + "epoch": 0.07416303413100851, + "grad_norm": 0.17248451709747314, + "learning_rate": 0.0009896878349262886, + "loss": 3.048, + "step": 2501 + }, + { + "epoch": 0.07419268748331999, + "grad_norm": 0.171360582113266, + "learning_rate": 0.0009896783262169316, + "loss": 3.0359, + "step": 2502 + }, + { + "epoch": 0.07422234083563146, + "grad_norm": 0.16559413075447083, + "learning_rate": 0.000989668813171389, + "loss": 3.0014, + "step": 2503 + }, + { + "epoch": 0.07425199418794294, + "grad_norm": 0.15432213246822357, + "learning_rate": 0.000989659295789745, + "loss": 2.999, + "step": 2504 + }, + { + "epoch": 0.07428164754025443, + "grad_norm": 0.15384535491466522, + "learning_rate": 0.0009896497740720838, + "loss": 3.0133, + "step": 2505 + }, + { + "epoch": 0.0743113008925659, + "grad_norm": 0.1618591696023941, + "learning_rate": 0.00098964024801849, + "loss": 3.0289, + "step": 2506 + }, + { + "epoch": 0.07434095424487738, + "grad_norm": 0.18099261820316315, + "learning_rate": 0.0009896307176290476, + "loss": 3.0431, + "step": 2507 + }, + { + "epoch": 0.07437060759718886, + "grad_norm": 0.20253358781337738, + "learning_rate": 0.0009896211829038414, + "loss": 3.0339, + "step": 2508 + }, + { + "epoch": 0.07440026094950034, + "grad_norm": 0.19008703529834747, + "learning_rate": 0.0009896116438429551, + "loss": 3.0322, + "step": 2509 + }, + { + "epoch": 0.07442991430181183, + "grad_norm": 0.17740921676158905, + "learning_rate": 0.000989602100446474, + "loss": 3.0111, + "step": 2510 + }, + { + "epoch": 0.0744595676541233, + "grad_norm": 0.17680242657661438, + "learning_rate": 0.0009895925527144823, + "loss": 3.0265, + "step": 2511 + }, + { + "epoch": 0.07448922100643478, + "grad_norm": 0.14756806194782257, + "learning_rate": 0.0009895830006470645, + "loss": 3.021, + "step": 2512 + }, + { + "epoch": 0.07451887435874625, + "grad_norm": 0.15012012422084808, + "learning_rate": 0.0009895734442443049, + "loss": 3.0463, + "step": 2513 + }, + { + "epoch": 0.07454852771105773, + "grad_norm": 0.15529322624206543, + "learning_rate": 0.0009895638835062887, + "loss": 3.0254, + "step": 2514 + }, + { + "epoch": 0.07457818106336922, + "grad_norm": 0.1639879047870636, + "learning_rate": 0.0009895543184331001, + "loss": 3.0489, + "step": 2515 + }, + { + "epoch": 0.0746078344156807, + "grad_norm": 0.13808129727840424, + "learning_rate": 0.0009895447490248241, + "loss": 3.0402, + "step": 2516 + }, + { + "epoch": 0.07463748776799217, + "grad_norm": 0.15441395342350006, + "learning_rate": 0.0009895351752815453, + "loss": 3.0223, + "step": 2517 + }, + { + "epoch": 0.07466714112030365, + "grad_norm": 0.16800954937934875, + "learning_rate": 0.0009895255972033486, + "loss": 3.0321, + "step": 2518 + }, + { + "epoch": 0.07469679447261512, + "grad_norm": 0.15773138403892517, + "learning_rate": 0.0009895160147903187, + "loss": 2.9703, + "step": 2519 + }, + { + "epoch": 0.07472644782492661, + "grad_norm": 0.1562170386314392, + "learning_rate": 0.0009895064280425402, + "loss": 3.032, + "step": 2520 + }, + { + "epoch": 0.07475610117723809, + "grad_norm": 0.15026940405368805, + "learning_rate": 0.0009894968369600985, + "loss": 3.0778, + "step": 2521 + }, + { + "epoch": 0.07478575452954957, + "grad_norm": 0.16600392758846283, + "learning_rate": 0.0009894872415430783, + "loss": 3.0177, + "step": 2522 + }, + { + "epoch": 0.07481540788186104, + "grad_norm": 0.19009310007095337, + "learning_rate": 0.0009894776417915644, + "loss": 3.0619, + "step": 2523 + }, + { + "epoch": 0.07484506123417252, + "grad_norm": 0.19594714045524597, + "learning_rate": 0.000989468037705642, + "loss": 3.0318, + "step": 2524 + }, + { + "epoch": 0.074874714586484, + "grad_norm": 0.16404980421066284, + "learning_rate": 0.0009894584292853962, + "loss": 2.9799, + "step": 2525 + }, + { + "epoch": 0.07490436793879549, + "grad_norm": 0.14470109343528748, + "learning_rate": 0.0009894488165309119, + "loss": 3.0463, + "step": 2526 + }, + { + "epoch": 0.07493402129110696, + "grad_norm": 0.17016233503818512, + "learning_rate": 0.0009894391994422745, + "loss": 3.0444, + "step": 2527 + }, + { + "epoch": 0.07496367464341844, + "grad_norm": 0.15792101621627808, + "learning_rate": 0.0009894295780195688, + "loss": 3.0334, + "step": 2528 + }, + { + "epoch": 0.07499332799572991, + "grad_norm": 0.1630411297082901, + "learning_rate": 0.0009894199522628802, + "loss": 3.0218, + "step": 2529 + }, + { + "epoch": 0.07502298134804139, + "grad_norm": 0.1495228260755539, + "learning_rate": 0.0009894103221722939, + "loss": 2.9967, + "step": 2530 + }, + { + "epoch": 0.07505263470035288, + "grad_norm": 0.15611276030540466, + "learning_rate": 0.0009894006877478952, + "loss": 3.0301, + "step": 2531 + }, + { + "epoch": 0.07508228805266436, + "grad_norm": 0.14637617766857147, + "learning_rate": 0.0009893910489897695, + "loss": 3.0406, + "step": 2532 + }, + { + "epoch": 0.07511194140497583, + "grad_norm": 0.18193916976451874, + "learning_rate": 0.000989381405898002, + "loss": 2.9929, + "step": 2533 + }, + { + "epoch": 0.07514159475728731, + "grad_norm": 0.19800880551338196, + "learning_rate": 0.000989371758472678, + "loss": 3.0399, + "step": 2534 + }, + { + "epoch": 0.07517124810959878, + "grad_norm": 0.1985711008310318, + "learning_rate": 0.0009893621067138833, + "loss": 3.0099, + "step": 2535 + }, + { + "epoch": 0.07520090146191027, + "grad_norm": 0.19096042215824127, + "learning_rate": 0.0009893524506217031, + "loss": 3.0094, + "step": 2536 + }, + { + "epoch": 0.07523055481422175, + "grad_norm": 0.15572330355644226, + "learning_rate": 0.000989342790196223, + "loss": 2.9675, + "step": 2537 + }, + { + "epoch": 0.07526020816653323, + "grad_norm": 0.15334877371788025, + "learning_rate": 0.0009893331254375282, + "loss": 3.0239, + "step": 2538 + }, + { + "epoch": 0.0752898615188447, + "grad_norm": 0.16555863618850708, + "learning_rate": 0.0009893234563457048, + "loss": 3.0412, + "step": 2539 + }, + { + "epoch": 0.07531951487115618, + "grad_norm": 0.15791651606559753, + "learning_rate": 0.0009893137829208383, + "loss": 3.0154, + "step": 2540 + }, + { + "epoch": 0.07534916822346767, + "grad_norm": 0.1438690721988678, + "learning_rate": 0.000989304105163014, + "loss": 3.026, + "step": 2541 + }, + { + "epoch": 0.07537882157577915, + "grad_norm": 0.12007793039083481, + "learning_rate": 0.000989294423072318, + "loss": 3.0477, + "step": 2542 + }, + { + "epoch": 0.07540847492809062, + "grad_norm": 0.1298205405473709, + "learning_rate": 0.0009892847366488361, + "loss": 3.0107, + "step": 2543 + }, + { + "epoch": 0.0754381282804021, + "grad_norm": 0.13868583738803864, + "learning_rate": 0.0009892750458926536, + "loss": 3.0171, + "step": 2544 + }, + { + "epoch": 0.07546778163271357, + "grad_norm": 0.13230465352535248, + "learning_rate": 0.0009892653508038567, + "loss": 2.9763, + "step": 2545 + }, + { + "epoch": 0.07549743498502506, + "grad_norm": 0.1450706571340561, + "learning_rate": 0.0009892556513825311, + "loss": 3.06, + "step": 2546 + }, + { + "epoch": 0.07552708833733654, + "grad_norm": 0.14212335646152496, + "learning_rate": 0.0009892459476287627, + "loss": 2.987, + "step": 2547 + }, + { + "epoch": 0.07555674168964802, + "grad_norm": 0.13584211468696594, + "learning_rate": 0.0009892362395426376, + "loss": 3.0506, + "step": 2548 + }, + { + "epoch": 0.07558639504195949, + "grad_norm": 0.15710340440273285, + "learning_rate": 0.0009892265271242416, + "loss": 3.0197, + "step": 2549 + }, + { + "epoch": 0.07561604839427097, + "grad_norm": 0.17182692885398865, + "learning_rate": 0.0009892168103736605, + "loss": 3.0039, + "step": 2550 + }, + { + "epoch": 0.07564570174658244, + "grad_norm": 0.1853766292333603, + "learning_rate": 0.0009892070892909808, + "loss": 3.0104, + "step": 2551 + }, + { + "epoch": 0.07567535509889393, + "grad_norm": 0.1642751544713974, + "learning_rate": 0.0009891973638762883, + "loss": 2.9901, + "step": 2552 + }, + { + "epoch": 0.07570500845120541, + "grad_norm": 0.1817287802696228, + "learning_rate": 0.000989187634129669, + "loss": 3.0703, + "step": 2553 + }, + { + "epoch": 0.07573466180351689, + "grad_norm": 0.1846064180135727, + "learning_rate": 0.0009891779000512093, + "loss": 2.9994, + "step": 2554 + }, + { + "epoch": 0.07576431515582836, + "grad_norm": 0.15010850131511688, + "learning_rate": 0.0009891681616409956, + "loss": 3.012, + "step": 2555 + }, + { + "epoch": 0.07579396850813984, + "grad_norm": 0.14439764618873596, + "learning_rate": 0.0009891584188991137, + "loss": 3.0171, + "step": 2556 + }, + { + "epoch": 0.07582362186045133, + "grad_norm": 0.1758783459663391, + "learning_rate": 0.00098914867182565, + "loss": 3.0493, + "step": 2557 + }, + { + "epoch": 0.0758532752127628, + "grad_norm": 0.1559227705001831, + "learning_rate": 0.0009891389204206907, + "loss": 2.985, + "step": 2558 + }, + { + "epoch": 0.07588292856507428, + "grad_norm": 0.16260826587677002, + "learning_rate": 0.0009891291646843227, + "loss": 2.9913, + "step": 2559 + }, + { + "epoch": 0.07591258191738576, + "grad_norm": 0.16675002872943878, + "learning_rate": 0.0009891194046166315, + "loss": 3.0356, + "step": 2560 + }, + { + "epoch": 0.07594223526969723, + "grad_norm": 0.1792200654745102, + "learning_rate": 0.0009891096402177043, + "loss": 3.068, + "step": 2561 + }, + { + "epoch": 0.07597188862200872, + "grad_norm": 0.19230541586875916, + "learning_rate": 0.0009890998714876273, + "loss": 3.0328, + "step": 2562 + }, + { + "epoch": 0.0760015419743202, + "grad_norm": 0.18159708380699158, + "learning_rate": 0.000989090098426487, + "loss": 3.0264, + "step": 2563 + }, + { + "epoch": 0.07603119532663168, + "grad_norm": 0.18845057487487793, + "learning_rate": 0.0009890803210343697, + "loss": 3.0396, + "step": 2564 + }, + { + "epoch": 0.07606084867894315, + "grad_norm": 0.18080061674118042, + "learning_rate": 0.0009890705393113624, + "loss": 3.016, + "step": 2565 + }, + { + "epoch": 0.07609050203125463, + "grad_norm": 0.1666337251663208, + "learning_rate": 0.0009890607532575515, + "loss": 3.0113, + "step": 2566 + }, + { + "epoch": 0.07612015538356612, + "grad_norm": 0.17459481954574585, + "learning_rate": 0.0009890509628730237, + "loss": 2.9976, + "step": 2567 + }, + { + "epoch": 0.0761498087358776, + "grad_norm": 0.16006910800933838, + "learning_rate": 0.0009890411681578656, + "loss": 3.0108, + "step": 2568 + }, + { + "epoch": 0.07617946208818907, + "grad_norm": 0.17139291763305664, + "learning_rate": 0.000989031369112164, + "loss": 3.0416, + "step": 2569 + }, + { + "epoch": 0.07620911544050055, + "grad_norm": 0.18316523730754852, + "learning_rate": 0.0009890215657360059, + "loss": 2.9952, + "step": 2570 + }, + { + "epoch": 0.07623876879281202, + "grad_norm": 0.18347682058811188, + "learning_rate": 0.0009890117580294777, + "loss": 3.0553, + "step": 2571 + }, + { + "epoch": 0.07626842214512351, + "grad_norm": 0.18121962249279022, + "learning_rate": 0.0009890019459926666, + "loss": 3.0318, + "step": 2572 + }, + { + "epoch": 0.07629807549743499, + "grad_norm": 0.17120635509490967, + "learning_rate": 0.000988992129625659, + "loss": 3.0401, + "step": 2573 + }, + { + "epoch": 0.07632772884974647, + "grad_norm": 0.19818313419818878, + "learning_rate": 0.0009889823089285425, + "loss": 3.022, + "step": 2574 + }, + { + "epoch": 0.07635738220205794, + "grad_norm": 0.19767986238002777, + "learning_rate": 0.0009889724839014036, + "loss": 3.0133, + "step": 2575 + }, + { + "epoch": 0.07638703555436942, + "grad_norm": 0.1606670469045639, + "learning_rate": 0.0009889626545443292, + "loss": 3.0352, + "step": 2576 + }, + { + "epoch": 0.0764166889066809, + "grad_norm": 0.16273967921733856, + "learning_rate": 0.000988952820857407, + "loss": 3.0528, + "step": 2577 + }, + { + "epoch": 0.07644634225899238, + "grad_norm": 0.15717676281929016, + "learning_rate": 0.0009889429828407233, + "loss": 3.008, + "step": 2578 + }, + { + "epoch": 0.07647599561130386, + "grad_norm": 0.13354161381721497, + "learning_rate": 0.0009889331404943657, + "loss": 3.0554, + "step": 2579 + }, + { + "epoch": 0.07650564896361534, + "grad_norm": 0.12027651071548462, + "learning_rate": 0.0009889232938184213, + "loss": 3.0278, + "step": 2580 + }, + { + "epoch": 0.07653530231592681, + "grad_norm": 0.1282476782798767, + "learning_rate": 0.0009889134428129772, + "loss": 3.0484, + "step": 2581 + }, + { + "epoch": 0.07656495566823829, + "grad_norm": 0.13198529183864594, + "learning_rate": 0.0009889035874781206, + "loss": 3.0142, + "step": 2582 + }, + { + "epoch": 0.07659460902054978, + "grad_norm": 0.1191353052854538, + "learning_rate": 0.0009888937278139387, + "loss": 3.031, + "step": 2583 + }, + { + "epoch": 0.07662426237286125, + "grad_norm": 0.12607400119304657, + "learning_rate": 0.0009888838638205189, + "loss": 3.0459, + "step": 2584 + }, + { + "epoch": 0.07665391572517273, + "grad_norm": 0.14631181955337524, + "learning_rate": 0.0009888739954979488, + "loss": 3.0187, + "step": 2585 + }, + { + "epoch": 0.0766835690774842, + "grad_norm": 0.16425834596157074, + "learning_rate": 0.0009888641228463153, + "loss": 3.002, + "step": 2586 + }, + { + "epoch": 0.07671322242979568, + "grad_norm": 0.1910981982946396, + "learning_rate": 0.0009888542458657062, + "loss": 3.0274, + "step": 2587 + }, + { + "epoch": 0.07674287578210717, + "grad_norm": 0.21152599155902863, + "learning_rate": 0.0009888443645562088, + "loss": 3.0432, + "step": 2588 + }, + { + "epoch": 0.07677252913441865, + "grad_norm": 0.19641618430614471, + "learning_rate": 0.0009888344789179108, + "loss": 2.9915, + "step": 2589 + }, + { + "epoch": 0.07680218248673013, + "grad_norm": 0.17912684381008148, + "learning_rate": 0.0009888245889508995, + "loss": 3.0238, + "step": 2590 + }, + { + "epoch": 0.0768318358390416, + "grad_norm": 0.18831458687782288, + "learning_rate": 0.0009888146946552625, + "loss": 2.9994, + "step": 2591 + }, + { + "epoch": 0.07686148919135308, + "grad_norm": 0.18806833028793335, + "learning_rate": 0.0009888047960310874, + "loss": 3.024, + "step": 2592 + }, + { + "epoch": 0.07689114254366457, + "grad_norm": 0.1489180326461792, + "learning_rate": 0.0009887948930784622, + "loss": 3.0433, + "step": 2593 + }, + { + "epoch": 0.07692079589597604, + "grad_norm": 0.15691129863262177, + "learning_rate": 0.0009887849857974742, + "loss": 3.0286, + "step": 2594 + }, + { + "epoch": 0.07695044924828752, + "grad_norm": 0.16310089826583862, + "learning_rate": 0.000988775074188211, + "loss": 3.0175, + "step": 2595 + }, + { + "epoch": 0.076980102600599, + "grad_norm": 0.15572333335876465, + "learning_rate": 0.000988765158250761, + "loss": 3.026, + "step": 2596 + }, + { + "epoch": 0.07700975595291047, + "grad_norm": 0.15599383413791656, + "learning_rate": 0.0009887552379852114, + "loss": 3.0211, + "step": 2597 + }, + { + "epoch": 0.07703940930522196, + "grad_norm": 0.17583361268043518, + "learning_rate": 0.0009887453133916504, + "loss": 3.0409, + "step": 2598 + }, + { + "epoch": 0.07706906265753344, + "grad_norm": 0.18271557986736298, + "learning_rate": 0.0009887353844701657, + "loss": 3.0413, + "step": 2599 + }, + { + "epoch": 0.07709871600984491, + "grad_norm": 0.18899446725845337, + "learning_rate": 0.0009887254512208452, + "loss": 2.9925, + "step": 2600 + }, + { + "epoch": 0.07712836936215639, + "grad_norm": 0.2131587564945221, + "learning_rate": 0.000988715513643777, + "loss": 3.0386, + "step": 2601 + }, + { + "epoch": 0.07715802271446787, + "grad_norm": 0.22320301830768585, + "learning_rate": 0.000988705571739049, + "loss": 3.025, + "step": 2602 + }, + { + "epoch": 0.07718767606677934, + "grad_norm": 0.20761002600193024, + "learning_rate": 0.0009886956255067494, + "loss": 3.0156, + "step": 2603 + }, + { + "epoch": 0.07721732941909083, + "grad_norm": 0.20862610638141632, + "learning_rate": 0.0009886856749469662, + "loss": 2.999, + "step": 2604 + }, + { + "epoch": 0.07724698277140231, + "grad_norm": 0.18215486407279968, + "learning_rate": 0.0009886757200597872, + "loss": 3.0265, + "step": 2605 + }, + { + "epoch": 0.07727663612371378, + "grad_norm": 0.16011637449264526, + "learning_rate": 0.000988665760845301, + "loss": 3.0086, + "step": 2606 + }, + { + "epoch": 0.07730628947602526, + "grad_norm": 0.16597019135951996, + "learning_rate": 0.0009886557973035955, + "loss": 3.0243, + "step": 2607 + }, + { + "epoch": 0.07733594282833674, + "grad_norm": 0.1766090840101242, + "learning_rate": 0.0009886458294347592, + "loss": 3.0002, + "step": 2608 + }, + { + "epoch": 0.07736559618064823, + "grad_norm": 0.15898136794567108, + "learning_rate": 0.00098863585723888, + "loss": 3.0082, + "step": 2609 + }, + { + "epoch": 0.0773952495329597, + "grad_norm": 0.20509357750415802, + "learning_rate": 0.0009886258807160467, + "loss": 3.0323, + "step": 2610 + }, + { + "epoch": 0.07742490288527118, + "grad_norm": 0.19095678627490997, + "learning_rate": 0.0009886158998663472, + "loss": 3.0004, + "step": 2611 + }, + { + "epoch": 0.07745455623758266, + "grad_norm": 0.13096477091312408, + "learning_rate": 0.00098860591468987, + "loss": 3.017, + "step": 2612 + }, + { + "epoch": 0.07748420958989413, + "grad_norm": 0.16458643972873688, + "learning_rate": 0.0009885959251867036, + "loss": 2.9875, + "step": 2613 + }, + { + "epoch": 0.07751386294220562, + "grad_norm": 0.1461668610572815, + "learning_rate": 0.0009885859313569364, + "loss": 3.0367, + "step": 2614 + }, + { + "epoch": 0.0775435162945171, + "grad_norm": 0.13737665116786957, + "learning_rate": 0.0009885759332006567, + "loss": 3.0394, + "step": 2615 + }, + { + "epoch": 0.07757316964682857, + "grad_norm": 0.14594748616218567, + "learning_rate": 0.0009885659307179535, + "loss": 3.02, + "step": 2616 + }, + { + "epoch": 0.07760282299914005, + "grad_norm": 0.1430327445268631, + "learning_rate": 0.0009885559239089152, + "loss": 3.0018, + "step": 2617 + }, + { + "epoch": 0.07763247635145153, + "grad_norm": 0.14164337515830994, + "learning_rate": 0.00098854591277363, + "loss": 3.0137, + "step": 2618 + }, + { + "epoch": 0.07766212970376302, + "grad_norm": 0.17231138050556183, + "learning_rate": 0.0009885358973121872, + "loss": 3.0303, + "step": 2619 + }, + { + "epoch": 0.07769178305607449, + "grad_norm": 0.20467530190944672, + "learning_rate": 0.0009885258775246752, + "loss": 3.0275, + "step": 2620 + }, + { + "epoch": 0.07772143640838597, + "grad_norm": 0.17370478808879852, + "learning_rate": 0.0009885158534111824, + "loss": 3.0225, + "step": 2621 + }, + { + "epoch": 0.07775108976069744, + "grad_norm": 0.1542648822069168, + "learning_rate": 0.000988505824971798, + "loss": 3.0441, + "step": 2622 + }, + { + "epoch": 0.07778074311300892, + "grad_norm": 0.1529424786567688, + "learning_rate": 0.000988495792206611, + "loss": 3.0003, + "step": 2623 + }, + { + "epoch": 0.07781039646532041, + "grad_norm": 0.12897521257400513, + "learning_rate": 0.0009884857551157094, + "loss": 3.0021, + "step": 2624 + }, + { + "epoch": 0.07784004981763189, + "grad_norm": 0.11994115263223648, + "learning_rate": 0.0009884757136991828, + "loss": 3.0161, + "step": 2625 + }, + { + "epoch": 0.07786970316994336, + "grad_norm": 0.13469631969928741, + "learning_rate": 0.00098846566795712, + "loss": 2.9808, + "step": 2626 + }, + { + "epoch": 0.07789935652225484, + "grad_norm": 0.15876305103302002, + "learning_rate": 0.0009884556178896098, + "loss": 3.0226, + "step": 2627 + }, + { + "epoch": 0.07792900987456632, + "grad_norm": 0.16856122016906738, + "learning_rate": 0.0009884455634967414, + "loss": 3.0327, + "step": 2628 + }, + { + "epoch": 0.07795866322687779, + "grad_norm": 0.16812194883823395, + "learning_rate": 0.0009884355047786034, + "loss": 3.0483, + "step": 2629 + }, + { + "epoch": 0.07798831657918928, + "grad_norm": 0.16059334576129913, + "learning_rate": 0.0009884254417352853, + "loss": 3.0045, + "step": 2630 + }, + { + "epoch": 0.07801796993150076, + "grad_norm": 0.18140053749084473, + "learning_rate": 0.000988415374366876, + "loss": 3.0206, + "step": 2631 + }, + { + "epoch": 0.07804762328381223, + "grad_norm": 0.1916157752275467, + "learning_rate": 0.0009884053026734648, + "loss": 3.0045, + "step": 2632 + }, + { + "epoch": 0.07807727663612371, + "grad_norm": 0.17526988685131073, + "learning_rate": 0.0009883952266551408, + "loss": 3.0038, + "step": 2633 + }, + { + "epoch": 0.07810692998843519, + "grad_norm": 0.15370780229568481, + "learning_rate": 0.0009883851463119934, + "loss": 3.0418, + "step": 2634 + }, + { + "epoch": 0.07813658334074668, + "grad_norm": 0.14521890878677368, + "learning_rate": 0.0009883750616441114, + "loss": 3.0177, + "step": 2635 + }, + { + "epoch": 0.07816623669305815, + "grad_norm": 0.15499505400657654, + "learning_rate": 0.0009883649726515847, + "loss": 3.02, + "step": 2636 + }, + { + "epoch": 0.07819589004536963, + "grad_norm": 0.17413973808288574, + "learning_rate": 0.000988354879334502, + "loss": 3.026, + "step": 2637 + }, + { + "epoch": 0.0782255433976811, + "grad_norm": 0.17860181629657745, + "learning_rate": 0.0009883447816929534, + "loss": 3.0096, + "step": 2638 + }, + { + "epoch": 0.07825519674999258, + "grad_norm": 0.17622637748718262, + "learning_rate": 0.0009883346797270277, + "loss": 3.0093, + "step": 2639 + }, + { + "epoch": 0.07828485010230407, + "grad_norm": 0.1716211885213852, + "learning_rate": 0.0009883245734368146, + "loss": 3.0422, + "step": 2640 + }, + { + "epoch": 0.07831450345461555, + "grad_norm": 0.183789923787117, + "learning_rate": 0.0009883144628224036, + "loss": 3.0332, + "step": 2641 + }, + { + "epoch": 0.07834415680692702, + "grad_norm": 0.1779409497976303, + "learning_rate": 0.0009883043478838842, + "loss": 3.0235, + "step": 2642 + }, + { + "epoch": 0.0783738101592385, + "grad_norm": 0.1746232658624649, + "learning_rate": 0.000988294228621346, + "loss": 3.0447, + "step": 2643 + }, + { + "epoch": 0.07840346351154998, + "grad_norm": 0.16139985620975494, + "learning_rate": 0.0009882841050348787, + "loss": 3.0222, + "step": 2644 + }, + { + "epoch": 0.07843311686386147, + "grad_norm": 0.14560842514038086, + "learning_rate": 0.0009882739771245716, + "loss": 3.0395, + "step": 2645 + }, + { + "epoch": 0.07846277021617294, + "grad_norm": 0.17187541723251343, + "learning_rate": 0.0009882638448905148, + "loss": 3.0215, + "step": 2646 + }, + { + "epoch": 0.07849242356848442, + "grad_norm": 0.15700076520442963, + "learning_rate": 0.000988253708332798, + "loss": 3.0399, + "step": 2647 + }, + { + "epoch": 0.0785220769207959, + "grad_norm": 0.15421085059642792, + "learning_rate": 0.0009882435674515105, + "loss": 3.0326, + "step": 2648 + }, + { + "epoch": 0.07855173027310737, + "grad_norm": 0.15303124487400055, + "learning_rate": 0.0009882334222467426, + "loss": 3.003, + "step": 2649 + }, + { + "epoch": 0.07858138362541886, + "grad_norm": 0.1538442224264145, + "learning_rate": 0.0009882232727185837, + "loss": 3.011, + "step": 2650 + }, + { + "epoch": 0.07861103697773034, + "grad_norm": 0.15590700507164001, + "learning_rate": 0.0009882131188671242, + "loss": 3.0023, + "step": 2651 + }, + { + "epoch": 0.07864069033004181, + "grad_norm": 0.16693954169750214, + "learning_rate": 0.0009882029606924536, + "loss": 3.0074, + "step": 2652 + }, + { + "epoch": 0.07867034368235329, + "grad_norm": 0.19148200750350952, + "learning_rate": 0.000988192798194662, + "loss": 3.0331, + "step": 2653 + }, + { + "epoch": 0.07869999703466476, + "grad_norm": 0.20440605282783508, + "learning_rate": 0.0009881826313738393, + "loss": 3.0081, + "step": 2654 + }, + { + "epoch": 0.07872965038697624, + "grad_norm": 0.1831197887659073, + "learning_rate": 0.0009881724602300757, + "loss": 3.041, + "step": 2655 + }, + { + "epoch": 0.07875930373928773, + "grad_norm": 0.1952410340309143, + "learning_rate": 0.0009881622847634612, + "loss": 2.9957, + "step": 2656 + }, + { + "epoch": 0.07878895709159921, + "grad_norm": 0.197196364402771, + "learning_rate": 0.0009881521049740858, + "loss": 3.0352, + "step": 2657 + }, + { + "epoch": 0.07881861044391068, + "grad_norm": 0.18842236697673798, + "learning_rate": 0.0009881419208620397, + "loss": 3.0304, + "step": 2658 + }, + { + "epoch": 0.07884826379622216, + "grad_norm": 0.17177705466747284, + "learning_rate": 0.0009881317324274132, + "loss": 3.0314, + "step": 2659 + }, + { + "epoch": 0.07887791714853364, + "grad_norm": 0.15969549119472504, + "learning_rate": 0.0009881215396702963, + "loss": 3.0391, + "step": 2660 + }, + { + "epoch": 0.07890757050084513, + "grad_norm": 0.144222691655159, + "learning_rate": 0.0009881113425907796, + "loss": 3.0292, + "step": 2661 + }, + { + "epoch": 0.0789372238531566, + "grad_norm": 0.14899864792823792, + "learning_rate": 0.0009881011411889528, + "loss": 2.9953, + "step": 2662 + }, + { + "epoch": 0.07896687720546808, + "grad_norm": 0.12986448407173157, + "learning_rate": 0.000988090935464907, + "loss": 3.0178, + "step": 2663 + }, + { + "epoch": 0.07899653055777955, + "grad_norm": 0.12713484466075897, + "learning_rate": 0.000988080725418732, + "loss": 2.9687, + "step": 2664 + }, + { + "epoch": 0.07902618391009103, + "grad_norm": 0.14002400636672974, + "learning_rate": 0.0009880705110505183, + "loss": 3.0286, + "step": 2665 + }, + { + "epoch": 0.07905583726240252, + "grad_norm": 0.1606145054101944, + "learning_rate": 0.0009880602923603567, + "loss": 3.0404, + "step": 2666 + }, + { + "epoch": 0.079085490614714, + "grad_norm": 0.17149509489536285, + "learning_rate": 0.0009880500693483373, + "loss": 2.9907, + "step": 2667 + }, + { + "epoch": 0.07911514396702547, + "grad_norm": 0.16055479645729065, + "learning_rate": 0.0009880398420145508, + "loss": 3.0023, + "step": 2668 + }, + { + "epoch": 0.07914479731933695, + "grad_norm": 0.17827580869197845, + "learning_rate": 0.0009880296103590876, + "loss": 2.9771, + "step": 2669 + }, + { + "epoch": 0.07917445067164842, + "grad_norm": 0.1722470074892044, + "learning_rate": 0.0009880193743820385, + "loss": 3.025, + "step": 2670 + }, + { + "epoch": 0.07920410402395991, + "grad_norm": 0.16239944100379944, + "learning_rate": 0.000988009134083494, + "loss": 3.0374, + "step": 2671 + }, + { + "epoch": 0.07923375737627139, + "grad_norm": 0.16652680933475494, + "learning_rate": 0.000987998889463545, + "loss": 3.0038, + "step": 2672 + }, + { + "epoch": 0.07926341072858287, + "grad_norm": 0.1525040864944458, + "learning_rate": 0.000987988640522282, + "loss": 3.0014, + "step": 2673 + }, + { + "epoch": 0.07929306408089434, + "grad_norm": 0.1350058615207672, + "learning_rate": 0.000987978387259796, + "loss": 3.022, + "step": 2674 + }, + { + "epoch": 0.07932271743320582, + "grad_norm": 0.1565057337284088, + "learning_rate": 0.0009879681296761774, + "loss": 3.0014, + "step": 2675 + }, + { + "epoch": 0.07935237078551731, + "grad_norm": 0.13377662003040314, + "learning_rate": 0.0009879578677715172, + "loss": 3.0183, + "step": 2676 + }, + { + "epoch": 0.07938202413782879, + "grad_norm": 0.14905628561973572, + "learning_rate": 0.0009879476015459065, + "loss": 2.9756, + "step": 2677 + }, + { + "epoch": 0.07941167749014026, + "grad_norm": 0.1666770577430725, + "learning_rate": 0.0009879373309994361, + "loss": 3.0475, + "step": 2678 + }, + { + "epoch": 0.07944133084245174, + "grad_norm": 0.16217991709709167, + "learning_rate": 0.0009879270561321968, + "loss": 3.009, + "step": 2679 + }, + { + "epoch": 0.07947098419476321, + "grad_norm": 0.14343827962875366, + "learning_rate": 0.0009879167769442797, + "loss": 2.9691, + "step": 2680 + }, + { + "epoch": 0.07950063754707469, + "grad_norm": 0.13802465796470642, + "learning_rate": 0.0009879064934357755, + "loss": 2.9875, + "step": 2681 + }, + { + "epoch": 0.07953029089938618, + "grad_norm": 0.14133259654045105, + "learning_rate": 0.000987896205606776, + "loss": 3.0072, + "step": 2682 + }, + { + "epoch": 0.07955994425169766, + "grad_norm": 0.1509176343679428, + "learning_rate": 0.0009878859134573714, + "loss": 3.0378, + "step": 2683 + }, + { + "epoch": 0.07958959760400913, + "grad_norm": 0.14734166860580444, + "learning_rate": 0.0009878756169876535, + "loss": 3.0236, + "step": 2684 + }, + { + "epoch": 0.07961925095632061, + "grad_norm": 0.16247078776359558, + "learning_rate": 0.0009878653161977133, + "loss": 3.0516, + "step": 2685 + }, + { + "epoch": 0.07964890430863208, + "grad_norm": 0.17683351039886475, + "learning_rate": 0.000987855011087642, + "loss": 3.0727, + "step": 2686 + }, + { + "epoch": 0.07967855766094357, + "grad_norm": 0.15135201811790466, + "learning_rate": 0.000987844701657531, + "loss": 2.9973, + "step": 2687 + }, + { + "epoch": 0.07970821101325505, + "grad_norm": 0.15316763520240784, + "learning_rate": 0.000987834387907471, + "loss": 2.997, + "step": 2688 + }, + { + "epoch": 0.07973786436556653, + "grad_norm": 0.16679427027702332, + "learning_rate": 0.0009878240698375541, + "loss": 3.0276, + "step": 2689 + }, + { + "epoch": 0.079767517717878, + "grad_norm": 0.14126497507095337, + "learning_rate": 0.0009878137474478713, + "loss": 3.0029, + "step": 2690 + }, + { + "epoch": 0.07979717107018948, + "grad_norm": 0.1494022011756897, + "learning_rate": 0.000987803420738514, + "loss": 2.9834, + "step": 2691 + }, + { + "epoch": 0.07982682442250097, + "grad_norm": 0.17299295961856842, + "learning_rate": 0.0009877930897095736, + "loss": 2.9827, + "step": 2692 + }, + { + "epoch": 0.07985647777481245, + "grad_norm": 0.1667426973581314, + "learning_rate": 0.0009877827543611417, + "loss": 3.031, + "step": 2693 + }, + { + "epoch": 0.07988613112712392, + "grad_norm": 0.18644526600837708, + "learning_rate": 0.0009877724146933097, + "loss": 3.027, + "step": 2694 + }, + { + "epoch": 0.0799157844794354, + "grad_norm": 0.18774229288101196, + "learning_rate": 0.0009877620707061693, + "loss": 3.0171, + "step": 2695 + }, + { + "epoch": 0.07994543783174687, + "grad_norm": 0.17586825788021088, + "learning_rate": 0.000987751722399812, + "loss": 3.0227, + "step": 2696 + }, + { + "epoch": 0.07997509118405836, + "grad_norm": 0.19151461124420166, + "learning_rate": 0.0009877413697743295, + "loss": 2.9883, + "step": 2697 + }, + { + "epoch": 0.08000474453636984, + "grad_norm": 0.1766922026872635, + "learning_rate": 0.0009877310128298135, + "loss": 3.0374, + "step": 2698 + }, + { + "epoch": 0.08003439788868132, + "grad_norm": 0.21369236707687378, + "learning_rate": 0.0009877206515663556, + "loss": 3.0459, + "step": 2699 + }, + { + "epoch": 0.08006405124099279, + "grad_norm": 0.20374582707881927, + "learning_rate": 0.0009877102859840478, + "loss": 3.004, + "step": 2700 + }, + { + "epoch": 0.08009370459330427, + "grad_norm": 0.22696463763713837, + "learning_rate": 0.0009876999160829817, + "loss": 3.0719, + "step": 2701 + }, + { + "epoch": 0.08012335794561576, + "grad_norm": 0.23711705207824707, + "learning_rate": 0.000987689541863249, + "loss": 3.0301, + "step": 2702 + }, + { + "epoch": 0.08015301129792723, + "grad_norm": 0.18011847138404846, + "learning_rate": 0.0009876791633249417, + "loss": 3.0079, + "step": 2703 + }, + { + "epoch": 0.08018266465023871, + "grad_norm": 0.15503577888011932, + "learning_rate": 0.0009876687804681516, + "loss": 2.9692, + "step": 2704 + }, + { + "epoch": 0.08021231800255019, + "grad_norm": 0.16507507860660553, + "learning_rate": 0.0009876583932929709, + "loss": 3.0175, + "step": 2705 + }, + { + "epoch": 0.08024197135486166, + "grad_norm": 0.15064673125743866, + "learning_rate": 0.0009876480017994914, + "loss": 2.986, + "step": 2706 + }, + { + "epoch": 0.08027162470717314, + "grad_norm": 0.15840379893779755, + "learning_rate": 0.000987637605987805, + "loss": 3.0267, + "step": 2707 + }, + { + "epoch": 0.08030127805948463, + "grad_norm": 0.176132470369339, + "learning_rate": 0.000987627205858004, + "loss": 2.9796, + "step": 2708 + }, + { + "epoch": 0.0803309314117961, + "grad_norm": 0.16935598850250244, + "learning_rate": 0.0009876168014101806, + "loss": 3.0061, + "step": 2709 + }, + { + "epoch": 0.08036058476410758, + "grad_norm": 0.1634000837802887, + "learning_rate": 0.0009876063926444263, + "loss": 3.0379, + "step": 2710 + }, + { + "epoch": 0.08039023811641906, + "grad_norm": 0.12644296884536743, + "learning_rate": 0.000987595979560834, + "loss": 3.06, + "step": 2711 + }, + { + "epoch": 0.08041989146873053, + "grad_norm": 0.1416521966457367, + "learning_rate": 0.0009875855621594954, + "loss": 3.0201, + "step": 2712 + }, + { + "epoch": 0.08044954482104202, + "grad_norm": 0.14056484401226044, + "learning_rate": 0.000987575140440503, + "loss": 3.0104, + "step": 2713 + }, + { + "epoch": 0.0804791981733535, + "grad_norm": 0.12224879860877991, + "learning_rate": 0.0009875647144039492, + "loss": 2.9937, + "step": 2714 + }, + { + "epoch": 0.08050885152566498, + "grad_norm": 0.13723225891590118, + "learning_rate": 0.000987554284049926, + "loss": 3.0064, + "step": 2715 + }, + { + "epoch": 0.08053850487797645, + "grad_norm": 0.13603141903877258, + "learning_rate": 0.000987543849378526, + "loss": 3.0014, + "step": 2716 + }, + { + "epoch": 0.08056815823028793, + "grad_norm": 0.13709710538387299, + "learning_rate": 0.0009875334103898418, + "loss": 2.9823, + "step": 2717 + }, + { + "epoch": 0.08059781158259942, + "grad_norm": 0.15068387985229492, + "learning_rate": 0.0009875229670839652, + "loss": 3.0341, + "step": 2718 + }, + { + "epoch": 0.0806274649349109, + "grad_norm": 0.16711871325969696, + "learning_rate": 0.000987512519460989, + "loss": 3.0078, + "step": 2719 + }, + { + "epoch": 0.08065711828722237, + "grad_norm": 0.1723295897245407, + "learning_rate": 0.000987502067521006, + "loss": 2.9724, + "step": 2720 + }, + { + "epoch": 0.08068677163953385, + "grad_norm": 0.17976652085781097, + "learning_rate": 0.0009874916112641084, + "loss": 2.9762, + "step": 2721 + }, + { + "epoch": 0.08071642499184532, + "grad_norm": 0.18686167895793915, + "learning_rate": 0.0009874811506903891, + "loss": 2.9744, + "step": 2722 + }, + { + "epoch": 0.08074607834415681, + "grad_norm": 0.16924899816513062, + "learning_rate": 0.0009874706857999403, + "loss": 2.9954, + "step": 2723 + }, + { + "epoch": 0.08077573169646829, + "grad_norm": 0.2090452015399933, + "learning_rate": 0.0009874602165928549, + "loss": 2.9853, + "step": 2724 + }, + { + "epoch": 0.08080538504877977, + "grad_norm": 0.180345818400383, + "learning_rate": 0.0009874497430692258, + "loss": 2.98, + "step": 2725 + }, + { + "epoch": 0.08083503840109124, + "grad_norm": 0.1702488213777542, + "learning_rate": 0.0009874392652291452, + "loss": 2.9955, + "step": 2726 + }, + { + "epoch": 0.08086469175340272, + "grad_norm": 0.16283121705055237, + "learning_rate": 0.0009874287830727066, + "loss": 3.021, + "step": 2727 + }, + { + "epoch": 0.08089434510571421, + "grad_norm": 0.15972226858139038, + "learning_rate": 0.0009874182966000023, + "loss": 3.0324, + "step": 2728 + }, + { + "epoch": 0.08092399845802568, + "grad_norm": 0.17038589715957642, + "learning_rate": 0.0009874078058111253, + "loss": 3.0275, + "step": 2729 + }, + { + "epoch": 0.08095365181033716, + "grad_norm": 0.1818012297153473, + "learning_rate": 0.0009873973107061686, + "loss": 3.0279, + "step": 2730 + }, + { + "epoch": 0.08098330516264864, + "grad_norm": 0.16181905567646027, + "learning_rate": 0.000987386811285225, + "loss": 3.0704, + "step": 2731 + }, + { + "epoch": 0.08101295851496011, + "grad_norm": 0.14100587368011475, + "learning_rate": 0.0009873763075483877, + "loss": 3.0002, + "step": 2732 + }, + { + "epoch": 0.08104261186727159, + "grad_norm": 0.12318725883960724, + "learning_rate": 0.0009873657994957491, + "loss": 3.0102, + "step": 2733 + }, + { + "epoch": 0.08107226521958308, + "grad_norm": 0.11989420652389526, + "learning_rate": 0.000987355287127403, + "loss": 2.9886, + "step": 2734 + }, + { + "epoch": 0.08110191857189455, + "grad_norm": 0.12296340614557266, + "learning_rate": 0.0009873447704434424, + "loss": 2.9816, + "step": 2735 + }, + { + "epoch": 0.08113157192420603, + "grad_norm": 0.13224934041500092, + "learning_rate": 0.00098733424944396, + "loss": 3.0314, + "step": 2736 + }, + { + "epoch": 0.0811612252765175, + "grad_norm": 0.14823368191719055, + "learning_rate": 0.0009873237241290488, + "loss": 3.0223, + "step": 2737 + }, + { + "epoch": 0.08119087862882898, + "grad_norm": 0.1662498563528061, + "learning_rate": 0.000987313194498803, + "loss": 2.9706, + "step": 2738 + }, + { + "epoch": 0.08122053198114047, + "grad_norm": 0.18827274441719055, + "learning_rate": 0.0009873026605533148, + "loss": 2.9791, + "step": 2739 + }, + { + "epoch": 0.08125018533345195, + "grad_norm": 0.19668374955654144, + "learning_rate": 0.000987292122292678, + "loss": 3.0107, + "step": 2740 + }, + { + "epoch": 0.08127983868576343, + "grad_norm": 0.18286143243312836, + "learning_rate": 0.0009872815797169858, + "loss": 2.9899, + "step": 2741 + }, + { + "epoch": 0.0813094920380749, + "grad_norm": 0.15505672991275787, + "learning_rate": 0.0009872710328263318, + "loss": 3.0165, + "step": 2742 + }, + { + "epoch": 0.08133914539038638, + "grad_norm": 0.1658448576927185, + "learning_rate": 0.0009872604816208088, + "loss": 3.0135, + "step": 2743 + }, + { + "epoch": 0.08136879874269787, + "grad_norm": 0.14985309541225433, + "learning_rate": 0.0009872499261005109, + "loss": 2.9857, + "step": 2744 + }, + { + "epoch": 0.08139845209500934, + "grad_norm": 0.13178810477256775, + "learning_rate": 0.000987239366265531, + "loss": 2.9752, + "step": 2745 + }, + { + "epoch": 0.08142810544732082, + "grad_norm": 0.13117067515850067, + "learning_rate": 0.000987228802115963, + "loss": 3.0238, + "step": 2746 + }, + { + "epoch": 0.0814577587996323, + "grad_norm": 0.13588720560073853, + "learning_rate": 0.0009872182336519003, + "loss": 3.0355, + "step": 2747 + }, + { + "epoch": 0.08148741215194377, + "grad_norm": 0.14199182391166687, + "learning_rate": 0.0009872076608734366, + "loss": 2.9742, + "step": 2748 + }, + { + "epoch": 0.08151706550425526, + "grad_norm": 0.16059096157550812, + "learning_rate": 0.0009871970837806653, + "loss": 3.0245, + "step": 2749 + }, + { + "epoch": 0.08154671885656674, + "grad_norm": 0.17167095839977264, + "learning_rate": 0.0009871865023736803, + "loss": 2.9992, + "step": 2750 + }, + { + "epoch": 0.08157637220887821, + "grad_norm": 0.1584548056125641, + "learning_rate": 0.0009871759166525753, + "loss": 2.9868, + "step": 2751 + }, + { + "epoch": 0.08160602556118969, + "grad_norm": 0.15368030965328217, + "learning_rate": 0.000987165326617444, + "loss": 3.0254, + "step": 2752 + }, + { + "epoch": 0.08163567891350117, + "grad_norm": 0.17299747467041016, + "learning_rate": 0.00098715473226838, + "loss": 3.0149, + "step": 2753 + }, + { + "epoch": 0.08166533226581266, + "grad_norm": 0.18276791274547577, + "learning_rate": 0.0009871441336054769, + "loss": 3.013, + "step": 2754 + }, + { + "epoch": 0.08169498561812413, + "grad_norm": 0.18787503242492676, + "learning_rate": 0.0009871335306288291, + "loss": 3.0199, + "step": 2755 + }, + { + "epoch": 0.08172463897043561, + "grad_norm": 0.18005603551864624, + "learning_rate": 0.0009871229233385304, + "loss": 2.981, + "step": 2756 + }, + { + "epoch": 0.08175429232274709, + "grad_norm": 0.18730631470680237, + "learning_rate": 0.0009871123117346744, + "loss": 2.9929, + "step": 2757 + }, + { + "epoch": 0.08178394567505856, + "grad_norm": 0.18229518830776215, + "learning_rate": 0.0009871016958173553, + "loss": 3.009, + "step": 2758 + }, + { + "epoch": 0.08181359902737004, + "grad_norm": 0.2011905163526535, + "learning_rate": 0.0009870910755866674, + "loss": 2.9733, + "step": 2759 + }, + { + "epoch": 0.08184325237968153, + "grad_norm": 0.21319015324115753, + "learning_rate": 0.000987080451042704, + "loss": 3.0156, + "step": 2760 + }, + { + "epoch": 0.081872905731993, + "grad_norm": 0.18098562955856323, + "learning_rate": 0.0009870698221855598, + "loss": 3.0307, + "step": 2761 + }, + { + "epoch": 0.08190255908430448, + "grad_norm": 0.15840600430965424, + "learning_rate": 0.0009870591890153285, + "loss": 2.9639, + "step": 2762 + }, + { + "epoch": 0.08193221243661596, + "grad_norm": 0.16282883286476135, + "learning_rate": 0.0009870485515321048, + "loss": 3.0163, + "step": 2763 + }, + { + "epoch": 0.08196186578892743, + "grad_norm": 0.15129242837429047, + "learning_rate": 0.0009870379097359826, + "loss": 3.0025, + "step": 2764 + }, + { + "epoch": 0.08199151914123892, + "grad_norm": 0.1482764482498169, + "learning_rate": 0.0009870272636270559, + "loss": 3.01, + "step": 2765 + }, + { + "epoch": 0.0820211724935504, + "grad_norm": 0.13159780204296112, + "learning_rate": 0.0009870166132054192, + "loss": 3.0118, + "step": 2766 + }, + { + "epoch": 0.08205082584586187, + "grad_norm": 0.1387946754693985, + "learning_rate": 0.0009870059584711668, + "loss": 2.9997, + "step": 2767 + }, + { + "epoch": 0.08208047919817335, + "grad_norm": 0.13146497309207916, + "learning_rate": 0.0009869952994243931, + "loss": 3.0139, + "step": 2768 + }, + { + "epoch": 0.08211013255048483, + "grad_norm": 0.14545923471450806, + "learning_rate": 0.0009869846360651925, + "loss": 2.992, + "step": 2769 + }, + { + "epoch": 0.08213978590279632, + "grad_norm": 0.14811964333057404, + "learning_rate": 0.0009869739683936592, + "loss": 2.9985, + "step": 2770 + }, + { + "epoch": 0.08216943925510779, + "grad_norm": 0.17371998727321625, + "learning_rate": 0.0009869632964098877, + "loss": 2.9991, + "step": 2771 + }, + { + "epoch": 0.08219909260741927, + "grad_norm": 0.18734978139400482, + "learning_rate": 0.000986952620113973, + "loss": 2.9848, + "step": 2772 + }, + { + "epoch": 0.08222874595973075, + "grad_norm": 0.1787903606891632, + "learning_rate": 0.0009869419395060089, + "loss": 2.996, + "step": 2773 + }, + { + "epoch": 0.08225839931204222, + "grad_norm": 0.14761559665203094, + "learning_rate": 0.0009869312545860906, + "loss": 3.0215, + "step": 2774 + }, + { + "epoch": 0.08228805266435371, + "grad_norm": 0.15877562761306763, + "learning_rate": 0.0009869205653543123, + "loss": 3.0207, + "step": 2775 + }, + { + "epoch": 0.08231770601666519, + "grad_norm": 0.1611311137676239, + "learning_rate": 0.000986909871810769, + "loss": 2.9901, + "step": 2776 + }, + { + "epoch": 0.08234735936897666, + "grad_norm": 0.1644519567489624, + "learning_rate": 0.000986899173955555, + "loss": 3.0609, + "step": 2777 + }, + { + "epoch": 0.08237701272128814, + "grad_norm": 0.18290838599205017, + "learning_rate": 0.0009868884717887654, + "loss": 3.0035, + "step": 2778 + }, + { + "epoch": 0.08240666607359962, + "grad_norm": 0.17318475246429443, + "learning_rate": 0.0009868777653104948, + "loss": 3.0056, + "step": 2779 + }, + { + "epoch": 0.0824363194259111, + "grad_norm": 0.17488233745098114, + "learning_rate": 0.000986867054520838, + "loss": 3.0044, + "step": 2780 + }, + { + "epoch": 0.08246597277822258, + "grad_norm": 0.15814439952373505, + "learning_rate": 0.0009868563394198897, + "loss": 3.0244, + "step": 2781 + }, + { + "epoch": 0.08249562613053406, + "grad_norm": 0.15761208534240723, + "learning_rate": 0.000986845620007745, + "loss": 3.002, + "step": 2782 + }, + { + "epoch": 0.08252527948284553, + "grad_norm": 0.1677539199590683, + "learning_rate": 0.000986834896284499, + "loss": 3.0107, + "step": 2783 + }, + { + "epoch": 0.08255493283515701, + "grad_norm": 0.14069655537605286, + "learning_rate": 0.000986824168250246, + "loss": 2.9927, + "step": 2784 + }, + { + "epoch": 0.08258458618746849, + "grad_norm": 0.15238773822784424, + "learning_rate": 0.0009868134359050818, + "loss": 2.9985, + "step": 2785 + }, + { + "epoch": 0.08261423953977998, + "grad_norm": 0.1779009848833084, + "learning_rate": 0.000986802699249101, + "loss": 3.0061, + "step": 2786 + }, + { + "epoch": 0.08264389289209145, + "grad_norm": 0.18262173235416412, + "learning_rate": 0.0009867919582823988, + "loss": 2.9994, + "step": 2787 + }, + { + "epoch": 0.08267354624440293, + "grad_norm": 0.1935725063085556, + "learning_rate": 0.0009867812130050701, + "loss": 2.9724, + "step": 2788 + }, + { + "epoch": 0.0827031995967144, + "grad_norm": 0.19080251455307007, + "learning_rate": 0.0009867704634172103, + "loss": 3.0226, + "step": 2789 + }, + { + "epoch": 0.08273285294902588, + "grad_norm": 0.19451206922531128, + "learning_rate": 0.0009867597095189146, + "loss": 2.9875, + "step": 2790 + }, + { + "epoch": 0.08276250630133737, + "grad_norm": 0.18237906694412231, + "learning_rate": 0.000986748951310278, + "loss": 3.0012, + "step": 2791 + }, + { + "epoch": 0.08279215965364885, + "grad_norm": 0.1634618490934372, + "learning_rate": 0.0009867381887913957, + "loss": 3.036, + "step": 2792 + }, + { + "epoch": 0.08282181300596032, + "grad_norm": 0.15831845998764038, + "learning_rate": 0.0009867274219623632, + "loss": 3.0174, + "step": 2793 + }, + { + "epoch": 0.0828514663582718, + "grad_norm": 0.14808490872383118, + "learning_rate": 0.000986716650823276, + "loss": 3.0298, + "step": 2794 + }, + { + "epoch": 0.08288111971058328, + "grad_norm": 0.13664884865283966, + "learning_rate": 0.0009867058753742293, + "loss": 2.9744, + "step": 2795 + }, + { + "epoch": 0.08291077306289477, + "grad_norm": 0.16067564487457275, + "learning_rate": 0.0009866950956153187, + "loss": 2.9861, + "step": 2796 + }, + { + "epoch": 0.08294042641520624, + "grad_norm": 0.1795128881931305, + "learning_rate": 0.0009866843115466392, + "loss": 2.9828, + "step": 2797 + }, + { + "epoch": 0.08297007976751772, + "grad_norm": 0.18802589178085327, + "learning_rate": 0.0009866735231682867, + "loss": 2.9964, + "step": 2798 + }, + { + "epoch": 0.0829997331198292, + "grad_norm": 0.1852487176656723, + "learning_rate": 0.0009866627304803567, + "loss": 3.022, + "step": 2799 + }, + { + "epoch": 0.08302938647214067, + "grad_norm": 0.17415833473205566, + "learning_rate": 0.0009866519334829444, + "loss": 2.9964, + "step": 2800 + }, + { + "epoch": 0.08305903982445216, + "grad_norm": 0.15617544949054718, + "learning_rate": 0.000986641132176146, + "loss": 2.9782, + "step": 2801 + }, + { + "epoch": 0.08308869317676364, + "grad_norm": 0.19677050411701202, + "learning_rate": 0.0009866303265600568, + "loss": 2.9977, + "step": 2802 + }, + { + "epoch": 0.08311834652907511, + "grad_norm": 0.1783176213502884, + "learning_rate": 0.0009866195166347725, + "loss": 3.0185, + "step": 2803 + }, + { + "epoch": 0.08314799988138659, + "grad_norm": 0.15280644595623016, + "learning_rate": 0.0009866087024003889, + "loss": 2.9591, + "step": 2804 + }, + { + "epoch": 0.08317765323369807, + "grad_norm": 0.1425468921661377, + "learning_rate": 0.0009865978838570016, + "loss": 2.9649, + "step": 2805 + }, + { + "epoch": 0.08320730658600956, + "grad_norm": 0.1625629961490631, + "learning_rate": 0.0009865870610047064, + "loss": 2.9885, + "step": 2806 + }, + { + "epoch": 0.08323695993832103, + "grad_norm": 0.16238652169704437, + "learning_rate": 0.0009865762338435995, + "loss": 3.0224, + "step": 2807 + }, + { + "epoch": 0.08326661329063251, + "grad_norm": 0.18539640307426453, + "learning_rate": 0.0009865654023737765, + "loss": 2.988, + "step": 2808 + }, + { + "epoch": 0.08329626664294398, + "grad_norm": 0.19019995629787445, + "learning_rate": 0.0009865545665953333, + "loss": 3.0374, + "step": 2809 + }, + { + "epoch": 0.08332591999525546, + "grad_norm": 0.175260528922081, + "learning_rate": 0.0009865437265083657, + "loss": 3.0012, + "step": 2810 + }, + { + "epoch": 0.08335557334756694, + "grad_norm": 0.18322935700416565, + "learning_rate": 0.0009865328821129702, + "loss": 3.0316, + "step": 2811 + }, + { + "epoch": 0.08338522669987843, + "grad_norm": 0.21459850668907166, + "learning_rate": 0.0009865220334092425, + "loss": 2.9884, + "step": 2812 + }, + { + "epoch": 0.0834148800521899, + "grad_norm": 0.19397343695163727, + "learning_rate": 0.0009865111803972783, + "loss": 2.9936, + "step": 2813 + }, + { + "epoch": 0.08344453340450138, + "grad_norm": 0.15331648290157318, + "learning_rate": 0.0009865003230771746, + "loss": 3.0045, + "step": 2814 + }, + { + "epoch": 0.08347418675681285, + "grad_norm": 0.1511688083410263, + "learning_rate": 0.0009864894614490267, + "loss": 2.995, + "step": 2815 + }, + { + "epoch": 0.08350384010912433, + "grad_norm": 0.1346741020679474, + "learning_rate": 0.0009864785955129311, + "loss": 3.0185, + "step": 2816 + }, + { + "epoch": 0.08353349346143582, + "grad_norm": 0.1315806806087494, + "learning_rate": 0.0009864677252689842, + "loss": 3.0314, + "step": 2817 + }, + { + "epoch": 0.0835631468137473, + "grad_norm": 0.14852648973464966, + "learning_rate": 0.000986456850717282, + "loss": 2.9898, + "step": 2818 + }, + { + "epoch": 0.08359280016605877, + "grad_norm": 0.13910309970378876, + "learning_rate": 0.0009864459718579208, + "loss": 2.9861, + "step": 2819 + }, + { + "epoch": 0.08362245351837025, + "grad_norm": 0.12979932129383087, + "learning_rate": 0.0009864350886909972, + "loss": 2.9975, + "step": 2820 + }, + { + "epoch": 0.08365210687068173, + "grad_norm": 0.1455746442079544, + "learning_rate": 0.0009864242012166075, + "loss": 3.026, + "step": 2821 + }, + { + "epoch": 0.08368176022299322, + "grad_norm": 0.1576453000307083, + "learning_rate": 0.0009864133094348478, + "loss": 2.9845, + "step": 2822 + }, + { + "epoch": 0.08371141357530469, + "grad_norm": 0.18601009249687195, + "learning_rate": 0.0009864024133458148, + "loss": 3.0061, + "step": 2823 + }, + { + "epoch": 0.08374106692761617, + "grad_norm": 0.1976841390132904, + "learning_rate": 0.0009863915129496048, + "loss": 2.9806, + "step": 2824 + }, + { + "epoch": 0.08377072027992764, + "grad_norm": 0.1833120584487915, + "learning_rate": 0.0009863806082463147, + "loss": 3.011, + "step": 2825 + }, + { + "epoch": 0.08380037363223912, + "grad_norm": 0.17788776755332947, + "learning_rate": 0.0009863696992360408, + "loss": 3.006, + "step": 2826 + }, + { + "epoch": 0.08383002698455061, + "grad_norm": 0.1636112630367279, + "learning_rate": 0.0009863587859188796, + "loss": 3.0146, + "step": 2827 + }, + { + "epoch": 0.08385968033686209, + "grad_norm": 0.14544770121574402, + "learning_rate": 0.0009863478682949278, + "loss": 2.9989, + "step": 2828 + }, + { + "epoch": 0.08388933368917356, + "grad_norm": 0.13356567919254303, + "learning_rate": 0.0009863369463642823, + "loss": 3.0107, + "step": 2829 + }, + { + "epoch": 0.08391898704148504, + "grad_norm": 0.13393640518188477, + "learning_rate": 0.0009863260201270396, + "loss": 3.0041, + "step": 2830 + }, + { + "epoch": 0.08394864039379651, + "grad_norm": 0.13314196467399597, + "learning_rate": 0.0009863150895832965, + "loss": 2.998, + "step": 2831 + }, + { + "epoch": 0.083978293746108, + "grad_norm": 0.15954019129276276, + "learning_rate": 0.00098630415473315, + "loss": 2.9843, + "step": 2832 + }, + { + "epoch": 0.08400794709841948, + "grad_norm": 0.16550202667713165, + "learning_rate": 0.0009862932155766965, + "loss": 2.9853, + "step": 2833 + }, + { + "epoch": 0.08403760045073096, + "grad_norm": 0.18375732004642487, + "learning_rate": 0.0009862822721140331, + "loss": 3.0106, + "step": 2834 + }, + { + "epoch": 0.08406725380304243, + "grad_norm": 0.19306041300296783, + "learning_rate": 0.0009862713243452568, + "loss": 3.0065, + "step": 2835 + }, + { + "epoch": 0.08409690715535391, + "grad_norm": 0.21143697202205658, + "learning_rate": 0.0009862603722704645, + "loss": 2.9793, + "step": 2836 + }, + { + "epoch": 0.08412656050766538, + "grad_norm": 0.25702446699142456, + "learning_rate": 0.000986249415889753, + "loss": 2.9921, + "step": 2837 + }, + { + "epoch": 0.08415621385997687, + "grad_norm": 0.24227121472358704, + "learning_rate": 0.0009862384552032197, + "loss": 2.9774, + "step": 2838 + }, + { + "epoch": 0.08418586721228835, + "grad_norm": 0.20661094784736633, + "learning_rate": 0.0009862274902109611, + "loss": 2.9944, + "step": 2839 + }, + { + "epoch": 0.08421552056459983, + "grad_norm": 0.17657895386219025, + "learning_rate": 0.0009862165209130749, + "loss": 2.9744, + "step": 2840 + }, + { + "epoch": 0.0842451739169113, + "grad_norm": 0.18902906775474548, + "learning_rate": 0.0009862055473096579, + "loss": 3.005, + "step": 2841 + }, + { + "epoch": 0.08427482726922278, + "grad_norm": 0.17532648146152496, + "learning_rate": 0.0009861945694008072, + "loss": 3.0154, + "step": 2842 + }, + { + "epoch": 0.08430448062153427, + "grad_norm": 0.14371909201145172, + "learning_rate": 0.0009861835871866204, + "loss": 2.9975, + "step": 2843 + }, + { + "epoch": 0.08433413397384575, + "grad_norm": 0.1279897689819336, + "learning_rate": 0.0009861726006671942, + "loss": 2.9736, + "step": 2844 + }, + { + "epoch": 0.08436378732615722, + "grad_norm": 0.125764399766922, + "learning_rate": 0.000986161609842626, + "loss": 3.026, + "step": 2845 + }, + { + "epoch": 0.0843934406784687, + "grad_norm": 0.1269269585609436, + "learning_rate": 0.0009861506147130137, + "loss": 3.0444, + "step": 2846 + }, + { + "epoch": 0.08442309403078017, + "grad_norm": 0.12249619513750076, + "learning_rate": 0.000986139615278454, + "loss": 2.9771, + "step": 2847 + }, + { + "epoch": 0.08445274738309166, + "grad_norm": 0.1299694925546646, + "learning_rate": 0.0009861286115390446, + "loss": 3.028, + "step": 2848 + }, + { + "epoch": 0.08448240073540314, + "grad_norm": 0.1408783495426178, + "learning_rate": 0.000986117603494883, + "loss": 2.9608, + "step": 2849 + }, + { + "epoch": 0.08451205408771462, + "grad_norm": 0.1579631268978119, + "learning_rate": 0.0009861065911460666, + "loss": 2.9597, + "step": 2850 + }, + { + "epoch": 0.08454170744002609, + "grad_norm": 0.16001035273075104, + "learning_rate": 0.0009860955744926928, + "loss": 2.9903, + "step": 2851 + }, + { + "epoch": 0.08457136079233757, + "grad_norm": 0.15505942702293396, + "learning_rate": 0.0009860845535348594, + "loss": 3.0433, + "step": 2852 + }, + { + "epoch": 0.08460101414464906, + "grad_norm": 0.14708679914474487, + "learning_rate": 0.0009860735282726635, + "loss": 2.9977, + "step": 2853 + }, + { + "epoch": 0.08463066749696053, + "grad_norm": 0.14092884957790375, + "learning_rate": 0.0009860624987062035, + "loss": 2.9754, + "step": 2854 + }, + { + "epoch": 0.08466032084927201, + "grad_norm": 0.1463962346315384, + "learning_rate": 0.0009860514648355764, + "loss": 2.996, + "step": 2855 + }, + { + "epoch": 0.08468997420158349, + "grad_norm": 0.13679715991020203, + "learning_rate": 0.0009860404266608803, + "loss": 2.9872, + "step": 2856 + }, + { + "epoch": 0.08471962755389496, + "grad_norm": 0.13274787366390228, + "learning_rate": 0.0009860293841822125, + "loss": 2.9957, + "step": 2857 + }, + { + "epoch": 0.08474928090620645, + "grad_norm": 0.14756350219249725, + "learning_rate": 0.0009860183373996713, + "loss": 3.0195, + "step": 2858 + }, + { + "epoch": 0.08477893425851793, + "grad_norm": 0.16337931156158447, + "learning_rate": 0.0009860072863133544, + "loss": 2.9987, + "step": 2859 + }, + { + "epoch": 0.0848085876108294, + "grad_norm": 0.17123384773731232, + "learning_rate": 0.0009859962309233595, + "loss": 3.0333, + "step": 2860 + }, + { + "epoch": 0.08483824096314088, + "grad_norm": 0.20621226727962494, + "learning_rate": 0.0009859851712297845, + "loss": 3.0033, + "step": 2861 + }, + { + "epoch": 0.08486789431545236, + "grad_norm": 0.21592208743095398, + "learning_rate": 0.0009859741072327274, + "loss": 2.9896, + "step": 2862 + }, + { + "epoch": 0.08489754766776383, + "grad_norm": 0.26063936948776245, + "learning_rate": 0.000985963038932286, + "loss": 3.0091, + "step": 2863 + }, + { + "epoch": 0.08492720102007532, + "grad_norm": 0.22488939762115479, + "learning_rate": 0.0009859519663285585, + "loss": 3.0389, + "step": 2864 + }, + { + "epoch": 0.0849568543723868, + "grad_norm": 0.23177321255207062, + "learning_rate": 0.0009859408894216431, + "loss": 2.9924, + "step": 2865 + }, + { + "epoch": 0.08498650772469828, + "grad_norm": 0.2298245131969452, + "learning_rate": 0.0009859298082116378, + "loss": 3.0215, + "step": 2866 + }, + { + "epoch": 0.08501616107700975, + "grad_norm": 0.20045781135559082, + "learning_rate": 0.0009859187226986404, + "loss": 2.9798, + "step": 2867 + }, + { + "epoch": 0.08504581442932123, + "grad_norm": 0.21503987908363342, + "learning_rate": 0.0009859076328827495, + "loss": 3.0004, + "step": 2868 + }, + { + "epoch": 0.08507546778163272, + "grad_norm": 0.1720392256975174, + "learning_rate": 0.0009858965387640629, + "loss": 3.0038, + "step": 2869 + }, + { + "epoch": 0.0851051211339442, + "grad_norm": 0.15546979010105133, + "learning_rate": 0.0009858854403426791, + "loss": 2.9982, + "step": 2870 + }, + { + "epoch": 0.08513477448625567, + "grad_norm": 0.1761118471622467, + "learning_rate": 0.0009858743376186963, + "loss": 2.9816, + "step": 2871 + }, + { + "epoch": 0.08516442783856715, + "grad_norm": 0.17048393189907074, + "learning_rate": 0.000985863230592213, + "loss": 3.006, + "step": 2872 + }, + { + "epoch": 0.08519408119087862, + "grad_norm": 0.20018523931503296, + "learning_rate": 0.0009858521192633272, + "loss": 2.9831, + "step": 2873 + }, + { + "epoch": 0.08522373454319011, + "grad_norm": 0.16670410335063934, + "learning_rate": 0.0009858410036321377, + "loss": 3.0502, + "step": 2874 + }, + { + "epoch": 0.08525338789550159, + "grad_norm": 0.1685926765203476, + "learning_rate": 0.0009858298836987426, + "loss": 2.9747, + "step": 2875 + }, + { + "epoch": 0.08528304124781307, + "grad_norm": 0.16185617446899414, + "learning_rate": 0.0009858187594632404, + "loss": 3.0019, + "step": 2876 + }, + { + "epoch": 0.08531269460012454, + "grad_norm": 0.15986678004264832, + "learning_rate": 0.0009858076309257298, + "loss": 2.9869, + "step": 2877 + }, + { + "epoch": 0.08534234795243602, + "grad_norm": 0.14612050354480743, + "learning_rate": 0.0009857964980863093, + "loss": 2.9737, + "step": 2878 + }, + { + "epoch": 0.08537200130474751, + "grad_norm": 0.12877118587493896, + "learning_rate": 0.0009857853609450775, + "loss": 2.9812, + "step": 2879 + }, + { + "epoch": 0.08540165465705898, + "grad_norm": 0.13301466405391693, + "learning_rate": 0.0009857742195021326, + "loss": 3.0284, + "step": 2880 + }, + { + "epoch": 0.08543130800937046, + "grad_norm": 0.14150318503379822, + "learning_rate": 0.000985763073757574, + "loss": 2.9975, + "step": 2881 + }, + { + "epoch": 0.08546096136168194, + "grad_norm": 0.13607828319072723, + "learning_rate": 0.0009857519237114999, + "loss": 2.9674, + "step": 2882 + }, + { + "epoch": 0.08549061471399341, + "grad_norm": 0.1153135895729065, + "learning_rate": 0.0009857407693640088, + "loss": 3.0154, + "step": 2883 + }, + { + "epoch": 0.0855202680663049, + "grad_norm": 0.13743487000465393, + "learning_rate": 0.0009857296107152003, + "loss": 2.9876, + "step": 2884 + }, + { + "epoch": 0.08554992141861638, + "grad_norm": 0.1677764505147934, + "learning_rate": 0.0009857184477651724, + "loss": 3.0251, + "step": 2885 + }, + { + "epoch": 0.08557957477092785, + "grad_norm": 0.16700418293476105, + "learning_rate": 0.0009857072805140243, + "loss": 2.9911, + "step": 2886 + }, + { + "epoch": 0.08560922812323933, + "grad_norm": 0.16674824059009552, + "learning_rate": 0.0009856961089618548, + "loss": 2.997, + "step": 2887 + }, + { + "epoch": 0.08563888147555081, + "grad_norm": 0.14586545526981354, + "learning_rate": 0.0009856849331087631, + "loss": 3.0276, + "step": 2888 + }, + { + "epoch": 0.08566853482786228, + "grad_norm": 0.13987602293491364, + "learning_rate": 0.0009856737529548478, + "loss": 3.0035, + "step": 2889 + }, + { + "epoch": 0.08569818818017377, + "grad_norm": 0.15714026987552643, + "learning_rate": 0.0009856625685002083, + "loss": 3.0083, + "step": 2890 + }, + { + "epoch": 0.08572784153248525, + "grad_norm": 0.16448140144348145, + "learning_rate": 0.000985651379744943, + "loss": 2.9793, + "step": 2891 + }, + { + "epoch": 0.08575749488479673, + "grad_norm": 0.18160612881183624, + "learning_rate": 0.0009856401866891516, + "loss": 3.002, + "step": 2892 + }, + { + "epoch": 0.0857871482371082, + "grad_norm": 0.192109614610672, + "learning_rate": 0.000985628989332933, + "loss": 2.9588, + "step": 2893 + }, + { + "epoch": 0.08581680158941968, + "grad_norm": 0.16954663395881653, + "learning_rate": 0.0009856177876763864, + "loss": 2.9872, + "step": 2894 + }, + { + "epoch": 0.08584645494173117, + "grad_norm": 0.13344553112983704, + "learning_rate": 0.0009856065817196106, + "loss": 2.9948, + "step": 2895 + }, + { + "epoch": 0.08587610829404264, + "grad_norm": 0.16021205484867096, + "learning_rate": 0.0009855953714627056, + "loss": 3.0323, + "step": 2896 + }, + { + "epoch": 0.08590576164635412, + "grad_norm": 0.16512492299079895, + "learning_rate": 0.00098558415690577, + "loss": 3.0034, + "step": 2897 + }, + { + "epoch": 0.0859354149986656, + "grad_norm": 0.14170946180820465, + "learning_rate": 0.0009855729380489034, + "loss": 3.0297, + "step": 2898 + }, + { + "epoch": 0.08596506835097707, + "grad_norm": 0.13635390996932983, + "learning_rate": 0.000985561714892205, + "loss": 2.983, + "step": 2899 + }, + { + "epoch": 0.08599472170328856, + "grad_norm": 0.1673961579799652, + "learning_rate": 0.0009855504874357744, + "loss": 2.9641, + "step": 2900 + }, + { + "epoch": 0.08602437505560004, + "grad_norm": 0.1799658238887787, + "learning_rate": 0.0009855392556797108, + "loss": 2.9766, + "step": 2901 + }, + { + "epoch": 0.08605402840791151, + "grad_norm": 0.1731489598751068, + "learning_rate": 0.0009855280196241138, + "loss": 3.0214, + "step": 2902 + }, + { + "epoch": 0.08608368176022299, + "grad_norm": 0.16721364855766296, + "learning_rate": 0.0009855167792690827, + "loss": 3.0154, + "step": 2903 + }, + { + "epoch": 0.08611333511253447, + "grad_norm": 0.17761513590812683, + "learning_rate": 0.0009855055346147173, + "loss": 2.9717, + "step": 2904 + }, + { + "epoch": 0.08614298846484596, + "grad_norm": 0.1992490440607071, + "learning_rate": 0.0009854942856611171, + "loss": 2.9858, + "step": 2905 + }, + { + "epoch": 0.08617264181715743, + "grad_norm": 0.19994567334651947, + "learning_rate": 0.0009854830324083816, + "loss": 2.9807, + "step": 2906 + }, + { + "epoch": 0.08620229516946891, + "grad_norm": 0.16505923867225647, + "learning_rate": 0.0009854717748566106, + "loss": 3.0028, + "step": 2907 + }, + { + "epoch": 0.08623194852178039, + "grad_norm": 0.14988075196743011, + "learning_rate": 0.0009854605130059035, + "loss": 2.9649, + "step": 2908 + }, + { + "epoch": 0.08626160187409186, + "grad_norm": 0.1296570599079132, + "learning_rate": 0.0009854492468563604, + "loss": 3.0059, + "step": 2909 + }, + { + "epoch": 0.08629125522640335, + "grad_norm": 0.14153160154819489, + "learning_rate": 0.000985437976408081, + "loss": 3.003, + "step": 2910 + }, + { + "epoch": 0.08632090857871483, + "grad_norm": 0.1447461098432541, + "learning_rate": 0.000985426701661165, + "loss": 2.9955, + "step": 2911 + }, + { + "epoch": 0.0863505619310263, + "grad_norm": 0.13347984850406647, + "learning_rate": 0.0009854154226157119, + "loss": 2.9545, + "step": 2912 + }, + { + "epoch": 0.08638021528333778, + "grad_norm": 0.15491734445095062, + "learning_rate": 0.000985404139271822, + "loss": 3.0147, + "step": 2913 + }, + { + "epoch": 0.08640986863564926, + "grad_norm": 0.1472507268190384, + "learning_rate": 0.0009853928516295953, + "loss": 2.9561, + "step": 2914 + }, + { + "epoch": 0.08643952198796073, + "grad_norm": 0.14986513555049896, + "learning_rate": 0.0009853815596891316, + "loss": 2.984, + "step": 2915 + }, + { + "epoch": 0.08646917534027222, + "grad_norm": 0.16261102259159088, + "learning_rate": 0.0009853702634505309, + "loss": 2.9806, + "step": 2916 + }, + { + "epoch": 0.0864988286925837, + "grad_norm": 0.18624478578567505, + "learning_rate": 0.000985358962913893, + "loss": 3.0042, + "step": 2917 + }, + { + "epoch": 0.08652848204489517, + "grad_norm": 0.2041064202785492, + "learning_rate": 0.0009853476580793182, + "loss": 2.9702, + "step": 2918 + }, + { + "epoch": 0.08655813539720665, + "grad_norm": 0.17512108385562897, + "learning_rate": 0.0009853363489469068, + "loss": 3.0034, + "step": 2919 + }, + { + "epoch": 0.08658778874951813, + "grad_norm": 0.19560480117797852, + "learning_rate": 0.0009853250355167584, + "loss": 2.9933, + "step": 2920 + }, + { + "epoch": 0.08661744210182962, + "grad_norm": 0.19702638685703278, + "learning_rate": 0.0009853137177889737, + "loss": 2.9718, + "step": 2921 + }, + { + "epoch": 0.0866470954541411, + "grad_norm": 0.20454537868499756, + "learning_rate": 0.0009853023957636527, + "loss": 3.0176, + "step": 2922 + }, + { + "epoch": 0.08667674880645257, + "grad_norm": 0.19054152071475983, + "learning_rate": 0.0009852910694408956, + "loss": 2.9722, + "step": 2923 + }, + { + "epoch": 0.08670640215876405, + "grad_norm": 0.1715710461139679, + "learning_rate": 0.000985279738820803, + "loss": 2.9721, + "step": 2924 + }, + { + "epoch": 0.08673605551107552, + "grad_norm": 0.15277642011642456, + "learning_rate": 0.0009852684039034748, + "loss": 2.9731, + "step": 2925 + }, + { + "epoch": 0.08676570886338701, + "grad_norm": 0.149800643324852, + "learning_rate": 0.0009852570646890118, + "loss": 2.9819, + "step": 2926 + }, + { + "epoch": 0.08679536221569849, + "grad_norm": 0.15198713541030884, + "learning_rate": 0.000985245721177514, + "loss": 2.9922, + "step": 2927 + }, + { + "epoch": 0.08682501556800996, + "grad_norm": 0.16125953197479248, + "learning_rate": 0.0009852343733690822, + "loss": 2.9719, + "step": 2928 + }, + { + "epoch": 0.08685466892032144, + "grad_norm": 0.18109165132045746, + "learning_rate": 0.0009852230212638164, + "loss": 3.0274, + "step": 2929 + }, + { + "epoch": 0.08688432227263292, + "grad_norm": 0.1833881139755249, + "learning_rate": 0.0009852116648618179, + "loss": 3.0181, + "step": 2930 + }, + { + "epoch": 0.0869139756249444, + "grad_norm": 0.15841512382030487, + "learning_rate": 0.0009852003041631865, + "loss": 3.0017, + "step": 2931 + }, + { + "epoch": 0.08694362897725588, + "grad_norm": 0.14141537249088287, + "learning_rate": 0.0009851889391680234, + "loss": 2.9831, + "step": 2932 + }, + { + "epoch": 0.08697328232956736, + "grad_norm": 0.1438208669424057, + "learning_rate": 0.0009851775698764287, + "loss": 2.9788, + "step": 2933 + }, + { + "epoch": 0.08700293568187883, + "grad_norm": 0.14323805272579193, + "learning_rate": 0.0009851661962885035, + "loss": 2.978, + "step": 2934 + }, + { + "epoch": 0.08703258903419031, + "grad_norm": 0.14635604619979858, + "learning_rate": 0.0009851548184043484, + "loss": 2.9711, + "step": 2935 + }, + { + "epoch": 0.0870622423865018, + "grad_norm": 0.1395374983549118, + "learning_rate": 0.000985143436224064, + "loss": 2.9267, + "step": 2936 + }, + { + "epoch": 0.08709189573881328, + "grad_norm": 0.132288858294487, + "learning_rate": 0.0009851320497477512, + "loss": 3.0174, + "step": 2937 + }, + { + "epoch": 0.08712154909112475, + "grad_norm": 0.12706884741783142, + "learning_rate": 0.0009851206589755108, + "loss": 2.9762, + "step": 2938 + }, + { + "epoch": 0.08715120244343623, + "grad_norm": 0.13224975764751434, + "learning_rate": 0.0009851092639074437, + "loss": 2.9955, + "step": 2939 + }, + { + "epoch": 0.0871808557957477, + "grad_norm": 0.15913274884223938, + "learning_rate": 0.0009850978645436507, + "loss": 2.9973, + "step": 2940 + }, + { + "epoch": 0.08721050914805918, + "grad_norm": 0.18880382180213928, + "learning_rate": 0.0009850864608842332, + "loss": 2.9669, + "step": 2941 + }, + { + "epoch": 0.08724016250037067, + "grad_norm": 0.18241067230701447, + "learning_rate": 0.0009850750529292916, + "loss": 3.0049, + "step": 2942 + }, + { + "epoch": 0.08726981585268215, + "grad_norm": 0.15738676488399506, + "learning_rate": 0.000985063640678927, + "loss": 2.994, + "step": 2943 + }, + { + "epoch": 0.08729946920499362, + "grad_norm": 0.1719163954257965, + "learning_rate": 0.000985052224133241, + "loss": 3.0013, + "step": 2944 + }, + { + "epoch": 0.0873291225573051, + "grad_norm": 0.17220166325569153, + "learning_rate": 0.0009850408032923339, + "loss": 3.0437, + "step": 2945 + }, + { + "epoch": 0.08735877590961658, + "grad_norm": 0.16741354763507843, + "learning_rate": 0.0009850293781563074, + "loss": 2.9918, + "step": 2946 + }, + { + "epoch": 0.08738842926192807, + "grad_norm": 0.17893344163894653, + "learning_rate": 0.0009850179487252623, + "loss": 2.9984, + "step": 2947 + }, + { + "epoch": 0.08741808261423954, + "grad_norm": 0.19005237519741058, + "learning_rate": 0.0009850065149993004, + "loss": 3.008, + "step": 2948 + }, + { + "epoch": 0.08744773596655102, + "grad_norm": 0.17984344065189362, + "learning_rate": 0.0009849950769785223, + "loss": 3.0024, + "step": 2949 + }, + { + "epoch": 0.0874773893188625, + "grad_norm": 0.17843694984912872, + "learning_rate": 0.0009849836346630296, + "loss": 2.9943, + "step": 2950 + }, + { + "epoch": 0.08750704267117397, + "grad_norm": 0.16648049652576447, + "learning_rate": 0.0009849721880529237, + "loss": 2.9972, + "step": 2951 + }, + { + "epoch": 0.08753669602348546, + "grad_norm": 0.1689595729112625, + "learning_rate": 0.0009849607371483055, + "loss": 2.9642, + "step": 2952 + }, + { + "epoch": 0.08756634937579694, + "grad_norm": 0.22434677183628082, + "learning_rate": 0.000984949281949277, + "loss": 2.9832, + "step": 2953 + }, + { + "epoch": 0.08759600272810841, + "grad_norm": 0.2715516984462738, + "learning_rate": 0.0009849378224559393, + "loss": 2.9693, + "step": 2954 + }, + { + "epoch": 0.08762565608041989, + "grad_norm": 0.23152664303779602, + "learning_rate": 0.000984926358668394, + "loss": 3.0147, + "step": 2955 + }, + { + "epoch": 0.08765530943273137, + "grad_norm": 0.20055750012397766, + "learning_rate": 0.0009849148905867425, + "loss": 2.9654, + "step": 2956 + }, + { + "epoch": 0.08768496278504286, + "grad_norm": 0.19680118560791016, + "learning_rate": 0.0009849034182110863, + "loss": 3.0042, + "step": 2957 + }, + { + "epoch": 0.08771461613735433, + "grad_norm": 0.22085018455982208, + "learning_rate": 0.0009848919415415274, + "loss": 2.9418, + "step": 2958 + }, + { + "epoch": 0.08774426948966581, + "grad_norm": 0.1393890678882599, + "learning_rate": 0.0009848804605781668, + "loss": 3.0029, + "step": 2959 + }, + { + "epoch": 0.08777392284197728, + "grad_norm": 0.13703778386116028, + "learning_rate": 0.0009848689753211067, + "loss": 3.0085, + "step": 2960 + }, + { + "epoch": 0.08780357619428876, + "grad_norm": 0.12522606551647186, + "learning_rate": 0.0009848574857704484, + "loss": 2.9936, + "step": 2961 + }, + { + "epoch": 0.08783322954660025, + "grad_norm": 0.12804441154003143, + "learning_rate": 0.000984845991926294, + "loss": 2.976, + "step": 2962 + }, + { + "epoch": 0.08786288289891173, + "grad_norm": 0.1350444257259369, + "learning_rate": 0.000984834493788745, + "loss": 3.0012, + "step": 2963 + }, + { + "epoch": 0.0878925362512232, + "grad_norm": 0.1269538253545761, + "learning_rate": 0.0009848229913579035, + "loss": 2.9693, + "step": 2964 + }, + { + "epoch": 0.08792218960353468, + "grad_norm": 0.12191654741764069, + "learning_rate": 0.0009848114846338712, + "loss": 2.9914, + "step": 2965 + }, + { + "epoch": 0.08795184295584615, + "grad_norm": 0.11391613632440567, + "learning_rate": 0.0009847999736167497, + "loss": 3.0037, + "step": 2966 + }, + { + "epoch": 0.08798149630815763, + "grad_norm": 0.12039459496736526, + "learning_rate": 0.0009847884583066414, + "loss": 2.9867, + "step": 2967 + }, + { + "epoch": 0.08801114966046912, + "grad_norm": 0.12429261952638626, + "learning_rate": 0.0009847769387036482, + "loss": 3.0164, + "step": 2968 + }, + { + "epoch": 0.0880408030127806, + "grad_norm": 0.12238883227109909, + "learning_rate": 0.000984765414807872, + "loss": 2.9835, + "step": 2969 + }, + { + "epoch": 0.08807045636509207, + "grad_norm": 0.12472891807556152, + "learning_rate": 0.0009847538866194147, + "loss": 2.9665, + "step": 2970 + }, + { + "epoch": 0.08810010971740355, + "grad_norm": 0.14037056267261505, + "learning_rate": 0.0009847423541383786, + "loss": 2.9982, + "step": 2971 + }, + { + "epoch": 0.08812976306971503, + "grad_norm": 0.14205706119537354, + "learning_rate": 0.0009847308173648657, + "loss": 2.9433, + "step": 2972 + }, + { + "epoch": 0.08815941642202652, + "grad_norm": 0.14999403059482574, + "learning_rate": 0.0009847192762989783, + "loss": 2.9611, + "step": 2973 + }, + { + "epoch": 0.08818906977433799, + "grad_norm": 0.17395207285881042, + "learning_rate": 0.0009847077309408183, + "loss": 2.9604, + "step": 2974 + }, + { + "epoch": 0.08821872312664947, + "grad_norm": 0.2120603621006012, + "learning_rate": 0.0009846961812904882, + "loss": 2.9657, + "step": 2975 + }, + { + "epoch": 0.08824837647896094, + "grad_norm": 0.2169695645570755, + "learning_rate": 0.0009846846273480904, + "loss": 2.9959, + "step": 2976 + }, + { + "epoch": 0.08827802983127242, + "grad_norm": 0.23300118744373322, + "learning_rate": 0.0009846730691137268, + "loss": 2.9803, + "step": 2977 + }, + { + "epoch": 0.08830768318358391, + "grad_norm": 0.20338664948940277, + "learning_rate": 0.0009846615065875002, + "loss": 3.0043, + "step": 2978 + }, + { + "epoch": 0.08833733653589539, + "grad_norm": 0.17119920253753662, + "learning_rate": 0.0009846499397695128, + "loss": 3.0035, + "step": 2979 + }, + { + "epoch": 0.08836698988820686, + "grad_norm": 0.2115541696548462, + "learning_rate": 0.0009846383686598669, + "loss": 2.9544, + "step": 2980 + }, + { + "epoch": 0.08839664324051834, + "grad_norm": 0.16091665625572205, + "learning_rate": 0.0009846267932586649, + "loss": 2.9449, + "step": 2981 + }, + { + "epoch": 0.08842629659282981, + "grad_norm": 0.16739650070667267, + "learning_rate": 0.0009846152135660096, + "loss": 2.967, + "step": 2982 + }, + { + "epoch": 0.0884559499451413, + "grad_norm": 0.1457521617412567, + "learning_rate": 0.0009846036295820034, + "loss": 2.987, + "step": 2983 + }, + { + "epoch": 0.08848560329745278, + "grad_norm": 0.1803404837846756, + "learning_rate": 0.0009845920413067489, + "loss": 2.9762, + "step": 2984 + }, + { + "epoch": 0.08851525664976426, + "grad_norm": 0.17546173930168152, + "learning_rate": 0.0009845804487403488, + "loss": 2.9862, + "step": 2985 + }, + { + "epoch": 0.08854491000207573, + "grad_norm": 0.169763445854187, + "learning_rate": 0.0009845688518829053, + "loss": 3.0054, + "step": 2986 + }, + { + "epoch": 0.08857456335438721, + "grad_norm": 0.143106609582901, + "learning_rate": 0.0009845572507345217, + "loss": 2.9965, + "step": 2987 + }, + { + "epoch": 0.0886042167066987, + "grad_norm": 0.14383910596370697, + "learning_rate": 0.0009845456452953003, + "loss": 2.9842, + "step": 2988 + }, + { + "epoch": 0.08863387005901018, + "grad_norm": 0.14343959093093872, + "learning_rate": 0.0009845340355653443, + "loss": 2.9827, + "step": 2989 + }, + { + "epoch": 0.08866352341132165, + "grad_norm": 0.14006876945495605, + "learning_rate": 0.0009845224215447562, + "loss": 3.0008, + "step": 2990 + }, + { + "epoch": 0.08869317676363313, + "grad_norm": 0.13100501894950867, + "learning_rate": 0.0009845108032336387, + "loss": 2.985, + "step": 2991 + }, + { + "epoch": 0.0887228301159446, + "grad_norm": 0.1419696807861328, + "learning_rate": 0.000984499180632095, + "loss": 2.9763, + "step": 2992 + }, + { + "epoch": 0.08875248346825608, + "grad_norm": 0.14276118576526642, + "learning_rate": 0.0009844875537402278, + "loss": 2.9746, + "step": 2993 + }, + { + "epoch": 0.08878213682056757, + "grad_norm": 0.15224766731262207, + "learning_rate": 0.0009844759225581402, + "loss": 3.0053, + "step": 2994 + }, + { + "epoch": 0.08881179017287905, + "grad_norm": 0.19194930791854858, + "learning_rate": 0.0009844642870859353, + "loss": 3.0042, + "step": 2995 + }, + { + "epoch": 0.08884144352519052, + "grad_norm": 0.20844687521457672, + "learning_rate": 0.0009844526473237157, + "loss": 2.9774, + "step": 2996 + }, + { + "epoch": 0.088871096877502, + "grad_norm": 0.16640760004520416, + "learning_rate": 0.000984441003271585, + "loss": 2.9933, + "step": 2997 + }, + { + "epoch": 0.08890075022981347, + "grad_norm": 0.16474704444408417, + "learning_rate": 0.000984429354929646, + "loss": 2.9769, + "step": 2998 + }, + { + "epoch": 0.08893040358212496, + "grad_norm": 0.14223645627498627, + "learning_rate": 0.0009844177022980017, + "loss": 2.9818, + "step": 2999 + }, + { + "epoch": 0.08896005693443644, + "grad_norm": 0.1406608372926712, + "learning_rate": 0.0009844060453767557, + "loss": 2.9732, + "step": 3000 + }, + { + "epoch": 0.08898971028674792, + "grad_norm": 0.15082143247127533, + "learning_rate": 0.000984394384166011, + "loss": 2.982, + "step": 3001 + }, + { + "epoch": 0.08901936363905939, + "grad_norm": 0.16031861305236816, + "learning_rate": 0.000984382718665871, + "loss": 2.9958, + "step": 3002 + }, + { + "epoch": 0.08904901699137087, + "grad_norm": 0.17615893483161926, + "learning_rate": 0.0009843710488764386, + "loss": 2.989, + "step": 3003 + }, + { + "epoch": 0.08907867034368236, + "grad_norm": 0.17258693277835846, + "learning_rate": 0.0009843593747978178, + "loss": 2.9772, + "step": 3004 + }, + { + "epoch": 0.08910832369599384, + "grad_norm": 0.14494828879833221, + "learning_rate": 0.0009843476964301113, + "loss": 2.982, + "step": 3005 + }, + { + "epoch": 0.08913797704830531, + "grad_norm": 0.13541150093078613, + "learning_rate": 0.0009843360137734227, + "loss": 2.9612, + "step": 3006 + }, + { + "epoch": 0.08916763040061679, + "grad_norm": 0.15765613317489624, + "learning_rate": 0.000984324326827856, + "loss": 2.9481, + "step": 3007 + }, + { + "epoch": 0.08919728375292826, + "grad_norm": 0.1710234433412552, + "learning_rate": 0.0009843126355935138, + "loss": 3.0149, + "step": 3008 + }, + { + "epoch": 0.08922693710523975, + "grad_norm": 0.23191478848457336, + "learning_rate": 0.0009843009400705004, + "loss": 2.9851, + "step": 3009 + }, + { + "epoch": 0.08925659045755123, + "grad_norm": 0.2670642137527466, + "learning_rate": 0.0009842892402589188, + "loss": 3.01, + "step": 3010 + }, + { + "epoch": 0.0892862438098627, + "grad_norm": 0.22488735616207123, + "learning_rate": 0.000984277536158873, + "loss": 3.019, + "step": 3011 + }, + { + "epoch": 0.08931589716217418, + "grad_norm": 0.18367242813110352, + "learning_rate": 0.0009842658277704665, + "loss": 2.9875, + "step": 3012 + }, + { + "epoch": 0.08934555051448566, + "grad_norm": 0.16188013553619385, + "learning_rate": 0.000984254115093803, + "loss": 3.0018, + "step": 3013 + }, + { + "epoch": 0.08937520386679715, + "grad_norm": 0.16691657900810242, + "learning_rate": 0.0009842423981289861, + "loss": 2.9871, + "step": 3014 + }, + { + "epoch": 0.08940485721910862, + "grad_norm": 0.19060377776622772, + "learning_rate": 0.0009842306768761196, + "loss": 2.9875, + "step": 3015 + }, + { + "epoch": 0.0894345105714201, + "grad_norm": 0.17087896168231964, + "learning_rate": 0.0009842189513353074, + "loss": 2.9636, + "step": 3016 + }, + { + "epoch": 0.08946416392373158, + "grad_norm": 0.18253029882907867, + "learning_rate": 0.0009842072215066533, + "loss": 2.9763, + "step": 3017 + }, + { + "epoch": 0.08949381727604305, + "grad_norm": 0.18063639104366302, + "learning_rate": 0.0009841954873902612, + "loss": 2.9746, + "step": 3018 + }, + { + "epoch": 0.08952347062835453, + "grad_norm": 0.1649933010339737, + "learning_rate": 0.0009841837489862348, + "loss": 2.9568, + "step": 3019 + }, + { + "epoch": 0.08955312398066602, + "grad_norm": 0.14526093006134033, + "learning_rate": 0.0009841720062946783, + "loss": 3.0007, + "step": 3020 + }, + { + "epoch": 0.0895827773329775, + "grad_norm": 0.16192379593849182, + "learning_rate": 0.0009841602593156956, + "loss": 3.0211, + "step": 3021 + }, + { + "epoch": 0.08961243068528897, + "grad_norm": 0.16697613894939423, + "learning_rate": 0.0009841485080493905, + "loss": 3.0243, + "step": 3022 + }, + { + "epoch": 0.08964208403760045, + "grad_norm": 0.18470363318920135, + "learning_rate": 0.0009841367524958675, + "loss": 2.9897, + "step": 3023 + }, + { + "epoch": 0.08967173738991192, + "grad_norm": 0.15727749466896057, + "learning_rate": 0.0009841249926552302, + "loss": 2.9585, + "step": 3024 + }, + { + "epoch": 0.08970139074222341, + "grad_norm": 0.147257462143898, + "learning_rate": 0.0009841132285275834, + "loss": 2.9593, + "step": 3025 + }, + { + "epoch": 0.08973104409453489, + "grad_norm": 0.16991566121578217, + "learning_rate": 0.0009841014601130304, + "loss": 2.9868, + "step": 3026 + }, + { + "epoch": 0.08976069744684637, + "grad_norm": 0.1514899879693985, + "learning_rate": 0.0009840896874116762, + "loss": 2.9761, + "step": 3027 + }, + { + "epoch": 0.08979035079915784, + "grad_norm": 0.1332433819770813, + "learning_rate": 0.0009840779104236246, + "loss": 2.9771, + "step": 3028 + }, + { + "epoch": 0.08982000415146932, + "grad_norm": 0.1410679817199707, + "learning_rate": 0.0009840661291489802, + "loss": 2.9935, + "step": 3029 + }, + { + "epoch": 0.08984965750378081, + "grad_norm": 0.14901681244373322, + "learning_rate": 0.0009840543435878468, + "loss": 2.9661, + "step": 3030 + }, + { + "epoch": 0.08987931085609228, + "grad_norm": 0.14651691913604736, + "learning_rate": 0.0009840425537403293, + "loss": 2.9756, + "step": 3031 + }, + { + "epoch": 0.08990896420840376, + "grad_norm": 0.1598082333803177, + "learning_rate": 0.0009840307596065319, + "loss": 3.0025, + "step": 3032 + }, + { + "epoch": 0.08993861756071524, + "grad_norm": 0.15029677748680115, + "learning_rate": 0.0009840189611865589, + "loss": 2.9866, + "step": 3033 + }, + { + "epoch": 0.08996827091302671, + "grad_norm": 0.1781039535999298, + "learning_rate": 0.000984007158480515, + "loss": 2.9712, + "step": 3034 + }, + { + "epoch": 0.0899979242653382, + "grad_norm": 0.18068017065525055, + "learning_rate": 0.0009839953514885046, + "loss": 2.9588, + "step": 3035 + }, + { + "epoch": 0.09002757761764968, + "grad_norm": 0.18881039321422577, + "learning_rate": 0.0009839835402106324, + "loss": 2.9829, + "step": 3036 + }, + { + "epoch": 0.09005723096996116, + "grad_norm": 0.17425881326198578, + "learning_rate": 0.0009839717246470027, + "loss": 2.9931, + "step": 3037 + }, + { + "epoch": 0.09008688432227263, + "grad_norm": 0.18937359750270844, + "learning_rate": 0.0009839599047977206, + "loss": 2.9991, + "step": 3038 + }, + { + "epoch": 0.09011653767458411, + "grad_norm": 0.16601328551769257, + "learning_rate": 0.0009839480806628901, + "loss": 2.9674, + "step": 3039 + }, + { + "epoch": 0.0901461910268956, + "grad_norm": 0.15614697337150574, + "learning_rate": 0.0009839362522426167, + "loss": 2.9888, + "step": 3040 + }, + { + "epoch": 0.09017584437920707, + "grad_norm": 0.17166569828987122, + "learning_rate": 0.0009839244195370045, + "loss": 2.9711, + "step": 3041 + }, + { + "epoch": 0.09020549773151855, + "grad_norm": 0.17855432629585266, + "learning_rate": 0.0009839125825461584, + "loss": 2.9614, + "step": 3042 + }, + { + "epoch": 0.09023515108383003, + "grad_norm": 0.17232482135295868, + "learning_rate": 0.0009839007412701835, + "loss": 3.0058, + "step": 3043 + }, + { + "epoch": 0.0902648044361415, + "grad_norm": 0.16692784428596497, + "learning_rate": 0.0009838888957091844, + "loss": 3.0157, + "step": 3044 + }, + { + "epoch": 0.09029445778845298, + "grad_norm": 0.17052043974399567, + "learning_rate": 0.000983877045863266, + "loss": 2.9825, + "step": 3045 + }, + { + "epoch": 0.09032411114076447, + "grad_norm": 0.15902495384216309, + "learning_rate": 0.0009838651917325335, + "loss": 2.9716, + "step": 3046 + }, + { + "epoch": 0.09035376449307594, + "grad_norm": 0.15374496579170227, + "learning_rate": 0.0009838533333170914, + "loss": 2.9822, + "step": 3047 + }, + { + "epoch": 0.09038341784538742, + "grad_norm": 0.171005979180336, + "learning_rate": 0.0009838414706170452, + "loss": 2.9444, + "step": 3048 + }, + { + "epoch": 0.0904130711976989, + "grad_norm": 0.1716334968805313, + "learning_rate": 0.0009838296036324995, + "loss": 2.9469, + "step": 3049 + }, + { + "epoch": 0.09044272455001037, + "grad_norm": 0.17088331282138824, + "learning_rate": 0.0009838177323635597, + "loss": 2.9639, + "step": 3050 + }, + { + "epoch": 0.09047237790232186, + "grad_norm": 0.17283180356025696, + "learning_rate": 0.000983805856810331, + "loss": 3.0131, + "step": 3051 + }, + { + "epoch": 0.09050203125463334, + "grad_norm": 0.14438283443450928, + "learning_rate": 0.0009837939769729183, + "loss": 3.0014, + "step": 3052 + }, + { + "epoch": 0.09053168460694482, + "grad_norm": 0.13752740621566772, + "learning_rate": 0.0009837820928514267, + "loss": 2.9809, + "step": 3053 + }, + { + "epoch": 0.09056133795925629, + "grad_norm": 0.1608777940273285, + "learning_rate": 0.000983770204445962, + "loss": 2.9876, + "step": 3054 + }, + { + "epoch": 0.09059099131156777, + "grad_norm": 0.15679091215133667, + "learning_rate": 0.000983758311756629, + "loss": 2.9781, + "step": 3055 + }, + { + "epoch": 0.09062064466387926, + "grad_norm": 0.14556598663330078, + "learning_rate": 0.0009837464147835328, + "loss": 2.9782, + "step": 3056 + }, + { + "epoch": 0.09065029801619073, + "grad_norm": 0.17513589560985565, + "learning_rate": 0.0009837345135267793, + "loss": 2.986, + "step": 3057 + }, + { + "epoch": 0.09067995136850221, + "grad_norm": 0.20174074172973633, + "learning_rate": 0.0009837226079864737, + "loss": 2.9735, + "step": 3058 + }, + { + "epoch": 0.09070960472081369, + "grad_norm": 0.19188417494297028, + "learning_rate": 0.0009837106981627213, + "loss": 2.9631, + "step": 3059 + }, + { + "epoch": 0.09073925807312516, + "grad_norm": 0.17622967064380646, + "learning_rate": 0.0009836987840556276, + "loss": 3.0278, + "step": 3060 + }, + { + "epoch": 0.09076891142543665, + "grad_norm": 0.17150017619132996, + "learning_rate": 0.0009836868656652982, + "loss": 2.9754, + "step": 3061 + }, + { + "epoch": 0.09079856477774813, + "grad_norm": 0.1848207265138626, + "learning_rate": 0.0009836749429918386, + "loss": 2.965, + "step": 3062 + }, + { + "epoch": 0.0908282181300596, + "grad_norm": 0.19563601911067963, + "learning_rate": 0.0009836630160353543, + "loss": 3.0044, + "step": 3063 + }, + { + "epoch": 0.09085787148237108, + "grad_norm": 0.1621381789445877, + "learning_rate": 0.000983651084795951, + "loss": 3.0117, + "step": 3064 + }, + { + "epoch": 0.09088752483468256, + "grad_norm": 0.16987289488315582, + "learning_rate": 0.0009836391492737343, + "loss": 2.9719, + "step": 3065 + }, + { + "epoch": 0.09091717818699405, + "grad_norm": 0.1728353202342987, + "learning_rate": 0.00098362720946881, + "loss": 2.9793, + "step": 3066 + }, + { + "epoch": 0.09094683153930552, + "grad_norm": 0.1614205539226532, + "learning_rate": 0.0009836152653812838, + "loss": 2.9602, + "step": 3067 + }, + { + "epoch": 0.090976484891617, + "grad_norm": 0.1594504863023758, + "learning_rate": 0.0009836033170112612, + "loss": 2.977, + "step": 3068 + }, + { + "epoch": 0.09100613824392847, + "grad_norm": 0.1468740552663803, + "learning_rate": 0.0009835913643588482, + "loss": 2.9783, + "step": 3069 + }, + { + "epoch": 0.09103579159623995, + "grad_norm": 0.14794321358203888, + "learning_rate": 0.0009835794074241509, + "loss": 2.9933, + "step": 3070 + }, + { + "epoch": 0.09106544494855143, + "grad_norm": 0.15491370856761932, + "learning_rate": 0.0009835674462072748, + "loss": 2.9993, + "step": 3071 + }, + { + "epoch": 0.09109509830086292, + "grad_norm": 0.16619443893432617, + "learning_rate": 0.0009835554807083261, + "loss": 2.98, + "step": 3072 + }, + { + "epoch": 0.0911247516531744, + "grad_norm": 0.18659882247447968, + "learning_rate": 0.0009835435109274105, + "loss": 2.9799, + "step": 3073 + }, + { + "epoch": 0.09115440500548587, + "grad_norm": 0.15449632704257965, + "learning_rate": 0.000983531536864634, + "loss": 2.9707, + "step": 3074 + }, + { + "epoch": 0.09118405835779735, + "grad_norm": 0.15462003648281097, + "learning_rate": 0.000983519558520103, + "loss": 2.9747, + "step": 3075 + }, + { + "epoch": 0.09121371171010882, + "grad_norm": 0.15627318620681763, + "learning_rate": 0.000983507575893923, + "loss": 2.9497, + "step": 3076 + }, + { + "epoch": 0.09124336506242031, + "grad_norm": 0.14176076650619507, + "learning_rate": 0.0009834955889862008, + "loss": 2.9926, + "step": 3077 + }, + { + "epoch": 0.09127301841473179, + "grad_norm": 0.1697416454553604, + "learning_rate": 0.0009834835977970417, + "loss": 2.9511, + "step": 3078 + }, + { + "epoch": 0.09130267176704326, + "grad_norm": 0.1723608523607254, + "learning_rate": 0.0009834716023265527, + "loss": 2.9338, + "step": 3079 + }, + { + "epoch": 0.09133232511935474, + "grad_norm": 0.1580328643321991, + "learning_rate": 0.0009834596025748397, + "loss": 2.9587, + "step": 3080 + }, + { + "epoch": 0.09136197847166622, + "grad_norm": 0.11467459797859192, + "learning_rate": 0.0009834475985420088, + "loss": 2.9675, + "step": 3081 + }, + { + "epoch": 0.0913916318239777, + "grad_norm": 0.13859352469444275, + "learning_rate": 0.0009834355902281664, + "loss": 2.9426, + "step": 3082 + }, + { + "epoch": 0.09142128517628918, + "grad_norm": 0.16893787682056427, + "learning_rate": 0.000983423577633419, + "loss": 2.9802, + "step": 3083 + }, + { + "epoch": 0.09145093852860066, + "grad_norm": 0.19902022182941437, + "learning_rate": 0.0009834115607578727, + "loss": 2.9722, + "step": 3084 + }, + { + "epoch": 0.09148059188091213, + "grad_norm": 0.20978595316410065, + "learning_rate": 0.0009833995396016342, + "loss": 3.009, + "step": 3085 + }, + { + "epoch": 0.09151024523322361, + "grad_norm": 0.22259320318698883, + "learning_rate": 0.0009833875141648097, + "loss": 2.988, + "step": 3086 + }, + { + "epoch": 0.0915398985855351, + "grad_norm": 0.21154004335403442, + "learning_rate": 0.0009833754844475059, + "loss": 3.015, + "step": 3087 + }, + { + "epoch": 0.09156955193784658, + "grad_norm": 0.16822151839733124, + "learning_rate": 0.000983363450449829, + "loss": 3.0055, + "step": 3088 + }, + { + "epoch": 0.09159920529015805, + "grad_norm": 0.17832151055335999, + "learning_rate": 0.000983351412171886, + "loss": 3.0043, + "step": 3089 + }, + { + "epoch": 0.09162885864246953, + "grad_norm": 0.1547488570213318, + "learning_rate": 0.0009833393696137831, + "loss": 2.9559, + "step": 3090 + }, + { + "epoch": 0.091658511994781, + "grad_norm": 0.16359950602054596, + "learning_rate": 0.0009833273227756272, + "loss": 2.9233, + "step": 3091 + }, + { + "epoch": 0.0916881653470925, + "grad_norm": 0.1532617062330246, + "learning_rate": 0.0009833152716575248, + "loss": 3.004, + "step": 3092 + }, + { + "epoch": 0.09171781869940397, + "grad_norm": 0.1398439258337021, + "learning_rate": 0.000983303216259583, + "loss": 2.9675, + "step": 3093 + }, + { + "epoch": 0.09174747205171545, + "grad_norm": 0.1333441287279129, + "learning_rate": 0.000983291156581908, + "loss": 2.9644, + "step": 3094 + }, + { + "epoch": 0.09177712540402692, + "grad_norm": 0.13427414000034332, + "learning_rate": 0.0009832790926246069, + "loss": 2.9753, + "step": 3095 + }, + { + "epoch": 0.0918067787563384, + "grad_norm": 0.15809065103530884, + "learning_rate": 0.0009832670243877866, + "loss": 3.0011, + "step": 3096 + }, + { + "epoch": 0.09183643210864988, + "grad_norm": 0.13633988797664642, + "learning_rate": 0.0009832549518715536, + "loss": 2.979, + "step": 3097 + }, + { + "epoch": 0.09186608546096137, + "grad_norm": 0.127159982919693, + "learning_rate": 0.0009832428750760152, + "loss": 2.9622, + "step": 3098 + }, + { + "epoch": 0.09189573881327284, + "grad_norm": 0.12440481036901474, + "learning_rate": 0.0009832307940012782, + "loss": 3.0004, + "step": 3099 + }, + { + "epoch": 0.09192539216558432, + "grad_norm": 0.12807288765907288, + "learning_rate": 0.0009832187086474496, + "loss": 2.9733, + "step": 3100 + }, + { + "epoch": 0.0919550455178958, + "grad_norm": 0.14443093538284302, + "learning_rate": 0.0009832066190146363, + "loss": 2.9651, + "step": 3101 + }, + { + "epoch": 0.09198469887020727, + "grad_norm": 0.16905871033668518, + "learning_rate": 0.0009831945251029457, + "loss": 2.9506, + "step": 3102 + }, + { + "epoch": 0.09201435222251876, + "grad_norm": 0.18682532012462616, + "learning_rate": 0.0009831824269124843, + "loss": 2.9411, + "step": 3103 + }, + { + "epoch": 0.09204400557483024, + "grad_norm": 0.1646977663040161, + "learning_rate": 0.0009831703244433598, + "loss": 2.9958, + "step": 3104 + }, + { + "epoch": 0.09207365892714171, + "grad_norm": 0.1850842535495758, + "learning_rate": 0.000983158217695679, + "loss": 2.9746, + "step": 3105 + }, + { + "epoch": 0.09210331227945319, + "grad_norm": 0.19765041768550873, + "learning_rate": 0.0009831461066695493, + "loss": 2.9821, + "step": 3106 + }, + { + "epoch": 0.09213296563176467, + "grad_norm": 0.1945296972990036, + "learning_rate": 0.0009831339913650779, + "loss": 2.9554, + "step": 3107 + }, + { + "epoch": 0.09216261898407616, + "grad_norm": 0.20211485028266907, + "learning_rate": 0.0009831218717823722, + "loss": 2.9726, + "step": 3108 + }, + { + "epoch": 0.09219227233638763, + "grad_norm": 0.23531846702098846, + "learning_rate": 0.0009831097479215392, + "loss": 2.9961, + "step": 3109 + }, + { + "epoch": 0.09222192568869911, + "grad_norm": 0.19223468005657196, + "learning_rate": 0.0009830976197826866, + "loss": 2.9785, + "step": 3110 + }, + { + "epoch": 0.09225157904101058, + "grad_norm": 0.1920045167207718, + "learning_rate": 0.0009830854873659216, + "loss": 2.9808, + "step": 3111 + }, + { + "epoch": 0.09228123239332206, + "grad_norm": 0.16374856233596802, + "learning_rate": 0.0009830733506713517, + "loss": 3.0158, + "step": 3112 + }, + { + "epoch": 0.09231088574563355, + "grad_norm": 0.15308788418769836, + "learning_rate": 0.0009830612096990844, + "loss": 2.9699, + "step": 3113 + }, + { + "epoch": 0.09234053909794503, + "grad_norm": 0.17065340280532837, + "learning_rate": 0.000983049064449227, + "loss": 3.0196, + "step": 3114 + }, + { + "epoch": 0.0923701924502565, + "grad_norm": 0.16571220755577087, + "learning_rate": 0.0009830369149218874, + "loss": 2.976, + "step": 3115 + }, + { + "epoch": 0.09239984580256798, + "grad_norm": 0.13428592681884766, + "learning_rate": 0.0009830247611171729, + "loss": 2.9568, + "step": 3116 + }, + { + "epoch": 0.09242949915487945, + "grad_norm": 0.17331181466579437, + "learning_rate": 0.0009830126030351913, + "loss": 2.9927, + "step": 3117 + }, + { + "epoch": 0.09245915250719094, + "grad_norm": 0.16341952979564667, + "learning_rate": 0.0009830004406760503, + "loss": 2.9822, + "step": 3118 + }, + { + "epoch": 0.09248880585950242, + "grad_norm": 0.1353921890258789, + "learning_rate": 0.0009829882740398572, + "loss": 2.9775, + "step": 3119 + }, + { + "epoch": 0.0925184592118139, + "grad_norm": 0.1488192230463028, + "learning_rate": 0.00098297610312672, + "loss": 3.0086, + "step": 3120 + }, + { + "epoch": 0.09254811256412537, + "grad_norm": 0.1567336916923523, + "learning_rate": 0.0009829639279367469, + "loss": 2.9761, + "step": 3121 + }, + { + "epoch": 0.09257776591643685, + "grad_norm": 0.14986440539360046, + "learning_rate": 0.0009829517484700452, + "loss": 2.9773, + "step": 3122 + }, + { + "epoch": 0.09260741926874833, + "grad_norm": 0.13906481862068176, + "learning_rate": 0.0009829395647267226, + "loss": 2.9844, + "step": 3123 + }, + { + "epoch": 0.09263707262105982, + "grad_norm": 0.15914170444011688, + "learning_rate": 0.0009829273767068874, + "loss": 2.9927, + "step": 3124 + }, + { + "epoch": 0.09266672597337129, + "grad_norm": 0.1586276888847351, + "learning_rate": 0.0009829151844106476, + "loss": 2.9593, + "step": 3125 + }, + { + "epoch": 0.09269637932568277, + "grad_norm": 0.14313837885856628, + "learning_rate": 0.0009829029878381107, + "loss": 2.9919, + "step": 3126 + }, + { + "epoch": 0.09272603267799424, + "grad_norm": 0.17757289111614227, + "learning_rate": 0.000982890786989385, + "loss": 3.0063, + "step": 3127 + }, + { + "epoch": 0.09275568603030572, + "grad_norm": 0.19853168725967407, + "learning_rate": 0.0009828785818645786, + "loss": 2.9833, + "step": 3128 + }, + { + "epoch": 0.09278533938261721, + "grad_norm": 0.17280596494674683, + "learning_rate": 0.0009828663724637994, + "loss": 2.9795, + "step": 3129 + }, + { + "epoch": 0.09281499273492869, + "grad_norm": 0.19961920380592346, + "learning_rate": 0.0009828541587871555, + "loss": 2.9878, + "step": 3130 + }, + { + "epoch": 0.09284464608724016, + "grad_norm": 0.20505152642726898, + "learning_rate": 0.0009828419408347553, + "loss": 3.009, + "step": 3131 + }, + { + "epoch": 0.09287429943955164, + "grad_norm": 0.21471025049686432, + "learning_rate": 0.0009828297186067069, + "loss": 3.0043, + "step": 3132 + }, + { + "epoch": 0.09290395279186311, + "grad_norm": 0.2255164533853531, + "learning_rate": 0.000982817492103118, + "loss": 2.9626, + "step": 3133 + }, + { + "epoch": 0.0929336061441746, + "grad_norm": 0.17829445004463196, + "learning_rate": 0.0009828052613240978, + "loss": 2.9945, + "step": 3134 + }, + { + "epoch": 0.09296325949648608, + "grad_norm": 0.18704116344451904, + "learning_rate": 0.000982793026269754, + "loss": 3.0066, + "step": 3135 + }, + { + "epoch": 0.09299291284879756, + "grad_norm": 0.1733470857143402, + "learning_rate": 0.000982780786940195, + "loss": 2.9938, + "step": 3136 + }, + { + "epoch": 0.09302256620110903, + "grad_norm": 0.1573854237794876, + "learning_rate": 0.0009827685433355295, + "loss": 3.0246, + "step": 3137 + }, + { + "epoch": 0.09305221955342051, + "grad_norm": 0.1760392189025879, + "learning_rate": 0.0009827562954558655, + "loss": 2.9686, + "step": 3138 + }, + { + "epoch": 0.093081872905732, + "grad_norm": 0.16231340169906616, + "learning_rate": 0.0009827440433013116, + "loss": 2.9984, + "step": 3139 + }, + { + "epoch": 0.09311152625804348, + "grad_norm": 0.17047016322612762, + "learning_rate": 0.0009827317868719766, + "loss": 2.9042, + "step": 3140 + }, + { + "epoch": 0.09314117961035495, + "grad_norm": 0.15123432874679565, + "learning_rate": 0.0009827195261679685, + "loss": 2.965, + "step": 3141 + }, + { + "epoch": 0.09317083296266643, + "grad_norm": 0.14064940810203552, + "learning_rate": 0.000982707261189396, + "loss": 2.9327, + "step": 3142 + }, + { + "epoch": 0.0932004863149779, + "grad_norm": 0.14542916417121887, + "learning_rate": 0.0009826949919363682, + "loss": 2.9959, + "step": 3143 + }, + { + "epoch": 0.0932301396672894, + "grad_norm": 0.12812449038028717, + "learning_rate": 0.0009826827184089932, + "loss": 3.0026, + "step": 3144 + }, + { + "epoch": 0.09325979301960087, + "grad_norm": 0.13677528500556946, + "learning_rate": 0.00098267044060738, + "loss": 2.9641, + "step": 3145 + }, + { + "epoch": 0.09328944637191235, + "grad_norm": 0.14300985634326935, + "learning_rate": 0.000982658158531637, + "loss": 2.9774, + "step": 3146 + }, + { + "epoch": 0.09331909972422382, + "grad_norm": 0.14781856536865234, + "learning_rate": 0.0009826458721818735, + "loss": 2.9855, + "step": 3147 + }, + { + "epoch": 0.0933487530765353, + "grad_norm": 0.14135713875293732, + "learning_rate": 0.0009826335815581975, + "loss": 2.9572, + "step": 3148 + }, + { + "epoch": 0.09337840642884677, + "grad_norm": 0.12966826558113098, + "learning_rate": 0.0009826212866607185, + "loss": 2.9616, + "step": 3149 + }, + { + "epoch": 0.09340805978115826, + "grad_norm": 0.11842654645442963, + "learning_rate": 0.0009826089874895453, + "loss": 2.9623, + "step": 3150 + }, + { + "epoch": 0.09343771313346974, + "grad_norm": 0.1205068901181221, + "learning_rate": 0.0009825966840447866, + "loss": 2.9671, + "step": 3151 + }, + { + "epoch": 0.09346736648578122, + "grad_norm": 0.1323694884777069, + "learning_rate": 0.0009825843763265514, + "loss": 2.9768, + "step": 3152 + }, + { + "epoch": 0.0934970198380927, + "grad_norm": 0.12651735544204712, + "learning_rate": 0.0009825720643349487, + "loss": 2.9505, + "step": 3153 + }, + { + "epoch": 0.09352667319040417, + "grad_norm": 0.11459507793188095, + "learning_rate": 0.0009825597480700875, + "loss": 3.0051, + "step": 3154 + }, + { + "epoch": 0.09355632654271566, + "grad_norm": 0.12723775207996368, + "learning_rate": 0.0009825474275320769, + "loss": 2.9585, + "step": 3155 + }, + { + "epoch": 0.09358597989502714, + "grad_norm": 0.13389378786087036, + "learning_rate": 0.000982535102721026, + "loss": 2.9641, + "step": 3156 + }, + { + "epoch": 0.09361563324733861, + "grad_norm": 0.14025457203388214, + "learning_rate": 0.0009825227736370442, + "loss": 2.9827, + "step": 3157 + }, + { + "epoch": 0.09364528659965009, + "grad_norm": 0.16252516210079193, + "learning_rate": 0.00098251044028024, + "loss": 3.0035, + "step": 3158 + }, + { + "epoch": 0.09367493995196156, + "grad_norm": 0.18672889471054077, + "learning_rate": 0.0009824981026507235, + "loss": 2.9785, + "step": 3159 + }, + { + "epoch": 0.09370459330427305, + "grad_norm": 0.19122464954853058, + "learning_rate": 0.0009824857607486032, + "loss": 3.023, + "step": 3160 + }, + { + "epoch": 0.09373424665658453, + "grad_norm": 0.18312731385231018, + "learning_rate": 0.0009824734145739886, + "loss": 3.0031, + "step": 3161 + }, + { + "epoch": 0.093763900008896, + "grad_norm": 0.20445917546749115, + "learning_rate": 0.000982461064126989, + "loss": 2.9737, + "step": 3162 + }, + { + "epoch": 0.09379355336120748, + "grad_norm": 0.2336108386516571, + "learning_rate": 0.0009824487094077143, + "loss": 2.9651, + "step": 3163 + }, + { + "epoch": 0.09382320671351896, + "grad_norm": 0.22759678959846497, + "learning_rate": 0.0009824363504162732, + "loss": 2.9749, + "step": 3164 + }, + { + "epoch": 0.09385286006583045, + "grad_norm": 0.15145814418792725, + "learning_rate": 0.0009824239871527754, + "loss": 2.9655, + "step": 3165 + }, + { + "epoch": 0.09388251341814192, + "grad_norm": 0.17214523255825043, + "learning_rate": 0.0009824116196173304, + "loss": 2.9661, + "step": 3166 + }, + { + "epoch": 0.0939121667704534, + "grad_norm": 0.16506926715373993, + "learning_rate": 0.0009823992478100476, + "loss": 3.0303, + "step": 3167 + }, + { + "epoch": 0.09394182012276488, + "grad_norm": 0.16277126967906952, + "learning_rate": 0.000982386871731037, + "loss": 2.9566, + "step": 3168 + }, + { + "epoch": 0.09397147347507635, + "grad_norm": 0.175755113363266, + "learning_rate": 0.0009823744913804076, + "loss": 2.9566, + "step": 3169 + }, + { + "epoch": 0.09400112682738784, + "grad_norm": 0.14189155399799347, + "learning_rate": 0.0009823621067582692, + "loss": 2.9201, + "step": 3170 + }, + { + "epoch": 0.09403078017969932, + "grad_norm": 0.2509623169898987, + "learning_rate": 0.0009823497178647316, + "loss": 2.9868, + "step": 3171 + }, + { + "epoch": 0.0940604335320108, + "grad_norm": 0.13830161094665527, + "learning_rate": 0.0009823373246999046, + "loss": 2.9703, + "step": 3172 + }, + { + "epoch": 0.09409008688432227, + "grad_norm": 0.13395336270332336, + "learning_rate": 0.0009823249272638977, + "loss": 2.9668, + "step": 3173 + }, + { + "epoch": 0.09411974023663375, + "grad_norm": 0.12961560487747192, + "learning_rate": 0.0009823125255568209, + "loss": 2.9763, + "step": 3174 + }, + { + "epoch": 0.09414939358894522, + "grad_norm": 0.12189830839633942, + "learning_rate": 0.0009823001195787837, + "loss": 2.9662, + "step": 3175 + }, + { + "epoch": 0.09417904694125671, + "grad_norm": 0.12663370370864868, + "learning_rate": 0.0009822877093298961, + "loss": 2.969, + "step": 3176 + }, + { + "epoch": 0.09420870029356819, + "grad_norm": 0.13491225242614746, + "learning_rate": 0.0009822752948102683, + "loss": 2.9611, + "step": 3177 + }, + { + "epoch": 0.09423835364587967, + "grad_norm": 0.1720840334892273, + "learning_rate": 0.0009822628760200098, + "loss": 3.0081, + "step": 3178 + }, + { + "epoch": 0.09426800699819114, + "grad_norm": 0.18974585831165314, + "learning_rate": 0.000982250452959231, + "loss": 2.9552, + "step": 3179 + }, + { + "epoch": 0.09429766035050262, + "grad_norm": 0.1874057650566101, + "learning_rate": 0.0009822380256280412, + "loss": 2.9829, + "step": 3180 + }, + { + "epoch": 0.09432731370281411, + "grad_norm": 0.16900300979614258, + "learning_rate": 0.0009822255940265512, + "loss": 2.9669, + "step": 3181 + }, + { + "epoch": 0.09435696705512558, + "grad_norm": 0.15524974465370178, + "learning_rate": 0.000982213158154871, + "loss": 2.9781, + "step": 3182 + }, + { + "epoch": 0.09438662040743706, + "grad_norm": 0.1797247678041458, + "learning_rate": 0.0009822007180131103, + "loss": 3.0041, + "step": 3183 + }, + { + "epoch": 0.09441627375974854, + "grad_norm": 0.19572274386882782, + "learning_rate": 0.0009821882736013793, + "loss": 2.9673, + "step": 3184 + }, + { + "epoch": 0.09444592711206001, + "grad_norm": 0.18921607732772827, + "learning_rate": 0.0009821758249197886, + "loss": 2.975, + "step": 3185 + }, + { + "epoch": 0.0944755804643715, + "grad_norm": 0.17412163317203522, + "learning_rate": 0.000982163371968448, + "loss": 2.9616, + "step": 3186 + }, + { + "epoch": 0.09450523381668298, + "grad_norm": 0.1624414473772049, + "learning_rate": 0.0009821509147474683, + "loss": 2.9723, + "step": 3187 + }, + { + "epoch": 0.09453488716899446, + "grad_norm": 0.1452871710062027, + "learning_rate": 0.0009821384532569593, + "loss": 2.9835, + "step": 3188 + }, + { + "epoch": 0.09456454052130593, + "grad_norm": 0.14391346275806427, + "learning_rate": 0.0009821259874970316, + "loss": 2.9887, + "step": 3189 + }, + { + "epoch": 0.09459419387361741, + "grad_norm": 0.14845263957977295, + "learning_rate": 0.0009821135174677956, + "loss": 2.9653, + "step": 3190 + }, + { + "epoch": 0.0946238472259289, + "grad_norm": 0.1724640429019928, + "learning_rate": 0.0009821010431693613, + "loss": 2.9958, + "step": 3191 + }, + { + "epoch": 0.09465350057824037, + "grad_norm": 0.19398413598537445, + "learning_rate": 0.0009820885646018398, + "loss": 2.9612, + "step": 3192 + }, + { + "epoch": 0.09468315393055185, + "grad_norm": 0.1599506288766861, + "learning_rate": 0.0009820760817653414, + "loss": 2.9572, + "step": 3193 + }, + { + "epoch": 0.09471280728286333, + "grad_norm": 0.1459907591342926, + "learning_rate": 0.0009820635946599762, + "loss": 2.9662, + "step": 3194 + }, + { + "epoch": 0.0947424606351748, + "grad_norm": 0.14724750816822052, + "learning_rate": 0.0009820511032858554, + "loss": 2.962, + "step": 3195 + }, + { + "epoch": 0.09477211398748629, + "grad_norm": 0.14211462438106537, + "learning_rate": 0.0009820386076430892, + "loss": 2.9341, + "step": 3196 + }, + { + "epoch": 0.09480176733979777, + "grad_norm": 0.1594228297472, + "learning_rate": 0.0009820261077317885, + "loss": 2.9673, + "step": 3197 + }, + { + "epoch": 0.09483142069210924, + "grad_norm": 0.17523755133152008, + "learning_rate": 0.000982013603552064, + "loss": 2.9994, + "step": 3198 + }, + { + "epoch": 0.09486107404442072, + "grad_norm": 0.16880756616592407, + "learning_rate": 0.000982001095104026, + "loss": 2.9558, + "step": 3199 + }, + { + "epoch": 0.0948907273967322, + "grad_norm": 0.1806461364030838, + "learning_rate": 0.0009819885823877856, + "loss": 2.9549, + "step": 3200 + }, + { + "epoch": 0.09492038074904367, + "grad_norm": 0.20701567828655243, + "learning_rate": 0.0009819760654034538, + "loss": 2.9547, + "step": 3201 + }, + { + "epoch": 0.09495003410135516, + "grad_norm": 0.1730148196220398, + "learning_rate": 0.000981963544151141, + "loss": 2.9495, + "step": 3202 + }, + { + "epoch": 0.09497968745366664, + "grad_norm": 0.1641717255115509, + "learning_rate": 0.0009819510186309583, + "loss": 2.9635, + "step": 3203 + }, + { + "epoch": 0.09500934080597812, + "grad_norm": 0.16493810713291168, + "learning_rate": 0.0009819384888430168, + "loss": 2.9442, + "step": 3204 + }, + { + "epoch": 0.09503899415828959, + "grad_norm": 0.1644190102815628, + "learning_rate": 0.000981925954787427, + "loss": 2.9621, + "step": 3205 + }, + { + "epoch": 0.09506864751060107, + "grad_norm": 0.16808444261550903, + "learning_rate": 0.0009819134164643004, + "loss": 2.9729, + "step": 3206 + }, + { + "epoch": 0.09509830086291256, + "grad_norm": 0.15058870613574982, + "learning_rate": 0.0009819008738737476, + "loss": 2.9769, + "step": 3207 + }, + { + "epoch": 0.09512795421522403, + "grad_norm": 0.1487637460231781, + "learning_rate": 0.00098188832701588, + "loss": 2.9848, + "step": 3208 + }, + { + "epoch": 0.09515760756753551, + "grad_norm": 0.15887297689914703, + "learning_rate": 0.0009818757758908085, + "loss": 2.9579, + "step": 3209 + }, + { + "epoch": 0.09518726091984699, + "grad_norm": 0.1575237363576889, + "learning_rate": 0.0009818632204986442, + "loss": 2.9694, + "step": 3210 + }, + { + "epoch": 0.09521691427215846, + "grad_norm": 0.1409996747970581, + "learning_rate": 0.0009818506608394984, + "loss": 2.9522, + "step": 3211 + }, + { + "epoch": 0.09524656762446995, + "grad_norm": 0.18042278289794922, + "learning_rate": 0.0009818380969134823, + "loss": 2.9627, + "step": 3212 + }, + { + "epoch": 0.09527622097678143, + "grad_norm": 0.18848931789398193, + "learning_rate": 0.0009818255287207072, + "loss": 2.9892, + "step": 3213 + }, + { + "epoch": 0.0953058743290929, + "grad_norm": 0.1557978391647339, + "learning_rate": 0.0009818129562612842, + "loss": 2.9889, + "step": 3214 + }, + { + "epoch": 0.09533552768140438, + "grad_norm": 0.16853943467140198, + "learning_rate": 0.000981800379535325, + "loss": 2.936, + "step": 3215 + }, + { + "epoch": 0.09536518103371586, + "grad_norm": 0.15147440135478973, + "learning_rate": 0.0009817877985429406, + "loss": 2.9591, + "step": 3216 + }, + { + "epoch": 0.09539483438602735, + "grad_norm": 0.14191128313541412, + "learning_rate": 0.0009817752132842425, + "loss": 2.9234, + "step": 3217 + }, + { + "epoch": 0.09542448773833882, + "grad_norm": 0.13930073380470276, + "learning_rate": 0.0009817626237593423, + "loss": 2.9709, + "step": 3218 + }, + { + "epoch": 0.0954541410906503, + "grad_norm": 0.15872064232826233, + "learning_rate": 0.0009817500299683514, + "loss": 2.9851, + "step": 3219 + }, + { + "epoch": 0.09548379444296178, + "grad_norm": 0.176509827375412, + "learning_rate": 0.000981737431911381, + "loss": 2.9729, + "step": 3220 + }, + { + "epoch": 0.09551344779527325, + "grad_norm": 0.19070233404636383, + "learning_rate": 0.000981724829588543, + "loss": 3.0168, + "step": 3221 + }, + { + "epoch": 0.09554310114758474, + "grad_norm": 0.16183814406394958, + "learning_rate": 0.0009817122229999493, + "loss": 2.9423, + "step": 3222 + }, + { + "epoch": 0.09557275449989622, + "grad_norm": 0.1407933086156845, + "learning_rate": 0.000981699612145711, + "loss": 2.9674, + "step": 3223 + }, + { + "epoch": 0.0956024078522077, + "grad_norm": 0.1382405012845993, + "learning_rate": 0.00098168699702594, + "loss": 2.9727, + "step": 3224 + }, + { + "epoch": 0.09563206120451917, + "grad_norm": 0.16012653708457947, + "learning_rate": 0.0009816743776407478, + "loss": 3.0008, + "step": 3225 + }, + { + "epoch": 0.09566171455683065, + "grad_norm": 0.17937909066677094, + "learning_rate": 0.0009816617539902463, + "loss": 2.972, + "step": 3226 + }, + { + "epoch": 0.09569136790914212, + "grad_norm": 0.16725103557109833, + "learning_rate": 0.0009816491260745475, + "loss": 2.9763, + "step": 3227 + }, + { + "epoch": 0.09572102126145361, + "grad_norm": 0.14670255780220032, + "learning_rate": 0.000981636493893763, + "loss": 2.9814, + "step": 3228 + }, + { + "epoch": 0.09575067461376509, + "grad_norm": 0.14402474462985992, + "learning_rate": 0.0009816238574480046, + "loss": 2.9783, + "step": 3229 + }, + { + "epoch": 0.09578032796607656, + "grad_norm": 0.14970169961452484, + "learning_rate": 0.000981611216737384, + "loss": 2.9362, + "step": 3230 + }, + { + "epoch": 0.09580998131838804, + "grad_norm": 0.16650806367397308, + "learning_rate": 0.0009815985717620138, + "loss": 2.9595, + "step": 3231 + }, + { + "epoch": 0.09583963467069952, + "grad_norm": 0.2033814787864685, + "learning_rate": 0.0009815859225220055, + "loss": 2.9542, + "step": 3232 + }, + { + "epoch": 0.095869288023011, + "grad_norm": 0.22329749166965485, + "learning_rate": 0.000981573269017471, + "loss": 3.0057, + "step": 3233 + }, + { + "epoch": 0.09589894137532248, + "grad_norm": 0.18544180691242218, + "learning_rate": 0.0009815606112485227, + "loss": 3.0008, + "step": 3234 + }, + { + "epoch": 0.09592859472763396, + "grad_norm": 0.1602955460548401, + "learning_rate": 0.0009815479492152725, + "loss": 2.9483, + "step": 3235 + }, + { + "epoch": 0.09595824807994544, + "grad_norm": 0.16356350481510162, + "learning_rate": 0.0009815352829178326, + "loss": 2.9582, + "step": 3236 + }, + { + "epoch": 0.09598790143225691, + "grad_norm": 0.1530417650938034, + "learning_rate": 0.000981522612356315, + "loss": 2.9333, + "step": 3237 + }, + { + "epoch": 0.0960175547845684, + "grad_norm": 0.14978186786174774, + "learning_rate": 0.0009815099375308322, + "loss": 2.953, + "step": 3238 + }, + { + "epoch": 0.09604720813687988, + "grad_norm": 0.1606997847557068, + "learning_rate": 0.000981497258441496, + "loss": 2.9279, + "step": 3239 + }, + { + "epoch": 0.09607686148919135, + "grad_norm": 0.1322992444038391, + "learning_rate": 0.000981484575088419, + "loss": 2.9591, + "step": 3240 + }, + { + "epoch": 0.09610651484150283, + "grad_norm": 0.1405646950006485, + "learning_rate": 0.0009814718874717138, + "loss": 2.9589, + "step": 3241 + }, + { + "epoch": 0.0961361681938143, + "grad_norm": 0.13780680298805237, + "learning_rate": 0.000981459195591492, + "loss": 2.9559, + "step": 3242 + }, + { + "epoch": 0.0961658215461258, + "grad_norm": 0.18002942204475403, + "learning_rate": 0.0009814464994478665, + "loss": 2.9476, + "step": 3243 + }, + { + "epoch": 0.09619547489843727, + "grad_norm": 0.1840604990720749, + "learning_rate": 0.0009814337990409496, + "loss": 2.9771, + "step": 3244 + }, + { + "epoch": 0.09622512825074875, + "grad_norm": 0.20726291835308075, + "learning_rate": 0.000981421094370854, + "loss": 2.98, + "step": 3245 + }, + { + "epoch": 0.09625478160306022, + "grad_norm": 0.2143896520137787, + "learning_rate": 0.0009814083854376915, + "loss": 2.98, + "step": 3246 + }, + { + "epoch": 0.0962844349553717, + "grad_norm": 0.2156897336244583, + "learning_rate": 0.0009813956722415755, + "loss": 2.9743, + "step": 3247 + }, + { + "epoch": 0.09631408830768319, + "grad_norm": 0.21872571110725403, + "learning_rate": 0.000981382954782618, + "loss": 2.9895, + "step": 3248 + }, + { + "epoch": 0.09634374165999467, + "grad_norm": 0.1813579648733139, + "learning_rate": 0.000981370233060932, + "loss": 2.9577, + "step": 3249 + }, + { + "epoch": 0.09637339501230614, + "grad_norm": 0.19509799778461456, + "learning_rate": 0.0009813575070766296, + "loss": 2.9648, + "step": 3250 + }, + { + "epoch": 0.09640304836461762, + "grad_norm": 0.1845463067293167, + "learning_rate": 0.000981344776829824, + "loss": 2.989, + "step": 3251 + }, + { + "epoch": 0.0964327017169291, + "grad_norm": 0.17341046035289764, + "learning_rate": 0.000981332042320628, + "loss": 2.9757, + "step": 3252 + }, + { + "epoch": 0.09646235506924057, + "grad_norm": 0.16818708181381226, + "learning_rate": 0.000981319303549154, + "loss": 2.9633, + "step": 3253 + }, + { + "epoch": 0.09649200842155206, + "grad_norm": 0.17776784300804138, + "learning_rate": 0.000981306560515515, + "loss": 2.9862, + "step": 3254 + }, + { + "epoch": 0.09652166177386354, + "grad_norm": 0.16722826659679413, + "learning_rate": 0.0009812938132198238, + "loss": 2.9752, + "step": 3255 + }, + { + "epoch": 0.09655131512617501, + "grad_norm": 0.18932537734508514, + "learning_rate": 0.000981281061662193, + "loss": 2.9891, + "step": 3256 + }, + { + "epoch": 0.09658096847848649, + "grad_norm": 0.17110653221607208, + "learning_rate": 0.000981268305842736, + "loss": 2.9377, + "step": 3257 + }, + { + "epoch": 0.09661062183079797, + "grad_norm": 0.1378072053194046, + "learning_rate": 0.0009812555457615656, + "loss": 2.9587, + "step": 3258 + }, + { + "epoch": 0.09664027518310946, + "grad_norm": 0.12743507325649261, + "learning_rate": 0.0009812427814187947, + "loss": 2.9918, + "step": 3259 + }, + { + "epoch": 0.09666992853542093, + "grad_norm": 0.12005425989627838, + "learning_rate": 0.0009812300128145364, + "loss": 2.9676, + "step": 3260 + }, + { + "epoch": 0.09669958188773241, + "grad_norm": 0.12983161211013794, + "learning_rate": 0.0009812172399489036, + "loss": 2.958, + "step": 3261 + }, + { + "epoch": 0.09672923524004388, + "grad_norm": 0.12533341348171234, + "learning_rate": 0.0009812044628220095, + "loss": 2.9397, + "step": 3262 + }, + { + "epoch": 0.09675888859235536, + "grad_norm": 0.1424763798713684, + "learning_rate": 0.0009811916814339676, + "loss": 2.929, + "step": 3263 + }, + { + "epoch": 0.09678854194466685, + "grad_norm": 0.14435404539108276, + "learning_rate": 0.0009811788957848906, + "loss": 2.9788, + "step": 3264 + }, + { + "epoch": 0.09681819529697833, + "grad_norm": 0.14698632061481476, + "learning_rate": 0.0009811661058748916, + "loss": 2.9754, + "step": 3265 + }, + { + "epoch": 0.0968478486492898, + "grad_norm": 0.16136574745178223, + "learning_rate": 0.0009811533117040844, + "loss": 2.9602, + "step": 3266 + }, + { + "epoch": 0.09687750200160128, + "grad_norm": 0.15340252220630646, + "learning_rate": 0.0009811405132725821, + "loss": 2.9612, + "step": 3267 + }, + { + "epoch": 0.09690715535391276, + "grad_norm": 0.15349088609218597, + "learning_rate": 0.000981127710580498, + "loss": 2.9824, + "step": 3268 + }, + { + "epoch": 0.09693680870622425, + "grad_norm": 0.14958100020885468, + "learning_rate": 0.0009811149036279454, + "loss": 2.967, + "step": 3269 + }, + { + "epoch": 0.09696646205853572, + "grad_norm": 0.14418324828147888, + "learning_rate": 0.0009811020924150376, + "loss": 2.9778, + "step": 3270 + }, + { + "epoch": 0.0969961154108472, + "grad_norm": 0.1446886658668518, + "learning_rate": 0.0009810892769418881, + "loss": 2.9514, + "step": 3271 + }, + { + "epoch": 0.09702576876315867, + "grad_norm": 0.172568216919899, + "learning_rate": 0.0009810764572086108, + "loss": 2.9786, + "step": 3272 + }, + { + "epoch": 0.09705542211547015, + "grad_norm": 0.1681160181760788, + "learning_rate": 0.0009810636332153188, + "loss": 2.9789, + "step": 3273 + }, + { + "epoch": 0.09708507546778163, + "grad_norm": 0.14066503942012787, + "learning_rate": 0.0009810508049621256, + "loss": 2.9577, + "step": 3274 + }, + { + "epoch": 0.09711472882009312, + "grad_norm": 0.1750916838645935, + "learning_rate": 0.0009810379724491452, + "loss": 3.0011, + "step": 3275 + }, + { + "epoch": 0.09714438217240459, + "grad_norm": 0.20135261118412018, + "learning_rate": 0.0009810251356764908, + "loss": 2.973, + "step": 3276 + }, + { + "epoch": 0.09717403552471607, + "grad_norm": 0.20557989180088043, + "learning_rate": 0.0009810122946442763, + "loss": 2.9772, + "step": 3277 + }, + { + "epoch": 0.09720368887702754, + "grad_norm": 0.19539834558963776, + "learning_rate": 0.0009809994493526152, + "loss": 2.9524, + "step": 3278 + }, + { + "epoch": 0.09723334222933902, + "grad_norm": 0.23010143637657166, + "learning_rate": 0.0009809865998016217, + "loss": 2.9514, + "step": 3279 + }, + { + "epoch": 0.09726299558165051, + "grad_norm": 0.2515283226966858, + "learning_rate": 0.000980973745991409, + "loss": 2.9837, + "step": 3280 + }, + { + "epoch": 0.09729264893396199, + "grad_norm": 0.19859789311885834, + "learning_rate": 0.0009809608879220914, + "loss": 2.9636, + "step": 3281 + }, + { + "epoch": 0.09732230228627346, + "grad_norm": 0.17831583321094513, + "learning_rate": 0.0009809480255937827, + "loss": 2.9464, + "step": 3282 + }, + { + "epoch": 0.09735195563858494, + "grad_norm": 0.18196985125541687, + "learning_rate": 0.0009809351590065966, + "loss": 3.0138, + "step": 3283 + }, + { + "epoch": 0.09738160899089641, + "grad_norm": 0.1791566163301468, + "learning_rate": 0.000980922288160647, + "loss": 2.9621, + "step": 3284 + }, + { + "epoch": 0.0974112623432079, + "grad_norm": 0.1443985551595688, + "learning_rate": 0.000980909413056048, + "loss": 2.9649, + "step": 3285 + }, + { + "epoch": 0.09744091569551938, + "grad_norm": 0.15944503247737885, + "learning_rate": 0.0009808965336929136, + "loss": 3.004, + "step": 3286 + }, + { + "epoch": 0.09747056904783086, + "grad_norm": 0.14119675755500793, + "learning_rate": 0.000980883650071358, + "loss": 2.971, + "step": 3287 + }, + { + "epoch": 0.09750022240014233, + "grad_norm": 0.1300056129693985, + "learning_rate": 0.000980870762191495, + "loss": 2.9506, + "step": 3288 + }, + { + "epoch": 0.09752987575245381, + "grad_norm": 0.124681755900383, + "learning_rate": 0.000980857870053439, + "loss": 2.9543, + "step": 3289 + }, + { + "epoch": 0.0975595291047653, + "grad_norm": 0.130082905292511, + "learning_rate": 0.000980844973657304, + "loss": 2.987, + "step": 3290 + }, + { + "epoch": 0.09758918245707678, + "grad_norm": 0.14995035529136658, + "learning_rate": 0.000980832073003204, + "loss": 2.9716, + "step": 3291 + }, + { + "epoch": 0.09761883580938825, + "grad_norm": 0.137568399310112, + "learning_rate": 0.0009808191680912536, + "loss": 2.9777, + "step": 3292 + }, + { + "epoch": 0.09764848916169973, + "grad_norm": 0.14079487323760986, + "learning_rate": 0.000980806258921567, + "loss": 2.9698, + "step": 3293 + }, + { + "epoch": 0.0976781425140112, + "grad_norm": 0.13637259602546692, + "learning_rate": 0.0009807933454942584, + "loss": 2.9625, + "step": 3294 + }, + { + "epoch": 0.0977077958663227, + "grad_norm": 0.13705933094024658, + "learning_rate": 0.000980780427809442, + "loss": 2.9512, + "step": 3295 + }, + { + "epoch": 0.09773744921863417, + "grad_norm": 0.15370382368564606, + "learning_rate": 0.0009807675058672327, + "loss": 2.9669, + "step": 3296 + }, + { + "epoch": 0.09776710257094565, + "grad_norm": 0.15506130456924438, + "learning_rate": 0.0009807545796677445, + "loss": 2.9925, + "step": 3297 + }, + { + "epoch": 0.09779675592325712, + "grad_norm": 0.16866092383861542, + "learning_rate": 0.0009807416492110918, + "loss": 2.9817, + "step": 3298 + }, + { + "epoch": 0.0978264092755686, + "grad_norm": 0.15216514468193054, + "learning_rate": 0.0009807287144973896, + "loss": 2.972, + "step": 3299 + }, + { + "epoch": 0.09785606262788007, + "grad_norm": 0.13273529708385468, + "learning_rate": 0.000980715775526752, + "loss": 2.9711, + "step": 3300 + }, + { + "epoch": 0.09788571598019156, + "grad_norm": 0.16576068103313446, + "learning_rate": 0.0009807028322992937, + "loss": 3.0061, + "step": 3301 + }, + { + "epoch": 0.09791536933250304, + "grad_norm": 0.18601860105991364, + "learning_rate": 0.000980689884815129, + "loss": 2.9272, + "step": 3302 + }, + { + "epoch": 0.09794502268481452, + "grad_norm": 0.17207340896129608, + "learning_rate": 0.0009806769330743732, + "loss": 2.9684, + "step": 3303 + }, + { + "epoch": 0.097974676037126, + "grad_norm": 0.19566261768341064, + "learning_rate": 0.0009806639770771407, + "loss": 2.9218, + "step": 3304 + }, + { + "epoch": 0.09800432938943747, + "grad_norm": 0.17182184755802155, + "learning_rate": 0.0009806510168235462, + "loss": 2.9606, + "step": 3305 + }, + { + "epoch": 0.09803398274174896, + "grad_norm": 0.17789116501808167, + "learning_rate": 0.0009806380523137042, + "loss": 2.9526, + "step": 3306 + }, + { + "epoch": 0.09806363609406044, + "grad_norm": 0.1834883987903595, + "learning_rate": 0.0009806250835477297, + "loss": 2.9685, + "step": 3307 + }, + { + "epoch": 0.09809328944637191, + "grad_norm": 0.15744373202323914, + "learning_rate": 0.0009806121105257377, + "loss": 2.9849, + "step": 3308 + }, + { + "epoch": 0.09812294279868339, + "grad_norm": 0.15262331068515778, + "learning_rate": 0.000980599133247843, + "loss": 2.9534, + "step": 3309 + }, + { + "epoch": 0.09815259615099486, + "grad_norm": 0.16097070276737213, + "learning_rate": 0.0009805861517141606, + "loss": 2.971, + "step": 3310 + }, + { + "epoch": 0.09818224950330635, + "grad_norm": 0.16684775054454803, + "learning_rate": 0.0009805731659248051, + "loss": 2.9862, + "step": 3311 + }, + { + "epoch": 0.09821190285561783, + "grad_norm": 0.18067990243434906, + "learning_rate": 0.000980560175879892, + "loss": 2.9384, + "step": 3312 + }, + { + "epoch": 0.0982415562079293, + "grad_norm": 0.1763729453086853, + "learning_rate": 0.0009805471815795357, + "loss": 2.9587, + "step": 3313 + }, + { + "epoch": 0.09827120956024078, + "grad_norm": 0.19512495398521423, + "learning_rate": 0.0009805341830238519, + "loss": 2.9418, + "step": 3314 + }, + { + "epoch": 0.09830086291255226, + "grad_norm": 0.18850059807300568, + "learning_rate": 0.0009805211802129553, + "loss": 2.9617, + "step": 3315 + }, + { + "epoch": 0.09833051626486375, + "grad_norm": 0.16201235353946686, + "learning_rate": 0.0009805081731469611, + "loss": 2.9784, + "step": 3316 + }, + { + "epoch": 0.09836016961717522, + "grad_norm": 0.17920222878456116, + "learning_rate": 0.0009804951618259848, + "loss": 2.9586, + "step": 3317 + }, + { + "epoch": 0.0983898229694867, + "grad_norm": 0.1769189089536667, + "learning_rate": 0.0009804821462501413, + "loss": 2.9309, + "step": 3318 + }, + { + "epoch": 0.09841947632179818, + "grad_norm": 0.17113710939884186, + "learning_rate": 0.0009804691264195457, + "loss": 2.9734, + "step": 3319 + }, + { + "epoch": 0.09844912967410965, + "grad_norm": 0.191463440656662, + "learning_rate": 0.0009804561023343138, + "loss": 2.9305, + "step": 3320 + }, + { + "epoch": 0.09847878302642114, + "grad_norm": 0.18378089368343353, + "learning_rate": 0.0009804430739945604, + "loss": 2.9871, + "step": 3321 + }, + { + "epoch": 0.09850843637873262, + "grad_norm": 0.19083349406719208, + "learning_rate": 0.0009804300414004014, + "loss": 2.9542, + "step": 3322 + }, + { + "epoch": 0.0985380897310441, + "grad_norm": 0.17879442870616913, + "learning_rate": 0.0009804170045519516, + "loss": 2.9316, + "step": 3323 + }, + { + "epoch": 0.09856774308335557, + "grad_norm": 0.14747320115566254, + "learning_rate": 0.000980403963449327, + "loss": 2.9508, + "step": 3324 + }, + { + "epoch": 0.09859739643566705, + "grad_norm": 0.1532939225435257, + "learning_rate": 0.0009803909180926425, + "loss": 2.9412, + "step": 3325 + }, + { + "epoch": 0.09862704978797852, + "grad_norm": 0.13200654089450836, + "learning_rate": 0.0009803778684820143, + "loss": 2.9735, + "step": 3326 + }, + { + "epoch": 0.09865670314029001, + "grad_norm": 0.16110633313655853, + "learning_rate": 0.0009803648146175575, + "loss": 2.9086, + "step": 3327 + }, + { + "epoch": 0.09868635649260149, + "grad_norm": 0.172915980219841, + "learning_rate": 0.0009803517564993878, + "loss": 2.9511, + "step": 3328 + }, + { + "epoch": 0.09871600984491297, + "grad_norm": 0.16833774745464325, + "learning_rate": 0.000980338694127621, + "loss": 2.9827, + "step": 3329 + }, + { + "epoch": 0.09874566319722444, + "grad_norm": 0.14545844495296478, + "learning_rate": 0.0009803256275023723, + "loss": 2.9445, + "step": 3330 + }, + { + "epoch": 0.09877531654953592, + "grad_norm": 0.1342565417289734, + "learning_rate": 0.000980312556623758, + "loss": 2.9337, + "step": 3331 + }, + { + "epoch": 0.09880496990184741, + "grad_norm": 0.1366616040468216, + "learning_rate": 0.0009802994814918935, + "loss": 2.9646, + "step": 3332 + }, + { + "epoch": 0.09883462325415888, + "grad_norm": 0.14674364030361176, + "learning_rate": 0.0009802864021068946, + "loss": 2.9615, + "step": 3333 + }, + { + "epoch": 0.09886427660647036, + "grad_norm": 0.13954688608646393, + "learning_rate": 0.0009802733184688772, + "loss": 2.9342, + "step": 3334 + }, + { + "epoch": 0.09889392995878184, + "grad_norm": 0.1448228359222412, + "learning_rate": 0.0009802602305779573, + "loss": 2.964, + "step": 3335 + }, + { + "epoch": 0.09892358331109331, + "grad_norm": 0.16149581968784332, + "learning_rate": 0.0009802471384342505, + "loss": 2.9839, + "step": 3336 + }, + { + "epoch": 0.0989532366634048, + "grad_norm": 0.16909250617027283, + "learning_rate": 0.0009802340420378725, + "loss": 2.9042, + "step": 3337 + }, + { + "epoch": 0.09898289001571628, + "grad_norm": 0.15078799426555634, + "learning_rate": 0.00098022094138894, + "loss": 2.9625, + "step": 3338 + }, + { + "epoch": 0.09901254336802776, + "grad_norm": 0.1319853961467743, + "learning_rate": 0.0009802078364875685, + "loss": 2.9588, + "step": 3339 + }, + { + "epoch": 0.09904219672033923, + "grad_norm": 0.13361236453056335, + "learning_rate": 0.0009801947273338743, + "loss": 2.9685, + "step": 3340 + }, + { + "epoch": 0.09907185007265071, + "grad_norm": 0.13920234143733978, + "learning_rate": 0.0009801816139279732, + "loss": 2.9336, + "step": 3341 + }, + { + "epoch": 0.0991015034249622, + "grad_norm": 0.15703454613685608, + "learning_rate": 0.0009801684962699817, + "loss": 2.9452, + "step": 3342 + }, + { + "epoch": 0.09913115677727367, + "grad_norm": 0.1787896603345871, + "learning_rate": 0.0009801553743600158, + "loss": 2.9806, + "step": 3343 + }, + { + "epoch": 0.09916081012958515, + "grad_norm": 0.18933241069316864, + "learning_rate": 0.0009801422481981914, + "loss": 2.9302, + "step": 3344 + }, + { + "epoch": 0.09919046348189663, + "grad_norm": 0.19761356711387634, + "learning_rate": 0.000980129117784625, + "loss": 2.9498, + "step": 3345 + }, + { + "epoch": 0.0992201168342081, + "grad_norm": 0.19434455037117004, + "learning_rate": 0.000980115983119433, + "loss": 2.9602, + "step": 3346 + }, + { + "epoch": 0.09924977018651959, + "grad_norm": 0.17856809496879578, + "learning_rate": 0.0009801028442027316, + "loss": 2.9619, + "step": 3347 + }, + { + "epoch": 0.09927942353883107, + "grad_norm": 0.190278097987175, + "learning_rate": 0.0009800897010346368, + "loss": 2.9314, + "step": 3348 + }, + { + "epoch": 0.09930907689114254, + "grad_norm": 0.20762771368026733, + "learning_rate": 0.0009800765536152657, + "loss": 2.9464, + "step": 3349 + }, + { + "epoch": 0.09933873024345402, + "grad_norm": 0.21511228382587433, + "learning_rate": 0.0009800634019447338, + "loss": 2.9606, + "step": 3350 + }, + { + "epoch": 0.0993683835957655, + "grad_norm": 0.18002188205718994, + "learning_rate": 0.0009800502460231583, + "loss": 2.9754, + "step": 3351 + }, + { + "epoch": 0.09939803694807697, + "grad_norm": 0.1666654348373413, + "learning_rate": 0.0009800370858506558, + "loss": 2.9849, + "step": 3352 + }, + { + "epoch": 0.09942769030038846, + "grad_norm": 0.1614130586385727, + "learning_rate": 0.000980023921427342, + "loss": 2.9609, + "step": 3353 + }, + { + "epoch": 0.09945734365269994, + "grad_norm": 0.1622416228055954, + "learning_rate": 0.0009800107527533344, + "loss": 2.9919, + "step": 3354 + }, + { + "epoch": 0.09948699700501142, + "grad_norm": 0.16184353828430176, + "learning_rate": 0.000979997579828749, + "loss": 2.9864, + "step": 3355 + }, + { + "epoch": 0.09951665035732289, + "grad_norm": 0.14193148910999298, + "learning_rate": 0.0009799844026537026, + "loss": 2.9539, + "step": 3356 + }, + { + "epoch": 0.09954630370963437, + "grad_norm": 0.15202613174915314, + "learning_rate": 0.000979971221228312, + "loss": 2.9586, + "step": 3357 + }, + { + "epoch": 0.09957595706194586, + "grad_norm": 0.14432232081890106, + "learning_rate": 0.0009799580355526938, + "loss": 2.9407, + "step": 3358 + }, + { + "epoch": 0.09960561041425733, + "grad_norm": 0.13379336893558502, + "learning_rate": 0.0009799448456269649, + "loss": 2.9331, + "step": 3359 + }, + { + "epoch": 0.09963526376656881, + "grad_norm": 0.1266937106847763, + "learning_rate": 0.000979931651451242, + "loss": 2.9258, + "step": 3360 + }, + { + "epoch": 0.09966491711888029, + "grad_norm": 0.14042793214321136, + "learning_rate": 0.0009799184530256417, + "loss": 2.9465, + "step": 3361 + }, + { + "epoch": 0.09969457047119176, + "grad_norm": 0.1563592404127121, + "learning_rate": 0.0009799052503502814, + "loss": 2.9406, + "step": 3362 + }, + { + "epoch": 0.09972422382350325, + "grad_norm": 0.17790937423706055, + "learning_rate": 0.0009798920434252777, + "loss": 2.9479, + "step": 3363 + }, + { + "epoch": 0.09975387717581473, + "grad_norm": 0.19139744341373444, + "learning_rate": 0.0009798788322507475, + "loss": 3.0178, + "step": 3364 + }, + { + "epoch": 0.0997835305281262, + "grad_norm": 0.1717638224363327, + "learning_rate": 0.0009798656168268078, + "loss": 2.9473, + "step": 3365 + }, + { + "epoch": 0.09981318388043768, + "grad_norm": 0.16685007512569427, + "learning_rate": 0.0009798523971535759, + "loss": 2.9582, + "step": 3366 + }, + { + "epoch": 0.09984283723274916, + "grad_norm": 0.18653425574302673, + "learning_rate": 0.0009798391732311685, + "loss": 2.9599, + "step": 3367 + }, + { + "epoch": 0.09987249058506065, + "grad_norm": 0.16198495030403137, + "learning_rate": 0.000979825945059703, + "loss": 2.9914, + "step": 3368 + }, + { + "epoch": 0.09990214393737212, + "grad_norm": 0.14335636794567108, + "learning_rate": 0.0009798127126392964, + "loss": 2.9578, + "step": 3369 + }, + { + "epoch": 0.0999317972896836, + "grad_norm": 0.16320320963859558, + "learning_rate": 0.0009797994759700656, + "loss": 2.957, + "step": 3370 + }, + { + "epoch": 0.09996145064199508, + "grad_norm": 0.17214664816856384, + "learning_rate": 0.0009797862350521282, + "loss": 2.9877, + "step": 3371 + }, + { + "epoch": 0.09999110399430655, + "grad_norm": 0.15453949570655823, + "learning_rate": 0.0009797729898856015, + "loss": 2.9841, + "step": 3372 + }, + { + "epoch": 0.10002075734661804, + "grad_norm": 0.16884790360927582, + "learning_rate": 0.0009797597404706026, + "loss": 2.9539, + "step": 3373 + }, + { + "epoch": 0.10005041069892952, + "grad_norm": 0.18957722187042236, + "learning_rate": 0.0009797464868072487, + "loss": 2.9534, + "step": 3374 + }, + { + "epoch": 0.100080064051241, + "grad_norm": 0.14956098794937134, + "learning_rate": 0.0009797332288956574, + "loss": 2.9721, + "step": 3375 + }, + { + "epoch": 0.10010971740355247, + "grad_norm": 0.14747777581214905, + "learning_rate": 0.000979719966735946, + "loss": 2.9737, + "step": 3376 + }, + { + "epoch": 0.10013937075586395, + "grad_norm": 0.1458701342344284, + "learning_rate": 0.000979706700328232, + "loss": 2.9217, + "step": 3377 + }, + { + "epoch": 0.10016902410817542, + "grad_norm": 0.13160152733325958, + "learning_rate": 0.0009796934296726328, + "loss": 2.9573, + "step": 3378 + }, + { + "epoch": 0.10019867746048691, + "grad_norm": 0.1475238800048828, + "learning_rate": 0.0009796801547692657, + "loss": 2.9751, + "step": 3379 + }, + { + "epoch": 0.10022833081279839, + "grad_norm": 0.14310531318187714, + "learning_rate": 0.0009796668756182488, + "loss": 2.9658, + "step": 3380 + }, + { + "epoch": 0.10025798416510986, + "grad_norm": 0.13180074095726013, + "learning_rate": 0.0009796535922196993, + "loss": 2.9557, + "step": 3381 + }, + { + "epoch": 0.10028763751742134, + "grad_norm": 0.16193661093711853, + "learning_rate": 0.000979640304573735, + "loss": 2.9653, + "step": 3382 + }, + { + "epoch": 0.10031729086973282, + "grad_norm": 0.17199626564979553, + "learning_rate": 0.0009796270126804735, + "loss": 2.9486, + "step": 3383 + }, + { + "epoch": 0.10034694422204431, + "grad_norm": 0.19276733696460724, + "learning_rate": 0.0009796137165400322, + "loss": 2.924, + "step": 3384 + }, + { + "epoch": 0.10037659757435578, + "grad_norm": 0.19005261361598969, + "learning_rate": 0.000979600416152529, + "loss": 2.9664, + "step": 3385 + }, + { + "epoch": 0.10040625092666726, + "grad_norm": 0.1910150647163391, + "learning_rate": 0.0009795871115180824, + "loss": 2.9606, + "step": 3386 + }, + { + "epoch": 0.10043590427897874, + "grad_norm": 0.22016198933124542, + "learning_rate": 0.000979573802636809, + "loss": 2.9724, + "step": 3387 + }, + { + "epoch": 0.10046555763129021, + "grad_norm": 0.22258266806602478, + "learning_rate": 0.0009795604895088278, + "loss": 2.9916, + "step": 3388 + }, + { + "epoch": 0.1004952109836017, + "grad_norm": 0.1771932691335678, + "learning_rate": 0.0009795471721342557, + "loss": 2.8999, + "step": 3389 + }, + { + "epoch": 0.10052486433591318, + "grad_norm": 0.1767386645078659, + "learning_rate": 0.0009795338505132114, + "loss": 2.9637, + "step": 3390 + }, + { + "epoch": 0.10055451768822465, + "grad_norm": 0.17722921073436737, + "learning_rate": 0.0009795205246458123, + "loss": 2.9575, + "step": 3391 + }, + { + "epoch": 0.10058417104053613, + "grad_norm": 0.1801147311925888, + "learning_rate": 0.0009795071945321767, + "loss": 2.9695, + "step": 3392 + }, + { + "epoch": 0.1006138243928476, + "grad_norm": 0.17974716424942017, + "learning_rate": 0.0009794938601724226, + "loss": 2.9866, + "step": 3393 + }, + { + "epoch": 0.1006434777451591, + "grad_norm": 0.16465134918689728, + "learning_rate": 0.0009794805215666681, + "loss": 2.9496, + "step": 3394 + }, + { + "epoch": 0.10067313109747057, + "grad_norm": 0.1370932161808014, + "learning_rate": 0.0009794671787150314, + "loss": 2.9231, + "step": 3395 + }, + { + "epoch": 0.10070278444978205, + "grad_norm": 0.13604114949703217, + "learning_rate": 0.0009794538316176304, + "loss": 2.9794, + "step": 3396 + }, + { + "epoch": 0.10073243780209352, + "grad_norm": 0.14625917375087738, + "learning_rate": 0.0009794404802745834, + "loss": 2.9785, + "step": 3397 + }, + { + "epoch": 0.100762091154405, + "grad_norm": 0.1620011031627655, + "learning_rate": 0.0009794271246860086, + "loss": 2.9642, + "step": 3398 + }, + { + "epoch": 0.10079174450671649, + "grad_norm": 0.129896342754364, + "learning_rate": 0.0009794137648520245, + "loss": 2.9228, + "step": 3399 + }, + { + "epoch": 0.10082139785902797, + "grad_norm": 0.14420007169246674, + "learning_rate": 0.000979400400772749, + "loss": 2.9532, + "step": 3400 + }, + { + "epoch": 0.10085105121133944, + "grad_norm": 0.14851556718349457, + "learning_rate": 0.000979387032448301, + "loss": 3.0122, + "step": 3401 + }, + { + "epoch": 0.10088070456365092, + "grad_norm": 0.11422236263751984, + "learning_rate": 0.000979373659878798, + "loss": 2.9121, + "step": 3402 + }, + { + "epoch": 0.1009103579159624, + "grad_norm": 0.12219250202178955, + "learning_rate": 0.0009793602830643596, + "loss": 2.9515, + "step": 3403 + }, + { + "epoch": 0.10094001126827387, + "grad_norm": 0.11253809183835983, + "learning_rate": 0.000979346902005103, + "loss": 2.9425, + "step": 3404 + }, + { + "epoch": 0.10096966462058536, + "grad_norm": 0.10833992809057236, + "learning_rate": 0.0009793335167011478, + "loss": 2.9283, + "step": 3405 + }, + { + "epoch": 0.10099931797289684, + "grad_norm": 0.11366299539804459, + "learning_rate": 0.0009793201271526117, + "loss": 2.9775, + "step": 3406 + }, + { + "epoch": 0.10102897132520831, + "grad_norm": 0.11549575626850128, + "learning_rate": 0.0009793067333596138, + "loss": 2.9754, + "step": 3407 + }, + { + "epoch": 0.10105862467751979, + "grad_norm": 0.12546899914741516, + "learning_rate": 0.0009792933353222726, + "loss": 2.9416, + "step": 3408 + }, + { + "epoch": 0.10108827802983127, + "grad_norm": 0.13529753684997559, + "learning_rate": 0.0009792799330407063, + "loss": 2.9803, + "step": 3409 + }, + { + "epoch": 0.10111793138214276, + "grad_norm": 0.15451166033744812, + "learning_rate": 0.0009792665265150342, + "loss": 2.9502, + "step": 3410 + }, + { + "epoch": 0.10114758473445423, + "grad_norm": 0.17264437675476074, + "learning_rate": 0.0009792531157453745, + "loss": 2.98, + "step": 3411 + }, + { + "epoch": 0.10117723808676571, + "grad_norm": 0.2104850709438324, + "learning_rate": 0.0009792397007318465, + "loss": 2.9714, + "step": 3412 + }, + { + "epoch": 0.10120689143907718, + "grad_norm": 0.27313467860221863, + "learning_rate": 0.0009792262814745684, + "loss": 2.9567, + "step": 3413 + }, + { + "epoch": 0.10123654479138866, + "grad_norm": 0.2534255087375641, + "learning_rate": 0.0009792128579736595, + "loss": 2.959, + "step": 3414 + }, + { + "epoch": 0.10126619814370015, + "grad_norm": 0.23581840097904205, + "learning_rate": 0.0009791994302292386, + "loss": 2.973, + "step": 3415 + }, + { + "epoch": 0.10129585149601163, + "grad_norm": 0.22990092635154724, + "learning_rate": 0.0009791859982414242, + "loss": 2.9734, + "step": 3416 + }, + { + "epoch": 0.1013255048483231, + "grad_norm": 0.1948527842760086, + "learning_rate": 0.0009791725620103358, + "loss": 2.9452, + "step": 3417 + }, + { + "epoch": 0.10135515820063458, + "grad_norm": 0.18018470704555511, + "learning_rate": 0.0009791591215360918, + "loss": 2.9867, + "step": 3418 + }, + { + "epoch": 0.10138481155294606, + "grad_norm": 0.1676059514284134, + "learning_rate": 0.0009791456768188118, + "loss": 2.9687, + "step": 3419 + }, + { + "epoch": 0.10141446490525755, + "grad_norm": 0.1585478037595749, + "learning_rate": 0.0009791322278586145, + "loss": 2.9257, + "step": 3420 + }, + { + "epoch": 0.10144411825756902, + "grad_norm": 0.15524114668369293, + "learning_rate": 0.0009791187746556191, + "loss": 2.9357, + "step": 3421 + }, + { + "epoch": 0.1014737716098805, + "grad_norm": 0.15729965269565582, + "learning_rate": 0.0009791053172099446, + "loss": 2.9729, + "step": 3422 + }, + { + "epoch": 0.10150342496219197, + "grad_norm": 0.16147907078266144, + "learning_rate": 0.0009790918555217106, + "loss": 2.944, + "step": 3423 + }, + { + "epoch": 0.10153307831450345, + "grad_norm": 0.16300909221172333, + "learning_rate": 0.0009790783895910356, + "loss": 2.9424, + "step": 3424 + }, + { + "epoch": 0.10156273166681494, + "grad_norm": 0.1487790048122406, + "learning_rate": 0.0009790649194180395, + "loss": 2.9381, + "step": 3425 + }, + { + "epoch": 0.10159238501912642, + "grad_norm": 0.16722561419010162, + "learning_rate": 0.000979051445002841, + "loss": 2.9928, + "step": 3426 + }, + { + "epoch": 0.10162203837143789, + "grad_norm": 0.19488446414470673, + "learning_rate": 0.0009790379663455599, + "loss": 2.9385, + "step": 3427 + }, + { + "epoch": 0.10165169172374937, + "grad_norm": 0.16035059094429016, + "learning_rate": 0.0009790244834463155, + "loss": 2.9637, + "step": 3428 + }, + { + "epoch": 0.10168134507606084, + "grad_norm": 0.1378241330385208, + "learning_rate": 0.0009790109963052271, + "loss": 2.9914, + "step": 3429 + }, + { + "epoch": 0.10171099842837232, + "grad_norm": 0.1567986160516739, + "learning_rate": 0.0009789975049224139, + "loss": 2.9447, + "step": 3430 + }, + { + "epoch": 0.10174065178068381, + "grad_norm": 0.14219577610492706, + "learning_rate": 0.0009789840092979958, + "loss": 2.9133, + "step": 3431 + }, + { + "epoch": 0.10177030513299529, + "grad_norm": 0.14227741956710815, + "learning_rate": 0.0009789705094320918, + "loss": 2.9754, + "step": 3432 + }, + { + "epoch": 0.10179995848530676, + "grad_norm": 0.14192020893096924, + "learning_rate": 0.0009789570053248219, + "loss": 2.9599, + "step": 3433 + }, + { + "epoch": 0.10182961183761824, + "grad_norm": 0.14189067482948303, + "learning_rate": 0.0009789434969763055, + "loss": 2.9434, + "step": 3434 + }, + { + "epoch": 0.10185926518992972, + "grad_norm": 0.148954838514328, + "learning_rate": 0.0009789299843866622, + "loss": 2.9635, + "step": 3435 + }, + { + "epoch": 0.1018889185422412, + "grad_norm": 0.18272194266319275, + "learning_rate": 0.0009789164675560115, + "loss": 2.9699, + "step": 3436 + }, + { + "epoch": 0.10191857189455268, + "grad_norm": 0.18653079867362976, + "learning_rate": 0.0009789029464844737, + "loss": 2.9813, + "step": 3437 + }, + { + "epoch": 0.10194822524686416, + "grad_norm": 0.17652466893196106, + "learning_rate": 0.0009788894211721678, + "loss": 2.9639, + "step": 3438 + }, + { + "epoch": 0.10197787859917563, + "grad_norm": 0.18006201088428497, + "learning_rate": 0.000978875891619214, + "loss": 2.9338, + "step": 3439 + }, + { + "epoch": 0.10200753195148711, + "grad_norm": 0.17627963423728943, + "learning_rate": 0.0009788623578257318, + "loss": 2.9296, + "step": 3440 + }, + { + "epoch": 0.1020371853037986, + "grad_norm": 0.18426619470119476, + "learning_rate": 0.0009788488197918414, + "loss": 2.9599, + "step": 3441 + }, + { + "epoch": 0.10206683865611008, + "grad_norm": 0.16830484569072723, + "learning_rate": 0.0009788352775176622, + "loss": 2.9529, + "step": 3442 + }, + { + "epoch": 0.10209649200842155, + "grad_norm": 0.16008178889751434, + "learning_rate": 0.000978821731003315, + "loss": 2.9338, + "step": 3443 + }, + { + "epoch": 0.10212614536073303, + "grad_norm": 0.1538759022951126, + "learning_rate": 0.0009788081802489187, + "loss": 2.9503, + "step": 3444 + }, + { + "epoch": 0.1021557987130445, + "grad_norm": 0.1362283080816269, + "learning_rate": 0.000978794625254594, + "loss": 2.9471, + "step": 3445 + }, + { + "epoch": 0.102185452065356, + "grad_norm": 0.14933642745018005, + "learning_rate": 0.0009787810660204605, + "loss": 2.9547, + "step": 3446 + }, + { + "epoch": 0.10221510541766747, + "grad_norm": 0.15534500777721405, + "learning_rate": 0.0009787675025466388, + "loss": 2.9149, + "step": 3447 + }, + { + "epoch": 0.10224475876997895, + "grad_norm": 0.14325447380542755, + "learning_rate": 0.0009787539348332485, + "loss": 2.9526, + "step": 3448 + }, + { + "epoch": 0.10227441212229042, + "grad_norm": 0.15452808141708374, + "learning_rate": 0.0009787403628804098, + "loss": 2.9343, + "step": 3449 + }, + { + "epoch": 0.1023040654746019, + "grad_norm": 0.15612250566482544, + "learning_rate": 0.0009787267866882433, + "loss": 2.9657, + "step": 3450 + }, + { + "epoch": 0.10233371882691339, + "grad_norm": 0.1456039696931839, + "learning_rate": 0.0009787132062568688, + "loss": 2.9742, + "step": 3451 + }, + { + "epoch": 0.10236337217922487, + "grad_norm": 0.1586751937866211, + "learning_rate": 0.0009786996215864067, + "loss": 2.952, + "step": 3452 + }, + { + "epoch": 0.10239302553153634, + "grad_norm": 0.17164981365203857, + "learning_rate": 0.000978686032676977, + "loss": 2.9503, + "step": 3453 + }, + { + "epoch": 0.10242267888384782, + "grad_norm": 0.17269791662693024, + "learning_rate": 0.0009786724395287009, + "loss": 2.9104, + "step": 3454 + }, + { + "epoch": 0.1024523322361593, + "grad_norm": 0.17387771606445312, + "learning_rate": 0.0009786588421416978, + "loss": 2.9362, + "step": 3455 + }, + { + "epoch": 0.10248198558847077, + "grad_norm": 0.20100334286689758, + "learning_rate": 0.0009786452405160884, + "loss": 2.9489, + "step": 3456 + }, + { + "epoch": 0.10251163894078226, + "grad_norm": 0.21561002731323242, + "learning_rate": 0.0009786316346519935, + "loss": 2.9904, + "step": 3457 + }, + { + "epoch": 0.10254129229309374, + "grad_norm": 0.20717878639698029, + "learning_rate": 0.000978618024549533, + "loss": 2.9507, + "step": 3458 + }, + { + "epoch": 0.10257094564540521, + "grad_norm": 0.20047083497047424, + "learning_rate": 0.0009786044102088282, + "loss": 2.9472, + "step": 3459 + }, + { + "epoch": 0.10260059899771669, + "grad_norm": 0.21357807517051697, + "learning_rate": 0.0009785907916299989, + "loss": 2.9621, + "step": 3460 + }, + { + "epoch": 0.10263025235002816, + "grad_norm": 0.16381578147411346, + "learning_rate": 0.000978577168813166, + "loss": 2.9406, + "step": 3461 + }, + { + "epoch": 0.10265990570233965, + "grad_norm": 0.16813522577285767, + "learning_rate": 0.0009785635417584502, + "loss": 2.9913, + "step": 3462 + }, + { + "epoch": 0.10268955905465113, + "grad_norm": 0.16864991188049316, + "learning_rate": 0.000978549910465972, + "loss": 2.9488, + "step": 3463 + }, + { + "epoch": 0.1027192124069626, + "grad_norm": 0.1410435289144516, + "learning_rate": 0.0009785362749358522, + "loss": 2.9288, + "step": 3464 + }, + { + "epoch": 0.10274886575927408, + "grad_norm": 0.15575136244297028, + "learning_rate": 0.0009785226351682115, + "loss": 2.9698, + "step": 3465 + }, + { + "epoch": 0.10277851911158556, + "grad_norm": 0.135243758559227, + "learning_rate": 0.000978508991163171, + "loss": 2.9479, + "step": 3466 + }, + { + "epoch": 0.10280817246389705, + "grad_norm": 0.1287325620651245, + "learning_rate": 0.000978495342920851, + "loss": 2.9482, + "step": 3467 + }, + { + "epoch": 0.10283782581620853, + "grad_norm": 0.1321336179971695, + "learning_rate": 0.0009784816904413727, + "loss": 2.9343, + "step": 3468 + }, + { + "epoch": 0.10286747916852, + "grad_norm": 0.1372213512659073, + "learning_rate": 0.0009784680337248568, + "loss": 2.96, + "step": 3469 + }, + { + "epoch": 0.10289713252083148, + "grad_norm": 0.1415642946958542, + "learning_rate": 0.0009784543727714245, + "loss": 2.9662, + "step": 3470 + }, + { + "epoch": 0.10292678587314295, + "grad_norm": 0.1851942539215088, + "learning_rate": 0.0009784407075811965, + "loss": 2.9399, + "step": 3471 + }, + { + "epoch": 0.10295643922545444, + "grad_norm": 0.16251049935817719, + "learning_rate": 0.000978427038154294, + "loss": 2.9232, + "step": 3472 + }, + { + "epoch": 0.10298609257776592, + "grad_norm": 0.15216007828712463, + "learning_rate": 0.0009784133644908377, + "loss": 2.9321, + "step": 3473 + }, + { + "epoch": 0.1030157459300774, + "grad_norm": 0.14852380752563477, + "learning_rate": 0.0009783996865909493, + "loss": 2.9639, + "step": 3474 + }, + { + "epoch": 0.10304539928238887, + "grad_norm": 0.16529947519302368, + "learning_rate": 0.0009783860044547492, + "loss": 2.9408, + "step": 3475 + }, + { + "epoch": 0.10307505263470035, + "grad_norm": 0.1660824716091156, + "learning_rate": 0.0009783723180823592, + "loss": 2.948, + "step": 3476 + }, + { + "epoch": 0.10310470598701184, + "grad_norm": 0.16698846220970154, + "learning_rate": 0.0009783586274739, + "loss": 2.9552, + "step": 3477 + }, + { + "epoch": 0.10313435933932331, + "grad_norm": 0.1687716692686081, + "learning_rate": 0.0009783449326294933, + "loss": 2.9767, + "step": 3478 + }, + { + "epoch": 0.10316401269163479, + "grad_norm": 0.14219971001148224, + "learning_rate": 0.0009783312335492598, + "loss": 2.9504, + "step": 3479 + }, + { + "epoch": 0.10319366604394627, + "grad_norm": 0.14973269402980804, + "learning_rate": 0.0009783175302333215, + "loss": 2.9118, + "step": 3480 + }, + { + "epoch": 0.10322331939625774, + "grad_norm": 0.13330131769180298, + "learning_rate": 0.000978303822681799, + "loss": 2.9446, + "step": 3481 + }, + { + "epoch": 0.10325297274856922, + "grad_norm": 0.14671222865581512, + "learning_rate": 0.0009782901108948143, + "loss": 2.9565, + "step": 3482 + }, + { + "epoch": 0.10328262610088071, + "grad_norm": 0.15898439288139343, + "learning_rate": 0.0009782763948724884, + "loss": 2.9611, + "step": 3483 + }, + { + "epoch": 0.10331227945319219, + "grad_norm": 0.1668369174003601, + "learning_rate": 0.000978262674614943, + "loss": 2.9385, + "step": 3484 + }, + { + "epoch": 0.10334193280550366, + "grad_norm": 0.16168446838855743, + "learning_rate": 0.0009782489501222996, + "loss": 2.9618, + "step": 3485 + }, + { + "epoch": 0.10337158615781514, + "grad_norm": 0.14308470487594604, + "learning_rate": 0.0009782352213946795, + "loss": 2.9528, + "step": 3486 + }, + { + "epoch": 0.10340123951012661, + "grad_norm": 0.14896653592586517, + "learning_rate": 0.0009782214884322047, + "loss": 2.9757, + "step": 3487 + }, + { + "epoch": 0.1034308928624381, + "grad_norm": 0.17187471687793732, + "learning_rate": 0.0009782077512349963, + "loss": 2.9864, + "step": 3488 + }, + { + "epoch": 0.10346054621474958, + "grad_norm": 0.15762417018413544, + "learning_rate": 0.000978194009803176, + "loss": 2.9675, + "step": 3489 + }, + { + "epoch": 0.10349019956706106, + "grad_norm": 0.15235452353954315, + "learning_rate": 0.000978180264136866, + "loss": 2.9323, + "step": 3490 + }, + { + "epoch": 0.10351985291937253, + "grad_norm": 0.16271136701107025, + "learning_rate": 0.0009781665142361876, + "loss": 2.9509, + "step": 3491 + }, + { + "epoch": 0.10354950627168401, + "grad_norm": 0.15817205607891083, + "learning_rate": 0.0009781527601012627, + "loss": 2.9447, + "step": 3492 + }, + { + "epoch": 0.1035791596239955, + "grad_norm": 0.16840769350528717, + "learning_rate": 0.000978139001732213, + "loss": 2.9405, + "step": 3493 + }, + { + "epoch": 0.10360881297630697, + "grad_norm": 0.1877451092004776, + "learning_rate": 0.00097812523912916, + "loss": 2.9114, + "step": 3494 + }, + { + "epoch": 0.10363846632861845, + "grad_norm": 0.18461652100086212, + "learning_rate": 0.0009781114722922264, + "loss": 2.9601, + "step": 3495 + }, + { + "epoch": 0.10366811968092993, + "grad_norm": 0.2050796002149582, + "learning_rate": 0.0009780977012215336, + "loss": 2.9631, + "step": 3496 + }, + { + "epoch": 0.1036977730332414, + "grad_norm": 0.20181527733802795, + "learning_rate": 0.0009780839259172034, + "loss": 2.9619, + "step": 3497 + }, + { + "epoch": 0.10372742638555289, + "grad_norm": 0.16516296565532684, + "learning_rate": 0.000978070146379358, + "loss": 2.9548, + "step": 3498 + }, + { + "epoch": 0.10375707973786437, + "grad_norm": 0.19105318188667297, + "learning_rate": 0.0009780563626081197, + "loss": 2.932, + "step": 3499 + }, + { + "epoch": 0.10378673309017585, + "grad_norm": 0.2177199423313141, + "learning_rate": 0.0009780425746036098, + "loss": 2.9258, + "step": 3500 + }, + { + "epoch": 0.10381638644248732, + "grad_norm": 0.24548453092575073, + "learning_rate": 0.000978028782365951, + "loss": 2.9557, + "step": 3501 + }, + { + "epoch": 0.1038460397947988, + "grad_norm": 0.2535386383533478, + "learning_rate": 0.0009780149858952653, + "loss": 2.9425, + "step": 3502 + }, + { + "epoch": 0.10387569314711029, + "grad_norm": 0.22209268808364868, + "learning_rate": 0.000978001185191675, + "loss": 2.927, + "step": 3503 + }, + { + "epoch": 0.10390534649942176, + "grad_norm": 0.17930319905281067, + "learning_rate": 0.0009779873802553021, + "loss": 2.9441, + "step": 3504 + }, + { + "epoch": 0.10393499985173324, + "grad_norm": 0.14683732390403748, + "learning_rate": 0.0009779735710862688, + "loss": 2.9988, + "step": 3505 + }, + { + "epoch": 0.10396465320404472, + "grad_norm": 0.12826813757419586, + "learning_rate": 0.0009779597576846976, + "loss": 2.9502, + "step": 3506 + }, + { + "epoch": 0.10399430655635619, + "grad_norm": 0.13478600978851318, + "learning_rate": 0.0009779459400507105, + "loss": 2.9348, + "step": 3507 + }, + { + "epoch": 0.10402395990866767, + "grad_norm": 0.11923857778310776, + "learning_rate": 0.0009779321181844303, + "loss": 2.9342, + "step": 3508 + }, + { + "epoch": 0.10405361326097916, + "grad_norm": 0.12547893822193146, + "learning_rate": 0.000977918292085979, + "loss": 2.9362, + "step": 3509 + }, + { + "epoch": 0.10408326661329063, + "grad_norm": 0.11113981902599335, + "learning_rate": 0.0009779044617554793, + "loss": 2.9654, + "step": 3510 + }, + { + "epoch": 0.10411291996560211, + "grad_norm": 0.12207842618227005, + "learning_rate": 0.0009778906271930535, + "loss": 2.9337, + "step": 3511 + }, + { + "epoch": 0.10414257331791359, + "grad_norm": 0.15175575017929077, + "learning_rate": 0.0009778767883988242, + "loss": 2.9581, + "step": 3512 + }, + { + "epoch": 0.10417222667022506, + "grad_norm": 0.18094773590564728, + "learning_rate": 0.0009778629453729138, + "loss": 2.9288, + "step": 3513 + }, + { + "epoch": 0.10420188002253655, + "grad_norm": 0.15631632506847382, + "learning_rate": 0.000977849098115445, + "loss": 3.0059, + "step": 3514 + }, + { + "epoch": 0.10423153337484803, + "grad_norm": 0.16196517646312714, + "learning_rate": 0.0009778352466265406, + "loss": 2.9012, + "step": 3515 + }, + { + "epoch": 0.1042611867271595, + "grad_norm": 0.17984351515769958, + "learning_rate": 0.000977821390906323, + "loss": 2.9761, + "step": 3516 + }, + { + "epoch": 0.10429084007947098, + "grad_norm": 0.17154169082641602, + "learning_rate": 0.000977807530954915, + "loss": 2.938, + "step": 3517 + }, + { + "epoch": 0.10432049343178246, + "grad_norm": 0.15594589710235596, + "learning_rate": 0.0009777936667724391, + "loss": 2.9237, + "step": 3518 + }, + { + "epoch": 0.10435014678409395, + "grad_norm": 0.1459856778383255, + "learning_rate": 0.0009777797983590185, + "loss": 2.9475, + "step": 3519 + }, + { + "epoch": 0.10437980013640542, + "grad_norm": 0.1414250284433365, + "learning_rate": 0.0009777659257147757, + "loss": 2.9426, + "step": 3520 + }, + { + "epoch": 0.1044094534887169, + "grad_norm": 0.13926897943019867, + "learning_rate": 0.0009777520488398336, + "loss": 2.9794, + "step": 3521 + }, + { + "epoch": 0.10443910684102838, + "grad_norm": 0.15293250977993011, + "learning_rate": 0.000977738167734315, + "loss": 2.9718, + "step": 3522 + }, + { + "epoch": 0.10446876019333985, + "grad_norm": 0.15621760487556458, + "learning_rate": 0.0009777242823983431, + "loss": 2.967, + "step": 3523 + }, + { + "epoch": 0.10449841354565134, + "grad_norm": 0.15032747387886047, + "learning_rate": 0.0009777103928320405, + "loss": 2.9505, + "step": 3524 + }, + { + "epoch": 0.10452806689796282, + "grad_norm": 0.15598715841770172, + "learning_rate": 0.0009776964990355307, + "loss": 2.9457, + "step": 3525 + }, + { + "epoch": 0.1045577202502743, + "grad_norm": 0.1781998574733734, + "learning_rate": 0.000977682601008936, + "loss": 2.954, + "step": 3526 + }, + { + "epoch": 0.10458737360258577, + "grad_norm": 0.18333503603935242, + "learning_rate": 0.00097766869875238, + "loss": 2.9561, + "step": 3527 + }, + { + "epoch": 0.10461702695489725, + "grad_norm": 0.15871044993400574, + "learning_rate": 0.0009776547922659858, + "loss": 2.9386, + "step": 3528 + }, + { + "epoch": 0.10464668030720874, + "grad_norm": 0.18203182518482208, + "learning_rate": 0.0009776408815498764, + "loss": 2.9513, + "step": 3529 + }, + { + "epoch": 0.10467633365952021, + "grad_norm": 0.19013351202011108, + "learning_rate": 0.0009776269666041748, + "loss": 2.9738, + "step": 3530 + }, + { + "epoch": 0.10470598701183169, + "grad_norm": 0.15939095616340637, + "learning_rate": 0.0009776130474290047, + "loss": 2.9528, + "step": 3531 + }, + { + "epoch": 0.10473564036414316, + "grad_norm": 0.14195537567138672, + "learning_rate": 0.000977599124024489, + "loss": 2.948, + "step": 3532 + }, + { + "epoch": 0.10476529371645464, + "grad_norm": 0.13967320322990417, + "learning_rate": 0.000977585196390751, + "loss": 2.956, + "step": 3533 + }, + { + "epoch": 0.10479494706876612, + "grad_norm": 0.14353784918785095, + "learning_rate": 0.0009775712645279143, + "loss": 2.9357, + "step": 3534 + }, + { + "epoch": 0.10482460042107761, + "grad_norm": 0.16876599192619324, + "learning_rate": 0.0009775573284361019, + "loss": 2.9662, + "step": 3535 + }, + { + "epoch": 0.10485425377338908, + "grad_norm": 0.20293359458446503, + "learning_rate": 0.0009775433881154373, + "loss": 2.9716, + "step": 3536 + }, + { + "epoch": 0.10488390712570056, + "grad_norm": 0.20669957995414734, + "learning_rate": 0.0009775294435660441, + "loss": 2.9265, + "step": 3537 + }, + { + "epoch": 0.10491356047801204, + "grad_norm": 0.20803388953208923, + "learning_rate": 0.000977515494788046, + "loss": 2.9335, + "step": 3538 + }, + { + "epoch": 0.10494321383032351, + "grad_norm": 0.18168140947818756, + "learning_rate": 0.000977501541781566, + "loss": 2.9446, + "step": 3539 + }, + { + "epoch": 0.104972867182635, + "grad_norm": 0.19305506348609924, + "learning_rate": 0.0009774875845467278, + "loss": 2.9765, + "step": 3540 + }, + { + "epoch": 0.10500252053494648, + "grad_norm": 0.17285272479057312, + "learning_rate": 0.0009774736230836552, + "loss": 2.9428, + "step": 3541 + }, + { + "epoch": 0.10503217388725795, + "grad_norm": 0.16861332952976227, + "learning_rate": 0.0009774596573924716, + "loss": 2.9364, + "step": 3542 + }, + { + "epoch": 0.10506182723956943, + "grad_norm": 0.13485410809516907, + "learning_rate": 0.000977445687473301, + "loss": 2.964, + "step": 3543 + }, + { + "epoch": 0.1050914805918809, + "grad_norm": 0.14682742953300476, + "learning_rate": 0.0009774317133262667, + "loss": 2.9417, + "step": 3544 + }, + { + "epoch": 0.1051211339441924, + "grad_norm": 0.14166513085365295, + "learning_rate": 0.0009774177349514926, + "loss": 2.9416, + "step": 3545 + }, + { + "epoch": 0.10515078729650387, + "grad_norm": 0.1424960494041443, + "learning_rate": 0.0009774037523491027, + "loss": 2.9662, + "step": 3546 + }, + { + "epoch": 0.10518044064881535, + "grad_norm": 0.129830464720726, + "learning_rate": 0.0009773897655192204, + "loss": 2.9032, + "step": 3547 + }, + { + "epoch": 0.10521009400112682, + "grad_norm": 0.1178160235285759, + "learning_rate": 0.00097737577446197, + "loss": 2.9459, + "step": 3548 + }, + { + "epoch": 0.1052397473534383, + "grad_norm": 0.13118162751197815, + "learning_rate": 0.0009773617791774749, + "loss": 2.9792, + "step": 3549 + }, + { + "epoch": 0.10526940070574979, + "grad_norm": 0.1356925070285797, + "learning_rate": 0.0009773477796658596, + "loss": 2.9413, + "step": 3550 + }, + { + "epoch": 0.10529905405806127, + "grad_norm": 0.1379304826259613, + "learning_rate": 0.0009773337759272475, + "loss": 2.975, + "step": 3551 + }, + { + "epoch": 0.10532870741037274, + "grad_norm": 0.15779976546764374, + "learning_rate": 0.0009773197679617631, + "loss": 2.9702, + "step": 3552 + }, + { + "epoch": 0.10535836076268422, + "grad_norm": 0.16222739219665527, + "learning_rate": 0.0009773057557695302, + "loss": 2.9312, + "step": 3553 + }, + { + "epoch": 0.1053880141149957, + "grad_norm": 0.18102434277534485, + "learning_rate": 0.0009772917393506728, + "loss": 2.9425, + "step": 3554 + }, + { + "epoch": 0.10541766746730719, + "grad_norm": 0.1572980135679245, + "learning_rate": 0.0009772777187053152, + "loss": 2.9646, + "step": 3555 + }, + { + "epoch": 0.10544732081961866, + "grad_norm": 0.14238926768302917, + "learning_rate": 0.0009772636938335814, + "loss": 2.9296, + "step": 3556 + }, + { + "epoch": 0.10547697417193014, + "grad_norm": 0.1627216339111328, + "learning_rate": 0.000977249664735596, + "loss": 2.9243, + "step": 3557 + }, + { + "epoch": 0.10550662752424161, + "grad_norm": 0.15766312181949615, + "learning_rate": 0.0009772356314114825, + "loss": 2.9378, + "step": 3558 + }, + { + "epoch": 0.10553628087655309, + "grad_norm": 0.13370990753173828, + "learning_rate": 0.0009772215938613656, + "loss": 2.9306, + "step": 3559 + }, + { + "epoch": 0.10556593422886457, + "grad_norm": 0.1433338224887848, + "learning_rate": 0.0009772075520853697, + "loss": 2.9723, + "step": 3560 + }, + { + "epoch": 0.10559558758117606, + "grad_norm": 0.14272911846637726, + "learning_rate": 0.000977193506083619, + "loss": 2.9561, + "step": 3561 + }, + { + "epoch": 0.10562524093348753, + "grad_norm": 0.15765202045440674, + "learning_rate": 0.0009771794558562379, + "loss": 2.9598, + "step": 3562 + }, + { + "epoch": 0.10565489428579901, + "grad_norm": 0.1857604682445526, + "learning_rate": 0.0009771654014033508, + "loss": 2.9334, + "step": 3563 + }, + { + "epoch": 0.10568454763811048, + "grad_norm": 0.193972647190094, + "learning_rate": 0.0009771513427250821, + "loss": 2.9394, + "step": 3564 + }, + { + "epoch": 0.10571420099042196, + "grad_norm": 0.18693307042121887, + "learning_rate": 0.0009771372798215564, + "loss": 2.9608, + "step": 3565 + }, + { + "epoch": 0.10574385434273345, + "grad_norm": 0.19384267926216125, + "learning_rate": 0.0009771232126928981, + "loss": 2.9352, + "step": 3566 + }, + { + "epoch": 0.10577350769504493, + "grad_norm": 0.16638396680355072, + "learning_rate": 0.000977109141339232, + "loss": 2.9353, + "step": 3567 + }, + { + "epoch": 0.1058031610473564, + "grad_norm": 0.1559734344482422, + "learning_rate": 0.0009770950657606826, + "loss": 2.9565, + "step": 3568 + }, + { + "epoch": 0.10583281439966788, + "grad_norm": 0.1442573219537735, + "learning_rate": 0.0009770809859573743, + "loss": 2.9316, + "step": 3569 + }, + { + "epoch": 0.10586246775197936, + "grad_norm": 0.1300106644630432, + "learning_rate": 0.0009770669019294324, + "loss": 2.9444, + "step": 3570 + }, + { + "epoch": 0.10589212110429085, + "grad_norm": 0.1401672512292862, + "learning_rate": 0.0009770528136769808, + "loss": 2.937, + "step": 3571 + }, + { + "epoch": 0.10592177445660232, + "grad_norm": 0.1323709338903427, + "learning_rate": 0.0009770387212001447, + "loss": 2.9367, + "step": 3572 + }, + { + "epoch": 0.1059514278089138, + "grad_norm": 0.11538819968700409, + "learning_rate": 0.0009770246244990488, + "loss": 2.9163, + "step": 3573 + }, + { + "epoch": 0.10598108116122527, + "grad_norm": 0.1191568449139595, + "learning_rate": 0.000977010523573818, + "loss": 2.9378, + "step": 3574 + }, + { + "epoch": 0.10601073451353675, + "grad_norm": 0.1446761041879654, + "learning_rate": 0.0009769964184245773, + "loss": 2.9549, + "step": 3575 + }, + { + "epoch": 0.10604038786584824, + "grad_norm": 0.17943990230560303, + "learning_rate": 0.0009769823090514513, + "loss": 2.931, + "step": 3576 + }, + { + "epoch": 0.10607004121815972, + "grad_norm": 0.21452952921390533, + "learning_rate": 0.000976968195454565, + "loss": 2.9706, + "step": 3577 + }, + { + "epoch": 0.10609969457047119, + "grad_norm": 0.21611851453781128, + "learning_rate": 0.0009769540776340434, + "loss": 2.9681, + "step": 3578 + }, + { + "epoch": 0.10612934792278267, + "grad_norm": 0.1853756308555603, + "learning_rate": 0.0009769399555900119, + "loss": 2.9336, + "step": 3579 + }, + { + "epoch": 0.10615900127509414, + "grad_norm": 0.19656331837177277, + "learning_rate": 0.0009769258293225952, + "loss": 2.9383, + "step": 3580 + }, + { + "epoch": 0.10618865462740563, + "grad_norm": 0.22067545354366302, + "learning_rate": 0.0009769116988319181, + "loss": 2.9269, + "step": 3581 + }, + { + "epoch": 0.10621830797971711, + "grad_norm": 0.21135982871055603, + "learning_rate": 0.0009768975641181064, + "loss": 2.9458, + "step": 3582 + }, + { + "epoch": 0.10624796133202859, + "grad_norm": 0.17845730483531952, + "learning_rate": 0.0009768834251812845, + "loss": 2.972, + "step": 3583 + }, + { + "epoch": 0.10627761468434006, + "grad_norm": 0.14968733489513397, + "learning_rate": 0.0009768692820215784, + "loss": 2.9384, + "step": 3584 + }, + { + "epoch": 0.10630726803665154, + "grad_norm": 0.1548340916633606, + "learning_rate": 0.0009768551346391128, + "loss": 2.9257, + "step": 3585 + }, + { + "epoch": 0.10633692138896302, + "grad_norm": 0.14744408428668976, + "learning_rate": 0.0009768409830340132, + "loss": 2.9359, + "step": 3586 + }, + { + "epoch": 0.1063665747412745, + "grad_norm": 0.15153944492340088, + "learning_rate": 0.0009768268272064048, + "loss": 2.9374, + "step": 3587 + }, + { + "epoch": 0.10639622809358598, + "grad_norm": 0.16003437340259552, + "learning_rate": 0.0009768126671564129, + "loss": 2.9771, + "step": 3588 + }, + { + "epoch": 0.10642588144589746, + "grad_norm": 0.1704113632440567, + "learning_rate": 0.000976798502884163, + "loss": 2.9598, + "step": 3589 + }, + { + "epoch": 0.10645553479820893, + "grad_norm": 0.17690221965312958, + "learning_rate": 0.0009767843343897807, + "loss": 2.9429, + "step": 3590 + }, + { + "epoch": 0.10648518815052041, + "grad_norm": 0.17671607434749603, + "learning_rate": 0.000976770161673391, + "loss": 2.9177, + "step": 3591 + }, + { + "epoch": 0.1065148415028319, + "grad_norm": 0.16344031691551208, + "learning_rate": 0.00097675598473512, + "loss": 2.9343, + "step": 3592 + }, + { + "epoch": 0.10654449485514338, + "grad_norm": 0.17047274112701416, + "learning_rate": 0.0009767418035750927, + "loss": 2.9336, + "step": 3593 + }, + { + "epoch": 0.10657414820745485, + "grad_norm": 0.1717672049999237, + "learning_rate": 0.000976727618193435, + "loss": 2.9201, + "step": 3594 + }, + { + "epoch": 0.10660380155976633, + "grad_norm": 0.14435285329818726, + "learning_rate": 0.0009767134285902724, + "loss": 2.9636, + "step": 3595 + }, + { + "epoch": 0.1066334549120778, + "grad_norm": 0.14212223887443542, + "learning_rate": 0.0009766992347657307, + "loss": 2.9123, + "step": 3596 + }, + { + "epoch": 0.1066631082643893, + "grad_norm": 0.1606818288564682, + "learning_rate": 0.0009766850367199352, + "loss": 2.9307, + "step": 3597 + }, + { + "epoch": 0.10669276161670077, + "grad_norm": 0.14956046640872955, + "learning_rate": 0.000976670834453012, + "loss": 2.9498, + "step": 3598 + }, + { + "epoch": 0.10672241496901225, + "grad_norm": 0.16999253630638123, + "learning_rate": 0.0009766566279650866, + "loss": 2.9475, + "step": 3599 + }, + { + "epoch": 0.10675206832132372, + "grad_norm": 0.17775794863700867, + "learning_rate": 0.000976642417256285, + "loss": 2.9385, + "step": 3600 + }, + { + "epoch": 0.1067817216736352, + "grad_norm": 0.17093852162361145, + "learning_rate": 0.0009766282023267333, + "loss": 2.9879, + "step": 3601 + }, + { + "epoch": 0.10681137502594669, + "grad_norm": 0.16493988037109375, + "learning_rate": 0.0009766139831765565, + "loss": 2.9584, + "step": 3602 + }, + { + "epoch": 0.10684102837825817, + "grad_norm": 0.15904569625854492, + "learning_rate": 0.0009765997598058815, + "loss": 2.9457, + "step": 3603 + }, + { + "epoch": 0.10687068173056964, + "grad_norm": 0.14979998767375946, + "learning_rate": 0.0009765855322148337, + "loss": 2.9243, + "step": 3604 + }, + { + "epoch": 0.10690033508288112, + "grad_norm": 0.1279379278421402, + "learning_rate": 0.0009765713004035391, + "loss": 2.9385, + "step": 3605 + }, + { + "epoch": 0.1069299884351926, + "grad_norm": 0.14371679723262787, + "learning_rate": 0.000976557064372124, + "loss": 2.9761, + "step": 3606 + }, + { + "epoch": 0.10695964178750408, + "grad_norm": 0.14642076194286346, + "learning_rate": 0.0009765428241207142, + "loss": 2.9189, + "step": 3607 + }, + { + "epoch": 0.10698929513981556, + "grad_norm": 0.14939601719379425, + "learning_rate": 0.0009765285796494359, + "loss": 2.9441, + "step": 3608 + }, + { + "epoch": 0.10701894849212704, + "grad_norm": 0.16206921637058258, + "learning_rate": 0.0009765143309584152, + "loss": 2.9581, + "step": 3609 + }, + { + "epoch": 0.10704860184443851, + "grad_norm": 0.15495698153972626, + "learning_rate": 0.0009765000780477784, + "loss": 2.9457, + "step": 3610 + }, + { + "epoch": 0.10707825519674999, + "grad_norm": 0.16241461038589478, + "learning_rate": 0.0009764858209176517, + "loss": 2.9279, + "step": 3611 + }, + { + "epoch": 0.10710790854906146, + "grad_norm": 0.1561061441898346, + "learning_rate": 0.000976471559568161, + "loss": 2.9173, + "step": 3612 + }, + { + "epoch": 0.10713756190137295, + "grad_norm": 0.17560705542564392, + "learning_rate": 0.0009764572939994331, + "loss": 2.9412, + "step": 3613 + }, + { + "epoch": 0.10716721525368443, + "grad_norm": 0.1544586718082428, + "learning_rate": 0.0009764430242115939, + "loss": 2.9639, + "step": 3614 + }, + { + "epoch": 0.10719686860599591, + "grad_norm": 0.1521182656288147, + "learning_rate": 0.00097642875020477, + "loss": 2.9751, + "step": 3615 + }, + { + "epoch": 0.10722652195830738, + "grad_norm": 0.16991320252418518, + "learning_rate": 0.0009764144719790878, + "loss": 2.9805, + "step": 3616 + }, + { + "epoch": 0.10725617531061886, + "grad_norm": 0.15570957958698273, + "learning_rate": 0.0009764001895346736, + "loss": 2.9096, + "step": 3617 + }, + { + "epoch": 0.10728582866293035, + "grad_norm": 0.15609031915664673, + "learning_rate": 0.0009763859028716539, + "loss": 2.9433, + "step": 3618 + }, + { + "epoch": 0.10731548201524183, + "grad_norm": 0.14207719266414642, + "learning_rate": 0.0009763716119901555, + "loss": 2.9107, + "step": 3619 + }, + { + "epoch": 0.1073451353675533, + "grad_norm": 0.137034073472023, + "learning_rate": 0.0009763573168903045, + "loss": 2.9419, + "step": 3620 + }, + { + "epoch": 0.10737478871986478, + "grad_norm": 0.16401609778404236, + "learning_rate": 0.0009763430175722277, + "loss": 2.9323, + "step": 3621 + }, + { + "epoch": 0.10740444207217625, + "grad_norm": 0.16115137934684753, + "learning_rate": 0.0009763287140360517, + "loss": 2.9766, + "step": 3622 + }, + { + "epoch": 0.10743409542448774, + "grad_norm": 0.13544835150241852, + "learning_rate": 0.0009763144062819033, + "loss": 2.943, + "step": 3623 + }, + { + "epoch": 0.10746374877679922, + "grad_norm": 0.15968303382396698, + "learning_rate": 0.000976300094309909, + "loss": 2.973, + "step": 3624 + }, + { + "epoch": 0.1074934021291107, + "grad_norm": 0.17991599440574646, + "learning_rate": 0.0009762857781201956, + "loss": 2.9492, + "step": 3625 + }, + { + "epoch": 0.10752305548142217, + "grad_norm": 0.1743585169315338, + "learning_rate": 0.00097627145771289, + "loss": 2.9601, + "step": 3626 + }, + { + "epoch": 0.10755270883373365, + "grad_norm": 0.19485867023468018, + "learning_rate": 0.0009762571330881187, + "loss": 2.9523, + "step": 3627 + }, + { + "epoch": 0.10758236218604514, + "grad_norm": 0.22181600332260132, + "learning_rate": 0.0009762428042460088, + "loss": 2.9293, + "step": 3628 + }, + { + "epoch": 0.10761201553835661, + "grad_norm": 0.20760945975780487, + "learning_rate": 0.0009762284711866871, + "loss": 2.9568, + "step": 3629 + }, + { + "epoch": 0.10764166889066809, + "grad_norm": 0.17942743003368378, + "learning_rate": 0.0009762141339102805, + "loss": 2.9512, + "step": 3630 + }, + { + "epoch": 0.10767132224297957, + "grad_norm": 0.1872245818376541, + "learning_rate": 0.0009761997924169162, + "loss": 2.9357, + "step": 3631 + }, + { + "epoch": 0.10770097559529104, + "grad_norm": 0.14627978205680847, + "learning_rate": 0.000976185446706721, + "loss": 2.9298, + "step": 3632 + }, + { + "epoch": 0.10773062894760253, + "grad_norm": 0.15689362585544586, + "learning_rate": 0.0009761710967798217, + "loss": 2.9417, + "step": 3633 + }, + { + "epoch": 0.10776028229991401, + "grad_norm": 0.19029273092746735, + "learning_rate": 0.0009761567426363458, + "loss": 2.9604, + "step": 3634 + }, + { + "epoch": 0.10778993565222549, + "grad_norm": 0.1880149394273758, + "learning_rate": 0.0009761423842764201, + "loss": 2.9329, + "step": 3635 + }, + { + "epoch": 0.10781958900453696, + "grad_norm": 0.1574995219707489, + "learning_rate": 0.0009761280217001719, + "loss": 2.9451, + "step": 3636 + }, + { + "epoch": 0.10784924235684844, + "grad_norm": 0.14267711341381073, + "learning_rate": 0.0009761136549077283, + "loss": 2.9263, + "step": 3637 + }, + { + "epoch": 0.10787889570915991, + "grad_norm": 0.14362278580665588, + "learning_rate": 0.0009760992838992167, + "loss": 2.9665, + "step": 3638 + }, + { + "epoch": 0.1079085490614714, + "grad_norm": 0.1391342431306839, + "learning_rate": 0.0009760849086747641, + "loss": 2.948, + "step": 3639 + }, + { + "epoch": 0.10793820241378288, + "grad_norm": 0.13671064376831055, + "learning_rate": 0.0009760705292344979, + "loss": 2.9505, + "step": 3640 + }, + { + "epoch": 0.10796785576609436, + "grad_norm": 0.15027423202991486, + "learning_rate": 0.0009760561455785455, + "loss": 2.9537, + "step": 3641 + }, + { + "epoch": 0.10799750911840583, + "grad_norm": 0.17171186208724976, + "learning_rate": 0.0009760417577070341, + "loss": 2.9326, + "step": 3642 + }, + { + "epoch": 0.10802716247071731, + "grad_norm": 0.17413659393787384, + "learning_rate": 0.0009760273656200915, + "loss": 2.9432, + "step": 3643 + }, + { + "epoch": 0.1080568158230288, + "grad_norm": 0.16176728904247284, + "learning_rate": 0.0009760129693178446, + "loss": 2.9106, + "step": 3644 + }, + { + "epoch": 0.10808646917534027, + "grad_norm": 0.15205571055412292, + "learning_rate": 0.0009759985688004214, + "loss": 2.9554, + "step": 3645 + }, + { + "epoch": 0.10811612252765175, + "grad_norm": 0.15744240581989288, + "learning_rate": 0.0009759841640679488, + "loss": 2.9653, + "step": 3646 + }, + { + "epoch": 0.10814577587996323, + "grad_norm": 0.13845059275627136, + "learning_rate": 0.0009759697551205551, + "loss": 2.9263, + "step": 3647 + }, + { + "epoch": 0.1081754292322747, + "grad_norm": 0.15365970134735107, + "learning_rate": 0.0009759553419583674, + "loss": 2.9369, + "step": 3648 + }, + { + "epoch": 0.10820508258458619, + "grad_norm": 0.15877234935760498, + "learning_rate": 0.0009759409245815132, + "loss": 2.9299, + "step": 3649 + }, + { + "epoch": 0.10823473593689767, + "grad_norm": 0.14520642161369324, + "learning_rate": 0.0009759265029901208, + "loss": 2.9618, + "step": 3650 + }, + { + "epoch": 0.10826438928920915, + "grad_norm": 0.12489362061023712, + "learning_rate": 0.0009759120771843173, + "loss": 2.9453, + "step": 3651 + }, + { + "epoch": 0.10829404264152062, + "grad_norm": 0.14025777578353882, + "learning_rate": 0.0009758976471642307, + "loss": 2.9584, + "step": 3652 + }, + { + "epoch": 0.1083236959938321, + "grad_norm": 0.14994646608829498, + "learning_rate": 0.0009758832129299888, + "loss": 2.9055, + "step": 3653 + }, + { + "epoch": 0.10835334934614359, + "grad_norm": 0.16357649862766266, + "learning_rate": 0.0009758687744817193, + "loss": 2.9357, + "step": 3654 + }, + { + "epoch": 0.10838300269845506, + "grad_norm": 0.1737295389175415, + "learning_rate": 0.0009758543318195501, + "loss": 2.9404, + "step": 3655 + }, + { + "epoch": 0.10841265605076654, + "grad_norm": 0.1802111715078354, + "learning_rate": 0.0009758398849436092, + "loss": 2.9378, + "step": 3656 + }, + { + "epoch": 0.10844230940307802, + "grad_norm": 0.1641305387020111, + "learning_rate": 0.0009758254338540245, + "loss": 2.9396, + "step": 3657 + }, + { + "epoch": 0.10847196275538949, + "grad_norm": 0.17000971734523773, + "learning_rate": 0.0009758109785509237, + "loss": 2.9537, + "step": 3658 + }, + { + "epoch": 0.10850161610770098, + "grad_norm": 0.20643194019794464, + "learning_rate": 0.0009757965190344351, + "loss": 2.9561, + "step": 3659 + }, + { + "epoch": 0.10853126946001246, + "grad_norm": 0.20538541674613953, + "learning_rate": 0.0009757820553046867, + "loss": 2.9591, + "step": 3660 + }, + { + "epoch": 0.10856092281232393, + "grad_norm": 0.159511536359787, + "learning_rate": 0.0009757675873618067, + "loss": 2.9428, + "step": 3661 + }, + { + "epoch": 0.10859057616463541, + "grad_norm": 0.16453097760677338, + "learning_rate": 0.0009757531152059227, + "loss": 2.9011, + "step": 3662 + }, + { + "epoch": 0.10862022951694689, + "grad_norm": 0.1857786774635315, + "learning_rate": 0.0009757386388371634, + "loss": 2.9401, + "step": 3663 + }, + { + "epoch": 0.10864988286925836, + "grad_norm": 0.1658896952867508, + "learning_rate": 0.0009757241582556567, + "loss": 2.9403, + "step": 3664 + }, + { + "epoch": 0.10867953622156985, + "grad_norm": 0.15523378551006317, + "learning_rate": 0.0009757096734615311, + "loss": 2.9102, + "step": 3665 + }, + { + "epoch": 0.10870918957388133, + "grad_norm": 0.14469346404075623, + "learning_rate": 0.0009756951844549145, + "loss": 2.8937, + "step": 3666 + }, + { + "epoch": 0.1087388429261928, + "grad_norm": 0.1409071534872055, + "learning_rate": 0.0009756806912359355, + "loss": 2.9288, + "step": 3667 + }, + { + "epoch": 0.10876849627850428, + "grad_norm": 0.1487325131893158, + "learning_rate": 0.0009756661938047222, + "loss": 2.9583, + "step": 3668 + }, + { + "epoch": 0.10879814963081576, + "grad_norm": 0.13779108226299286, + "learning_rate": 0.0009756516921614033, + "loss": 2.9482, + "step": 3669 + }, + { + "epoch": 0.10882780298312725, + "grad_norm": 0.13572049140930176, + "learning_rate": 0.0009756371863061068, + "loss": 2.9543, + "step": 3670 + }, + { + "epoch": 0.10885745633543872, + "grad_norm": 0.13815240561962128, + "learning_rate": 0.0009756226762389615, + "loss": 2.9319, + "step": 3671 + }, + { + "epoch": 0.1088871096877502, + "grad_norm": 0.13271519541740417, + "learning_rate": 0.0009756081619600958, + "loss": 2.9559, + "step": 3672 + }, + { + "epoch": 0.10891676304006168, + "grad_norm": 0.13813965022563934, + "learning_rate": 0.0009755936434696382, + "loss": 2.9432, + "step": 3673 + }, + { + "epoch": 0.10894641639237315, + "grad_norm": 0.14026302099227905, + "learning_rate": 0.0009755791207677172, + "loss": 2.9232, + "step": 3674 + }, + { + "epoch": 0.10897606974468464, + "grad_norm": 0.1354328989982605, + "learning_rate": 0.0009755645938544615, + "loss": 2.9592, + "step": 3675 + }, + { + "epoch": 0.10900572309699612, + "grad_norm": 0.1453804075717926, + "learning_rate": 0.0009755500627299996, + "loss": 2.9512, + "step": 3676 + }, + { + "epoch": 0.1090353764493076, + "grad_norm": 0.14338089525699615, + "learning_rate": 0.0009755355273944603, + "loss": 2.9675, + "step": 3677 + }, + { + "epoch": 0.10906502980161907, + "grad_norm": 0.15389032661914825, + "learning_rate": 0.0009755209878479723, + "loss": 2.9375, + "step": 3678 + }, + { + "epoch": 0.10909468315393055, + "grad_norm": 0.16342796385288239, + "learning_rate": 0.0009755064440906642, + "loss": 2.9353, + "step": 3679 + }, + { + "epoch": 0.10912433650624204, + "grad_norm": 0.17382024228572845, + "learning_rate": 0.0009754918961226651, + "loss": 2.9517, + "step": 3680 + }, + { + "epoch": 0.10915398985855351, + "grad_norm": 0.16455301642417908, + "learning_rate": 0.0009754773439441035, + "loss": 2.9343, + "step": 3681 + }, + { + "epoch": 0.10918364321086499, + "grad_norm": 0.13895352184772491, + "learning_rate": 0.0009754627875551085, + "loss": 2.9392, + "step": 3682 + }, + { + "epoch": 0.10921329656317647, + "grad_norm": 0.14392612874507904, + "learning_rate": 0.0009754482269558089, + "loss": 2.9347, + "step": 3683 + }, + { + "epoch": 0.10924294991548794, + "grad_norm": 0.14226102828979492, + "learning_rate": 0.0009754336621463337, + "loss": 2.9403, + "step": 3684 + }, + { + "epoch": 0.10927260326779943, + "grad_norm": 0.14655716717243195, + "learning_rate": 0.0009754190931268118, + "loss": 2.9051, + "step": 3685 + }, + { + "epoch": 0.10930225662011091, + "grad_norm": 0.1322093904018402, + "learning_rate": 0.0009754045198973721, + "loss": 2.9432, + "step": 3686 + }, + { + "epoch": 0.10933190997242238, + "grad_norm": 0.1388329118490219, + "learning_rate": 0.0009753899424581439, + "loss": 2.9512, + "step": 3687 + }, + { + "epoch": 0.10936156332473386, + "grad_norm": 0.14846855401992798, + "learning_rate": 0.0009753753608092561, + "loss": 2.9331, + "step": 3688 + }, + { + "epoch": 0.10939121667704534, + "grad_norm": 0.1396811455488205, + "learning_rate": 0.0009753607749508379, + "loss": 2.9449, + "step": 3689 + }, + { + "epoch": 0.10942087002935681, + "grad_norm": 0.13235697150230408, + "learning_rate": 0.0009753461848830183, + "loss": 2.9033, + "step": 3690 + }, + { + "epoch": 0.1094505233816683, + "grad_norm": 0.14851601421833038, + "learning_rate": 0.0009753315906059268, + "loss": 2.9564, + "step": 3691 + }, + { + "epoch": 0.10948017673397978, + "grad_norm": 0.1701756715774536, + "learning_rate": 0.0009753169921196924, + "loss": 2.9435, + "step": 3692 + }, + { + "epoch": 0.10950983008629125, + "grad_norm": 0.22457675635814667, + "learning_rate": 0.0009753023894244446, + "loss": 2.9201, + "step": 3693 + }, + { + "epoch": 0.10953948343860273, + "grad_norm": 0.2956860363483429, + "learning_rate": 0.0009752877825203124, + "loss": 2.9593, + "step": 3694 + }, + { + "epoch": 0.1095691367909142, + "grad_norm": 0.25625428557395935, + "learning_rate": 0.0009752731714074254, + "loss": 2.9567, + "step": 3695 + }, + { + "epoch": 0.1095987901432257, + "grad_norm": 0.23603232204914093, + "learning_rate": 0.0009752585560859129, + "loss": 2.9721, + "step": 3696 + }, + { + "epoch": 0.10962844349553717, + "grad_norm": 0.2607174515724182, + "learning_rate": 0.0009752439365559041, + "loss": 2.9907, + "step": 3697 + }, + { + "epoch": 0.10965809684784865, + "grad_norm": 0.18761110305786133, + "learning_rate": 0.0009752293128175289, + "loss": 2.9335, + "step": 3698 + }, + { + "epoch": 0.10968775020016013, + "grad_norm": 0.1680179238319397, + "learning_rate": 0.0009752146848709165, + "loss": 2.933, + "step": 3699 + }, + { + "epoch": 0.1097174035524716, + "grad_norm": 0.16391611099243164, + "learning_rate": 0.0009752000527161964, + "loss": 2.9773, + "step": 3700 + }, + { + "epoch": 0.10974705690478309, + "grad_norm": 0.15498994290828705, + "learning_rate": 0.0009751854163534984, + "loss": 2.9096, + "step": 3701 + }, + { + "epoch": 0.10977671025709457, + "grad_norm": 0.15977323055267334, + "learning_rate": 0.000975170775782952, + "loss": 2.9325, + "step": 3702 + }, + { + "epoch": 0.10980636360940604, + "grad_norm": 0.1335441917181015, + "learning_rate": 0.0009751561310046866, + "loss": 2.9423, + "step": 3703 + }, + { + "epoch": 0.10983601696171752, + "grad_norm": 0.15185384452342987, + "learning_rate": 0.0009751414820188322, + "loss": 2.9461, + "step": 3704 + }, + { + "epoch": 0.109865670314029, + "grad_norm": 0.16897043585777283, + "learning_rate": 0.0009751268288255186, + "loss": 2.9186, + "step": 3705 + }, + { + "epoch": 0.10989532366634049, + "grad_norm": 0.16265298426151276, + "learning_rate": 0.0009751121714248751, + "loss": 2.9309, + "step": 3706 + }, + { + "epoch": 0.10992497701865196, + "grad_norm": 0.15187543630599976, + "learning_rate": 0.0009750975098170321, + "loss": 2.9397, + "step": 3707 + }, + { + "epoch": 0.10995463037096344, + "grad_norm": 0.14703217148780823, + "learning_rate": 0.0009750828440021188, + "loss": 2.9294, + "step": 3708 + }, + { + "epoch": 0.10998428372327491, + "grad_norm": 0.1458263397216797, + "learning_rate": 0.0009750681739802654, + "loss": 2.9373, + "step": 3709 + }, + { + "epoch": 0.11001393707558639, + "grad_norm": 0.1736963838338852, + "learning_rate": 0.0009750534997516019, + "loss": 2.9265, + "step": 3710 + }, + { + "epoch": 0.11004359042789788, + "grad_norm": 0.1449538618326187, + "learning_rate": 0.000975038821316258, + "loss": 2.9464, + "step": 3711 + }, + { + "epoch": 0.11007324378020936, + "grad_norm": 0.12278815358877182, + "learning_rate": 0.0009750241386743639, + "loss": 2.9394, + "step": 3712 + }, + { + "epoch": 0.11010289713252083, + "grad_norm": 0.14793430268764496, + "learning_rate": 0.0009750094518260495, + "loss": 2.9388, + "step": 3713 + }, + { + "epoch": 0.11013255048483231, + "grad_norm": 0.1371871680021286, + "learning_rate": 0.0009749947607714449, + "loss": 2.9399, + "step": 3714 + }, + { + "epoch": 0.11016220383714379, + "grad_norm": 0.13791222870349884, + "learning_rate": 0.0009749800655106801, + "loss": 2.9127, + "step": 3715 + }, + { + "epoch": 0.11019185718945526, + "grad_norm": 0.14762426912784576, + "learning_rate": 0.0009749653660438853, + "loss": 2.9538, + "step": 3716 + }, + { + "epoch": 0.11022151054176675, + "grad_norm": 0.14378361403942108, + "learning_rate": 0.0009749506623711906, + "loss": 2.8812, + "step": 3717 + }, + { + "epoch": 0.11025116389407823, + "grad_norm": 0.13395360112190247, + "learning_rate": 0.0009749359544927263, + "loss": 2.9247, + "step": 3718 + }, + { + "epoch": 0.1102808172463897, + "grad_norm": 0.12394485622644424, + "learning_rate": 0.0009749212424086227, + "loss": 2.9392, + "step": 3719 + }, + { + "epoch": 0.11031047059870118, + "grad_norm": 0.12935303151607513, + "learning_rate": 0.0009749065261190099, + "loss": 2.9577, + "step": 3720 + }, + { + "epoch": 0.11034012395101266, + "grad_norm": 0.15789619088172913, + "learning_rate": 0.0009748918056240182, + "loss": 2.934, + "step": 3721 + }, + { + "epoch": 0.11036977730332415, + "grad_norm": 0.17692138254642487, + "learning_rate": 0.0009748770809237782, + "loss": 2.9226, + "step": 3722 + }, + { + "epoch": 0.11039943065563562, + "grad_norm": 0.19035829603672028, + "learning_rate": 0.0009748623520184201, + "loss": 2.9355, + "step": 3723 + }, + { + "epoch": 0.1104290840079471, + "grad_norm": 0.20476004481315613, + "learning_rate": 0.0009748476189080744, + "loss": 2.9497, + "step": 3724 + }, + { + "epoch": 0.11045873736025857, + "grad_norm": 0.17104116082191467, + "learning_rate": 0.0009748328815928713, + "loss": 2.9503, + "step": 3725 + }, + { + "epoch": 0.11048839071257005, + "grad_norm": 0.1655658632516861, + "learning_rate": 0.0009748181400729418, + "loss": 2.9476, + "step": 3726 + }, + { + "epoch": 0.11051804406488154, + "grad_norm": 0.17664605379104614, + "learning_rate": 0.000974803394348416, + "loss": 2.9015, + "step": 3727 + }, + { + "epoch": 0.11054769741719302, + "grad_norm": 0.16671505570411682, + "learning_rate": 0.0009747886444194247, + "loss": 2.92, + "step": 3728 + }, + { + "epoch": 0.11057735076950449, + "grad_norm": 0.16114194691181183, + "learning_rate": 0.0009747738902860985, + "loss": 2.9416, + "step": 3729 + }, + { + "epoch": 0.11060700412181597, + "grad_norm": 0.13387230038642883, + "learning_rate": 0.000974759131948568, + "loss": 2.9378, + "step": 3730 + }, + { + "epoch": 0.11063665747412745, + "grad_norm": 0.1469026505947113, + "learning_rate": 0.0009747443694069638, + "loss": 2.8943, + "step": 3731 + }, + { + "epoch": 0.11066631082643894, + "grad_norm": 0.15223850309848785, + "learning_rate": 0.0009747296026614169, + "loss": 2.9704, + "step": 3732 + }, + { + "epoch": 0.11069596417875041, + "grad_norm": 0.14666061103343964, + "learning_rate": 0.0009747148317120577, + "loss": 2.9546, + "step": 3733 + }, + { + "epoch": 0.11072561753106189, + "grad_norm": 0.16371142864227295, + "learning_rate": 0.0009747000565590172, + "loss": 2.8995, + "step": 3734 + }, + { + "epoch": 0.11075527088337336, + "grad_norm": 0.17309436202049255, + "learning_rate": 0.0009746852772024264, + "loss": 2.9563, + "step": 3735 + }, + { + "epoch": 0.11078492423568484, + "grad_norm": 0.1815778613090515, + "learning_rate": 0.0009746704936424158, + "loss": 2.9465, + "step": 3736 + }, + { + "epoch": 0.11081457758799633, + "grad_norm": 0.17988541722297668, + "learning_rate": 0.0009746557058791166, + "loss": 2.9391, + "step": 3737 + }, + { + "epoch": 0.1108442309403078, + "grad_norm": 0.16340942680835724, + "learning_rate": 0.0009746409139126596, + "loss": 2.9303, + "step": 3738 + }, + { + "epoch": 0.11087388429261928, + "grad_norm": 0.1292443722486496, + "learning_rate": 0.0009746261177431759, + "loss": 2.9375, + "step": 3739 + }, + { + "epoch": 0.11090353764493076, + "grad_norm": 0.1482861340045929, + "learning_rate": 0.0009746113173707963, + "loss": 2.9097, + "step": 3740 + }, + { + "epoch": 0.11093319099724223, + "grad_norm": 0.1471356749534607, + "learning_rate": 0.0009745965127956522, + "loss": 2.901, + "step": 3741 + }, + { + "epoch": 0.11096284434955371, + "grad_norm": 0.13679644465446472, + "learning_rate": 0.0009745817040178744, + "loss": 2.9364, + "step": 3742 + }, + { + "epoch": 0.1109924977018652, + "grad_norm": 0.15962634980678558, + "learning_rate": 0.0009745668910375942, + "loss": 2.8982, + "step": 3743 + }, + { + "epoch": 0.11102215105417668, + "grad_norm": 0.16919571161270142, + "learning_rate": 0.0009745520738549427, + "loss": 2.9554, + "step": 3744 + }, + { + "epoch": 0.11105180440648815, + "grad_norm": 0.19945842027664185, + "learning_rate": 0.0009745372524700512, + "loss": 2.9178, + "step": 3745 + }, + { + "epoch": 0.11108145775879963, + "grad_norm": 0.21358272433280945, + "learning_rate": 0.0009745224268830508, + "loss": 2.9442, + "step": 3746 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.17491759359836578, + "learning_rate": 0.0009745075970940729, + "loss": 2.9281, + "step": 3747 + }, + { + "epoch": 0.1111407644634226, + "grad_norm": 0.13934306800365448, + "learning_rate": 0.0009744927631032488, + "loss": 2.9132, + "step": 3748 + }, + { + "epoch": 0.11117041781573407, + "grad_norm": 0.1454830765724182, + "learning_rate": 0.0009744779249107097, + "loss": 2.9606, + "step": 3749 + }, + { + "epoch": 0.11120007116804555, + "grad_norm": 0.13385102152824402, + "learning_rate": 0.0009744630825165874, + "loss": 2.9255, + "step": 3750 + }, + { + "epoch": 0.11122972452035702, + "grad_norm": 0.12260439991950989, + "learning_rate": 0.0009744482359210127, + "loss": 2.9635, + "step": 3751 + }, + { + "epoch": 0.1112593778726685, + "grad_norm": 0.11593838036060333, + "learning_rate": 0.0009744333851241177, + "loss": 2.9287, + "step": 3752 + }, + { + "epoch": 0.11128903122497999, + "grad_norm": 0.12828056514263153, + "learning_rate": 0.0009744185301260336, + "loss": 2.8981, + "step": 3753 + }, + { + "epoch": 0.11131868457729147, + "grad_norm": 0.14310666918754578, + "learning_rate": 0.0009744036709268919, + "loss": 2.9626, + "step": 3754 + }, + { + "epoch": 0.11134833792960294, + "grad_norm": 0.1736077517271042, + "learning_rate": 0.0009743888075268243, + "loss": 2.9409, + "step": 3755 + }, + { + "epoch": 0.11137799128191442, + "grad_norm": 0.1741245687007904, + "learning_rate": 0.0009743739399259624, + "loss": 2.9362, + "step": 3756 + }, + { + "epoch": 0.1114076446342259, + "grad_norm": 0.1621016263961792, + "learning_rate": 0.0009743590681244379, + "loss": 2.9558, + "step": 3757 + }, + { + "epoch": 0.11143729798653738, + "grad_norm": 0.14904209971427917, + "learning_rate": 0.0009743441921223824, + "loss": 2.9453, + "step": 3758 + }, + { + "epoch": 0.11146695133884886, + "grad_norm": 0.16614167392253876, + "learning_rate": 0.0009743293119199276, + "loss": 2.9257, + "step": 3759 + }, + { + "epoch": 0.11149660469116034, + "grad_norm": 0.18074741959571838, + "learning_rate": 0.0009743144275172053, + "loss": 2.9342, + "step": 3760 + }, + { + "epoch": 0.11152625804347181, + "grad_norm": 0.1842638999223709, + "learning_rate": 0.0009742995389143474, + "loss": 2.967, + "step": 3761 + }, + { + "epoch": 0.11155591139578329, + "grad_norm": 0.17380216717720032, + "learning_rate": 0.0009742846461114856, + "loss": 2.9482, + "step": 3762 + }, + { + "epoch": 0.11158556474809478, + "grad_norm": 0.1717093139886856, + "learning_rate": 0.000974269749108752, + "loss": 2.9566, + "step": 3763 + }, + { + "epoch": 0.11161521810040625, + "grad_norm": 0.16294439136981964, + "learning_rate": 0.0009742548479062783, + "loss": 2.9309, + "step": 3764 + }, + { + "epoch": 0.11164487145271773, + "grad_norm": 0.1547192931175232, + "learning_rate": 0.0009742399425041963, + "loss": 2.9289, + "step": 3765 + }, + { + "epoch": 0.11167452480502921, + "grad_norm": 0.1515028029680252, + "learning_rate": 0.0009742250329026385, + "loss": 2.8955, + "step": 3766 + }, + { + "epoch": 0.11170417815734068, + "grad_norm": 0.15482871234416962, + "learning_rate": 0.0009742101191017365, + "loss": 2.9617, + "step": 3767 + }, + { + "epoch": 0.11173383150965216, + "grad_norm": 0.1550643891096115, + "learning_rate": 0.0009741952011016224, + "loss": 2.9147, + "step": 3768 + }, + { + "epoch": 0.11176348486196365, + "grad_norm": 0.15562553703784943, + "learning_rate": 0.0009741802789024286, + "loss": 2.9455, + "step": 3769 + }, + { + "epoch": 0.11179313821427513, + "grad_norm": 0.16575822234153748, + "learning_rate": 0.000974165352504287, + "loss": 2.907, + "step": 3770 + }, + { + "epoch": 0.1118227915665866, + "grad_norm": 0.15670467913150787, + "learning_rate": 0.0009741504219073297, + "loss": 2.9423, + "step": 3771 + }, + { + "epoch": 0.11185244491889808, + "grad_norm": 0.15368157625198364, + "learning_rate": 0.0009741354871116892, + "loss": 2.9066, + "step": 3772 + }, + { + "epoch": 0.11188209827120955, + "grad_norm": 0.1656104326248169, + "learning_rate": 0.0009741205481174974, + "loss": 2.9305, + "step": 3773 + }, + { + "epoch": 0.11191175162352104, + "grad_norm": 0.167758971452713, + "learning_rate": 0.0009741056049248868, + "loss": 2.915, + "step": 3774 + }, + { + "epoch": 0.11194140497583252, + "grad_norm": 0.18287144601345062, + "learning_rate": 0.0009740906575339898, + "loss": 2.8943, + "step": 3775 + }, + { + "epoch": 0.111971058328144, + "grad_norm": 0.18998734652996063, + "learning_rate": 0.0009740757059449386, + "loss": 2.9601, + "step": 3776 + }, + { + "epoch": 0.11200071168045547, + "grad_norm": 0.17185360193252563, + "learning_rate": 0.0009740607501578655, + "loss": 2.9229, + "step": 3777 + }, + { + "epoch": 0.11203036503276695, + "grad_norm": 0.14509223401546478, + "learning_rate": 0.0009740457901729033, + "loss": 2.927, + "step": 3778 + }, + { + "epoch": 0.11206001838507844, + "grad_norm": 0.16452226042747498, + "learning_rate": 0.0009740308259901842, + "loss": 2.9501, + "step": 3779 + }, + { + "epoch": 0.11208967173738991, + "grad_norm": 0.1540575474500656, + "learning_rate": 0.0009740158576098407, + "loss": 2.9089, + "step": 3780 + }, + { + "epoch": 0.11211932508970139, + "grad_norm": 0.13177311420440674, + "learning_rate": 0.0009740008850320054, + "loss": 2.9442, + "step": 3781 + }, + { + "epoch": 0.11214897844201287, + "grad_norm": 0.14155367016792297, + "learning_rate": 0.000973985908256811, + "loss": 2.9181, + "step": 3782 + }, + { + "epoch": 0.11217863179432434, + "grad_norm": 0.12856563925743103, + "learning_rate": 0.00097397092728439, + "loss": 2.9385, + "step": 3783 + }, + { + "epoch": 0.11220828514663583, + "grad_norm": 0.1398635059595108, + "learning_rate": 0.000973955942114875, + "loss": 2.9283, + "step": 3784 + }, + { + "epoch": 0.11223793849894731, + "grad_norm": 0.13988789916038513, + "learning_rate": 0.0009739409527483989, + "loss": 2.9448, + "step": 3785 + }, + { + "epoch": 0.11226759185125879, + "grad_norm": 0.1469191014766693, + "learning_rate": 0.0009739259591850941, + "loss": 2.9467, + "step": 3786 + }, + { + "epoch": 0.11229724520357026, + "grad_norm": 0.1566626876592636, + "learning_rate": 0.0009739109614250939, + "loss": 2.9146, + "step": 3787 + }, + { + "epoch": 0.11232689855588174, + "grad_norm": 0.14900262653827667, + "learning_rate": 0.0009738959594685306, + "loss": 2.9126, + "step": 3788 + }, + { + "epoch": 0.11235655190819323, + "grad_norm": 0.1741170883178711, + "learning_rate": 0.0009738809533155373, + "loss": 2.9601, + "step": 3789 + }, + { + "epoch": 0.1123862052605047, + "grad_norm": 0.17925718426704407, + "learning_rate": 0.0009738659429662468, + "loss": 2.8994, + "step": 3790 + }, + { + "epoch": 0.11241585861281618, + "grad_norm": 0.163352370262146, + "learning_rate": 0.0009738509284207919, + "loss": 2.9276, + "step": 3791 + }, + { + "epoch": 0.11244551196512766, + "grad_norm": 0.15797443687915802, + "learning_rate": 0.0009738359096793059, + "loss": 2.9327, + "step": 3792 + }, + { + "epoch": 0.11247516531743913, + "grad_norm": 0.18405014276504517, + "learning_rate": 0.0009738208867419216, + "loss": 2.9224, + "step": 3793 + }, + { + "epoch": 0.11250481866975061, + "grad_norm": 0.16795533895492554, + "learning_rate": 0.0009738058596087718, + "loss": 2.9136, + "step": 3794 + }, + { + "epoch": 0.1125344720220621, + "grad_norm": 0.17037440836429596, + "learning_rate": 0.0009737908282799898, + "loss": 2.934, + "step": 3795 + }, + { + "epoch": 0.11256412537437357, + "grad_norm": 0.21018283069133759, + "learning_rate": 0.0009737757927557089, + "loss": 2.9562, + "step": 3796 + }, + { + "epoch": 0.11259377872668505, + "grad_norm": 0.18720968067646027, + "learning_rate": 0.000973760753036062, + "loss": 2.9231, + "step": 3797 + }, + { + "epoch": 0.11262343207899653, + "grad_norm": 0.16910764575004578, + "learning_rate": 0.0009737457091211822, + "loss": 2.9711, + "step": 3798 + }, + { + "epoch": 0.112653085431308, + "grad_norm": 0.19926482439041138, + "learning_rate": 0.0009737306610112029, + "loss": 2.9441, + "step": 3799 + }, + { + "epoch": 0.1126827387836195, + "grad_norm": 0.1947413831949234, + "learning_rate": 0.0009737156087062573, + "loss": 2.9399, + "step": 3800 + }, + { + "epoch": 0.11271239213593097, + "grad_norm": 0.24545031785964966, + "learning_rate": 0.0009737005522064785, + "loss": 2.9634, + "step": 3801 + }, + { + "epoch": 0.11274204548824245, + "grad_norm": 0.21926364302635193, + "learning_rate": 0.0009736854915120001, + "loss": 2.9243, + "step": 3802 + }, + { + "epoch": 0.11277169884055392, + "grad_norm": 0.19033806025981903, + "learning_rate": 0.0009736704266229554, + "loss": 2.9385, + "step": 3803 + }, + { + "epoch": 0.1128013521928654, + "grad_norm": 0.19469569623470306, + "learning_rate": 0.0009736553575394778, + "loss": 2.9228, + "step": 3804 + }, + { + "epoch": 0.11283100554517689, + "grad_norm": 0.15520043671131134, + "learning_rate": 0.0009736402842617007, + "loss": 2.9375, + "step": 3805 + }, + { + "epoch": 0.11286065889748836, + "grad_norm": 0.15920373797416687, + "learning_rate": 0.0009736252067897575, + "loss": 2.8897, + "step": 3806 + }, + { + "epoch": 0.11289031224979984, + "grad_norm": 0.14855241775512695, + "learning_rate": 0.0009736101251237819, + "loss": 2.9338, + "step": 3807 + }, + { + "epoch": 0.11291996560211132, + "grad_norm": 0.1121029183268547, + "learning_rate": 0.0009735950392639073, + "loss": 2.9308, + "step": 3808 + }, + { + "epoch": 0.11294961895442279, + "grad_norm": 0.13439945876598358, + "learning_rate": 0.0009735799492102673, + "loss": 2.8977, + "step": 3809 + }, + { + "epoch": 0.11297927230673428, + "grad_norm": 0.13575336337089539, + "learning_rate": 0.0009735648549629956, + "loss": 2.9807, + "step": 3810 + }, + { + "epoch": 0.11300892565904576, + "grad_norm": 0.1555628627538681, + "learning_rate": 0.0009735497565222258, + "loss": 2.939, + "step": 3811 + }, + { + "epoch": 0.11303857901135723, + "grad_norm": 0.16948941349983215, + "learning_rate": 0.0009735346538880916, + "loss": 2.9563, + "step": 3812 + }, + { + "epoch": 0.11306823236366871, + "grad_norm": 0.17867527902126312, + "learning_rate": 0.0009735195470607269, + "loss": 2.9507, + "step": 3813 + }, + { + "epoch": 0.11309788571598019, + "grad_norm": 0.17293907701969147, + "learning_rate": 0.0009735044360402651, + "loss": 2.8889, + "step": 3814 + }, + { + "epoch": 0.11312753906829168, + "grad_norm": 0.16056384146213531, + "learning_rate": 0.0009734893208268405, + "loss": 2.9528, + "step": 3815 + }, + { + "epoch": 0.11315719242060315, + "grad_norm": 0.168413445353508, + "learning_rate": 0.0009734742014205865, + "loss": 2.9126, + "step": 3816 + }, + { + "epoch": 0.11318684577291463, + "grad_norm": 0.17126916348934174, + "learning_rate": 0.0009734590778216372, + "loss": 2.9128, + "step": 3817 + }, + { + "epoch": 0.1132164991252261, + "grad_norm": 0.1528923213481903, + "learning_rate": 0.0009734439500301267, + "loss": 2.8929, + "step": 3818 + }, + { + "epoch": 0.11324615247753758, + "grad_norm": 0.1443394124507904, + "learning_rate": 0.0009734288180461885, + "loss": 2.9025, + "step": 3819 + }, + { + "epoch": 0.11327580582984906, + "grad_norm": 0.13678234815597534, + "learning_rate": 0.0009734136818699569, + "loss": 2.9458, + "step": 3820 + }, + { + "epoch": 0.11330545918216055, + "grad_norm": 0.11764630675315857, + "learning_rate": 0.0009733985415015659, + "loss": 2.9201, + "step": 3821 + }, + { + "epoch": 0.11333511253447202, + "grad_norm": 0.12390736490488052, + "learning_rate": 0.0009733833969411496, + "loss": 2.9488, + "step": 3822 + }, + { + "epoch": 0.1133647658867835, + "grad_norm": 0.13311518728733063, + "learning_rate": 0.000973368248188842, + "loss": 2.9379, + "step": 3823 + }, + { + "epoch": 0.11339441923909498, + "grad_norm": 0.16166633367538452, + "learning_rate": 0.0009733530952447775, + "loss": 2.8927, + "step": 3824 + }, + { + "epoch": 0.11342407259140645, + "grad_norm": 0.16370798647403717, + "learning_rate": 0.0009733379381090899, + "loss": 2.9354, + "step": 3825 + }, + { + "epoch": 0.11345372594371794, + "grad_norm": 0.17467983067035675, + "learning_rate": 0.0009733227767819137, + "loss": 2.9481, + "step": 3826 + }, + { + "epoch": 0.11348337929602942, + "grad_norm": 0.18318408727645874, + "learning_rate": 0.000973307611263383, + "loss": 2.9063, + "step": 3827 + }, + { + "epoch": 0.1135130326483409, + "grad_norm": 0.16566890478134155, + "learning_rate": 0.0009732924415536322, + "loss": 2.9028, + "step": 3828 + }, + { + "epoch": 0.11354268600065237, + "grad_norm": 0.17418301105499268, + "learning_rate": 0.0009732772676527956, + "loss": 2.8955, + "step": 3829 + }, + { + "epoch": 0.11357233935296385, + "grad_norm": 0.1662023663520813, + "learning_rate": 0.0009732620895610075, + "loss": 2.8786, + "step": 3830 + }, + { + "epoch": 0.11360199270527534, + "grad_norm": 0.15775655210018158, + "learning_rate": 0.0009732469072784023, + "loss": 2.9373, + "step": 3831 + }, + { + "epoch": 0.11363164605758681, + "grad_norm": 0.15569300949573517, + "learning_rate": 0.0009732317208051147, + "loss": 2.9099, + "step": 3832 + }, + { + "epoch": 0.11366129940989829, + "grad_norm": 0.12645357847213745, + "learning_rate": 0.000973216530141279, + "loss": 2.9139, + "step": 3833 + }, + { + "epoch": 0.11369095276220977, + "grad_norm": 0.13269490003585815, + "learning_rate": 0.0009732013352870295, + "loss": 2.9535, + "step": 3834 + }, + { + "epoch": 0.11372060611452124, + "grad_norm": 0.13766269385814667, + "learning_rate": 0.0009731861362425009, + "loss": 2.9, + "step": 3835 + }, + { + "epoch": 0.11375025946683273, + "grad_norm": 0.13626961410045624, + "learning_rate": 0.0009731709330078281, + "loss": 2.9433, + "step": 3836 + }, + { + "epoch": 0.11377991281914421, + "grad_norm": 0.14806842803955078, + "learning_rate": 0.0009731557255831454, + "loss": 2.9625, + "step": 3837 + }, + { + "epoch": 0.11380956617145568, + "grad_norm": 0.15892964601516724, + "learning_rate": 0.0009731405139685874, + "loss": 2.9546, + "step": 3838 + }, + { + "epoch": 0.11383921952376716, + "grad_norm": 0.1685255914926529, + "learning_rate": 0.0009731252981642891, + "loss": 2.9268, + "step": 3839 + }, + { + "epoch": 0.11386887287607864, + "grad_norm": 0.17265401780605316, + "learning_rate": 0.000973110078170385, + "loss": 2.9046, + "step": 3840 + }, + { + "epoch": 0.11389852622839013, + "grad_norm": 0.18338140845298767, + "learning_rate": 0.0009730948539870099, + "loss": 2.925, + "step": 3841 + }, + { + "epoch": 0.1139281795807016, + "grad_norm": 0.17897425591945648, + "learning_rate": 0.0009730796256142986, + "loss": 2.9158, + "step": 3842 + }, + { + "epoch": 0.11395783293301308, + "grad_norm": 0.18069550395011902, + "learning_rate": 0.0009730643930523863, + "loss": 2.9073, + "step": 3843 + }, + { + "epoch": 0.11398748628532455, + "grad_norm": 0.19904743134975433, + "learning_rate": 0.0009730491563014073, + "loss": 2.9534, + "step": 3844 + }, + { + "epoch": 0.11401713963763603, + "grad_norm": 0.18420132994651794, + "learning_rate": 0.000973033915361497, + "loss": 2.9422, + "step": 3845 + }, + { + "epoch": 0.1140467929899475, + "grad_norm": 0.15427079796791077, + "learning_rate": 0.0009730186702327901, + "loss": 2.9309, + "step": 3846 + }, + { + "epoch": 0.114076446342259, + "grad_norm": 0.19032776355743408, + "learning_rate": 0.0009730034209154217, + "loss": 2.9186, + "step": 3847 + }, + { + "epoch": 0.11410609969457047, + "grad_norm": 0.1983938366174698, + "learning_rate": 0.0009729881674095269, + "loss": 2.94, + "step": 3848 + }, + { + "epoch": 0.11413575304688195, + "grad_norm": 0.1666628122329712, + "learning_rate": 0.0009729729097152405, + "loss": 2.942, + "step": 3849 + }, + { + "epoch": 0.11416540639919343, + "grad_norm": 0.15568050742149353, + "learning_rate": 0.0009729576478326981, + "loss": 2.9368, + "step": 3850 + }, + { + "epoch": 0.1141950597515049, + "grad_norm": 0.1538112759590149, + "learning_rate": 0.0009729423817620342, + "loss": 2.9434, + "step": 3851 + }, + { + "epoch": 0.11422471310381639, + "grad_norm": 0.14914840459823608, + "learning_rate": 0.0009729271115033845, + "loss": 2.9632, + "step": 3852 + }, + { + "epoch": 0.11425436645612787, + "grad_norm": 0.15999431908130646, + "learning_rate": 0.0009729118370568841, + "loss": 2.9196, + "step": 3853 + }, + { + "epoch": 0.11428401980843934, + "grad_norm": 0.1686503291130066, + "learning_rate": 0.0009728965584226681, + "loss": 2.9549, + "step": 3854 + }, + { + "epoch": 0.11431367316075082, + "grad_norm": 0.17729707062244415, + "learning_rate": 0.0009728812756008721, + "loss": 2.9013, + "step": 3855 + }, + { + "epoch": 0.1143433265130623, + "grad_norm": 0.1404321938753128, + "learning_rate": 0.000972865988591631, + "loss": 2.91, + "step": 3856 + }, + { + "epoch": 0.11437297986537379, + "grad_norm": 0.12706561386585236, + "learning_rate": 0.0009728506973950805, + "loss": 2.9121, + "step": 3857 + }, + { + "epoch": 0.11440263321768526, + "grad_norm": 0.13626888394355774, + "learning_rate": 0.0009728354020113559, + "loss": 2.9322, + "step": 3858 + }, + { + "epoch": 0.11443228656999674, + "grad_norm": 0.12396631389856339, + "learning_rate": 0.0009728201024405927, + "loss": 2.9466, + "step": 3859 + }, + { + "epoch": 0.11446193992230821, + "grad_norm": 0.12519563734531403, + "learning_rate": 0.0009728047986829263, + "loss": 2.9268, + "step": 3860 + }, + { + "epoch": 0.11449159327461969, + "grad_norm": 0.1410100907087326, + "learning_rate": 0.0009727894907384922, + "loss": 2.9265, + "step": 3861 + }, + { + "epoch": 0.11452124662693118, + "grad_norm": 0.157411590218544, + "learning_rate": 0.000972774178607426, + "loss": 2.9128, + "step": 3862 + }, + { + "epoch": 0.11455089997924266, + "grad_norm": 0.14842061698436737, + "learning_rate": 0.0009727588622898633, + "loss": 2.9274, + "step": 3863 + }, + { + "epoch": 0.11458055333155413, + "grad_norm": 0.1454889178276062, + "learning_rate": 0.0009727435417859399, + "loss": 2.9536, + "step": 3864 + }, + { + "epoch": 0.11461020668386561, + "grad_norm": 0.1414448618888855, + "learning_rate": 0.0009727282170957912, + "loss": 2.9182, + "step": 3865 + }, + { + "epoch": 0.11463986003617709, + "grad_norm": 0.14724813401699066, + "learning_rate": 0.0009727128882195528, + "loss": 2.9181, + "step": 3866 + }, + { + "epoch": 0.11466951338848858, + "grad_norm": 0.16415280103683472, + "learning_rate": 0.000972697555157361, + "loss": 2.9186, + "step": 3867 + }, + { + "epoch": 0.11469916674080005, + "grad_norm": 0.1755082756280899, + "learning_rate": 0.0009726822179093508, + "loss": 2.9523, + "step": 3868 + }, + { + "epoch": 0.11472882009311153, + "grad_norm": 0.18462471663951874, + "learning_rate": 0.0009726668764756587, + "loss": 2.9565, + "step": 3869 + }, + { + "epoch": 0.114758473445423, + "grad_norm": 0.1710841953754425, + "learning_rate": 0.0009726515308564202, + "loss": 2.9525, + "step": 3870 + }, + { + "epoch": 0.11478812679773448, + "grad_norm": 0.15416322648525238, + "learning_rate": 0.0009726361810517714, + "loss": 2.9108, + "step": 3871 + }, + { + "epoch": 0.11481778015004596, + "grad_norm": 0.18473491072654724, + "learning_rate": 0.0009726208270618479, + "loss": 2.8969, + "step": 3872 + }, + { + "epoch": 0.11484743350235745, + "grad_norm": 0.198618084192276, + "learning_rate": 0.0009726054688867859, + "loss": 2.9473, + "step": 3873 + }, + { + "epoch": 0.11487708685466892, + "grad_norm": 0.18342067301273346, + "learning_rate": 0.0009725901065267213, + "loss": 2.944, + "step": 3874 + }, + { + "epoch": 0.1149067402069804, + "grad_norm": 0.16118936240673065, + "learning_rate": 0.0009725747399817904, + "loss": 2.9251, + "step": 3875 + }, + { + "epoch": 0.11493639355929187, + "grad_norm": 0.1688440591096878, + "learning_rate": 0.0009725593692521289, + "loss": 2.9097, + "step": 3876 + }, + { + "epoch": 0.11496604691160335, + "grad_norm": 0.15683898329734802, + "learning_rate": 0.0009725439943378731, + "loss": 2.891, + "step": 3877 + }, + { + "epoch": 0.11499570026391484, + "grad_norm": 0.13707378506660461, + "learning_rate": 0.000972528615239159, + "loss": 2.9102, + "step": 3878 + }, + { + "epoch": 0.11502535361622632, + "grad_norm": 0.1499914824962616, + "learning_rate": 0.0009725132319561231, + "loss": 2.916, + "step": 3879 + }, + { + "epoch": 0.11505500696853779, + "grad_norm": 0.13681118190288544, + "learning_rate": 0.0009724978444889012, + "loss": 2.9252, + "step": 3880 + }, + { + "epoch": 0.11508466032084927, + "grad_norm": 0.1561567783355713, + "learning_rate": 0.00097248245283763, + "loss": 2.9166, + "step": 3881 + }, + { + "epoch": 0.11511431367316075, + "grad_norm": 0.16960926353931427, + "learning_rate": 0.0009724670570024455, + "loss": 2.9146, + "step": 3882 + }, + { + "epoch": 0.11514396702547224, + "grad_norm": 0.14615611732006073, + "learning_rate": 0.0009724516569834842, + "loss": 2.9247, + "step": 3883 + }, + { + "epoch": 0.11517362037778371, + "grad_norm": 0.1413535177707672, + "learning_rate": 0.0009724362527808822, + "loss": 2.9442, + "step": 3884 + }, + { + "epoch": 0.11520327373009519, + "grad_norm": 0.13668407499790192, + "learning_rate": 0.0009724208443947762, + "loss": 2.9145, + "step": 3885 + }, + { + "epoch": 0.11523292708240666, + "grad_norm": 0.12257909029722214, + "learning_rate": 0.0009724054318253026, + "loss": 2.9318, + "step": 3886 + }, + { + "epoch": 0.11526258043471814, + "grad_norm": 0.12399961054325104, + "learning_rate": 0.0009723900150725977, + "loss": 2.9278, + "step": 3887 + }, + { + "epoch": 0.11529223378702963, + "grad_norm": 0.13241514563560486, + "learning_rate": 0.000972374594136798, + "loss": 2.9163, + "step": 3888 + }, + { + "epoch": 0.1153218871393411, + "grad_norm": 0.1342909038066864, + "learning_rate": 0.0009723591690180405, + "loss": 2.9745, + "step": 3889 + }, + { + "epoch": 0.11535154049165258, + "grad_norm": 0.13756301999092102, + "learning_rate": 0.0009723437397164612, + "loss": 2.9376, + "step": 3890 + }, + { + "epoch": 0.11538119384396406, + "grad_norm": 0.14531521499156952, + "learning_rate": 0.0009723283062321972, + "loss": 2.9379, + "step": 3891 + }, + { + "epoch": 0.11541084719627553, + "grad_norm": 0.16528238356113434, + "learning_rate": 0.000972312868565385, + "loss": 2.9464, + "step": 3892 + }, + { + "epoch": 0.11544050054858702, + "grad_norm": 0.17279762029647827, + "learning_rate": 0.0009722974267161612, + "loss": 2.8905, + "step": 3893 + }, + { + "epoch": 0.1154701539008985, + "grad_norm": 0.1843959093093872, + "learning_rate": 0.0009722819806846626, + "loss": 2.9276, + "step": 3894 + }, + { + "epoch": 0.11549980725320998, + "grad_norm": 0.16900323331356049, + "learning_rate": 0.000972266530471026, + "loss": 2.9575, + "step": 3895 + }, + { + "epoch": 0.11552946060552145, + "grad_norm": 0.15523134171962738, + "learning_rate": 0.0009722510760753882, + "loss": 2.9169, + "step": 3896 + }, + { + "epoch": 0.11555911395783293, + "grad_norm": 0.18693755567073822, + "learning_rate": 0.000972235617497886, + "loss": 2.9276, + "step": 3897 + }, + { + "epoch": 0.1155887673101444, + "grad_norm": 0.23599448800086975, + "learning_rate": 0.0009722201547386564, + "loss": 2.9454, + "step": 3898 + }, + { + "epoch": 0.1156184206624559, + "grad_norm": 0.20554998517036438, + "learning_rate": 0.0009722046877978363, + "loss": 2.912, + "step": 3899 + }, + { + "epoch": 0.11564807401476737, + "grad_norm": 0.16907049715518951, + "learning_rate": 0.0009721892166755627, + "loss": 2.931, + "step": 3900 + }, + { + "epoch": 0.11567772736707885, + "grad_norm": 0.19296114146709442, + "learning_rate": 0.0009721737413719725, + "loss": 2.8887, + "step": 3901 + }, + { + "epoch": 0.11570738071939032, + "grad_norm": 0.1846199631690979, + "learning_rate": 0.0009721582618872027, + "loss": 2.9208, + "step": 3902 + }, + { + "epoch": 0.1157370340717018, + "grad_norm": 0.16031381487846375, + "learning_rate": 0.0009721427782213905, + "loss": 2.8938, + "step": 3903 + }, + { + "epoch": 0.11576668742401329, + "grad_norm": 0.14235074818134308, + "learning_rate": 0.0009721272903746729, + "loss": 2.9164, + "step": 3904 + }, + { + "epoch": 0.11579634077632477, + "grad_norm": 0.14609146118164062, + "learning_rate": 0.0009721117983471872, + "loss": 2.9324, + "step": 3905 + }, + { + "epoch": 0.11582599412863624, + "grad_norm": 0.157633975148201, + "learning_rate": 0.0009720963021390704, + "loss": 2.9213, + "step": 3906 + }, + { + "epoch": 0.11585564748094772, + "grad_norm": 0.14726753532886505, + "learning_rate": 0.0009720808017504599, + "loss": 2.9231, + "step": 3907 + }, + { + "epoch": 0.1158853008332592, + "grad_norm": 0.13911612331867218, + "learning_rate": 0.0009720652971814928, + "loss": 2.9197, + "step": 3908 + }, + { + "epoch": 0.11591495418557068, + "grad_norm": 0.17368276417255402, + "learning_rate": 0.0009720497884323064, + "loss": 2.9121, + "step": 3909 + }, + { + "epoch": 0.11594460753788216, + "grad_norm": 0.1657918393611908, + "learning_rate": 0.0009720342755030382, + "loss": 2.9271, + "step": 3910 + }, + { + "epoch": 0.11597426089019364, + "grad_norm": 0.15287119150161743, + "learning_rate": 0.0009720187583938254, + "loss": 2.9238, + "step": 3911 + }, + { + "epoch": 0.11600391424250511, + "grad_norm": 0.14749905467033386, + "learning_rate": 0.0009720032371048056, + "loss": 2.9108, + "step": 3912 + }, + { + "epoch": 0.11603356759481659, + "grad_norm": 0.1556137204170227, + "learning_rate": 0.000971987711636116, + "loss": 2.9123, + "step": 3913 + }, + { + "epoch": 0.11606322094712808, + "grad_norm": 0.14838893711566925, + "learning_rate": 0.0009719721819878941, + "loss": 2.905, + "step": 3914 + }, + { + "epoch": 0.11609287429943956, + "grad_norm": 0.14972160756587982, + "learning_rate": 0.0009719566481602778, + "loss": 2.8759, + "step": 3915 + }, + { + "epoch": 0.11612252765175103, + "grad_norm": 0.13789409399032593, + "learning_rate": 0.0009719411101534041, + "loss": 2.9481, + "step": 3916 + }, + { + "epoch": 0.11615218100406251, + "grad_norm": 0.16148535907268524, + "learning_rate": 0.0009719255679674111, + "loss": 2.8973, + "step": 3917 + }, + { + "epoch": 0.11618183435637398, + "grad_norm": 0.1657019704580307, + "learning_rate": 0.000971910021602436, + "loss": 2.9398, + "step": 3918 + }, + { + "epoch": 0.11621148770868547, + "grad_norm": 0.1516784429550171, + "learning_rate": 0.0009718944710586169, + "loss": 2.8972, + "step": 3919 + }, + { + "epoch": 0.11624114106099695, + "grad_norm": 0.1247127577662468, + "learning_rate": 0.0009718789163360909, + "loss": 2.9278, + "step": 3920 + }, + { + "epoch": 0.11627079441330843, + "grad_norm": 0.1318385750055313, + "learning_rate": 0.0009718633574349963, + "loss": 2.9123, + "step": 3921 + }, + { + "epoch": 0.1163004477656199, + "grad_norm": 0.14234161376953125, + "learning_rate": 0.0009718477943554707, + "loss": 2.8851, + "step": 3922 + }, + { + "epoch": 0.11633010111793138, + "grad_norm": 0.14427681267261505, + "learning_rate": 0.0009718322270976518, + "loss": 2.9277, + "step": 3923 + }, + { + "epoch": 0.11635975447024285, + "grad_norm": 0.15567666292190552, + "learning_rate": 0.0009718166556616776, + "loss": 2.8989, + "step": 3924 + }, + { + "epoch": 0.11638940782255434, + "grad_norm": 0.1613239347934723, + "learning_rate": 0.0009718010800476859, + "loss": 2.9049, + "step": 3925 + }, + { + "epoch": 0.11641906117486582, + "grad_norm": 0.15881191194057465, + "learning_rate": 0.0009717855002558147, + "loss": 2.9251, + "step": 3926 + }, + { + "epoch": 0.1164487145271773, + "grad_norm": 0.16969478130340576, + "learning_rate": 0.0009717699162862019, + "loss": 2.9285, + "step": 3927 + }, + { + "epoch": 0.11647836787948877, + "grad_norm": 0.16538918018341064, + "learning_rate": 0.0009717543281389855, + "loss": 2.9408, + "step": 3928 + }, + { + "epoch": 0.11650802123180025, + "grad_norm": 0.17019008100032806, + "learning_rate": 0.0009717387358143035, + "loss": 2.9483, + "step": 3929 + }, + { + "epoch": 0.11653767458411174, + "grad_norm": 0.20147809386253357, + "learning_rate": 0.0009717231393122941, + "loss": 2.9164, + "step": 3930 + }, + { + "epoch": 0.11656732793642322, + "grad_norm": 0.20654594898223877, + "learning_rate": 0.0009717075386330953, + "loss": 2.9157, + "step": 3931 + }, + { + "epoch": 0.11659698128873469, + "grad_norm": 0.1741906702518463, + "learning_rate": 0.0009716919337768452, + "loss": 2.936, + "step": 3932 + }, + { + "epoch": 0.11662663464104617, + "grad_norm": 0.17229577898979187, + "learning_rate": 0.0009716763247436821, + "loss": 2.9257, + "step": 3933 + }, + { + "epoch": 0.11665628799335764, + "grad_norm": 0.1565244048833847, + "learning_rate": 0.0009716607115337443, + "loss": 2.9269, + "step": 3934 + }, + { + "epoch": 0.11668594134566913, + "grad_norm": 0.15334312617778778, + "learning_rate": 0.0009716450941471699, + "loss": 2.9288, + "step": 3935 + }, + { + "epoch": 0.11671559469798061, + "grad_norm": 0.16479338705539703, + "learning_rate": 0.0009716294725840972, + "loss": 2.9199, + "step": 3936 + }, + { + "epoch": 0.11674524805029209, + "grad_norm": 0.18211403489112854, + "learning_rate": 0.0009716138468446646, + "loss": 2.9027, + "step": 3937 + }, + { + "epoch": 0.11677490140260356, + "grad_norm": 0.17904749512672424, + "learning_rate": 0.0009715982169290103, + "loss": 2.919, + "step": 3938 + }, + { + "epoch": 0.11680455475491504, + "grad_norm": 0.1738811433315277, + "learning_rate": 0.0009715825828372729, + "loss": 2.8549, + "step": 3939 + }, + { + "epoch": 0.11683420810722653, + "grad_norm": 0.16994725167751312, + "learning_rate": 0.0009715669445695909, + "loss": 2.9397, + "step": 3940 + }, + { + "epoch": 0.116863861459538, + "grad_norm": 0.16567565500736237, + "learning_rate": 0.0009715513021261026, + "loss": 2.9127, + "step": 3941 + }, + { + "epoch": 0.11689351481184948, + "grad_norm": 0.1549907922744751, + "learning_rate": 0.0009715356555069465, + "loss": 2.9416, + "step": 3942 + }, + { + "epoch": 0.11692316816416096, + "grad_norm": 0.15709146857261658, + "learning_rate": 0.0009715200047122613, + "loss": 2.9113, + "step": 3943 + }, + { + "epoch": 0.11695282151647243, + "grad_norm": 0.17609745264053345, + "learning_rate": 0.0009715043497421856, + "loss": 2.9382, + "step": 3944 + }, + { + "epoch": 0.11698247486878392, + "grad_norm": 0.17805024981498718, + "learning_rate": 0.0009714886905968579, + "loss": 2.9087, + "step": 3945 + }, + { + "epoch": 0.1170121282210954, + "grad_norm": 0.15281713008880615, + "learning_rate": 0.0009714730272764167, + "loss": 2.8901, + "step": 3946 + }, + { + "epoch": 0.11704178157340688, + "grad_norm": 0.14260250329971313, + "learning_rate": 0.0009714573597810012, + "loss": 2.9077, + "step": 3947 + }, + { + "epoch": 0.11707143492571835, + "grad_norm": 0.1543053388595581, + "learning_rate": 0.0009714416881107498, + "loss": 2.9245, + "step": 3948 + }, + { + "epoch": 0.11710108827802983, + "grad_norm": 0.1460719257593155, + "learning_rate": 0.0009714260122658012, + "loss": 2.9447, + "step": 3949 + }, + { + "epoch": 0.1171307416303413, + "grad_norm": 0.13625168800354004, + "learning_rate": 0.0009714103322462944, + "loss": 2.9314, + "step": 3950 + }, + { + "epoch": 0.1171603949826528, + "grad_norm": 0.12782834470272064, + "learning_rate": 0.0009713946480523684, + "loss": 2.8894, + "step": 3951 + }, + { + "epoch": 0.11719004833496427, + "grad_norm": 0.1426776796579361, + "learning_rate": 0.0009713789596841615, + "loss": 2.916, + "step": 3952 + }, + { + "epoch": 0.11721970168727575, + "grad_norm": 0.16036413609981537, + "learning_rate": 0.0009713632671418133, + "loss": 2.9333, + "step": 3953 + }, + { + "epoch": 0.11724935503958722, + "grad_norm": 0.153816819190979, + "learning_rate": 0.0009713475704254623, + "loss": 2.9058, + "step": 3954 + }, + { + "epoch": 0.1172790083918987, + "grad_norm": 0.13387979567050934, + "learning_rate": 0.0009713318695352478, + "loss": 2.9371, + "step": 3955 + }, + { + "epoch": 0.11730866174421019, + "grad_norm": 0.12466281652450562, + "learning_rate": 0.0009713161644713085, + "loss": 2.9377, + "step": 3956 + }, + { + "epoch": 0.11733831509652166, + "grad_norm": 0.13588450849056244, + "learning_rate": 0.0009713004552337839, + "loss": 2.9624, + "step": 3957 + }, + { + "epoch": 0.11736796844883314, + "grad_norm": 0.13703976571559906, + "learning_rate": 0.0009712847418228126, + "loss": 2.9091, + "step": 3958 + }, + { + "epoch": 0.11739762180114462, + "grad_norm": 0.1562979370355606, + "learning_rate": 0.0009712690242385342, + "loss": 2.8955, + "step": 3959 + }, + { + "epoch": 0.11742727515345609, + "grad_norm": 0.20264367759227753, + "learning_rate": 0.0009712533024810876, + "loss": 2.9672, + "step": 3960 + }, + { + "epoch": 0.11745692850576758, + "grad_norm": 0.19947125017642975, + "learning_rate": 0.0009712375765506122, + "loss": 2.9199, + "step": 3961 + }, + { + "epoch": 0.11748658185807906, + "grad_norm": 0.17703565955162048, + "learning_rate": 0.0009712218464472471, + "loss": 2.904, + "step": 3962 + }, + { + "epoch": 0.11751623521039053, + "grad_norm": 0.1636650562286377, + "learning_rate": 0.0009712061121711317, + "loss": 2.9371, + "step": 3963 + }, + { + "epoch": 0.11754588856270201, + "grad_norm": 0.166131854057312, + "learning_rate": 0.0009711903737224054, + "loss": 2.9161, + "step": 3964 + }, + { + "epoch": 0.11757554191501349, + "grad_norm": 0.193747416138649, + "learning_rate": 0.0009711746311012073, + "loss": 2.9084, + "step": 3965 + }, + { + "epoch": 0.11760519526732498, + "grad_norm": 0.16867217421531677, + "learning_rate": 0.0009711588843076771, + "loss": 2.9197, + "step": 3966 + }, + { + "epoch": 0.11763484861963645, + "grad_norm": 0.15913423895835876, + "learning_rate": 0.0009711431333419541, + "loss": 2.9595, + "step": 3967 + }, + { + "epoch": 0.11766450197194793, + "grad_norm": 0.14053255319595337, + "learning_rate": 0.0009711273782041776, + "loss": 2.9163, + "step": 3968 + }, + { + "epoch": 0.1176941553242594, + "grad_norm": 0.14198285341262817, + "learning_rate": 0.0009711116188944874, + "loss": 2.9458, + "step": 3969 + }, + { + "epoch": 0.11772380867657088, + "grad_norm": 0.1405138075351715, + "learning_rate": 0.0009710958554130229, + "loss": 2.9275, + "step": 3970 + }, + { + "epoch": 0.11775346202888237, + "grad_norm": 0.1490321308374405, + "learning_rate": 0.0009710800877599239, + "loss": 2.9409, + "step": 3971 + }, + { + "epoch": 0.11778311538119385, + "grad_norm": 0.1621212512254715, + "learning_rate": 0.0009710643159353299, + "loss": 2.8871, + "step": 3972 + }, + { + "epoch": 0.11781276873350532, + "grad_norm": 0.14684468507766724, + "learning_rate": 0.0009710485399393803, + "loss": 2.9126, + "step": 3973 + }, + { + "epoch": 0.1178424220858168, + "grad_norm": 0.1738077700138092, + "learning_rate": 0.000971032759772215, + "loss": 2.915, + "step": 3974 + }, + { + "epoch": 0.11787207543812828, + "grad_norm": 0.18567368388175964, + "learning_rate": 0.0009710169754339739, + "loss": 2.9262, + "step": 3975 + }, + { + "epoch": 0.11790172879043975, + "grad_norm": 0.18740206956863403, + "learning_rate": 0.0009710011869247967, + "loss": 2.9607, + "step": 3976 + }, + { + "epoch": 0.11793138214275124, + "grad_norm": 0.17552898824214935, + "learning_rate": 0.000970985394244823, + "loss": 2.8869, + "step": 3977 + }, + { + "epoch": 0.11796103549506272, + "grad_norm": 0.1743534356355667, + "learning_rate": 0.0009709695973941928, + "loss": 2.9265, + "step": 3978 + }, + { + "epoch": 0.1179906888473742, + "grad_norm": 0.16357091069221497, + "learning_rate": 0.0009709537963730462, + "loss": 2.9119, + "step": 3979 + }, + { + "epoch": 0.11802034219968567, + "grad_norm": 0.17076292634010315, + "learning_rate": 0.0009709379911815226, + "loss": 2.915, + "step": 3980 + }, + { + "epoch": 0.11804999555199715, + "grad_norm": 0.14374905824661255, + "learning_rate": 0.0009709221818197624, + "loss": 2.8994, + "step": 3981 + }, + { + "epoch": 0.11807964890430864, + "grad_norm": 0.12974202632904053, + "learning_rate": 0.0009709063682879054, + "loss": 2.9496, + "step": 3982 + }, + { + "epoch": 0.11810930225662011, + "grad_norm": 0.1451283097267151, + "learning_rate": 0.0009708905505860917, + "loss": 2.897, + "step": 3983 + }, + { + "epoch": 0.11813895560893159, + "grad_norm": 0.1478685587644577, + "learning_rate": 0.0009708747287144612, + "loss": 2.9541, + "step": 3984 + }, + { + "epoch": 0.11816860896124307, + "grad_norm": 0.1457318812608719, + "learning_rate": 0.0009708589026731544, + "loss": 2.935, + "step": 3985 + }, + { + "epoch": 0.11819826231355454, + "grad_norm": 0.15003760159015656, + "learning_rate": 0.0009708430724623112, + "loss": 2.9294, + "step": 3986 + }, + { + "epoch": 0.11822791566586603, + "grad_norm": 0.15923041105270386, + "learning_rate": 0.0009708272380820715, + "loss": 2.9205, + "step": 3987 + }, + { + "epoch": 0.11825756901817751, + "grad_norm": 0.16435213387012482, + "learning_rate": 0.000970811399532576, + "loss": 2.917, + "step": 3988 + }, + { + "epoch": 0.11828722237048898, + "grad_norm": 0.15483491122722626, + "learning_rate": 0.0009707955568139647, + "loss": 2.9061, + "step": 3989 + }, + { + "epoch": 0.11831687572280046, + "grad_norm": 0.12359076738357544, + "learning_rate": 0.000970779709926378, + "loss": 2.9048, + "step": 3990 + }, + { + "epoch": 0.11834652907511194, + "grad_norm": 0.14010651409626007, + "learning_rate": 0.0009707638588699561, + "loss": 2.9225, + "step": 3991 + }, + { + "epoch": 0.11837618242742343, + "grad_norm": 0.15987810492515564, + "learning_rate": 0.0009707480036448393, + "loss": 2.9193, + "step": 3992 + }, + { + "epoch": 0.1184058357797349, + "grad_norm": 0.1564309149980545, + "learning_rate": 0.0009707321442511683, + "loss": 2.9173, + "step": 3993 + }, + { + "epoch": 0.11843548913204638, + "grad_norm": 0.1577070951461792, + "learning_rate": 0.0009707162806890832, + "loss": 2.8918, + "step": 3994 + }, + { + "epoch": 0.11846514248435785, + "grad_norm": 0.159605473279953, + "learning_rate": 0.000970700412958725, + "loss": 2.8966, + "step": 3995 + }, + { + "epoch": 0.11849479583666933, + "grad_norm": 0.20233598351478577, + "learning_rate": 0.0009706845410602334, + "loss": 2.9046, + "step": 3996 + }, + { + "epoch": 0.11852444918898082, + "grad_norm": 0.22584685683250427, + "learning_rate": 0.0009706686649937496, + "loss": 2.945, + "step": 3997 + }, + { + "epoch": 0.1185541025412923, + "grad_norm": 0.1954154521226883, + "learning_rate": 0.0009706527847594139, + "loss": 2.9312, + "step": 3998 + }, + { + "epoch": 0.11858375589360377, + "grad_norm": 0.16425196826457977, + "learning_rate": 0.0009706369003573672, + "loss": 2.9194, + "step": 3999 + }, + { + "epoch": 0.11861340924591525, + "grad_norm": 0.1767295002937317, + "learning_rate": 0.0009706210117877498, + "loss": 2.9232, + "step": 4000 + }, + { + "epoch": 0.11864306259822673, + "grad_norm": 0.1648510992527008, + "learning_rate": 0.0009706051190507026, + "loss": 2.9479, + "step": 4001 + }, + { + "epoch": 0.1186727159505382, + "grad_norm": 0.15980049967765808, + "learning_rate": 0.0009705892221463663, + "loss": 2.9151, + "step": 4002 + }, + { + "epoch": 0.11870236930284969, + "grad_norm": 0.15925557911396027, + "learning_rate": 0.0009705733210748816, + "loss": 2.9189, + "step": 4003 + }, + { + "epoch": 0.11873202265516117, + "grad_norm": 0.15672709047794342, + "learning_rate": 0.0009705574158363894, + "loss": 2.9459, + "step": 4004 + }, + { + "epoch": 0.11876167600747264, + "grad_norm": 0.1490585207939148, + "learning_rate": 0.0009705415064310306, + "loss": 2.912, + "step": 4005 + }, + { + "epoch": 0.11879132935978412, + "grad_norm": 0.15927983820438385, + "learning_rate": 0.0009705255928589458, + "loss": 2.9025, + "step": 4006 + }, + { + "epoch": 0.1188209827120956, + "grad_norm": 0.16095556318759918, + "learning_rate": 0.0009705096751202763, + "loss": 2.9269, + "step": 4007 + }, + { + "epoch": 0.11885063606440709, + "grad_norm": 0.19152981042861938, + "learning_rate": 0.0009704937532151628, + "loss": 2.9402, + "step": 4008 + }, + { + "epoch": 0.11888028941671856, + "grad_norm": 0.20014071464538574, + "learning_rate": 0.0009704778271437465, + "loss": 2.9322, + "step": 4009 + }, + { + "epoch": 0.11890994276903004, + "grad_norm": 0.18190865218639374, + "learning_rate": 0.0009704618969061681, + "loss": 2.9031, + "step": 4010 + }, + { + "epoch": 0.11893959612134151, + "grad_norm": 0.1760558933019638, + "learning_rate": 0.0009704459625025688, + "loss": 2.9192, + "step": 4011 + }, + { + "epoch": 0.11896924947365299, + "grad_norm": 0.19023217260837555, + "learning_rate": 0.0009704300239330899, + "loss": 2.9431, + "step": 4012 + }, + { + "epoch": 0.11899890282596448, + "grad_norm": 0.1626746505498886, + "learning_rate": 0.0009704140811978724, + "loss": 2.8991, + "step": 4013 + }, + { + "epoch": 0.11902855617827596, + "grad_norm": 0.1445513367652893, + "learning_rate": 0.0009703981342970572, + "loss": 2.9055, + "step": 4014 + }, + { + "epoch": 0.11905820953058743, + "grad_norm": 0.13711197674274445, + "learning_rate": 0.000970382183230786, + "loss": 2.927, + "step": 4015 + }, + { + "epoch": 0.11908786288289891, + "grad_norm": 0.1397210955619812, + "learning_rate": 0.0009703662279991998, + "loss": 2.9, + "step": 4016 + }, + { + "epoch": 0.11911751623521039, + "grad_norm": 0.13574501872062683, + "learning_rate": 0.0009703502686024399, + "loss": 2.9079, + "step": 4017 + }, + { + "epoch": 0.11914716958752188, + "grad_norm": 0.12799963355064392, + "learning_rate": 0.0009703343050406477, + "loss": 2.9159, + "step": 4018 + }, + { + "epoch": 0.11917682293983335, + "grad_norm": 0.13293933868408203, + "learning_rate": 0.0009703183373139645, + "loss": 2.9203, + "step": 4019 + }, + { + "epoch": 0.11920647629214483, + "grad_norm": 0.12859204411506653, + "learning_rate": 0.0009703023654225316, + "loss": 2.8967, + "step": 4020 + }, + { + "epoch": 0.1192361296444563, + "grad_norm": 0.1342332363128662, + "learning_rate": 0.0009702863893664907, + "loss": 2.8924, + "step": 4021 + }, + { + "epoch": 0.11926578299676778, + "grad_norm": 0.16236256062984467, + "learning_rate": 0.0009702704091459829, + "loss": 2.9392, + "step": 4022 + }, + { + "epoch": 0.11929543634907927, + "grad_norm": 0.1788494735956192, + "learning_rate": 0.0009702544247611499, + "loss": 2.9229, + "step": 4023 + }, + { + "epoch": 0.11932508970139075, + "grad_norm": 0.16520118713378906, + "learning_rate": 0.0009702384362121333, + "loss": 2.908, + "step": 4024 + }, + { + "epoch": 0.11935474305370222, + "grad_norm": 0.13056842982769012, + "learning_rate": 0.0009702224434990748, + "loss": 2.9073, + "step": 4025 + }, + { + "epoch": 0.1193843964060137, + "grad_norm": 0.13324801623821259, + "learning_rate": 0.0009702064466221156, + "loss": 2.9052, + "step": 4026 + }, + { + "epoch": 0.11941404975832517, + "grad_norm": 0.13179485499858856, + "learning_rate": 0.0009701904455813976, + "loss": 2.9305, + "step": 4027 + }, + { + "epoch": 0.11944370311063665, + "grad_norm": 0.15850265324115753, + "learning_rate": 0.0009701744403770627, + "loss": 2.9182, + "step": 4028 + }, + { + "epoch": 0.11947335646294814, + "grad_norm": 0.16929826140403748, + "learning_rate": 0.0009701584310092524, + "loss": 2.9292, + "step": 4029 + }, + { + "epoch": 0.11950300981525962, + "grad_norm": 0.17312785983085632, + "learning_rate": 0.0009701424174781084, + "loss": 2.946, + "step": 4030 + }, + { + "epoch": 0.1195326631675711, + "grad_norm": 0.14893954992294312, + "learning_rate": 0.0009701263997837726, + "loss": 2.8965, + "step": 4031 + }, + { + "epoch": 0.11956231651988257, + "grad_norm": 0.16363511979579926, + "learning_rate": 0.0009701103779263868, + "loss": 2.9266, + "step": 4032 + }, + { + "epoch": 0.11959196987219405, + "grad_norm": 0.178573340177536, + "learning_rate": 0.0009700943519060929, + "loss": 2.8603, + "step": 4033 + }, + { + "epoch": 0.11962162322450554, + "grad_norm": 0.1468447595834732, + "learning_rate": 0.000970078321723033, + "loss": 2.8907, + "step": 4034 + }, + { + "epoch": 0.11965127657681701, + "grad_norm": 0.12497743219137192, + "learning_rate": 0.0009700622873773489, + "loss": 2.9144, + "step": 4035 + }, + { + "epoch": 0.11968092992912849, + "grad_norm": 0.1152876690030098, + "learning_rate": 0.0009700462488691823, + "loss": 2.8933, + "step": 4036 + }, + { + "epoch": 0.11971058328143996, + "grad_norm": 0.11102735996246338, + "learning_rate": 0.0009700302061986756, + "loss": 2.9615, + "step": 4037 + }, + { + "epoch": 0.11974023663375144, + "grad_norm": 0.12939392030239105, + "learning_rate": 0.0009700141593659708, + "loss": 2.9302, + "step": 4038 + }, + { + "epoch": 0.11976988998606293, + "grad_norm": 0.1575719565153122, + "learning_rate": 0.0009699981083712098, + "loss": 2.8891, + "step": 4039 + }, + { + "epoch": 0.1197995433383744, + "grad_norm": 0.17948131263256073, + "learning_rate": 0.0009699820532145351, + "loss": 2.9331, + "step": 4040 + }, + { + "epoch": 0.11982919669068588, + "grad_norm": 0.18543189764022827, + "learning_rate": 0.0009699659938960884, + "loss": 2.9206, + "step": 4041 + }, + { + "epoch": 0.11985885004299736, + "grad_norm": 0.17760640382766724, + "learning_rate": 0.0009699499304160124, + "loss": 2.895, + "step": 4042 + }, + { + "epoch": 0.11988850339530883, + "grad_norm": 0.19468794763088226, + "learning_rate": 0.000969933862774449, + "loss": 2.9126, + "step": 4043 + }, + { + "epoch": 0.11991815674762032, + "grad_norm": 0.21359993517398834, + "learning_rate": 0.0009699177909715404, + "loss": 2.923, + "step": 4044 + }, + { + "epoch": 0.1199478100999318, + "grad_norm": 0.17809970676898956, + "learning_rate": 0.0009699017150074293, + "loss": 2.9217, + "step": 4045 + }, + { + "epoch": 0.11997746345224328, + "grad_norm": 0.16492986679077148, + "learning_rate": 0.0009698856348822577, + "loss": 2.944, + "step": 4046 + }, + { + "epoch": 0.12000711680455475, + "grad_norm": 0.15653066337108612, + "learning_rate": 0.0009698695505961683, + "loss": 2.9038, + "step": 4047 + }, + { + "epoch": 0.12003677015686623, + "grad_norm": 0.1750083565711975, + "learning_rate": 0.0009698534621493033, + "loss": 2.9335, + "step": 4048 + }, + { + "epoch": 0.12006642350917772, + "grad_norm": 0.18247559666633606, + "learning_rate": 0.0009698373695418054, + "loss": 2.9199, + "step": 4049 + }, + { + "epoch": 0.1200960768614892, + "grad_norm": 0.17172135412693024, + "learning_rate": 0.0009698212727738168, + "loss": 2.9659, + "step": 4050 + }, + { + "epoch": 0.12012573021380067, + "grad_norm": 0.18350441753864288, + "learning_rate": 0.0009698051718454802, + "loss": 2.9172, + "step": 4051 + }, + { + "epoch": 0.12015538356611215, + "grad_norm": 0.17948871850967407, + "learning_rate": 0.0009697890667569383, + "loss": 2.8938, + "step": 4052 + }, + { + "epoch": 0.12018503691842362, + "grad_norm": 0.16197656095027924, + "learning_rate": 0.0009697729575083334, + "loss": 2.8882, + "step": 4053 + }, + { + "epoch": 0.1202146902707351, + "grad_norm": 0.14424192905426025, + "learning_rate": 0.0009697568440998084, + "loss": 2.9024, + "step": 4054 + }, + { + "epoch": 0.12024434362304659, + "grad_norm": 0.1539389193058014, + "learning_rate": 0.0009697407265315058, + "loss": 2.9569, + "step": 4055 + }, + { + "epoch": 0.12027399697535807, + "grad_norm": 0.1437801867723465, + "learning_rate": 0.0009697246048035686, + "loss": 2.9021, + "step": 4056 + }, + { + "epoch": 0.12030365032766954, + "grad_norm": 0.12417270243167877, + "learning_rate": 0.0009697084789161392, + "loss": 2.9595, + "step": 4057 + }, + { + "epoch": 0.12033330367998102, + "grad_norm": 0.12473884224891663, + "learning_rate": 0.0009696923488693608, + "loss": 2.8821, + "step": 4058 + }, + { + "epoch": 0.1203629570322925, + "grad_norm": 0.12304659187793732, + "learning_rate": 0.000969676214663376, + "loss": 2.9136, + "step": 4059 + }, + { + "epoch": 0.12039261038460398, + "grad_norm": 0.12272222340106964, + "learning_rate": 0.0009696600762983277, + "loss": 2.9203, + "step": 4060 + }, + { + "epoch": 0.12042226373691546, + "grad_norm": 0.13841597735881805, + "learning_rate": 0.0009696439337743586, + "loss": 2.9315, + "step": 4061 + }, + { + "epoch": 0.12045191708922694, + "grad_norm": 0.13326121866703033, + "learning_rate": 0.0009696277870916121, + "loss": 2.9063, + "step": 4062 + }, + { + "epoch": 0.12048157044153841, + "grad_norm": 0.14809690415859222, + "learning_rate": 0.0009696116362502308, + "loss": 2.9529, + "step": 4063 + }, + { + "epoch": 0.12051122379384989, + "grad_norm": 0.15363241732120514, + "learning_rate": 0.0009695954812503578, + "loss": 2.9099, + "step": 4064 + }, + { + "epoch": 0.12054087714616138, + "grad_norm": 0.16373397409915924, + "learning_rate": 0.0009695793220921364, + "loss": 2.9047, + "step": 4065 + }, + { + "epoch": 0.12057053049847286, + "grad_norm": 0.17617833614349365, + "learning_rate": 0.0009695631587757095, + "loss": 2.9019, + "step": 4066 + }, + { + "epoch": 0.12060018385078433, + "grad_norm": 0.1865970492362976, + "learning_rate": 0.00096954699130122, + "loss": 2.9113, + "step": 4067 + }, + { + "epoch": 0.12062983720309581, + "grad_norm": 0.16579240560531616, + "learning_rate": 0.0009695308196688115, + "loss": 2.9361, + "step": 4068 + }, + { + "epoch": 0.12065949055540728, + "grad_norm": 0.1812818944454193, + "learning_rate": 0.0009695146438786268, + "loss": 2.9243, + "step": 4069 + }, + { + "epoch": 0.12068914390771877, + "grad_norm": 0.1693504899740219, + "learning_rate": 0.0009694984639308095, + "loss": 2.9082, + "step": 4070 + }, + { + "epoch": 0.12071879726003025, + "grad_norm": 0.18656766414642334, + "learning_rate": 0.0009694822798255027, + "loss": 2.9001, + "step": 4071 + }, + { + "epoch": 0.12074845061234173, + "grad_norm": 0.18431316316127777, + "learning_rate": 0.0009694660915628497, + "loss": 2.9281, + "step": 4072 + }, + { + "epoch": 0.1207781039646532, + "grad_norm": 0.17501407861709595, + "learning_rate": 0.0009694498991429938, + "loss": 2.9465, + "step": 4073 + }, + { + "epoch": 0.12080775731696468, + "grad_norm": 0.16343364119529724, + "learning_rate": 0.0009694337025660787, + "loss": 2.9269, + "step": 4074 + }, + { + "epoch": 0.12083741066927617, + "grad_norm": 0.15345576405525208, + "learning_rate": 0.0009694175018322473, + "loss": 2.9096, + "step": 4075 + }, + { + "epoch": 0.12086706402158764, + "grad_norm": 0.1762371063232422, + "learning_rate": 0.0009694012969416436, + "loss": 2.9019, + "step": 4076 + }, + { + "epoch": 0.12089671737389912, + "grad_norm": 0.15459255874156952, + "learning_rate": 0.0009693850878944106, + "loss": 2.8957, + "step": 4077 + }, + { + "epoch": 0.1209263707262106, + "grad_norm": 0.15600387752056122, + "learning_rate": 0.0009693688746906923, + "loss": 2.9274, + "step": 4078 + }, + { + "epoch": 0.12095602407852207, + "grad_norm": 0.15349824726581573, + "learning_rate": 0.0009693526573306319, + "loss": 2.9413, + "step": 4079 + }, + { + "epoch": 0.12098567743083355, + "grad_norm": 0.15503013134002686, + "learning_rate": 0.0009693364358143732, + "loss": 2.9383, + "step": 4080 + }, + { + "epoch": 0.12101533078314504, + "grad_norm": 0.1860615313053131, + "learning_rate": 0.0009693202101420599, + "loss": 2.9238, + "step": 4081 + }, + { + "epoch": 0.12104498413545652, + "grad_norm": 0.17568179965019226, + "learning_rate": 0.0009693039803138355, + "loss": 2.8886, + "step": 4082 + }, + { + "epoch": 0.12107463748776799, + "grad_norm": 0.1771457940340042, + "learning_rate": 0.0009692877463298437, + "loss": 2.9141, + "step": 4083 + }, + { + "epoch": 0.12110429084007947, + "grad_norm": 0.1914258897304535, + "learning_rate": 0.0009692715081902285, + "loss": 2.9015, + "step": 4084 + }, + { + "epoch": 0.12113394419239094, + "grad_norm": 0.18880628049373627, + "learning_rate": 0.0009692552658951334, + "loss": 2.8827, + "step": 4085 + }, + { + "epoch": 0.12116359754470243, + "grad_norm": 0.17174308001995087, + "learning_rate": 0.0009692390194447025, + "loss": 2.8784, + "step": 4086 + }, + { + "epoch": 0.12119325089701391, + "grad_norm": 0.15937931835651398, + "learning_rate": 0.0009692227688390796, + "loss": 2.9251, + "step": 4087 + }, + { + "epoch": 0.12122290424932539, + "grad_norm": 0.16328933835029602, + "learning_rate": 0.0009692065140784083, + "loss": 2.8988, + "step": 4088 + }, + { + "epoch": 0.12125255760163686, + "grad_norm": 0.14978323876857758, + "learning_rate": 0.0009691902551628329, + "loss": 2.9628, + "step": 4089 + }, + { + "epoch": 0.12128221095394834, + "grad_norm": 0.167361319065094, + "learning_rate": 0.0009691739920924974, + "loss": 2.9099, + "step": 4090 + }, + { + "epoch": 0.12131186430625983, + "grad_norm": 0.168547585606575, + "learning_rate": 0.0009691577248675454, + "loss": 2.9329, + "step": 4091 + }, + { + "epoch": 0.1213415176585713, + "grad_norm": 0.19052189588546753, + "learning_rate": 0.0009691414534881215, + "loss": 2.9331, + "step": 4092 + }, + { + "epoch": 0.12137117101088278, + "grad_norm": 0.14796975255012512, + "learning_rate": 0.0009691251779543693, + "loss": 2.8888, + "step": 4093 + }, + { + "epoch": 0.12140082436319426, + "grad_norm": 0.12809711694717407, + "learning_rate": 0.0009691088982664331, + "loss": 2.903, + "step": 4094 + }, + { + "epoch": 0.12143047771550573, + "grad_norm": 0.1449379324913025, + "learning_rate": 0.0009690926144244571, + "loss": 2.9204, + "step": 4095 + }, + { + "epoch": 0.12146013106781722, + "grad_norm": 0.1245410218834877, + "learning_rate": 0.0009690763264285857, + "loss": 2.8862, + "step": 4096 + }, + { + "epoch": 0.1214897844201287, + "grad_norm": 0.10878316313028336, + "learning_rate": 0.0009690600342789627, + "loss": 2.9139, + "step": 4097 + }, + { + "epoch": 0.12151943777244018, + "grad_norm": 0.12305592745542526, + "learning_rate": 0.0009690437379757327, + "loss": 2.9044, + "step": 4098 + }, + { + "epoch": 0.12154909112475165, + "grad_norm": 0.11984182149171829, + "learning_rate": 0.0009690274375190398, + "loss": 2.9278, + "step": 4099 + }, + { + "epoch": 0.12157874447706313, + "grad_norm": 0.11184778064489365, + "learning_rate": 0.0009690111329090285, + "loss": 2.916, + "step": 4100 + }, + { + "epoch": 0.12160839782937462, + "grad_norm": 0.11897645145654678, + "learning_rate": 0.0009689948241458431, + "loss": 2.8492, + "step": 4101 + }, + { + "epoch": 0.1216380511816861, + "grad_norm": 0.12411756068468094, + "learning_rate": 0.000968978511229628, + "loss": 2.9162, + "step": 4102 + }, + { + "epoch": 0.12166770453399757, + "grad_norm": 0.12178720533847809, + "learning_rate": 0.0009689621941605278, + "loss": 2.9136, + "step": 4103 + }, + { + "epoch": 0.12169735788630905, + "grad_norm": 0.13175533711910248, + "learning_rate": 0.0009689458729386866, + "loss": 2.9378, + "step": 4104 + }, + { + "epoch": 0.12172701123862052, + "grad_norm": 0.1396850198507309, + "learning_rate": 0.0009689295475642495, + "loss": 2.9299, + "step": 4105 + }, + { + "epoch": 0.121756664590932, + "grad_norm": 0.1365397721529007, + "learning_rate": 0.0009689132180373606, + "loss": 2.9099, + "step": 4106 + }, + { + "epoch": 0.12178631794324349, + "grad_norm": 0.1549454778432846, + "learning_rate": 0.0009688968843581647, + "loss": 2.8608, + "step": 4107 + }, + { + "epoch": 0.12181597129555496, + "grad_norm": 0.17944833636283875, + "learning_rate": 0.0009688805465268064, + "loss": 2.9029, + "step": 4108 + }, + { + "epoch": 0.12184562464786644, + "grad_norm": 0.17539706826210022, + "learning_rate": 0.0009688642045434304, + "loss": 2.9177, + "step": 4109 + }, + { + "epoch": 0.12187527800017792, + "grad_norm": 0.15337850153446198, + "learning_rate": 0.0009688478584081813, + "loss": 2.9089, + "step": 4110 + }, + { + "epoch": 0.12190493135248939, + "grad_norm": 0.16841720044612885, + "learning_rate": 0.000968831508121204, + "loss": 2.9057, + "step": 4111 + }, + { + "epoch": 0.12193458470480088, + "grad_norm": 0.21770475804805756, + "learning_rate": 0.0009688151536826433, + "loss": 2.8729, + "step": 4112 + }, + { + "epoch": 0.12196423805711236, + "grad_norm": 0.21245643496513367, + "learning_rate": 0.000968798795092644, + "loss": 2.9164, + "step": 4113 + }, + { + "epoch": 0.12199389140942384, + "grad_norm": 0.18157881498336792, + "learning_rate": 0.0009687824323513506, + "loss": 2.9293, + "step": 4114 + }, + { + "epoch": 0.12202354476173531, + "grad_norm": 0.17739304900169373, + "learning_rate": 0.0009687660654589085, + "loss": 2.9384, + "step": 4115 + }, + { + "epoch": 0.12205319811404679, + "grad_norm": 0.1977706104516983, + "learning_rate": 0.0009687496944154625, + "loss": 2.9394, + "step": 4116 + }, + { + "epoch": 0.12208285146635828, + "grad_norm": 0.19758862257003784, + "learning_rate": 0.0009687333192211574, + "loss": 2.9361, + "step": 4117 + }, + { + "epoch": 0.12211250481866975, + "grad_norm": 0.22031599283218384, + "learning_rate": 0.0009687169398761382, + "loss": 2.9223, + "step": 4118 + }, + { + "epoch": 0.12214215817098123, + "grad_norm": 0.1942421793937683, + "learning_rate": 0.0009687005563805502, + "loss": 2.9336, + "step": 4119 + }, + { + "epoch": 0.1221718115232927, + "grad_norm": 0.16650956869125366, + "learning_rate": 0.0009686841687345382, + "loss": 2.9276, + "step": 4120 + }, + { + "epoch": 0.12220146487560418, + "grad_norm": 0.18495309352874756, + "learning_rate": 0.0009686677769382474, + "loss": 2.9181, + "step": 4121 + }, + { + "epoch": 0.12223111822791567, + "grad_norm": 0.18968158960342407, + "learning_rate": 0.0009686513809918232, + "loss": 2.9418, + "step": 4122 + }, + { + "epoch": 0.12226077158022715, + "grad_norm": 0.15455549955368042, + "learning_rate": 0.0009686349808954105, + "loss": 2.9152, + "step": 4123 + }, + { + "epoch": 0.12229042493253862, + "grad_norm": 0.15500985085964203, + "learning_rate": 0.0009686185766491546, + "loss": 2.9041, + "step": 4124 + }, + { + "epoch": 0.1223200782848501, + "grad_norm": 0.1514420211315155, + "learning_rate": 0.0009686021682532007, + "loss": 2.9162, + "step": 4125 + }, + { + "epoch": 0.12234973163716158, + "grad_norm": 0.166293203830719, + "learning_rate": 0.0009685857557076942, + "loss": 2.9375, + "step": 4126 + }, + { + "epoch": 0.12237938498947307, + "grad_norm": 0.17085343599319458, + "learning_rate": 0.0009685693390127805, + "loss": 2.8862, + "step": 4127 + }, + { + "epoch": 0.12240903834178454, + "grad_norm": 0.1716010421514511, + "learning_rate": 0.0009685529181686048, + "loss": 2.9033, + "step": 4128 + }, + { + "epoch": 0.12243869169409602, + "grad_norm": 0.16534991562366486, + "learning_rate": 0.0009685364931753124, + "loss": 2.9209, + "step": 4129 + }, + { + "epoch": 0.1224683450464075, + "grad_norm": 0.14885497093200684, + "learning_rate": 0.0009685200640330491, + "loss": 2.9412, + "step": 4130 + }, + { + "epoch": 0.12249799839871897, + "grad_norm": 0.15863333642482758, + "learning_rate": 0.0009685036307419604, + "loss": 2.9277, + "step": 4131 + }, + { + "epoch": 0.12252765175103045, + "grad_norm": 0.14783763885498047, + "learning_rate": 0.0009684871933021913, + "loss": 2.9098, + "step": 4132 + }, + { + "epoch": 0.12255730510334194, + "grad_norm": 0.12818056344985962, + "learning_rate": 0.0009684707517138879, + "loss": 2.8984, + "step": 4133 + }, + { + "epoch": 0.12258695845565341, + "grad_norm": 0.14552368223667145, + "learning_rate": 0.0009684543059771955, + "loss": 2.9096, + "step": 4134 + }, + { + "epoch": 0.12261661180796489, + "grad_norm": 0.13028880953788757, + "learning_rate": 0.0009684378560922597, + "loss": 2.9252, + "step": 4135 + }, + { + "epoch": 0.12264626516027637, + "grad_norm": 0.1288769245147705, + "learning_rate": 0.0009684214020592265, + "loss": 2.9481, + "step": 4136 + }, + { + "epoch": 0.12267591851258784, + "grad_norm": 0.13301637768745422, + "learning_rate": 0.0009684049438782413, + "loss": 2.9094, + "step": 4137 + }, + { + "epoch": 0.12270557186489933, + "grad_norm": 0.14224915206432343, + "learning_rate": 0.0009683884815494499, + "loss": 2.9197, + "step": 4138 + }, + { + "epoch": 0.12273522521721081, + "grad_norm": 0.11614123731851578, + "learning_rate": 0.0009683720150729981, + "loss": 2.94, + "step": 4139 + }, + { + "epoch": 0.12276487856952228, + "grad_norm": 0.11581072956323624, + "learning_rate": 0.0009683555444490317, + "loss": 2.9135, + "step": 4140 + }, + { + "epoch": 0.12279453192183376, + "grad_norm": 0.13031870126724243, + "learning_rate": 0.0009683390696776966, + "loss": 2.9322, + "step": 4141 + }, + { + "epoch": 0.12282418527414524, + "grad_norm": 0.12929610908031464, + "learning_rate": 0.0009683225907591386, + "loss": 2.9247, + "step": 4142 + }, + { + "epoch": 0.12285383862645673, + "grad_norm": 0.12995561957359314, + "learning_rate": 0.0009683061076935037, + "loss": 2.8805, + "step": 4143 + }, + { + "epoch": 0.1228834919787682, + "grad_norm": 0.1244012862443924, + "learning_rate": 0.0009682896204809378, + "loss": 2.9149, + "step": 4144 + }, + { + "epoch": 0.12291314533107968, + "grad_norm": 0.12846046686172485, + "learning_rate": 0.000968273129121587, + "loss": 2.8848, + "step": 4145 + }, + { + "epoch": 0.12294279868339116, + "grad_norm": 0.15192444622516632, + "learning_rate": 0.0009682566336155971, + "loss": 2.9342, + "step": 4146 + }, + { + "epoch": 0.12297245203570263, + "grad_norm": 0.17074693739414215, + "learning_rate": 0.0009682401339631146, + "loss": 2.9399, + "step": 4147 + }, + { + "epoch": 0.12300210538801412, + "grad_norm": 0.16189777851104736, + "learning_rate": 0.0009682236301642852, + "loss": 2.9642, + "step": 4148 + }, + { + "epoch": 0.1230317587403256, + "grad_norm": 0.16595104336738586, + "learning_rate": 0.0009682071222192551, + "loss": 2.908, + "step": 4149 + }, + { + "epoch": 0.12306141209263707, + "grad_norm": 0.1607166826725006, + "learning_rate": 0.0009681906101281707, + "loss": 2.9481, + "step": 4150 + }, + { + "epoch": 0.12309106544494855, + "grad_norm": 0.1668585240840912, + "learning_rate": 0.0009681740938911781, + "loss": 2.9051, + "step": 4151 + }, + { + "epoch": 0.12312071879726003, + "grad_norm": 0.18142102658748627, + "learning_rate": 0.0009681575735084233, + "loss": 2.8867, + "step": 4152 + }, + { + "epoch": 0.12315037214957152, + "grad_norm": 0.20793196558952332, + "learning_rate": 0.000968141048980053, + "loss": 2.9244, + "step": 4153 + }, + { + "epoch": 0.12318002550188299, + "grad_norm": 0.21354687213897705, + "learning_rate": 0.0009681245203062135, + "loss": 2.8962, + "step": 4154 + }, + { + "epoch": 0.12320967885419447, + "grad_norm": 0.20385828614234924, + "learning_rate": 0.0009681079874870508, + "loss": 2.9232, + "step": 4155 + }, + { + "epoch": 0.12323933220650594, + "grad_norm": 0.22101177275180817, + "learning_rate": 0.0009680914505227116, + "loss": 2.9484, + "step": 4156 + }, + { + "epoch": 0.12326898555881742, + "grad_norm": 0.1972762495279312, + "learning_rate": 0.0009680749094133423, + "loss": 2.9025, + "step": 4157 + }, + { + "epoch": 0.1232986389111289, + "grad_norm": 0.22598569095134735, + "learning_rate": 0.0009680583641590892, + "loss": 2.9317, + "step": 4158 + }, + { + "epoch": 0.12332829226344039, + "grad_norm": 0.19147951900959015, + "learning_rate": 0.0009680418147600991, + "loss": 2.9334, + "step": 4159 + }, + { + "epoch": 0.12335794561575186, + "grad_norm": 0.135511577129364, + "learning_rate": 0.0009680252612165183, + "loss": 2.9263, + "step": 4160 + }, + { + "epoch": 0.12338759896806334, + "grad_norm": 0.18476948142051697, + "learning_rate": 0.0009680087035284935, + "loss": 2.9243, + "step": 4161 + }, + { + "epoch": 0.12341725232037482, + "grad_norm": 0.16834676265716553, + "learning_rate": 0.0009679921416961713, + "loss": 2.8932, + "step": 4162 + }, + { + "epoch": 0.12344690567268629, + "grad_norm": 0.1603792905807495, + "learning_rate": 0.0009679755757196984, + "loss": 2.9086, + "step": 4163 + }, + { + "epoch": 0.12347655902499778, + "grad_norm": 0.1660943329334259, + "learning_rate": 0.0009679590055992213, + "loss": 2.9024, + "step": 4164 + }, + { + "epoch": 0.12350621237730926, + "grad_norm": 0.16347649693489075, + "learning_rate": 0.000967942431334887, + "loss": 2.9043, + "step": 4165 + }, + { + "epoch": 0.12353586572962073, + "grad_norm": 0.12731648981571198, + "learning_rate": 0.0009679258529268422, + "loss": 2.9516, + "step": 4166 + }, + { + "epoch": 0.12356551908193221, + "grad_norm": 0.13476380705833435, + "learning_rate": 0.0009679092703752336, + "loss": 2.8998, + "step": 4167 + }, + { + "epoch": 0.12359517243424369, + "grad_norm": 0.13649238646030426, + "learning_rate": 0.000967892683680208, + "loss": 2.9444, + "step": 4168 + }, + { + "epoch": 0.12362482578655518, + "grad_norm": 0.12912462651729584, + "learning_rate": 0.0009678760928419124, + "loss": 2.8816, + "step": 4169 + }, + { + "epoch": 0.12365447913886665, + "grad_norm": 0.12559537589550018, + "learning_rate": 0.0009678594978604938, + "loss": 2.8906, + "step": 4170 + }, + { + "epoch": 0.12368413249117813, + "grad_norm": 0.1496659368276596, + "learning_rate": 0.000967842898736099, + "loss": 2.9037, + "step": 4171 + }, + { + "epoch": 0.1237137858434896, + "grad_norm": 0.14985297620296478, + "learning_rate": 0.000967826295468875, + "loss": 2.9294, + "step": 4172 + }, + { + "epoch": 0.12374343919580108, + "grad_norm": 0.13624955713748932, + "learning_rate": 0.0009678096880589689, + "loss": 2.8881, + "step": 4173 + }, + { + "epoch": 0.12377309254811257, + "grad_norm": 0.13229775428771973, + "learning_rate": 0.0009677930765065277, + "loss": 2.9223, + "step": 4174 + }, + { + "epoch": 0.12380274590042405, + "grad_norm": 0.15435175597667694, + "learning_rate": 0.0009677764608116984, + "loss": 2.9003, + "step": 4175 + }, + { + "epoch": 0.12383239925273552, + "grad_norm": 0.17428897321224213, + "learning_rate": 0.0009677598409746285, + "loss": 2.9143, + "step": 4176 + }, + { + "epoch": 0.123862052605047, + "grad_norm": 0.1832716464996338, + "learning_rate": 0.0009677432169954646, + "loss": 2.9255, + "step": 4177 + }, + { + "epoch": 0.12389170595735848, + "grad_norm": 0.17643338441848755, + "learning_rate": 0.0009677265888743545, + "loss": 2.923, + "step": 4178 + }, + { + "epoch": 0.12392135930966997, + "grad_norm": 0.1623879075050354, + "learning_rate": 0.0009677099566114449, + "loss": 2.8973, + "step": 4179 + }, + { + "epoch": 0.12395101266198144, + "grad_norm": 0.14139297604560852, + "learning_rate": 0.0009676933202068836, + "loss": 2.924, + "step": 4180 + }, + { + "epoch": 0.12398066601429292, + "grad_norm": 0.13967932760715485, + "learning_rate": 0.0009676766796608175, + "loss": 2.9052, + "step": 4181 + }, + { + "epoch": 0.1240103193666044, + "grad_norm": 0.13260199129581451, + "learning_rate": 0.0009676600349733942, + "loss": 2.8561, + "step": 4182 + }, + { + "epoch": 0.12403997271891587, + "grad_norm": 0.14569272100925446, + "learning_rate": 0.0009676433861447612, + "loss": 2.9098, + "step": 4183 + }, + { + "epoch": 0.12406962607122735, + "grad_norm": 0.17188304662704468, + "learning_rate": 0.0009676267331750654, + "loss": 2.9682, + "step": 4184 + }, + { + "epoch": 0.12409927942353884, + "grad_norm": 0.19191773235797882, + "learning_rate": 0.0009676100760644548, + "loss": 2.9242, + "step": 4185 + }, + { + "epoch": 0.12412893277585031, + "grad_norm": 0.1859394907951355, + "learning_rate": 0.0009675934148130767, + "loss": 2.9175, + "step": 4186 + }, + { + "epoch": 0.12415858612816179, + "grad_norm": 0.1887684315443039, + "learning_rate": 0.0009675767494210785, + "loss": 2.8949, + "step": 4187 + }, + { + "epoch": 0.12418823948047326, + "grad_norm": 0.19791625440120697, + "learning_rate": 0.0009675600798886082, + "loss": 2.86, + "step": 4188 + }, + { + "epoch": 0.12421789283278474, + "grad_norm": 0.21662932634353638, + "learning_rate": 0.000967543406215813, + "loss": 2.8979, + "step": 4189 + }, + { + "epoch": 0.12424754618509623, + "grad_norm": 0.19225995242595673, + "learning_rate": 0.0009675267284028407, + "loss": 2.8888, + "step": 4190 + }, + { + "epoch": 0.1242771995374077, + "grad_norm": 0.1699885129928589, + "learning_rate": 0.000967510046449839, + "loss": 2.9004, + "step": 4191 + }, + { + "epoch": 0.12430685288971918, + "grad_norm": 0.14343850314617157, + "learning_rate": 0.0009674933603569555, + "loss": 2.894, + "step": 4192 + }, + { + "epoch": 0.12433650624203066, + "grad_norm": 0.17096573114395142, + "learning_rate": 0.0009674766701243381, + "loss": 2.8988, + "step": 4193 + }, + { + "epoch": 0.12436615959434213, + "grad_norm": 0.13145902752876282, + "learning_rate": 0.0009674599757521345, + "loss": 2.8754, + "step": 4194 + }, + { + "epoch": 0.12439581294665362, + "grad_norm": 0.1394783854484558, + "learning_rate": 0.0009674432772404926, + "loss": 2.8997, + "step": 4195 + }, + { + "epoch": 0.1244254662989651, + "grad_norm": 0.12878680229187012, + "learning_rate": 0.0009674265745895602, + "loss": 2.8786, + "step": 4196 + }, + { + "epoch": 0.12445511965127658, + "grad_norm": 0.1554274559020996, + "learning_rate": 0.0009674098677994854, + "loss": 2.895, + "step": 4197 + }, + { + "epoch": 0.12448477300358805, + "grad_norm": 0.16016924381256104, + "learning_rate": 0.000967393156870416, + "loss": 2.9295, + "step": 4198 + }, + { + "epoch": 0.12451442635589953, + "grad_norm": 0.14937549829483032, + "learning_rate": 0.0009673764418024997, + "loss": 2.8642, + "step": 4199 + }, + { + "epoch": 0.12454407970821102, + "grad_norm": 0.13676613569259644, + "learning_rate": 0.000967359722595885, + "loss": 2.9148, + "step": 4200 + }, + { + "epoch": 0.1245737330605225, + "grad_norm": 0.14228668808937073, + "learning_rate": 0.0009673429992507197, + "loss": 2.923, + "step": 4201 + }, + { + "epoch": 0.12460338641283397, + "grad_norm": 0.13573423027992249, + "learning_rate": 0.000967326271767152, + "loss": 2.9298, + "step": 4202 + }, + { + "epoch": 0.12463303976514545, + "grad_norm": 0.13814912736415863, + "learning_rate": 0.0009673095401453298, + "loss": 2.8935, + "step": 4203 + }, + { + "epoch": 0.12466269311745692, + "grad_norm": 0.1250903159379959, + "learning_rate": 0.0009672928043854014, + "loss": 2.9185, + "step": 4204 + }, + { + "epoch": 0.12469234646976841, + "grad_norm": 0.1378580778837204, + "learning_rate": 0.0009672760644875151, + "loss": 2.9189, + "step": 4205 + }, + { + "epoch": 0.12472199982207989, + "grad_norm": 0.13773661851882935, + "learning_rate": 0.000967259320451819, + "loss": 2.8903, + "step": 4206 + }, + { + "epoch": 0.12475165317439137, + "grad_norm": 0.14692947268486023, + "learning_rate": 0.0009672425722784615, + "loss": 2.9138, + "step": 4207 + }, + { + "epoch": 0.12478130652670284, + "grad_norm": 0.1434909850358963, + "learning_rate": 0.0009672258199675907, + "loss": 2.9617, + "step": 4208 + }, + { + "epoch": 0.12481095987901432, + "grad_norm": 0.15397629141807556, + "learning_rate": 0.0009672090635193552, + "loss": 2.9573, + "step": 4209 + }, + { + "epoch": 0.1248406132313258, + "grad_norm": 0.15407299995422363, + "learning_rate": 0.0009671923029339032, + "loss": 2.9043, + "step": 4210 + }, + { + "epoch": 0.12487026658363728, + "grad_norm": 0.15757018327713013, + "learning_rate": 0.000967175538211383, + "loss": 2.9321, + "step": 4211 + }, + { + "epoch": 0.12489991993594876, + "grad_norm": 0.16315029561519623, + "learning_rate": 0.0009671587693519435, + "loss": 2.8979, + "step": 4212 + }, + { + "epoch": 0.12492957328826024, + "grad_norm": 0.17218202352523804, + "learning_rate": 0.0009671419963557326, + "loss": 2.9283, + "step": 4213 + }, + { + "epoch": 0.12495922664057171, + "grad_norm": 0.1539960503578186, + "learning_rate": 0.0009671252192228994, + "loss": 2.9065, + "step": 4214 + }, + { + "epoch": 0.12498887999288319, + "grad_norm": 0.1412593275308609, + "learning_rate": 0.0009671084379535922, + "loss": 2.8971, + "step": 4215 + }, + { + "epoch": 0.12501853334519467, + "grad_norm": 0.1333884298801422, + "learning_rate": 0.0009670916525479594, + "loss": 2.9303, + "step": 4216 + }, + { + "epoch": 0.12504818669750614, + "grad_norm": 0.15028877556324005, + "learning_rate": 0.0009670748630061499, + "loss": 2.9193, + "step": 4217 + }, + { + "epoch": 0.12507784004981765, + "grad_norm": 0.15185153484344482, + "learning_rate": 0.0009670580693283124, + "loss": 2.9009, + "step": 4218 + }, + { + "epoch": 0.12510749340212912, + "grad_norm": 0.17596372961997986, + "learning_rate": 0.0009670412715145955, + "loss": 2.9167, + "step": 4219 + }, + { + "epoch": 0.1251371467544406, + "grad_norm": 0.20296363532543182, + "learning_rate": 0.000967024469565148, + "loss": 2.9014, + "step": 4220 + }, + { + "epoch": 0.12516680010675207, + "grad_norm": 0.22714807093143463, + "learning_rate": 0.0009670076634801186, + "loss": 2.9031, + "step": 4221 + }, + { + "epoch": 0.12519645345906355, + "grad_norm": 0.21553419530391693, + "learning_rate": 0.0009669908532596562, + "loss": 2.9202, + "step": 4222 + }, + { + "epoch": 0.12522610681137503, + "grad_norm": 0.19403885304927826, + "learning_rate": 0.0009669740389039097, + "loss": 2.9092, + "step": 4223 + }, + { + "epoch": 0.1252557601636865, + "grad_norm": 0.18893055617809296, + "learning_rate": 0.0009669572204130278, + "loss": 2.9329, + "step": 4224 + }, + { + "epoch": 0.12528541351599798, + "grad_norm": 0.1584555208683014, + "learning_rate": 0.0009669403977871596, + "loss": 2.8885, + "step": 4225 + }, + { + "epoch": 0.12531506686830945, + "grad_norm": 0.14705903828144073, + "learning_rate": 0.0009669235710264542, + "loss": 2.8907, + "step": 4226 + }, + { + "epoch": 0.12534472022062093, + "grad_norm": 0.16788682341575623, + "learning_rate": 0.0009669067401310602, + "loss": 2.9362, + "step": 4227 + }, + { + "epoch": 0.1253743735729324, + "grad_norm": 0.156332865357399, + "learning_rate": 0.0009668899051011269, + "loss": 2.8932, + "step": 4228 + }, + { + "epoch": 0.1254040269252439, + "grad_norm": 0.151275172829628, + "learning_rate": 0.0009668730659368035, + "loss": 2.8969, + "step": 4229 + }, + { + "epoch": 0.1254336802775554, + "grad_norm": 0.16234277188777924, + "learning_rate": 0.0009668562226382388, + "loss": 2.8957, + "step": 4230 + }, + { + "epoch": 0.12546333362986686, + "grad_norm": 0.16336627304553986, + "learning_rate": 0.0009668393752055821, + "loss": 2.9296, + "step": 4231 + }, + { + "epoch": 0.12549298698217834, + "grad_norm": 0.17019978165626526, + "learning_rate": 0.0009668225236389829, + "loss": 2.9049, + "step": 4232 + }, + { + "epoch": 0.12552264033448982, + "grad_norm": 0.14664137363433838, + "learning_rate": 0.0009668056679385898, + "loss": 2.9312, + "step": 4233 + }, + { + "epoch": 0.1255522936868013, + "grad_norm": 0.1473105549812317, + "learning_rate": 0.0009667888081045525, + "loss": 2.9208, + "step": 4234 + }, + { + "epoch": 0.12558194703911277, + "grad_norm": 0.15856795012950897, + "learning_rate": 0.0009667719441370201, + "loss": 2.9172, + "step": 4235 + }, + { + "epoch": 0.12561160039142424, + "grad_norm": 0.16839316487312317, + "learning_rate": 0.0009667550760361422, + "loss": 2.9392, + "step": 4236 + }, + { + "epoch": 0.12564125374373572, + "grad_norm": 0.18520401418209076, + "learning_rate": 0.0009667382038020679, + "loss": 2.9244, + "step": 4237 + }, + { + "epoch": 0.1256709070960472, + "grad_norm": 0.16107213497161865, + "learning_rate": 0.0009667213274349467, + "loss": 2.8946, + "step": 4238 + }, + { + "epoch": 0.1257005604483587, + "grad_norm": 0.14792965352535248, + "learning_rate": 0.000966704446934928, + "loss": 2.9054, + "step": 4239 + }, + { + "epoch": 0.12573021380067018, + "grad_norm": 0.17473018169403076, + "learning_rate": 0.0009666875623021613, + "loss": 2.9187, + "step": 4240 + }, + { + "epoch": 0.12575986715298165, + "grad_norm": 0.1620602309703827, + "learning_rate": 0.0009666706735367961, + "loss": 2.921, + "step": 4241 + }, + { + "epoch": 0.12578952050529313, + "grad_norm": 0.15606237947940826, + "learning_rate": 0.000966653780638982, + "loss": 2.9129, + "step": 4242 + }, + { + "epoch": 0.1258191738576046, + "grad_norm": 0.16795900464057922, + "learning_rate": 0.0009666368836088686, + "loss": 2.9172, + "step": 4243 + }, + { + "epoch": 0.12584882720991608, + "grad_norm": 0.16895850002765656, + "learning_rate": 0.0009666199824466056, + "loss": 2.9252, + "step": 4244 + }, + { + "epoch": 0.12587848056222756, + "grad_norm": 0.15924540162086487, + "learning_rate": 0.0009666030771523424, + "loss": 2.9085, + "step": 4245 + }, + { + "epoch": 0.12590813391453903, + "grad_norm": 0.14358140528202057, + "learning_rate": 0.0009665861677262289, + "loss": 2.8672, + "step": 4246 + }, + { + "epoch": 0.1259377872668505, + "grad_norm": 0.1290850043296814, + "learning_rate": 0.0009665692541684147, + "loss": 2.881, + "step": 4247 + }, + { + "epoch": 0.12596744061916199, + "grad_norm": 0.14071033895015717, + "learning_rate": 0.0009665523364790499, + "loss": 2.9284, + "step": 4248 + }, + { + "epoch": 0.12599709397147346, + "grad_norm": 0.12611864507198334, + "learning_rate": 0.000966535414658284, + "loss": 2.9272, + "step": 4249 + }, + { + "epoch": 0.12602674732378497, + "grad_norm": 0.12561944127082825, + "learning_rate": 0.000966518488706267, + "loss": 2.8872, + "step": 4250 + }, + { + "epoch": 0.12605640067609644, + "grad_norm": 0.1550622284412384, + "learning_rate": 0.0009665015586231485, + "loss": 2.8919, + "step": 4251 + }, + { + "epoch": 0.12608605402840792, + "grad_norm": 0.15381476283073425, + "learning_rate": 0.0009664846244090787, + "loss": 2.9562, + "step": 4252 + }, + { + "epoch": 0.1261157073807194, + "grad_norm": 0.1694144606590271, + "learning_rate": 0.0009664676860642074, + "loss": 2.9141, + "step": 4253 + }, + { + "epoch": 0.12614536073303087, + "grad_norm": 0.18804024159908295, + "learning_rate": 0.0009664507435886849, + "loss": 2.8941, + "step": 4254 + }, + { + "epoch": 0.12617501408534235, + "grad_norm": 0.17549091577529907, + "learning_rate": 0.0009664337969826609, + "loss": 2.9283, + "step": 4255 + }, + { + "epoch": 0.12620466743765382, + "grad_norm": 0.1659145951271057, + "learning_rate": 0.0009664168462462855, + "loss": 2.8897, + "step": 4256 + }, + { + "epoch": 0.1262343207899653, + "grad_norm": 0.1871439665555954, + "learning_rate": 0.0009663998913797089, + "loss": 2.8919, + "step": 4257 + }, + { + "epoch": 0.12626397414227677, + "grad_norm": 0.15693101286888123, + "learning_rate": 0.0009663829323830811, + "loss": 2.9206, + "step": 4258 + }, + { + "epoch": 0.12629362749458825, + "grad_norm": 0.14534536004066467, + "learning_rate": 0.0009663659692565525, + "loss": 2.8775, + "step": 4259 + }, + { + "epoch": 0.12632328084689975, + "grad_norm": 0.14108800888061523, + "learning_rate": 0.0009663490020002732, + "loss": 2.9068, + "step": 4260 + }, + { + "epoch": 0.12635293419921123, + "grad_norm": 0.1513291895389557, + "learning_rate": 0.0009663320306143935, + "loss": 2.9086, + "step": 4261 + }, + { + "epoch": 0.1263825875515227, + "grad_norm": 0.1733872890472412, + "learning_rate": 0.0009663150550990636, + "loss": 2.8827, + "step": 4262 + }, + { + "epoch": 0.12641224090383418, + "grad_norm": 0.1626979559659958, + "learning_rate": 0.0009662980754544337, + "loss": 2.9088, + "step": 4263 + }, + { + "epoch": 0.12644189425614566, + "grad_norm": 0.1700531393289566, + "learning_rate": 0.0009662810916806543, + "loss": 2.9305, + "step": 4264 + }, + { + "epoch": 0.12647154760845714, + "grad_norm": 0.1628144234418869, + "learning_rate": 0.000966264103777876, + "loss": 2.9299, + "step": 4265 + }, + { + "epoch": 0.1265012009607686, + "grad_norm": 0.14651311933994293, + "learning_rate": 0.0009662471117462489, + "loss": 2.889, + "step": 4266 + }, + { + "epoch": 0.1265308543130801, + "grad_norm": 0.13824951648712158, + "learning_rate": 0.0009662301155859236, + "loss": 2.909, + "step": 4267 + }, + { + "epoch": 0.12656050766539156, + "grad_norm": 0.12674163281917572, + "learning_rate": 0.0009662131152970506, + "loss": 2.9035, + "step": 4268 + }, + { + "epoch": 0.12659016101770304, + "grad_norm": 0.1487300992012024, + "learning_rate": 0.0009661961108797805, + "loss": 2.8648, + "step": 4269 + }, + { + "epoch": 0.12661981437001454, + "grad_norm": 0.16395339369773865, + "learning_rate": 0.0009661791023342637, + "loss": 2.9084, + "step": 4270 + }, + { + "epoch": 0.12664946772232602, + "grad_norm": 0.13810352981090546, + "learning_rate": 0.0009661620896606511, + "loss": 2.9529, + "step": 4271 + }, + { + "epoch": 0.1266791210746375, + "grad_norm": 0.16657422482967377, + "learning_rate": 0.0009661450728590931, + "loss": 2.9034, + "step": 4272 + }, + { + "epoch": 0.12670877442694897, + "grad_norm": 0.1577882021665573, + "learning_rate": 0.0009661280519297404, + "loss": 2.9291, + "step": 4273 + }, + { + "epoch": 0.12673842777926045, + "grad_norm": 0.1395055055618286, + "learning_rate": 0.0009661110268727438, + "loss": 2.9405, + "step": 4274 + }, + { + "epoch": 0.12676808113157192, + "grad_norm": 0.15144555270671844, + "learning_rate": 0.0009660939976882541, + "loss": 2.9259, + "step": 4275 + }, + { + "epoch": 0.1267977344838834, + "grad_norm": 0.14760728180408478, + "learning_rate": 0.000966076964376422, + "loss": 2.9225, + "step": 4276 + }, + { + "epoch": 0.12682738783619488, + "grad_norm": 0.13816913962364197, + "learning_rate": 0.0009660599269373984, + "loss": 2.9368, + "step": 4277 + }, + { + "epoch": 0.12685704118850635, + "grad_norm": 0.1361888349056244, + "learning_rate": 0.0009660428853713343, + "loss": 2.9176, + "step": 4278 + }, + { + "epoch": 0.12688669454081783, + "grad_norm": 0.14000047743320465, + "learning_rate": 0.0009660258396783802, + "loss": 2.9274, + "step": 4279 + }, + { + "epoch": 0.1269163478931293, + "grad_norm": 0.13045914471149445, + "learning_rate": 0.0009660087898586874, + "loss": 2.929, + "step": 4280 + }, + { + "epoch": 0.1269460012454408, + "grad_norm": 0.1486787647008896, + "learning_rate": 0.0009659917359124069, + "loss": 2.8875, + "step": 4281 + }, + { + "epoch": 0.12697565459775229, + "grad_norm": 0.1339513063430786, + "learning_rate": 0.0009659746778396894, + "loss": 2.9245, + "step": 4282 + }, + { + "epoch": 0.12700530795006376, + "grad_norm": 0.1280677765607834, + "learning_rate": 0.0009659576156406861, + "loss": 2.9147, + "step": 4283 + }, + { + "epoch": 0.12703496130237524, + "grad_norm": 0.1536700278520584, + "learning_rate": 0.0009659405493155484, + "loss": 2.9326, + "step": 4284 + }, + { + "epoch": 0.12706461465468671, + "grad_norm": 0.18044666945934296, + "learning_rate": 0.000965923478864427, + "loss": 2.9342, + "step": 4285 + }, + { + "epoch": 0.1270942680069982, + "grad_norm": 0.17796535789966583, + "learning_rate": 0.0009659064042874733, + "loss": 2.9017, + "step": 4286 + }, + { + "epoch": 0.12712392135930967, + "grad_norm": 0.16024279594421387, + "learning_rate": 0.0009658893255848382, + "loss": 2.9212, + "step": 4287 + }, + { + "epoch": 0.12715357471162114, + "grad_norm": 0.19440068304538727, + "learning_rate": 0.0009658722427566734, + "loss": 2.9147, + "step": 4288 + }, + { + "epoch": 0.12718322806393262, + "grad_norm": 0.19812938570976257, + "learning_rate": 0.0009658551558031299, + "loss": 2.9089, + "step": 4289 + }, + { + "epoch": 0.1272128814162441, + "grad_norm": 0.19952036440372467, + "learning_rate": 0.0009658380647243589, + "loss": 2.891, + "step": 4290 + }, + { + "epoch": 0.1272425347685556, + "grad_norm": 0.20623354613780975, + "learning_rate": 0.0009658209695205119, + "loss": 2.8989, + "step": 4291 + }, + { + "epoch": 0.12727218812086707, + "grad_norm": 0.17259712517261505, + "learning_rate": 0.0009658038701917403, + "loss": 2.8806, + "step": 4292 + }, + { + "epoch": 0.12730184147317855, + "grad_norm": 0.17462217807769775, + "learning_rate": 0.0009657867667381954, + "loss": 2.8949, + "step": 4293 + }, + { + "epoch": 0.12733149482549003, + "grad_norm": 0.17052100598812103, + "learning_rate": 0.0009657696591600289, + "loss": 2.9109, + "step": 4294 + }, + { + "epoch": 0.1273611481778015, + "grad_norm": 0.17049257457256317, + "learning_rate": 0.0009657525474573921, + "loss": 2.9161, + "step": 4295 + }, + { + "epoch": 0.12739080153011298, + "grad_norm": 0.17234398424625397, + "learning_rate": 0.0009657354316304364, + "loss": 2.9075, + "step": 4296 + }, + { + "epoch": 0.12742045488242446, + "grad_norm": 0.14543724060058594, + "learning_rate": 0.0009657183116793136, + "loss": 2.8874, + "step": 4297 + }, + { + "epoch": 0.12745010823473593, + "grad_norm": 0.13018448650836945, + "learning_rate": 0.0009657011876041753, + "loss": 2.9113, + "step": 4298 + }, + { + "epoch": 0.1274797615870474, + "grad_norm": 0.14632125198841095, + "learning_rate": 0.0009656840594051728, + "loss": 2.8829, + "step": 4299 + }, + { + "epoch": 0.12750941493935888, + "grad_norm": 0.14156627655029297, + "learning_rate": 0.0009656669270824582, + "loss": 2.8935, + "step": 4300 + }, + { + "epoch": 0.12753906829167036, + "grad_norm": 0.13591106235980988, + "learning_rate": 0.000965649790636183, + "loss": 2.8961, + "step": 4301 + }, + { + "epoch": 0.12756872164398186, + "grad_norm": 0.14360284805297852, + "learning_rate": 0.0009656326500664989, + "loss": 2.893, + "step": 4302 + }, + { + "epoch": 0.12759837499629334, + "grad_norm": 0.1581641435623169, + "learning_rate": 0.0009656155053735579, + "loss": 2.9343, + "step": 4303 + }, + { + "epoch": 0.12762802834860482, + "grad_norm": 0.18338459730148315, + "learning_rate": 0.0009655983565575115, + "loss": 2.8672, + "step": 4304 + }, + { + "epoch": 0.1276576817009163, + "grad_norm": 0.18501992523670197, + "learning_rate": 0.0009655812036185119, + "loss": 2.9225, + "step": 4305 + }, + { + "epoch": 0.12768733505322777, + "grad_norm": 0.1680738478899002, + "learning_rate": 0.0009655640465567108, + "loss": 2.9518, + "step": 4306 + }, + { + "epoch": 0.12771698840553924, + "grad_norm": 0.18406705558300018, + "learning_rate": 0.00096554688537226, + "loss": 2.941, + "step": 4307 + }, + { + "epoch": 0.12774664175785072, + "grad_norm": 0.1969601958990097, + "learning_rate": 0.0009655297200653119, + "loss": 2.8956, + "step": 4308 + }, + { + "epoch": 0.1277762951101622, + "grad_norm": 0.19536685943603516, + "learning_rate": 0.000965512550636018, + "loss": 2.9198, + "step": 4309 + }, + { + "epoch": 0.12780594846247367, + "grad_norm": 0.1521073281764984, + "learning_rate": 0.0009654953770845305, + "loss": 2.8827, + "step": 4310 + }, + { + "epoch": 0.12783560181478515, + "grad_norm": 0.15128193795681, + "learning_rate": 0.0009654781994110018, + "loss": 2.9117, + "step": 4311 + }, + { + "epoch": 0.12786525516709665, + "grad_norm": 0.14698350429534912, + "learning_rate": 0.0009654610176155836, + "loss": 2.8971, + "step": 4312 + }, + { + "epoch": 0.12789490851940813, + "grad_norm": 0.1343841552734375, + "learning_rate": 0.0009654438316984281, + "loss": 2.9204, + "step": 4313 + }, + { + "epoch": 0.1279245618717196, + "grad_norm": 0.13196761906147003, + "learning_rate": 0.0009654266416596878, + "loss": 2.8751, + "step": 4314 + }, + { + "epoch": 0.12795421522403108, + "grad_norm": 0.13857302069664001, + "learning_rate": 0.0009654094474995144, + "loss": 2.8994, + "step": 4315 + }, + { + "epoch": 0.12798386857634256, + "grad_norm": 0.13924771547317505, + "learning_rate": 0.0009653922492180607, + "loss": 2.8908, + "step": 4316 + }, + { + "epoch": 0.12801352192865403, + "grad_norm": 0.14078456163406372, + "learning_rate": 0.0009653750468154788, + "loss": 2.896, + "step": 4317 + }, + { + "epoch": 0.1280431752809655, + "grad_norm": 0.16217055916786194, + "learning_rate": 0.0009653578402919207, + "loss": 2.913, + "step": 4318 + }, + { + "epoch": 0.12807282863327699, + "grad_norm": 0.15836061537265778, + "learning_rate": 0.0009653406296475393, + "loss": 2.913, + "step": 4319 + }, + { + "epoch": 0.12810248198558846, + "grad_norm": 0.15967100858688354, + "learning_rate": 0.0009653234148824866, + "loss": 2.9239, + "step": 4320 + }, + { + "epoch": 0.12813213533789994, + "grad_norm": 0.14705051481723785, + "learning_rate": 0.0009653061959969152, + "loss": 2.913, + "step": 4321 + }, + { + "epoch": 0.12816178869021144, + "grad_norm": 0.14352621138095856, + "learning_rate": 0.0009652889729909776, + "loss": 2.9091, + "step": 4322 + }, + { + "epoch": 0.12819144204252292, + "grad_norm": 0.1473979502916336, + "learning_rate": 0.0009652717458648264, + "loss": 2.9125, + "step": 4323 + }, + { + "epoch": 0.1282210953948344, + "grad_norm": 0.13720813393592834, + "learning_rate": 0.0009652545146186138, + "loss": 2.9002, + "step": 4324 + }, + { + "epoch": 0.12825074874714587, + "grad_norm": 0.15828725695610046, + "learning_rate": 0.0009652372792524927, + "loss": 2.9037, + "step": 4325 + }, + { + "epoch": 0.12828040209945735, + "grad_norm": 0.17555633187294006, + "learning_rate": 0.0009652200397666157, + "loss": 2.9118, + "step": 4326 + }, + { + "epoch": 0.12831005545176882, + "grad_norm": 0.19278836250305176, + "learning_rate": 0.0009652027961611354, + "loss": 2.889, + "step": 4327 + }, + { + "epoch": 0.1283397088040803, + "grad_norm": 0.14296625554561615, + "learning_rate": 0.0009651855484362045, + "loss": 2.9421, + "step": 4328 + }, + { + "epoch": 0.12836936215639178, + "grad_norm": 0.13938894867897034, + "learning_rate": 0.0009651682965919755, + "loss": 2.9098, + "step": 4329 + }, + { + "epoch": 0.12839901550870325, + "grad_norm": 0.15289513766765594, + "learning_rate": 0.0009651510406286016, + "loss": 2.8926, + "step": 4330 + }, + { + "epoch": 0.12842866886101473, + "grad_norm": 0.13546337187290192, + "learning_rate": 0.0009651337805462354, + "loss": 2.8855, + "step": 4331 + }, + { + "epoch": 0.1284583222133262, + "grad_norm": 0.1538798213005066, + "learning_rate": 0.0009651165163450296, + "loss": 2.9071, + "step": 4332 + }, + { + "epoch": 0.1284879755656377, + "grad_norm": 0.1696440875530243, + "learning_rate": 0.0009650992480251373, + "loss": 2.9084, + "step": 4333 + }, + { + "epoch": 0.12851762891794918, + "grad_norm": 0.16889816522598267, + "learning_rate": 0.0009650819755867113, + "loss": 2.8999, + "step": 4334 + }, + { + "epoch": 0.12854728227026066, + "grad_norm": 0.16069191694259644, + "learning_rate": 0.0009650646990299046, + "loss": 2.8916, + "step": 4335 + }, + { + "epoch": 0.12857693562257214, + "grad_norm": 0.155861034989357, + "learning_rate": 0.0009650474183548701, + "loss": 2.904, + "step": 4336 + }, + { + "epoch": 0.1286065889748836, + "grad_norm": 0.17520752549171448, + "learning_rate": 0.000965030133561761, + "loss": 2.8976, + "step": 4337 + }, + { + "epoch": 0.1286362423271951, + "grad_norm": 0.17323827743530273, + "learning_rate": 0.0009650128446507302, + "loss": 2.9197, + "step": 4338 + }, + { + "epoch": 0.12866589567950656, + "grad_norm": 0.16895517706871033, + "learning_rate": 0.0009649955516219308, + "loss": 2.8696, + "step": 4339 + }, + { + "epoch": 0.12869554903181804, + "grad_norm": 0.14357861876487732, + "learning_rate": 0.000964978254475516, + "loss": 2.931, + "step": 4340 + }, + { + "epoch": 0.12872520238412952, + "grad_norm": 0.124570332467556, + "learning_rate": 0.000964960953211639, + "loss": 2.9175, + "step": 4341 + }, + { + "epoch": 0.128754855736441, + "grad_norm": 0.13406679034233093, + "learning_rate": 0.0009649436478304528, + "loss": 2.9147, + "step": 4342 + }, + { + "epoch": 0.1287845090887525, + "grad_norm": 0.12362726777791977, + "learning_rate": 0.000964926338332111, + "loss": 2.9082, + "step": 4343 + }, + { + "epoch": 0.12881416244106397, + "grad_norm": 0.1417793333530426, + "learning_rate": 0.0009649090247167664, + "loss": 2.903, + "step": 4344 + }, + { + "epoch": 0.12884381579337545, + "grad_norm": 0.14268310368061066, + "learning_rate": 0.0009648917069845728, + "loss": 2.9159, + "step": 4345 + }, + { + "epoch": 0.12887346914568693, + "grad_norm": 0.17168045043945312, + "learning_rate": 0.0009648743851356833, + "loss": 2.8867, + "step": 4346 + }, + { + "epoch": 0.1289031224979984, + "grad_norm": 0.20146924257278442, + "learning_rate": 0.0009648570591702513, + "loss": 2.9516, + "step": 4347 + }, + { + "epoch": 0.12893277585030988, + "grad_norm": 0.21651968359947205, + "learning_rate": 0.0009648397290884304, + "loss": 2.8832, + "step": 4348 + }, + { + "epoch": 0.12896242920262135, + "grad_norm": 0.20730704069137573, + "learning_rate": 0.0009648223948903736, + "loss": 2.8872, + "step": 4349 + }, + { + "epoch": 0.12899208255493283, + "grad_norm": 0.15342438220977783, + "learning_rate": 0.0009648050565762349, + "loss": 2.8908, + "step": 4350 + }, + { + "epoch": 0.1290217359072443, + "grad_norm": 0.1611441671848297, + "learning_rate": 0.0009647877141461676, + "loss": 2.8899, + "step": 4351 + }, + { + "epoch": 0.12905138925955578, + "grad_norm": 0.16708599030971527, + "learning_rate": 0.0009647703676003254, + "loss": 2.8589, + "step": 4352 + }, + { + "epoch": 0.12908104261186726, + "grad_norm": 0.14614422619342804, + "learning_rate": 0.0009647530169388617, + "loss": 2.8829, + "step": 4353 + }, + { + "epoch": 0.12911069596417876, + "grad_norm": 0.14355617761611938, + "learning_rate": 0.0009647356621619303, + "loss": 2.9274, + "step": 4354 + }, + { + "epoch": 0.12914034931649024, + "grad_norm": 0.14587801694869995, + "learning_rate": 0.0009647183032696849, + "loss": 2.8718, + "step": 4355 + }, + { + "epoch": 0.12917000266880171, + "grad_norm": 0.21172288060188293, + "learning_rate": 0.000964700940262279, + "loss": 2.8985, + "step": 4356 + }, + { + "epoch": 0.1291996560211132, + "grad_norm": 0.14984755218029022, + "learning_rate": 0.0009646835731398667, + "loss": 2.9192, + "step": 4357 + }, + { + "epoch": 0.12922930937342467, + "grad_norm": 0.15086430311203003, + "learning_rate": 0.0009646662019026016, + "loss": 2.9009, + "step": 4358 + }, + { + "epoch": 0.12925896272573614, + "grad_norm": 0.14923812448978424, + "learning_rate": 0.0009646488265506373, + "loss": 2.8918, + "step": 4359 + }, + { + "epoch": 0.12928861607804762, + "grad_norm": 0.1357213854789734, + "learning_rate": 0.000964631447084128, + "loss": 2.934, + "step": 4360 + }, + { + "epoch": 0.1293182694303591, + "grad_norm": 0.13446517288684845, + "learning_rate": 0.0009646140635032277, + "loss": 2.9039, + "step": 4361 + }, + { + "epoch": 0.12934792278267057, + "grad_norm": 0.14624276757240295, + "learning_rate": 0.0009645966758080898, + "loss": 2.9198, + "step": 4362 + }, + { + "epoch": 0.12937757613498205, + "grad_norm": 0.14550966024398804, + "learning_rate": 0.0009645792839988687, + "loss": 2.8955, + "step": 4363 + }, + { + "epoch": 0.12940722948729355, + "grad_norm": 0.13394884765148163, + "learning_rate": 0.0009645618880757183, + "loss": 2.8873, + "step": 4364 + }, + { + "epoch": 0.12943688283960503, + "grad_norm": 0.13297423720359802, + "learning_rate": 0.0009645444880387927, + "loss": 2.8769, + "step": 4365 + }, + { + "epoch": 0.1294665361919165, + "grad_norm": 0.1416965276002884, + "learning_rate": 0.0009645270838882458, + "loss": 2.9203, + "step": 4366 + }, + { + "epoch": 0.12949618954422798, + "grad_norm": 0.17079110443592072, + "learning_rate": 0.000964509675624232, + "loss": 2.9117, + "step": 4367 + }, + { + "epoch": 0.12952584289653946, + "grad_norm": 0.16956821084022522, + "learning_rate": 0.0009644922632469051, + "loss": 2.906, + "step": 4368 + }, + { + "epoch": 0.12955549624885093, + "grad_norm": 0.1496324986219406, + "learning_rate": 0.0009644748467564196, + "loss": 2.8565, + "step": 4369 + }, + { + "epoch": 0.1295851496011624, + "grad_norm": 0.17050284147262573, + "learning_rate": 0.0009644574261529295, + "loss": 2.9002, + "step": 4370 + }, + { + "epoch": 0.12961480295347388, + "grad_norm": 0.19579699635505676, + "learning_rate": 0.0009644400014365892, + "loss": 2.913, + "step": 4371 + }, + { + "epoch": 0.12964445630578536, + "grad_norm": 0.17653876543045044, + "learning_rate": 0.0009644225726075531, + "loss": 2.9506, + "step": 4372 + }, + { + "epoch": 0.12967410965809684, + "grad_norm": 0.18565697968006134, + "learning_rate": 0.0009644051396659754, + "loss": 2.9031, + "step": 4373 + }, + { + "epoch": 0.1297037630104083, + "grad_norm": 0.20479047298431396, + "learning_rate": 0.0009643877026120104, + "loss": 2.9022, + "step": 4374 + }, + { + "epoch": 0.12973341636271982, + "grad_norm": 0.20002037286758423, + "learning_rate": 0.0009643702614458126, + "loss": 2.8766, + "step": 4375 + }, + { + "epoch": 0.1297630697150313, + "grad_norm": 0.1922181397676468, + "learning_rate": 0.0009643528161675364, + "loss": 2.8949, + "step": 4376 + }, + { + "epoch": 0.12979272306734277, + "grad_norm": 0.1704670488834381, + "learning_rate": 0.0009643353667773362, + "loss": 2.8646, + "step": 4377 + }, + { + "epoch": 0.12982237641965425, + "grad_norm": 0.1627606898546219, + "learning_rate": 0.0009643179132753668, + "loss": 2.9295, + "step": 4378 + }, + { + "epoch": 0.12985202977196572, + "grad_norm": 0.16846652328968048, + "learning_rate": 0.0009643004556617825, + "loss": 2.9026, + "step": 4379 + }, + { + "epoch": 0.1298816831242772, + "grad_norm": 0.16503100097179413, + "learning_rate": 0.0009642829939367379, + "loss": 2.9283, + "step": 4380 + }, + { + "epoch": 0.12991133647658867, + "grad_norm": 0.17251496016979218, + "learning_rate": 0.0009642655281003878, + "loss": 2.867, + "step": 4381 + }, + { + "epoch": 0.12994098982890015, + "grad_norm": 0.14105740189552307, + "learning_rate": 0.0009642480581528867, + "loss": 2.9028, + "step": 4382 + }, + { + "epoch": 0.12997064318121163, + "grad_norm": 0.13317511975765228, + "learning_rate": 0.0009642305840943894, + "loss": 2.9098, + "step": 4383 + }, + { + "epoch": 0.1300002965335231, + "grad_norm": 0.1373881697654724, + "learning_rate": 0.0009642131059250506, + "loss": 2.9241, + "step": 4384 + }, + { + "epoch": 0.1300299498858346, + "grad_norm": 0.13648301362991333, + "learning_rate": 0.0009641956236450251, + "loss": 2.8999, + "step": 4385 + }, + { + "epoch": 0.13005960323814608, + "grad_norm": 0.12912844121456146, + "learning_rate": 0.0009641781372544676, + "loss": 2.8973, + "step": 4386 + }, + { + "epoch": 0.13008925659045756, + "grad_norm": 0.13011111319065094, + "learning_rate": 0.0009641606467535331, + "loss": 2.9282, + "step": 4387 + }, + { + "epoch": 0.13011890994276903, + "grad_norm": 0.15047456324100494, + "learning_rate": 0.0009641431521423763, + "loss": 2.9152, + "step": 4388 + }, + { + "epoch": 0.1301485632950805, + "grad_norm": 0.16446621716022491, + "learning_rate": 0.0009641256534211522, + "loss": 2.9238, + "step": 4389 + }, + { + "epoch": 0.130178216647392, + "grad_norm": 0.17107373476028442, + "learning_rate": 0.000964108150590016, + "loss": 2.9471, + "step": 4390 + }, + { + "epoch": 0.13020786999970346, + "grad_norm": 0.17261561751365662, + "learning_rate": 0.0009640906436491222, + "loss": 2.9122, + "step": 4391 + }, + { + "epoch": 0.13023752335201494, + "grad_norm": 0.18146730959415436, + "learning_rate": 0.0009640731325986263, + "loss": 2.8948, + "step": 4392 + }, + { + "epoch": 0.13026717670432642, + "grad_norm": 0.2327626645565033, + "learning_rate": 0.000964055617438683, + "loss": 2.9232, + "step": 4393 + }, + { + "epoch": 0.1302968300566379, + "grad_norm": 0.24715293943881989, + "learning_rate": 0.0009640380981694476, + "loss": 2.905, + "step": 4394 + }, + { + "epoch": 0.1303264834089494, + "grad_norm": 0.20056283473968506, + "learning_rate": 0.0009640205747910751, + "loss": 2.9085, + "step": 4395 + }, + { + "epoch": 0.13035613676126087, + "grad_norm": 0.16302190721035004, + "learning_rate": 0.0009640030473037209, + "loss": 2.8755, + "step": 4396 + }, + { + "epoch": 0.13038579011357235, + "grad_norm": 0.16328153014183044, + "learning_rate": 0.00096398551570754, + "loss": 2.9058, + "step": 4397 + }, + { + "epoch": 0.13041544346588382, + "grad_norm": 0.14686575531959534, + "learning_rate": 0.0009639679800026877, + "loss": 2.9016, + "step": 4398 + }, + { + "epoch": 0.1304450968181953, + "grad_norm": 0.14863577485084534, + "learning_rate": 0.0009639504401893193, + "loss": 2.8814, + "step": 4399 + }, + { + "epoch": 0.13047475017050678, + "grad_norm": 0.14814701676368713, + "learning_rate": 0.0009639328962675902, + "loss": 2.9049, + "step": 4400 + }, + { + "epoch": 0.13050440352281825, + "grad_norm": 0.16306306421756744, + "learning_rate": 0.0009639153482376557, + "loss": 2.924, + "step": 4401 + }, + { + "epoch": 0.13053405687512973, + "grad_norm": 0.14312045276165009, + "learning_rate": 0.0009638977960996711, + "loss": 2.9063, + "step": 4402 + }, + { + "epoch": 0.1305637102274412, + "grad_norm": 0.1366911679506302, + "learning_rate": 0.0009638802398537919, + "loss": 2.9395, + "step": 4403 + }, + { + "epoch": 0.13059336357975268, + "grad_norm": 0.14836789667606354, + "learning_rate": 0.0009638626795001735, + "loss": 2.9283, + "step": 4404 + }, + { + "epoch": 0.13062301693206416, + "grad_norm": 0.1282220482826233, + "learning_rate": 0.0009638451150389715, + "loss": 2.887, + "step": 4405 + }, + { + "epoch": 0.13065267028437566, + "grad_norm": 0.1261482983827591, + "learning_rate": 0.0009638275464703413, + "loss": 2.8723, + "step": 4406 + }, + { + "epoch": 0.13068232363668714, + "grad_norm": 0.13467344641685486, + "learning_rate": 0.0009638099737944386, + "loss": 2.8687, + "step": 4407 + }, + { + "epoch": 0.1307119769889986, + "grad_norm": 0.13945595920085907, + "learning_rate": 0.0009637923970114191, + "loss": 2.9003, + "step": 4408 + }, + { + "epoch": 0.1307416303413101, + "grad_norm": 0.1432110071182251, + "learning_rate": 0.0009637748161214381, + "loss": 2.921, + "step": 4409 + }, + { + "epoch": 0.13077128369362157, + "grad_norm": 0.15823446214199066, + "learning_rate": 0.0009637572311246516, + "loss": 2.9098, + "step": 4410 + }, + { + "epoch": 0.13080093704593304, + "grad_norm": 0.15149986743927002, + "learning_rate": 0.0009637396420212152, + "loss": 2.8688, + "step": 4411 + }, + { + "epoch": 0.13083059039824452, + "grad_norm": 0.16813310980796814, + "learning_rate": 0.0009637220488112847, + "loss": 2.9177, + "step": 4412 + }, + { + "epoch": 0.130860243750556, + "grad_norm": 0.1689901053905487, + "learning_rate": 0.000963704451495016, + "loss": 2.8633, + "step": 4413 + }, + { + "epoch": 0.13088989710286747, + "grad_norm": 0.18623444437980652, + "learning_rate": 0.0009636868500725646, + "loss": 2.9342, + "step": 4414 + }, + { + "epoch": 0.13091955045517895, + "grad_norm": 0.19446037709712982, + "learning_rate": 0.0009636692445440866, + "loss": 2.9022, + "step": 4415 + }, + { + "epoch": 0.13094920380749045, + "grad_norm": 0.18994951248168945, + "learning_rate": 0.0009636516349097377, + "loss": 2.8728, + "step": 4416 + }, + { + "epoch": 0.13097885715980193, + "grad_norm": 0.15564554929733276, + "learning_rate": 0.0009636340211696743, + "loss": 2.8828, + "step": 4417 + }, + { + "epoch": 0.1310085105121134, + "grad_norm": 0.1635948270559311, + "learning_rate": 0.0009636164033240517, + "loss": 2.9185, + "step": 4418 + }, + { + "epoch": 0.13103816386442488, + "grad_norm": 0.15873441100120544, + "learning_rate": 0.0009635987813730266, + "loss": 2.9114, + "step": 4419 + }, + { + "epoch": 0.13106781721673635, + "grad_norm": 0.16090606153011322, + "learning_rate": 0.0009635811553167546, + "loss": 2.8987, + "step": 4420 + }, + { + "epoch": 0.13109747056904783, + "grad_norm": 0.1710996925830841, + "learning_rate": 0.0009635635251553918, + "loss": 2.8514, + "step": 4421 + }, + { + "epoch": 0.1311271239213593, + "grad_norm": 0.14938807487487793, + "learning_rate": 0.0009635458908890946, + "loss": 2.906, + "step": 4422 + }, + { + "epoch": 0.13115677727367078, + "grad_norm": 0.1534438282251358, + "learning_rate": 0.0009635282525180189, + "loss": 2.8996, + "step": 4423 + }, + { + "epoch": 0.13118643062598226, + "grad_norm": 0.15110230445861816, + "learning_rate": 0.0009635106100423209, + "loss": 2.8932, + "step": 4424 + }, + { + "epoch": 0.13121608397829373, + "grad_norm": 0.14259618520736694, + "learning_rate": 0.0009634929634621569, + "loss": 2.879, + "step": 4425 + }, + { + "epoch": 0.1312457373306052, + "grad_norm": 0.12986043095588684, + "learning_rate": 0.0009634753127776832, + "loss": 2.9025, + "step": 4426 + }, + { + "epoch": 0.13127539068291671, + "grad_norm": 0.12858407199382782, + "learning_rate": 0.000963457657989056, + "loss": 2.861, + "step": 4427 + }, + { + "epoch": 0.1313050440352282, + "grad_norm": 0.14963583648204803, + "learning_rate": 0.0009634399990964316, + "loss": 2.9108, + "step": 4428 + }, + { + "epoch": 0.13133469738753967, + "grad_norm": 0.15329059958457947, + "learning_rate": 0.0009634223360999666, + "loss": 2.8435, + "step": 4429 + }, + { + "epoch": 0.13136435073985114, + "grad_norm": 0.13890832662582397, + "learning_rate": 0.0009634046689998173, + "loss": 2.9088, + "step": 4430 + }, + { + "epoch": 0.13139400409216262, + "grad_norm": 0.12904076278209686, + "learning_rate": 0.00096338699779614, + "loss": 2.8671, + "step": 4431 + }, + { + "epoch": 0.1314236574444741, + "grad_norm": 0.16378509998321533, + "learning_rate": 0.0009633693224890914, + "loss": 2.8963, + "step": 4432 + }, + { + "epoch": 0.13145331079678557, + "grad_norm": 0.17150837182998657, + "learning_rate": 0.0009633516430788278, + "loss": 2.8858, + "step": 4433 + }, + { + "epoch": 0.13148296414909705, + "grad_norm": 0.16683350503444672, + "learning_rate": 0.0009633339595655059, + "loss": 2.8945, + "step": 4434 + }, + { + "epoch": 0.13151261750140852, + "grad_norm": 0.15997588634490967, + "learning_rate": 0.0009633162719492823, + "loss": 2.8737, + "step": 4435 + }, + { + "epoch": 0.13154227085372, + "grad_norm": 0.15527740120887756, + "learning_rate": 0.0009632985802303136, + "loss": 2.894, + "step": 4436 + }, + { + "epoch": 0.1315719242060315, + "grad_norm": 0.13320769369602203, + "learning_rate": 0.0009632808844087564, + "loss": 2.9018, + "step": 4437 + }, + { + "epoch": 0.13160157755834298, + "grad_norm": 0.1308583766222, + "learning_rate": 0.0009632631844847673, + "loss": 2.8744, + "step": 4438 + }, + { + "epoch": 0.13163123091065446, + "grad_norm": 0.1602843850851059, + "learning_rate": 0.0009632454804585033, + "loss": 2.8941, + "step": 4439 + }, + { + "epoch": 0.13166088426296593, + "grad_norm": 0.16666699945926666, + "learning_rate": 0.0009632277723301213, + "loss": 2.9102, + "step": 4440 + }, + { + "epoch": 0.1316905376152774, + "grad_norm": 0.16312943398952484, + "learning_rate": 0.0009632100600997775, + "loss": 2.8973, + "step": 4441 + }, + { + "epoch": 0.13172019096758888, + "grad_norm": 0.17525209486484528, + "learning_rate": 0.0009631923437676294, + "loss": 2.9082, + "step": 4442 + }, + { + "epoch": 0.13174984431990036, + "grad_norm": 0.1717994213104248, + "learning_rate": 0.0009631746233338335, + "loss": 2.9021, + "step": 4443 + }, + { + "epoch": 0.13177949767221184, + "grad_norm": 0.16309097409248352, + "learning_rate": 0.0009631568987985466, + "loss": 2.9171, + "step": 4444 + }, + { + "epoch": 0.1318091510245233, + "grad_norm": 0.15386314690113068, + "learning_rate": 0.0009631391701619261, + "loss": 2.8989, + "step": 4445 + }, + { + "epoch": 0.1318388043768348, + "grad_norm": 0.16398070752620697, + "learning_rate": 0.0009631214374241287, + "loss": 2.8622, + "step": 4446 + }, + { + "epoch": 0.1318684577291463, + "grad_norm": 0.17689545452594757, + "learning_rate": 0.0009631037005853114, + "loss": 2.8991, + "step": 4447 + }, + { + "epoch": 0.13189811108145777, + "grad_norm": 0.14718718826770782, + "learning_rate": 0.0009630859596456314, + "loss": 2.8896, + "step": 4448 + }, + { + "epoch": 0.13192776443376925, + "grad_norm": 0.15462954342365265, + "learning_rate": 0.0009630682146052458, + "loss": 2.8587, + "step": 4449 + }, + { + "epoch": 0.13195741778608072, + "grad_norm": 0.13964706659317017, + "learning_rate": 0.0009630504654643115, + "loss": 2.8766, + "step": 4450 + }, + { + "epoch": 0.1319870711383922, + "grad_norm": 0.13381262123584747, + "learning_rate": 0.000963032712222986, + "loss": 2.8188, + "step": 4451 + }, + { + "epoch": 0.13201672449070367, + "grad_norm": 0.13142478466033936, + "learning_rate": 0.0009630149548814263, + "loss": 2.9092, + "step": 4452 + }, + { + "epoch": 0.13204637784301515, + "grad_norm": 0.15588581562042236, + "learning_rate": 0.0009629971934397897, + "loss": 2.8868, + "step": 4453 + }, + { + "epoch": 0.13207603119532663, + "grad_norm": 0.17741063237190247, + "learning_rate": 0.0009629794278982335, + "loss": 2.9187, + "step": 4454 + }, + { + "epoch": 0.1321056845476381, + "grad_norm": 0.18873244524002075, + "learning_rate": 0.0009629616582569149, + "loss": 2.9262, + "step": 4455 + }, + { + "epoch": 0.13213533789994958, + "grad_norm": 0.15178129076957703, + "learning_rate": 0.0009629438845159914, + "loss": 2.9154, + "step": 4456 + }, + { + "epoch": 0.13216499125226105, + "grad_norm": 0.1252005696296692, + "learning_rate": 0.0009629261066756205, + "loss": 2.894, + "step": 4457 + }, + { + "epoch": 0.13219464460457256, + "grad_norm": 0.14139442145824432, + "learning_rate": 0.0009629083247359593, + "loss": 2.8723, + "step": 4458 + }, + { + "epoch": 0.13222429795688403, + "grad_norm": 0.1371673047542572, + "learning_rate": 0.0009628905386971655, + "loss": 2.916, + "step": 4459 + }, + { + "epoch": 0.1322539513091955, + "grad_norm": 0.14130011200904846, + "learning_rate": 0.0009628727485593965, + "loss": 2.8884, + "step": 4460 + }, + { + "epoch": 0.132283604661507, + "grad_norm": 0.14431653916835785, + "learning_rate": 0.0009628549543228098, + "loss": 2.877, + "step": 4461 + }, + { + "epoch": 0.13231325801381846, + "grad_norm": 0.16179470717906952, + "learning_rate": 0.0009628371559875632, + "loss": 2.8744, + "step": 4462 + }, + { + "epoch": 0.13234291136612994, + "grad_norm": 0.1745164394378662, + "learning_rate": 0.0009628193535538139, + "loss": 2.9015, + "step": 4463 + }, + { + "epoch": 0.13237256471844142, + "grad_norm": 0.17823287844657898, + "learning_rate": 0.0009628015470217199, + "loss": 2.918, + "step": 4464 + }, + { + "epoch": 0.1324022180707529, + "grad_norm": 0.15369944274425507, + "learning_rate": 0.0009627837363914389, + "loss": 2.886, + "step": 4465 + }, + { + "epoch": 0.13243187142306437, + "grad_norm": 0.1436958611011505, + "learning_rate": 0.0009627659216631284, + "loss": 2.8775, + "step": 4466 + }, + { + "epoch": 0.13246152477537584, + "grad_norm": 0.15398384630680084, + "learning_rate": 0.000962748102836946, + "loss": 2.9199, + "step": 4467 + }, + { + "epoch": 0.13249117812768735, + "grad_norm": 0.16496653854846954, + "learning_rate": 0.00096273027991305, + "loss": 2.9161, + "step": 4468 + }, + { + "epoch": 0.13252083147999882, + "grad_norm": 0.17318135499954224, + "learning_rate": 0.0009627124528915978, + "loss": 2.8835, + "step": 4469 + }, + { + "epoch": 0.1325504848323103, + "grad_norm": 0.1808551549911499, + "learning_rate": 0.0009626946217727475, + "loss": 2.8908, + "step": 4470 + }, + { + "epoch": 0.13258013818462178, + "grad_norm": 0.17128004133701324, + "learning_rate": 0.0009626767865566568, + "loss": 2.8635, + "step": 4471 + }, + { + "epoch": 0.13260979153693325, + "grad_norm": 0.14187230169773102, + "learning_rate": 0.0009626589472434838, + "loss": 2.857, + "step": 4472 + }, + { + "epoch": 0.13263944488924473, + "grad_norm": 0.16007322072982788, + "learning_rate": 0.0009626411038333864, + "loss": 2.9189, + "step": 4473 + }, + { + "epoch": 0.1326690982415562, + "grad_norm": 0.13423103094100952, + "learning_rate": 0.0009626232563265227, + "loss": 2.8639, + "step": 4474 + }, + { + "epoch": 0.13269875159386768, + "grad_norm": 0.1236073300242424, + "learning_rate": 0.0009626054047230506, + "loss": 2.9136, + "step": 4475 + }, + { + "epoch": 0.13272840494617916, + "grad_norm": 0.1270204484462738, + "learning_rate": 0.0009625875490231282, + "loss": 2.8786, + "step": 4476 + }, + { + "epoch": 0.13275805829849063, + "grad_norm": 0.1260475218296051, + "learning_rate": 0.0009625696892269135, + "loss": 2.8831, + "step": 4477 + }, + { + "epoch": 0.1327877116508021, + "grad_norm": 0.143777534365654, + "learning_rate": 0.0009625518253345651, + "loss": 2.8666, + "step": 4478 + }, + { + "epoch": 0.1328173650031136, + "grad_norm": 0.15401475131511688, + "learning_rate": 0.0009625339573462406, + "loss": 2.9079, + "step": 4479 + }, + { + "epoch": 0.1328470183554251, + "grad_norm": 0.1933249682188034, + "learning_rate": 0.0009625160852620987, + "loss": 2.889, + "step": 4480 + }, + { + "epoch": 0.13287667170773657, + "grad_norm": 0.18008652329444885, + "learning_rate": 0.0009624982090822975, + "loss": 2.9282, + "step": 4481 + }, + { + "epoch": 0.13290632506004804, + "grad_norm": 0.16193778812885284, + "learning_rate": 0.0009624803288069952, + "loss": 2.8827, + "step": 4482 + }, + { + "epoch": 0.13293597841235952, + "grad_norm": 0.15099744498729706, + "learning_rate": 0.0009624624444363502, + "loss": 2.8726, + "step": 4483 + }, + { + "epoch": 0.132965631764671, + "grad_norm": 0.14861232042312622, + "learning_rate": 0.0009624445559705208, + "loss": 2.8843, + "step": 4484 + }, + { + "epoch": 0.13299528511698247, + "grad_norm": 0.14967533946037292, + "learning_rate": 0.0009624266634096657, + "loss": 2.9407, + "step": 4485 + }, + { + "epoch": 0.13302493846929395, + "grad_norm": 0.14515121281147003, + "learning_rate": 0.000962408766753943, + "loss": 2.9086, + "step": 4486 + }, + { + "epoch": 0.13305459182160542, + "grad_norm": 0.13093078136444092, + "learning_rate": 0.0009623908660035112, + "loss": 2.9106, + "step": 4487 + }, + { + "epoch": 0.1330842451739169, + "grad_norm": 0.14552882313728333, + "learning_rate": 0.000962372961158529, + "loss": 2.9048, + "step": 4488 + }, + { + "epoch": 0.1331138985262284, + "grad_norm": 0.15924878418445587, + "learning_rate": 0.0009623550522191549, + "loss": 2.8918, + "step": 4489 + }, + { + "epoch": 0.13314355187853988, + "grad_norm": 0.1925780326128006, + "learning_rate": 0.0009623371391855475, + "loss": 2.9339, + "step": 4490 + }, + { + "epoch": 0.13317320523085135, + "grad_norm": 0.20582172274589539, + "learning_rate": 0.0009623192220578652, + "loss": 2.8985, + "step": 4491 + }, + { + "epoch": 0.13320285858316283, + "grad_norm": 0.1815221607685089, + "learning_rate": 0.0009623013008362669, + "loss": 2.8691, + "step": 4492 + }, + { + "epoch": 0.1332325119354743, + "grad_norm": 0.148500457406044, + "learning_rate": 0.0009622833755209113, + "loss": 2.8886, + "step": 4493 + }, + { + "epoch": 0.13326216528778578, + "grad_norm": 0.17315301299095154, + "learning_rate": 0.000962265446111957, + "loss": 2.905, + "step": 4494 + }, + { + "epoch": 0.13329181864009726, + "grad_norm": 0.14900769293308258, + "learning_rate": 0.0009622475126095629, + "loss": 2.8996, + "step": 4495 + }, + { + "epoch": 0.13332147199240874, + "grad_norm": 0.1517738550901413, + "learning_rate": 0.0009622295750138876, + "loss": 2.9168, + "step": 4496 + }, + { + "epoch": 0.1333511253447202, + "grad_norm": 0.15671518445014954, + "learning_rate": 0.0009622116333250901, + "loss": 2.8814, + "step": 4497 + }, + { + "epoch": 0.1333807786970317, + "grad_norm": 0.15605655312538147, + "learning_rate": 0.0009621936875433293, + "loss": 2.8988, + "step": 4498 + }, + { + "epoch": 0.1334104320493432, + "grad_norm": 0.157119020819664, + "learning_rate": 0.0009621757376687641, + "loss": 2.8736, + "step": 4499 + }, + { + "epoch": 0.13344008540165467, + "grad_norm": 0.15535521507263184, + "learning_rate": 0.0009621577837015534, + "loss": 2.8746, + "step": 4500 + }, + { + "epoch": 0.13346973875396614, + "grad_norm": 0.14149752259254456, + "learning_rate": 0.0009621398256418561, + "loss": 2.8974, + "step": 4501 + }, + { + "epoch": 0.13349939210627762, + "grad_norm": 0.14112131297588348, + "learning_rate": 0.0009621218634898314, + "loss": 2.9278, + "step": 4502 + }, + { + "epoch": 0.1335290454585891, + "grad_norm": 0.14891569316387177, + "learning_rate": 0.0009621038972456383, + "loss": 2.8968, + "step": 4503 + }, + { + "epoch": 0.13355869881090057, + "grad_norm": 0.18643346428871155, + "learning_rate": 0.0009620859269094357, + "loss": 2.8844, + "step": 4504 + }, + { + "epoch": 0.13358835216321205, + "grad_norm": 0.1996704787015915, + "learning_rate": 0.0009620679524813831, + "loss": 2.8676, + "step": 4505 + }, + { + "epoch": 0.13361800551552352, + "grad_norm": 0.18747088313102722, + "learning_rate": 0.0009620499739616395, + "loss": 2.9102, + "step": 4506 + }, + { + "epoch": 0.133647658867835, + "grad_norm": 0.1818915605545044, + "learning_rate": 0.0009620319913503639, + "loss": 2.8867, + "step": 4507 + }, + { + "epoch": 0.13367731222014648, + "grad_norm": 0.17449846863746643, + "learning_rate": 0.0009620140046477157, + "loss": 2.8874, + "step": 4508 + }, + { + "epoch": 0.13370696557245795, + "grad_norm": 0.17158159613609314, + "learning_rate": 0.0009619960138538544, + "loss": 2.8989, + "step": 4509 + }, + { + "epoch": 0.13373661892476946, + "grad_norm": 0.17767907679080963, + "learning_rate": 0.0009619780189689389, + "loss": 2.9396, + "step": 4510 + }, + { + "epoch": 0.13376627227708093, + "grad_norm": 0.14684171974658966, + "learning_rate": 0.0009619600199931289, + "loss": 2.9269, + "step": 4511 + }, + { + "epoch": 0.1337959256293924, + "grad_norm": 0.1437506079673767, + "learning_rate": 0.0009619420169265834, + "loss": 2.9196, + "step": 4512 + }, + { + "epoch": 0.13382557898170389, + "grad_norm": 0.12049921602010727, + "learning_rate": 0.0009619240097694622, + "loss": 2.8815, + "step": 4513 + }, + { + "epoch": 0.13385523233401536, + "grad_norm": 0.1256992071866989, + "learning_rate": 0.0009619059985219246, + "loss": 2.8806, + "step": 4514 + }, + { + "epoch": 0.13388488568632684, + "grad_norm": 0.15253885090351105, + "learning_rate": 0.0009618879831841301, + "loss": 2.9243, + "step": 4515 + }, + { + "epoch": 0.1339145390386383, + "grad_norm": 0.16794782876968384, + "learning_rate": 0.0009618699637562383, + "loss": 2.9186, + "step": 4516 + }, + { + "epoch": 0.1339441923909498, + "grad_norm": 0.2017570585012436, + "learning_rate": 0.0009618519402384085, + "loss": 2.8798, + "step": 4517 + }, + { + "epoch": 0.13397384574326127, + "grad_norm": 0.2000197172164917, + "learning_rate": 0.0009618339126308006, + "loss": 2.9486, + "step": 4518 + }, + { + "epoch": 0.13400349909557274, + "grad_norm": 0.1712108850479126, + "learning_rate": 0.0009618158809335742, + "loss": 2.8963, + "step": 4519 + }, + { + "epoch": 0.13403315244788425, + "grad_norm": 0.17838186025619507, + "learning_rate": 0.0009617978451468887, + "loss": 2.8725, + "step": 4520 + }, + { + "epoch": 0.13406280580019572, + "grad_norm": 0.1356251984834671, + "learning_rate": 0.0009617798052709043, + "loss": 2.8741, + "step": 4521 + }, + { + "epoch": 0.1340924591525072, + "grad_norm": 0.1331903338432312, + "learning_rate": 0.0009617617613057803, + "loss": 2.8983, + "step": 4522 + }, + { + "epoch": 0.13412211250481867, + "grad_norm": 0.1337653547525406, + "learning_rate": 0.0009617437132516766, + "loss": 2.8637, + "step": 4523 + }, + { + "epoch": 0.13415176585713015, + "grad_norm": 0.12621735036373138, + "learning_rate": 0.000961725661108753, + "loss": 2.9095, + "step": 4524 + }, + { + "epoch": 0.13418141920944163, + "grad_norm": 0.1183653473854065, + "learning_rate": 0.0009617076048771695, + "loss": 2.8916, + "step": 4525 + }, + { + "epoch": 0.1342110725617531, + "grad_norm": 0.1390220820903778, + "learning_rate": 0.0009616895445570861, + "loss": 2.8512, + "step": 4526 + }, + { + "epoch": 0.13424072591406458, + "grad_norm": 0.12957802414894104, + "learning_rate": 0.0009616714801486624, + "loss": 2.8867, + "step": 4527 + }, + { + "epoch": 0.13427037926637606, + "grad_norm": 0.12591008841991425, + "learning_rate": 0.0009616534116520584, + "loss": 2.8642, + "step": 4528 + }, + { + "epoch": 0.13430003261868753, + "grad_norm": 0.1196334958076477, + "learning_rate": 0.0009616353390674344, + "loss": 2.9106, + "step": 4529 + }, + { + "epoch": 0.134329685970999, + "grad_norm": 0.14934486150741577, + "learning_rate": 0.00096161726239495, + "loss": 2.874, + "step": 4530 + }, + { + "epoch": 0.1343593393233105, + "grad_norm": 0.12124255299568176, + "learning_rate": 0.0009615991816347655, + "loss": 2.9155, + "step": 4531 + }, + { + "epoch": 0.134388992675622, + "grad_norm": 0.12542769312858582, + "learning_rate": 0.0009615810967870414, + "loss": 2.8445, + "step": 4532 + }, + { + "epoch": 0.13441864602793346, + "grad_norm": 0.14687226712703705, + "learning_rate": 0.0009615630078519371, + "loss": 2.9354, + "step": 4533 + }, + { + "epoch": 0.13444829938024494, + "grad_norm": 0.1480223685503006, + "learning_rate": 0.0009615449148296132, + "loss": 2.8807, + "step": 4534 + }, + { + "epoch": 0.13447795273255642, + "grad_norm": 0.1479090005159378, + "learning_rate": 0.0009615268177202298, + "loss": 2.9366, + "step": 4535 + }, + { + "epoch": 0.1345076060848679, + "grad_norm": 0.1554870456457138, + "learning_rate": 0.0009615087165239473, + "loss": 2.9193, + "step": 4536 + }, + { + "epoch": 0.13453725943717937, + "grad_norm": 0.16643990576267242, + "learning_rate": 0.0009614906112409259, + "loss": 2.886, + "step": 4537 + }, + { + "epoch": 0.13456691278949084, + "grad_norm": 0.15994447469711304, + "learning_rate": 0.000961472501871326, + "loss": 2.9215, + "step": 4538 + }, + { + "epoch": 0.13459656614180232, + "grad_norm": 0.16088470816612244, + "learning_rate": 0.0009614543884153078, + "loss": 2.8913, + "step": 4539 + }, + { + "epoch": 0.1346262194941138, + "grad_norm": 0.17936459183692932, + "learning_rate": 0.0009614362708730318, + "loss": 2.9001, + "step": 4540 + }, + { + "epoch": 0.1346558728464253, + "grad_norm": 0.17473916709423065, + "learning_rate": 0.0009614181492446583, + "loss": 2.9137, + "step": 4541 + }, + { + "epoch": 0.13468552619873678, + "grad_norm": 0.17335845530033112, + "learning_rate": 0.000961400023530348, + "loss": 2.8687, + "step": 4542 + }, + { + "epoch": 0.13471517955104825, + "grad_norm": 0.1657850444316864, + "learning_rate": 0.0009613818937302612, + "loss": 2.8871, + "step": 4543 + }, + { + "epoch": 0.13474483290335973, + "grad_norm": 0.18114784359931946, + "learning_rate": 0.0009613637598445586, + "loss": 2.8799, + "step": 4544 + }, + { + "epoch": 0.1347744862556712, + "grad_norm": 0.1897517442703247, + "learning_rate": 0.0009613456218734008, + "loss": 2.881, + "step": 4545 + }, + { + "epoch": 0.13480413960798268, + "grad_norm": 0.149736687541008, + "learning_rate": 0.0009613274798169482, + "loss": 2.8831, + "step": 4546 + }, + { + "epoch": 0.13483379296029416, + "grad_norm": 0.13100233674049377, + "learning_rate": 0.0009613093336753617, + "loss": 2.9113, + "step": 4547 + }, + { + "epoch": 0.13486344631260563, + "grad_norm": 0.12603513896465302, + "learning_rate": 0.0009612911834488018, + "loss": 2.9095, + "step": 4548 + }, + { + "epoch": 0.1348930996649171, + "grad_norm": 0.12989117205142975, + "learning_rate": 0.0009612730291374292, + "loss": 2.8832, + "step": 4549 + }, + { + "epoch": 0.13492275301722859, + "grad_norm": 0.13960285484790802, + "learning_rate": 0.0009612548707414048, + "loss": 2.8998, + "step": 4550 + }, + { + "epoch": 0.1349524063695401, + "grad_norm": 0.13228924572467804, + "learning_rate": 0.0009612367082608895, + "loss": 2.8771, + "step": 4551 + }, + { + "epoch": 0.13498205972185157, + "grad_norm": 0.12871046364307404, + "learning_rate": 0.0009612185416960439, + "loss": 2.8754, + "step": 4552 + }, + { + "epoch": 0.13501171307416304, + "grad_norm": 0.1494869738817215, + "learning_rate": 0.0009612003710470289, + "loss": 2.8691, + "step": 4553 + }, + { + "epoch": 0.13504136642647452, + "grad_norm": 0.15480607748031616, + "learning_rate": 0.0009611821963140055, + "loss": 2.8838, + "step": 4554 + }, + { + "epoch": 0.135071019778786, + "grad_norm": 0.18332889676094055, + "learning_rate": 0.0009611640174971345, + "loss": 2.9223, + "step": 4555 + }, + { + "epoch": 0.13510067313109747, + "grad_norm": 0.19113686680793762, + "learning_rate": 0.000961145834596577, + "loss": 2.8999, + "step": 4556 + }, + { + "epoch": 0.13513032648340895, + "grad_norm": 0.15489636361598969, + "learning_rate": 0.0009611276476124939, + "loss": 2.8711, + "step": 4557 + }, + { + "epoch": 0.13515997983572042, + "grad_norm": 0.15172766149044037, + "learning_rate": 0.0009611094565450466, + "loss": 2.8815, + "step": 4558 + }, + { + "epoch": 0.1351896331880319, + "grad_norm": 0.1763870269060135, + "learning_rate": 0.0009610912613943957, + "loss": 2.9303, + "step": 4559 + }, + { + "epoch": 0.13521928654034338, + "grad_norm": 0.17492155730724335, + "learning_rate": 0.0009610730621607026, + "loss": 2.8924, + "step": 4560 + }, + { + "epoch": 0.13524893989265485, + "grad_norm": 0.15386168658733368, + "learning_rate": 0.0009610548588441283, + "loss": 2.8807, + "step": 4561 + }, + { + "epoch": 0.13527859324496636, + "grad_norm": 0.16473248600959778, + "learning_rate": 0.0009610366514448342, + "loss": 2.8947, + "step": 4562 + }, + { + "epoch": 0.13530824659727783, + "grad_norm": 0.15827295184135437, + "learning_rate": 0.0009610184399629813, + "loss": 2.8939, + "step": 4563 + }, + { + "epoch": 0.1353378999495893, + "grad_norm": 0.14805921912193298, + "learning_rate": 0.0009610002243987311, + "loss": 2.87, + "step": 4564 + }, + { + "epoch": 0.13536755330190078, + "grad_norm": 0.15981824696063995, + "learning_rate": 0.0009609820047522448, + "loss": 2.8976, + "step": 4565 + }, + { + "epoch": 0.13539720665421226, + "grad_norm": 0.15828780829906464, + "learning_rate": 0.0009609637810236837, + "loss": 2.8853, + "step": 4566 + }, + { + "epoch": 0.13542686000652374, + "grad_norm": 0.17897160351276398, + "learning_rate": 0.0009609455532132091, + "loss": 2.8935, + "step": 4567 + }, + { + "epoch": 0.1354565133588352, + "grad_norm": 0.20299552381038666, + "learning_rate": 0.0009609273213209826, + "loss": 2.8934, + "step": 4568 + }, + { + "epoch": 0.1354861667111467, + "grad_norm": 0.2119504064321518, + "learning_rate": 0.0009609090853471654, + "loss": 2.8948, + "step": 4569 + }, + { + "epoch": 0.13551582006345816, + "grad_norm": 0.17507658898830414, + "learning_rate": 0.0009608908452919194, + "loss": 2.9007, + "step": 4570 + }, + { + "epoch": 0.13554547341576964, + "grad_norm": 0.16117899119853973, + "learning_rate": 0.0009608726011554056, + "loss": 2.9286, + "step": 4571 + }, + { + "epoch": 0.13557512676808114, + "grad_norm": 0.1767672449350357, + "learning_rate": 0.000960854352937786, + "loss": 2.8956, + "step": 4572 + }, + { + "epoch": 0.13560478012039262, + "grad_norm": 0.15695686638355255, + "learning_rate": 0.0009608361006392219, + "loss": 2.8919, + "step": 4573 + }, + { + "epoch": 0.1356344334727041, + "grad_norm": 0.14453692734241486, + "learning_rate": 0.000960817844259875, + "loss": 2.8835, + "step": 4574 + }, + { + "epoch": 0.13566408682501557, + "grad_norm": 0.14479991793632507, + "learning_rate": 0.0009607995837999071, + "loss": 2.8553, + "step": 4575 + }, + { + "epoch": 0.13569374017732705, + "grad_norm": 0.1437324434518814, + "learning_rate": 0.0009607813192594796, + "loss": 2.8895, + "step": 4576 + }, + { + "epoch": 0.13572339352963853, + "grad_norm": 0.16016536951065063, + "learning_rate": 0.0009607630506387546, + "loss": 2.887, + "step": 4577 + }, + { + "epoch": 0.13575304688195, + "grad_norm": 0.1511707752943039, + "learning_rate": 0.0009607447779378937, + "loss": 2.9175, + "step": 4578 + }, + { + "epoch": 0.13578270023426148, + "grad_norm": 0.1383364498615265, + "learning_rate": 0.0009607265011570585, + "loss": 2.8909, + "step": 4579 + }, + { + "epoch": 0.13581235358657295, + "grad_norm": 0.14534227550029755, + "learning_rate": 0.0009607082202964112, + "loss": 2.8887, + "step": 4580 + }, + { + "epoch": 0.13584200693888443, + "grad_norm": 0.1485057920217514, + "learning_rate": 0.0009606899353561136, + "loss": 2.8806, + "step": 4581 + }, + { + "epoch": 0.1358716602911959, + "grad_norm": 0.1317228078842163, + "learning_rate": 0.0009606716463363274, + "loss": 2.9013, + "step": 4582 + }, + { + "epoch": 0.1359013136435074, + "grad_norm": 0.13592250645160675, + "learning_rate": 0.0009606533532372148, + "loss": 2.8728, + "step": 4583 + }, + { + "epoch": 0.13593096699581889, + "grad_norm": 0.15094874799251556, + "learning_rate": 0.0009606350560589377, + "loss": 2.8772, + "step": 4584 + }, + { + "epoch": 0.13596062034813036, + "grad_norm": 0.1646319031715393, + "learning_rate": 0.0009606167548016581, + "loss": 2.9013, + "step": 4585 + }, + { + "epoch": 0.13599027370044184, + "grad_norm": 0.23578862845897675, + "learning_rate": 0.0009605984494655379, + "loss": 2.9032, + "step": 4586 + }, + { + "epoch": 0.13601992705275331, + "grad_norm": 0.17779457569122314, + "learning_rate": 0.0009605801400507397, + "loss": 2.865, + "step": 4587 + }, + { + "epoch": 0.1360495804050648, + "grad_norm": 0.14644113183021545, + "learning_rate": 0.0009605618265574251, + "loss": 2.8811, + "step": 4588 + }, + { + "epoch": 0.13607923375737627, + "grad_norm": 0.159868985414505, + "learning_rate": 0.0009605435089857564, + "loss": 2.8846, + "step": 4589 + }, + { + "epoch": 0.13610888710968774, + "grad_norm": 0.1634291112422943, + "learning_rate": 0.000960525187335896, + "loss": 2.8892, + "step": 4590 + }, + { + "epoch": 0.13613854046199922, + "grad_norm": 0.16443653404712677, + "learning_rate": 0.000960506861608006, + "loss": 2.8902, + "step": 4591 + }, + { + "epoch": 0.1361681938143107, + "grad_norm": 0.1429525464773178, + "learning_rate": 0.0009604885318022487, + "loss": 2.9077, + "step": 4592 + }, + { + "epoch": 0.1361978471666222, + "grad_norm": 0.15562982857227325, + "learning_rate": 0.0009604701979187864, + "loss": 2.9071, + "step": 4593 + }, + { + "epoch": 0.13622750051893368, + "grad_norm": 0.16843706369400024, + "learning_rate": 0.0009604518599577814, + "loss": 2.9128, + "step": 4594 + }, + { + "epoch": 0.13625715387124515, + "grad_norm": 0.14389744400978088, + "learning_rate": 0.0009604335179193962, + "loss": 2.907, + "step": 4595 + }, + { + "epoch": 0.13628680722355663, + "grad_norm": 0.14911237359046936, + "learning_rate": 0.0009604151718037933, + "loss": 2.8967, + "step": 4596 + }, + { + "epoch": 0.1363164605758681, + "grad_norm": 0.13844527304172516, + "learning_rate": 0.0009603968216111348, + "loss": 2.8838, + "step": 4597 + }, + { + "epoch": 0.13634611392817958, + "grad_norm": 0.1474885195493698, + "learning_rate": 0.0009603784673415834, + "loss": 2.8856, + "step": 4598 + }, + { + "epoch": 0.13637576728049106, + "grad_norm": 0.14853207767009735, + "learning_rate": 0.0009603601089953018, + "loss": 2.9273, + "step": 4599 + }, + { + "epoch": 0.13640542063280253, + "grad_norm": 0.1307155042886734, + "learning_rate": 0.0009603417465724525, + "loss": 2.8823, + "step": 4600 + }, + { + "epoch": 0.136435073985114, + "grad_norm": 0.12562605738639832, + "learning_rate": 0.0009603233800731978, + "loss": 2.8974, + "step": 4601 + }, + { + "epoch": 0.13646472733742548, + "grad_norm": 0.13993258774280548, + "learning_rate": 0.0009603050094977006, + "loss": 2.9232, + "step": 4602 + }, + { + "epoch": 0.136494380689737, + "grad_norm": 0.15220080316066742, + "learning_rate": 0.0009602866348461236, + "loss": 2.9085, + "step": 4603 + }, + { + "epoch": 0.13652403404204846, + "grad_norm": 0.1397281140089035, + "learning_rate": 0.0009602682561186294, + "loss": 2.8919, + "step": 4604 + }, + { + "epoch": 0.13655368739435994, + "grad_norm": 0.12316953390836716, + "learning_rate": 0.0009602498733153809, + "loss": 2.8928, + "step": 4605 + }, + { + "epoch": 0.13658334074667142, + "grad_norm": 0.13052088022232056, + "learning_rate": 0.0009602314864365404, + "loss": 2.8729, + "step": 4606 + }, + { + "epoch": 0.1366129940989829, + "grad_norm": 0.15020789206027985, + "learning_rate": 0.0009602130954822714, + "loss": 2.8853, + "step": 4607 + }, + { + "epoch": 0.13664264745129437, + "grad_norm": 0.18263666331768036, + "learning_rate": 0.0009601947004527364, + "loss": 2.8718, + "step": 4608 + }, + { + "epoch": 0.13667230080360585, + "grad_norm": 0.2033710479736328, + "learning_rate": 0.0009601763013480984, + "loss": 2.9185, + "step": 4609 + }, + { + "epoch": 0.13670195415591732, + "grad_norm": 0.24239417910575867, + "learning_rate": 0.0009601578981685201, + "loss": 2.9181, + "step": 4610 + }, + { + "epoch": 0.1367316075082288, + "grad_norm": 0.2263757884502411, + "learning_rate": 0.0009601394909141648, + "loss": 2.8609, + "step": 4611 + }, + { + "epoch": 0.13676126086054027, + "grad_norm": 0.17107759416103363, + "learning_rate": 0.0009601210795851953, + "loss": 2.8731, + "step": 4612 + }, + { + "epoch": 0.13679091421285175, + "grad_norm": 0.19055712223052979, + "learning_rate": 0.0009601026641817747, + "loss": 2.8864, + "step": 4613 + }, + { + "epoch": 0.13682056756516325, + "grad_norm": 0.16341063380241394, + "learning_rate": 0.0009600842447040659, + "loss": 2.8891, + "step": 4614 + }, + { + "epoch": 0.13685022091747473, + "grad_norm": 0.16223716735839844, + "learning_rate": 0.0009600658211522322, + "loss": 2.9012, + "step": 4615 + }, + { + "epoch": 0.1368798742697862, + "grad_norm": 0.1531602293252945, + "learning_rate": 0.0009600473935264367, + "loss": 2.8597, + "step": 4616 + }, + { + "epoch": 0.13690952762209768, + "grad_norm": 0.16048042476177216, + "learning_rate": 0.0009600289618268425, + "loss": 2.8807, + "step": 4617 + }, + { + "epoch": 0.13693918097440916, + "grad_norm": 0.13601744174957275, + "learning_rate": 0.000960010526053613, + "loss": 2.9186, + "step": 4618 + }, + { + "epoch": 0.13696883432672063, + "grad_norm": 0.15008504688739777, + "learning_rate": 0.0009599920862069112, + "loss": 2.9022, + "step": 4619 + }, + { + "epoch": 0.1369984876790321, + "grad_norm": 0.15029798448085785, + "learning_rate": 0.0009599736422869006, + "loss": 2.903, + "step": 4620 + }, + { + "epoch": 0.1370281410313436, + "grad_norm": 0.1365303099155426, + "learning_rate": 0.0009599551942937444, + "loss": 2.9216, + "step": 4621 + }, + { + "epoch": 0.13705779438365506, + "grad_norm": 0.1488114297389984, + "learning_rate": 0.000959936742227606, + "loss": 2.8732, + "step": 4622 + }, + { + "epoch": 0.13708744773596654, + "grad_norm": 0.16413335502147675, + "learning_rate": 0.0009599182860886488, + "loss": 2.8649, + "step": 4623 + }, + { + "epoch": 0.13711710108827804, + "grad_norm": 0.1651233583688736, + "learning_rate": 0.0009598998258770362, + "loss": 2.9161, + "step": 4624 + }, + { + "epoch": 0.13714675444058952, + "grad_norm": 0.1577628254890442, + "learning_rate": 0.0009598813615929318, + "loss": 2.8737, + "step": 4625 + }, + { + "epoch": 0.137176407792901, + "grad_norm": 0.13416887819766998, + "learning_rate": 0.000959862893236499, + "loss": 2.8944, + "step": 4626 + }, + { + "epoch": 0.13720606114521247, + "grad_norm": 0.12834922969341278, + "learning_rate": 0.0009598444208079013, + "loss": 2.8774, + "step": 4627 + }, + { + "epoch": 0.13723571449752395, + "grad_norm": 0.14294908940792084, + "learning_rate": 0.0009598259443073023, + "loss": 2.9216, + "step": 4628 + }, + { + "epoch": 0.13726536784983542, + "grad_norm": 0.13949203491210938, + "learning_rate": 0.0009598074637348657, + "loss": 2.8865, + "step": 4629 + }, + { + "epoch": 0.1372950212021469, + "grad_norm": 0.1432061791419983, + "learning_rate": 0.000959788979090755, + "loss": 2.8815, + "step": 4630 + }, + { + "epoch": 0.13732467455445838, + "grad_norm": 0.13769233226776123, + "learning_rate": 0.000959770490375134, + "loss": 2.9083, + "step": 4631 + }, + { + "epoch": 0.13735432790676985, + "grad_norm": 0.14903655648231506, + "learning_rate": 0.0009597519975881665, + "loss": 2.8867, + "step": 4632 + }, + { + "epoch": 0.13738398125908133, + "grad_norm": 0.17263461649417877, + "learning_rate": 0.000959733500730016, + "loss": 2.8865, + "step": 4633 + }, + { + "epoch": 0.1374136346113928, + "grad_norm": 0.18396471440792084, + "learning_rate": 0.0009597149998008466, + "loss": 2.8683, + "step": 4634 + }, + { + "epoch": 0.1374432879637043, + "grad_norm": 0.1668914258480072, + "learning_rate": 0.0009596964948008217, + "loss": 2.9205, + "step": 4635 + }, + { + "epoch": 0.13747294131601578, + "grad_norm": 0.14992786943912506, + "learning_rate": 0.0009596779857301056, + "loss": 2.8736, + "step": 4636 + }, + { + "epoch": 0.13750259466832726, + "grad_norm": 0.15512625873088837, + "learning_rate": 0.0009596594725888621, + "loss": 2.8881, + "step": 4637 + }, + { + "epoch": 0.13753224802063874, + "grad_norm": 0.13779576122760773, + "learning_rate": 0.000959640955377255, + "loss": 2.9018, + "step": 4638 + }, + { + "epoch": 0.1375619013729502, + "grad_norm": 0.14932841062545776, + "learning_rate": 0.0009596224340954482, + "loss": 2.903, + "step": 4639 + }, + { + "epoch": 0.1375915547252617, + "grad_norm": 0.1508997678756714, + "learning_rate": 0.000959603908743606, + "loss": 2.9032, + "step": 4640 + }, + { + "epoch": 0.13762120807757316, + "grad_norm": 0.12551742792129517, + "learning_rate": 0.0009595853793218923, + "loss": 2.9025, + "step": 4641 + }, + { + "epoch": 0.13765086142988464, + "grad_norm": 0.1558608114719391, + "learning_rate": 0.0009595668458304711, + "loss": 2.83, + "step": 4642 + }, + { + "epoch": 0.13768051478219612, + "grad_norm": 0.17195650935173035, + "learning_rate": 0.0009595483082695068, + "loss": 2.9131, + "step": 4643 + }, + { + "epoch": 0.1377101681345076, + "grad_norm": 0.1603497564792633, + "learning_rate": 0.0009595297666391632, + "loss": 2.907, + "step": 4644 + }, + { + "epoch": 0.1377398214868191, + "grad_norm": 0.16886396706104279, + "learning_rate": 0.0009595112209396046, + "loss": 2.8851, + "step": 4645 + }, + { + "epoch": 0.13776947483913057, + "grad_norm": 0.19019612669944763, + "learning_rate": 0.0009594926711709953, + "loss": 2.8909, + "step": 4646 + }, + { + "epoch": 0.13779912819144205, + "grad_norm": 0.172607883810997, + "learning_rate": 0.0009594741173334996, + "loss": 2.8757, + "step": 4647 + }, + { + "epoch": 0.13782878154375353, + "grad_norm": 0.1613563448190689, + "learning_rate": 0.0009594555594272816, + "loss": 2.8792, + "step": 4648 + }, + { + "epoch": 0.137858434896065, + "grad_norm": 0.1463330239057541, + "learning_rate": 0.0009594369974525059, + "loss": 2.9035, + "step": 4649 + }, + { + "epoch": 0.13788808824837648, + "grad_norm": 0.15116825699806213, + "learning_rate": 0.0009594184314093366, + "loss": 2.9069, + "step": 4650 + }, + { + "epoch": 0.13791774160068795, + "grad_norm": 0.15935291349887848, + "learning_rate": 0.0009593998612979383, + "loss": 2.9222, + "step": 4651 + }, + { + "epoch": 0.13794739495299943, + "grad_norm": 0.16505059599876404, + "learning_rate": 0.0009593812871184754, + "loss": 2.8643, + "step": 4652 + }, + { + "epoch": 0.1379770483053109, + "grad_norm": 0.1646169126033783, + "learning_rate": 0.0009593627088711124, + "loss": 2.9147, + "step": 4653 + }, + { + "epoch": 0.13800670165762238, + "grad_norm": 0.17083950340747833, + "learning_rate": 0.0009593441265560136, + "loss": 2.9212, + "step": 4654 + }, + { + "epoch": 0.1380363550099339, + "grad_norm": 0.160907581448555, + "learning_rate": 0.0009593255401733437, + "loss": 2.8985, + "step": 4655 + }, + { + "epoch": 0.13806600836224536, + "grad_norm": 0.13904160261154175, + "learning_rate": 0.0009593069497232674, + "loss": 2.9021, + "step": 4656 + }, + { + "epoch": 0.13809566171455684, + "grad_norm": 0.15284200012683868, + "learning_rate": 0.0009592883552059493, + "loss": 2.9132, + "step": 4657 + }, + { + "epoch": 0.13812531506686831, + "grad_norm": 0.14676633477210999, + "learning_rate": 0.0009592697566215538, + "loss": 2.9061, + "step": 4658 + }, + { + "epoch": 0.1381549684191798, + "grad_norm": 0.14522089064121246, + "learning_rate": 0.0009592511539702459, + "loss": 2.8961, + "step": 4659 + }, + { + "epoch": 0.13818462177149127, + "grad_norm": 0.14225518703460693, + "learning_rate": 0.0009592325472521901, + "loss": 2.8984, + "step": 4660 + }, + { + "epoch": 0.13821427512380274, + "grad_norm": 0.13582870364189148, + "learning_rate": 0.0009592139364675514, + "loss": 2.8611, + "step": 4661 + }, + { + "epoch": 0.13824392847611422, + "grad_norm": 0.15958647429943085, + "learning_rate": 0.0009591953216164943, + "loss": 2.8763, + "step": 4662 + }, + { + "epoch": 0.1382735818284257, + "grad_norm": 0.17880778014659882, + "learning_rate": 0.000959176702699184, + "loss": 2.883, + "step": 4663 + }, + { + "epoch": 0.13830323518073717, + "grad_norm": 0.20597009360790253, + "learning_rate": 0.0009591580797157851, + "loss": 2.8899, + "step": 4664 + }, + { + "epoch": 0.13833288853304865, + "grad_norm": 0.2167087346315384, + "learning_rate": 0.0009591394526664625, + "loss": 2.9191, + "step": 4665 + }, + { + "epoch": 0.13836254188536015, + "grad_norm": 0.1989494115114212, + "learning_rate": 0.0009591208215513813, + "loss": 2.8686, + "step": 4666 + }, + { + "epoch": 0.13839219523767163, + "grad_norm": 0.16886210441589355, + "learning_rate": 0.0009591021863707065, + "loss": 2.8754, + "step": 4667 + }, + { + "epoch": 0.1384218485899831, + "grad_norm": 0.159206360578537, + "learning_rate": 0.0009590835471246029, + "loss": 2.8838, + "step": 4668 + }, + { + "epoch": 0.13845150194229458, + "grad_norm": 0.15583094954490662, + "learning_rate": 0.0009590649038132358, + "loss": 2.9135, + "step": 4669 + }, + { + "epoch": 0.13848115529460606, + "grad_norm": 0.18158142268657684, + "learning_rate": 0.0009590462564367701, + "loss": 2.8567, + "step": 4670 + }, + { + "epoch": 0.13851080864691753, + "grad_norm": 0.18432579934597015, + "learning_rate": 0.000959027604995371, + "loss": 2.8852, + "step": 4671 + }, + { + "epoch": 0.138540461999229, + "grad_norm": 0.16718396544456482, + "learning_rate": 0.0009590089494892039, + "loss": 2.8657, + "step": 4672 + }, + { + "epoch": 0.13857011535154048, + "grad_norm": 0.1812555491924286, + "learning_rate": 0.0009589902899184334, + "loss": 2.8884, + "step": 4673 + }, + { + "epoch": 0.13859976870385196, + "grad_norm": 0.17584238946437836, + "learning_rate": 0.0009589716262832253, + "loss": 2.8997, + "step": 4674 + }, + { + "epoch": 0.13862942205616344, + "grad_norm": 0.18028147518634796, + "learning_rate": 0.0009589529585837446, + "loss": 2.8639, + "step": 4675 + }, + { + "epoch": 0.13865907540847494, + "grad_norm": 0.1713123321533203, + "learning_rate": 0.0009589342868201568, + "loss": 2.924, + "step": 4676 + }, + { + "epoch": 0.13868872876078642, + "grad_norm": 0.1769685447216034, + "learning_rate": 0.0009589156109926269, + "loss": 2.8956, + "step": 4677 + }, + { + "epoch": 0.1387183821130979, + "grad_norm": 0.16474035382270813, + "learning_rate": 0.0009588969311013207, + "loss": 2.8926, + "step": 4678 + }, + { + "epoch": 0.13874803546540937, + "grad_norm": 0.15591329336166382, + "learning_rate": 0.0009588782471464033, + "loss": 2.8874, + "step": 4679 + }, + { + "epoch": 0.13877768881772085, + "grad_norm": 0.1383199691772461, + "learning_rate": 0.0009588595591280403, + "loss": 2.8911, + "step": 4680 + }, + { + "epoch": 0.13880734217003232, + "grad_norm": 0.13853617012500763, + "learning_rate": 0.0009588408670463971, + "loss": 2.9166, + "step": 4681 + }, + { + "epoch": 0.1388369955223438, + "grad_norm": 0.14217783510684967, + "learning_rate": 0.0009588221709016392, + "loss": 2.905, + "step": 4682 + }, + { + "epoch": 0.13886664887465527, + "grad_norm": 0.12925174832344055, + "learning_rate": 0.0009588034706939323, + "loss": 2.8662, + "step": 4683 + }, + { + "epoch": 0.13889630222696675, + "grad_norm": 0.1278703659772873, + "learning_rate": 0.0009587847664234419, + "loss": 2.8569, + "step": 4684 + }, + { + "epoch": 0.13892595557927823, + "grad_norm": 0.11773109436035156, + "learning_rate": 0.0009587660580903338, + "loss": 2.8719, + "step": 4685 + }, + { + "epoch": 0.1389556089315897, + "grad_norm": 0.11585850268602371, + "learning_rate": 0.0009587473456947733, + "loss": 2.851, + "step": 4686 + }, + { + "epoch": 0.1389852622839012, + "grad_norm": 0.11965195834636688, + "learning_rate": 0.0009587286292369264, + "loss": 2.8792, + "step": 4687 + }, + { + "epoch": 0.13901491563621268, + "grad_norm": 0.12431745231151581, + "learning_rate": 0.0009587099087169587, + "loss": 2.883, + "step": 4688 + }, + { + "epoch": 0.13904456898852416, + "grad_norm": 0.13463814556598663, + "learning_rate": 0.0009586911841350361, + "loss": 2.864, + "step": 4689 + }, + { + "epoch": 0.13907422234083563, + "grad_norm": 0.13949087262153625, + "learning_rate": 0.0009586724554913243, + "loss": 2.8814, + "step": 4690 + }, + { + "epoch": 0.1391038756931471, + "grad_norm": 0.12589725852012634, + "learning_rate": 0.0009586537227859892, + "loss": 2.8488, + "step": 4691 + }, + { + "epoch": 0.1391335290454586, + "grad_norm": 0.13897624611854553, + "learning_rate": 0.0009586349860191965, + "loss": 2.9027, + "step": 4692 + }, + { + "epoch": 0.13916318239777006, + "grad_norm": 0.14243827760219574, + "learning_rate": 0.0009586162451911124, + "loss": 2.8751, + "step": 4693 + }, + { + "epoch": 0.13919283575008154, + "grad_norm": 0.141870379447937, + "learning_rate": 0.0009585975003019027, + "loss": 2.8984, + "step": 4694 + }, + { + "epoch": 0.13922248910239302, + "grad_norm": 0.15358048677444458, + "learning_rate": 0.0009585787513517334, + "loss": 2.9114, + "step": 4695 + }, + { + "epoch": 0.1392521424547045, + "grad_norm": 0.18114037811756134, + "learning_rate": 0.0009585599983407707, + "loss": 2.9113, + "step": 4696 + }, + { + "epoch": 0.139281795807016, + "grad_norm": 0.20419131219387054, + "learning_rate": 0.0009585412412691805, + "loss": 2.8779, + "step": 4697 + }, + { + "epoch": 0.13931144915932747, + "grad_norm": 0.17610235512256622, + "learning_rate": 0.0009585224801371286, + "loss": 2.872, + "step": 4698 + }, + { + "epoch": 0.13934110251163895, + "grad_norm": 0.1541580706834793, + "learning_rate": 0.0009585037149447817, + "loss": 2.8664, + "step": 4699 + }, + { + "epoch": 0.13937075586395042, + "grad_norm": 0.18762102723121643, + "learning_rate": 0.0009584849456923057, + "loss": 2.8725, + "step": 4700 + }, + { + "epoch": 0.1394004092162619, + "grad_norm": 0.1592111438512802, + "learning_rate": 0.0009584661723798666, + "loss": 2.8767, + "step": 4701 + }, + { + "epoch": 0.13943006256857338, + "grad_norm": 0.18227288126945496, + "learning_rate": 0.0009584473950076312, + "loss": 2.8752, + "step": 4702 + }, + { + "epoch": 0.13945971592088485, + "grad_norm": 0.19943387806415558, + "learning_rate": 0.0009584286135757651, + "loss": 2.8739, + "step": 4703 + }, + { + "epoch": 0.13948936927319633, + "grad_norm": 0.16854433715343475, + "learning_rate": 0.000958409828084435, + "loss": 2.8994, + "step": 4704 + }, + { + "epoch": 0.1395190226255078, + "grad_norm": 0.17193305492401123, + "learning_rate": 0.0009583910385338073, + "loss": 2.8753, + "step": 4705 + }, + { + "epoch": 0.13954867597781928, + "grad_norm": 0.17498831450939178, + "learning_rate": 0.0009583722449240481, + "loss": 2.9002, + "step": 4706 + }, + { + "epoch": 0.13957832933013078, + "grad_norm": 0.14654256403446198, + "learning_rate": 0.0009583534472553241, + "loss": 2.8352, + "step": 4707 + }, + { + "epoch": 0.13960798268244226, + "grad_norm": 0.14351439476013184, + "learning_rate": 0.0009583346455278017, + "loss": 2.9081, + "step": 4708 + }, + { + "epoch": 0.13963763603475374, + "grad_norm": 0.1663573831319809, + "learning_rate": 0.0009583158397416473, + "loss": 2.8334, + "step": 4709 + }, + { + "epoch": 0.1396672893870652, + "grad_norm": 0.17271868884563446, + "learning_rate": 0.0009582970298970274, + "loss": 2.8638, + "step": 4710 + }, + { + "epoch": 0.1396969427393767, + "grad_norm": 0.16764500737190247, + "learning_rate": 0.0009582782159941088, + "loss": 2.9094, + "step": 4711 + }, + { + "epoch": 0.13972659609168817, + "grad_norm": 0.15316206216812134, + "learning_rate": 0.0009582593980330578, + "loss": 2.9048, + "step": 4712 + }, + { + "epoch": 0.13975624944399964, + "grad_norm": 0.14019671082496643, + "learning_rate": 0.0009582405760140411, + "loss": 2.9117, + "step": 4713 + }, + { + "epoch": 0.13978590279631112, + "grad_norm": 0.1486760377883911, + "learning_rate": 0.0009582217499372257, + "loss": 2.87, + "step": 4714 + }, + { + "epoch": 0.1398155561486226, + "grad_norm": 0.1469818502664566, + "learning_rate": 0.0009582029198027778, + "loss": 2.8767, + "step": 4715 + }, + { + "epoch": 0.13984520950093407, + "grad_norm": 0.13478460907936096, + "learning_rate": 0.0009581840856108646, + "loss": 2.8792, + "step": 4716 + }, + { + "epoch": 0.13987486285324555, + "grad_norm": 0.1289086788892746, + "learning_rate": 0.0009581652473616524, + "loss": 2.9149, + "step": 4717 + }, + { + "epoch": 0.13990451620555705, + "grad_norm": 0.15488725900650024, + "learning_rate": 0.0009581464050553086, + "loss": 2.9389, + "step": 4718 + }, + { + "epoch": 0.13993416955786853, + "grad_norm": 0.15254150331020355, + "learning_rate": 0.0009581275586919995, + "loss": 2.8577, + "step": 4719 + }, + { + "epoch": 0.13996382291018, + "grad_norm": 0.13650597631931305, + "learning_rate": 0.0009581087082718924, + "loss": 2.8681, + "step": 4720 + }, + { + "epoch": 0.13999347626249148, + "grad_norm": 0.1267806887626648, + "learning_rate": 0.0009580898537951539, + "loss": 2.8666, + "step": 4721 + }, + { + "epoch": 0.14002312961480295, + "grad_norm": 0.1303902119398117, + "learning_rate": 0.0009580709952619513, + "loss": 2.8749, + "step": 4722 + }, + { + "epoch": 0.14005278296711443, + "grad_norm": 0.13209272921085358, + "learning_rate": 0.0009580521326724513, + "loss": 2.8991, + "step": 4723 + }, + { + "epoch": 0.1400824363194259, + "grad_norm": 0.14966335892677307, + "learning_rate": 0.000958033266026821, + "loss": 2.9248, + "step": 4724 + }, + { + "epoch": 0.14011208967173738, + "grad_norm": 0.15561503171920776, + "learning_rate": 0.0009580143953252276, + "loss": 2.9446, + "step": 4725 + }, + { + "epoch": 0.14014174302404886, + "grad_norm": 0.1822178214788437, + "learning_rate": 0.0009579955205678381, + "loss": 2.8997, + "step": 4726 + }, + { + "epoch": 0.14017139637636034, + "grad_norm": 0.19530704617500305, + "learning_rate": 0.0009579766417548196, + "loss": 2.8657, + "step": 4727 + }, + { + "epoch": 0.14020104972867184, + "grad_norm": 0.2067132145166397, + "learning_rate": 0.0009579577588863392, + "loss": 2.9058, + "step": 4728 + }, + { + "epoch": 0.14023070308098332, + "grad_norm": 0.1687202751636505, + "learning_rate": 0.0009579388719625645, + "loss": 2.8822, + "step": 4729 + }, + { + "epoch": 0.1402603564332948, + "grad_norm": 0.165195032954216, + "learning_rate": 0.0009579199809836624, + "loss": 2.8766, + "step": 4730 + }, + { + "epoch": 0.14029000978560627, + "grad_norm": 0.18240101635456085, + "learning_rate": 0.0009579010859498003, + "loss": 2.9429, + "step": 4731 + }, + { + "epoch": 0.14031966313791774, + "grad_norm": 0.16975732147693634, + "learning_rate": 0.0009578821868611453, + "loss": 2.9016, + "step": 4732 + }, + { + "epoch": 0.14034931649022922, + "grad_norm": 0.18107688426971436, + "learning_rate": 0.0009578632837178652, + "loss": 2.8749, + "step": 4733 + }, + { + "epoch": 0.1403789698425407, + "grad_norm": 0.18066568672657013, + "learning_rate": 0.000957844376520127, + "loss": 2.8943, + "step": 4734 + }, + { + "epoch": 0.14040862319485217, + "grad_norm": 0.156778022646904, + "learning_rate": 0.0009578254652680982, + "loss": 2.8806, + "step": 4735 + }, + { + "epoch": 0.14043827654716365, + "grad_norm": 0.1532306969165802, + "learning_rate": 0.0009578065499619464, + "loss": 2.8754, + "step": 4736 + }, + { + "epoch": 0.14046792989947512, + "grad_norm": 0.13148656487464905, + "learning_rate": 0.000957787630601839, + "loss": 2.8877, + "step": 4737 + }, + { + "epoch": 0.1404975832517866, + "grad_norm": 0.14073677361011505, + "learning_rate": 0.0009577687071879435, + "loss": 2.8774, + "step": 4738 + }, + { + "epoch": 0.1405272366040981, + "grad_norm": 0.12930598855018616, + "learning_rate": 0.0009577497797204276, + "loss": 2.9028, + "step": 4739 + }, + { + "epoch": 0.14055688995640958, + "grad_norm": 0.12298066169023514, + "learning_rate": 0.0009577308481994589, + "loss": 2.8848, + "step": 4740 + }, + { + "epoch": 0.14058654330872106, + "grad_norm": 0.15802066028118134, + "learning_rate": 0.0009577119126252048, + "loss": 2.8806, + "step": 4741 + }, + { + "epoch": 0.14061619666103253, + "grad_norm": 0.15851347148418427, + "learning_rate": 0.0009576929729978332, + "loss": 2.8825, + "step": 4742 + }, + { + "epoch": 0.140645850013344, + "grad_norm": 0.15389320254325867, + "learning_rate": 0.0009576740293175118, + "loss": 2.8797, + "step": 4743 + }, + { + "epoch": 0.14067550336565549, + "grad_norm": 0.16944165527820587, + "learning_rate": 0.0009576550815844082, + "loss": 2.8745, + "step": 4744 + }, + { + "epoch": 0.14070515671796696, + "grad_norm": 0.16757628321647644, + "learning_rate": 0.0009576361297986904, + "loss": 2.8755, + "step": 4745 + }, + { + "epoch": 0.14073481007027844, + "grad_norm": 0.16107077896595, + "learning_rate": 0.0009576171739605261, + "loss": 2.8949, + "step": 4746 + }, + { + "epoch": 0.1407644634225899, + "grad_norm": 0.15754801034927368, + "learning_rate": 0.0009575982140700833, + "loss": 2.8867, + "step": 4747 + }, + { + "epoch": 0.1407941167749014, + "grad_norm": 0.16781385242938995, + "learning_rate": 0.0009575792501275295, + "loss": 2.9087, + "step": 4748 + }, + { + "epoch": 0.1408237701272129, + "grad_norm": 0.19270315766334534, + "learning_rate": 0.0009575602821330332, + "loss": 2.8959, + "step": 4749 + }, + { + "epoch": 0.14085342347952437, + "grad_norm": 0.1744469851255417, + "learning_rate": 0.0009575413100867619, + "loss": 2.8732, + "step": 4750 + }, + { + "epoch": 0.14088307683183585, + "grad_norm": 0.14669707417488098, + "learning_rate": 0.0009575223339888838, + "loss": 2.8542, + "step": 4751 + }, + { + "epoch": 0.14091273018414732, + "grad_norm": 0.15213298797607422, + "learning_rate": 0.0009575033538395669, + "loss": 2.9127, + "step": 4752 + }, + { + "epoch": 0.1409423835364588, + "grad_norm": 0.14299850165843964, + "learning_rate": 0.0009574843696389792, + "loss": 2.8643, + "step": 4753 + }, + { + "epoch": 0.14097203688877027, + "grad_norm": 0.1268918514251709, + "learning_rate": 0.0009574653813872888, + "loss": 2.894, + "step": 4754 + }, + { + "epoch": 0.14100169024108175, + "grad_norm": 0.12662680447101593, + "learning_rate": 0.0009574463890846642, + "loss": 2.8605, + "step": 4755 + }, + { + "epoch": 0.14103134359339323, + "grad_norm": 0.1257261335849762, + "learning_rate": 0.0009574273927312731, + "loss": 2.8476, + "step": 4756 + }, + { + "epoch": 0.1410609969457047, + "grad_norm": 0.1347561627626419, + "learning_rate": 0.000957408392327284, + "loss": 2.89, + "step": 4757 + }, + { + "epoch": 0.14109065029801618, + "grad_norm": 0.13896217942237854, + "learning_rate": 0.0009573893878728651, + "loss": 2.8763, + "step": 4758 + }, + { + "epoch": 0.14112030365032768, + "grad_norm": 0.15837402641773224, + "learning_rate": 0.0009573703793681846, + "loss": 2.8573, + "step": 4759 + }, + { + "epoch": 0.14114995700263916, + "grad_norm": 0.16306950151920319, + "learning_rate": 0.0009573513668134109, + "loss": 2.8889, + "step": 4760 + }, + { + "epoch": 0.14117961035495064, + "grad_norm": 0.1492929309606552, + "learning_rate": 0.0009573323502087124, + "loss": 2.8928, + "step": 4761 + }, + { + "epoch": 0.1412092637072621, + "grad_norm": 0.16249558329582214, + "learning_rate": 0.0009573133295542574, + "loss": 2.9083, + "step": 4762 + }, + { + "epoch": 0.1412389170595736, + "grad_norm": 0.16519834101200104, + "learning_rate": 0.0009572943048502143, + "loss": 2.8751, + "step": 4763 + }, + { + "epoch": 0.14126857041188506, + "grad_norm": 0.1346586048603058, + "learning_rate": 0.0009572752760967517, + "loss": 2.8681, + "step": 4764 + }, + { + "epoch": 0.14129822376419654, + "grad_norm": 0.13528753817081451, + "learning_rate": 0.000957256243294038, + "loss": 2.8768, + "step": 4765 + }, + { + "epoch": 0.14132787711650802, + "grad_norm": 0.1492493748664856, + "learning_rate": 0.0009572372064422419, + "loss": 2.8696, + "step": 4766 + }, + { + "epoch": 0.1413575304688195, + "grad_norm": 0.1362149566411972, + "learning_rate": 0.0009572181655415318, + "loss": 2.8626, + "step": 4767 + }, + { + "epoch": 0.14138718382113097, + "grad_norm": 0.13350552320480347, + "learning_rate": 0.0009571991205920763, + "loss": 2.8805, + "step": 4768 + }, + { + "epoch": 0.14141683717344244, + "grad_norm": 0.1569695919752121, + "learning_rate": 0.000957180071594044, + "loss": 2.856, + "step": 4769 + }, + { + "epoch": 0.14144649052575395, + "grad_norm": 0.17365655303001404, + "learning_rate": 0.0009571610185476039, + "loss": 2.8693, + "step": 4770 + }, + { + "epoch": 0.14147614387806542, + "grad_norm": 0.19255883991718292, + "learning_rate": 0.0009571419614529244, + "loss": 2.8689, + "step": 4771 + }, + { + "epoch": 0.1415057972303769, + "grad_norm": 0.1875816285610199, + "learning_rate": 0.0009571229003101744, + "loss": 2.8741, + "step": 4772 + }, + { + "epoch": 0.14153545058268838, + "grad_norm": 0.17593441903591156, + "learning_rate": 0.0009571038351195227, + "loss": 2.9262, + "step": 4773 + }, + { + "epoch": 0.14156510393499985, + "grad_norm": 0.18014779686927795, + "learning_rate": 0.000957084765881138, + "loss": 2.9079, + "step": 4774 + }, + { + "epoch": 0.14159475728731133, + "grad_norm": 0.18527072668075562, + "learning_rate": 0.0009570656925951893, + "loss": 2.8757, + "step": 4775 + }, + { + "epoch": 0.1416244106396228, + "grad_norm": 0.1844773143529892, + "learning_rate": 0.0009570466152618453, + "loss": 2.8812, + "step": 4776 + }, + { + "epoch": 0.14165406399193428, + "grad_norm": 0.1744680553674698, + "learning_rate": 0.0009570275338812753, + "loss": 2.896, + "step": 4777 + }, + { + "epoch": 0.14168371734424576, + "grad_norm": 0.1433253288269043, + "learning_rate": 0.0009570084484536479, + "loss": 2.8903, + "step": 4778 + }, + { + "epoch": 0.14171337069655723, + "grad_norm": 0.138149693608284, + "learning_rate": 0.0009569893589791323, + "loss": 2.8876, + "step": 4779 + }, + { + "epoch": 0.14174302404886874, + "grad_norm": 0.14356482028961182, + "learning_rate": 0.0009569702654578974, + "loss": 2.8685, + "step": 4780 + }, + { + "epoch": 0.1417726774011802, + "grad_norm": 0.1284903585910797, + "learning_rate": 0.0009569511678901123, + "loss": 2.8909, + "step": 4781 + }, + { + "epoch": 0.1418023307534917, + "grad_norm": 0.1445958912372589, + "learning_rate": 0.0009569320662759464, + "loss": 2.8687, + "step": 4782 + }, + { + "epoch": 0.14183198410580317, + "grad_norm": 0.15274986624717712, + "learning_rate": 0.0009569129606155685, + "loss": 2.872, + "step": 4783 + }, + { + "epoch": 0.14186163745811464, + "grad_norm": 0.13928957283496857, + "learning_rate": 0.0009568938509091479, + "loss": 2.8994, + "step": 4784 + }, + { + "epoch": 0.14189129081042612, + "grad_norm": 0.16139432787895203, + "learning_rate": 0.0009568747371568539, + "loss": 2.8703, + "step": 4785 + }, + { + "epoch": 0.1419209441627376, + "grad_norm": 0.15992648899555206, + "learning_rate": 0.0009568556193588556, + "loss": 2.8827, + "step": 4786 + }, + { + "epoch": 0.14195059751504907, + "grad_norm": 0.17394645512104034, + "learning_rate": 0.0009568364975153224, + "loss": 2.9345, + "step": 4787 + }, + { + "epoch": 0.14198025086736055, + "grad_norm": 0.18264321982860565, + "learning_rate": 0.0009568173716264235, + "loss": 2.9061, + "step": 4788 + }, + { + "epoch": 0.14200990421967202, + "grad_norm": 0.18893401324748993, + "learning_rate": 0.0009567982416923285, + "loss": 2.88, + "step": 4789 + }, + { + "epoch": 0.1420395575719835, + "grad_norm": 0.1532384306192398, + "learning_rate": 0.0009567791077132067, + "loss": 2.8542, + "step": 4790 + }, + { + "epoch": 0.142069210924295, + "grad_norm": 0.13414938747882843, + "learning_rate": 0.0009567599696892274, + "loss": 2.8792, + "step": 4791 + }, + { + "epoch": 0.14209886427660648, + "grad_norm": 0.15764278173446655, + "learning_rate": 0.0009567408276205602, + "loss": 2.8883, + "step": 4792 + }, + { + "epoch": 0.14212851762891796, + "grad_norm": 0.1491052508354187, + "learning_rate": 0.0009567216815073745, + "loss": 2.881, + "step": 4793 + }, + { + "epoch": 0.14215817098122943, + "grad_norm": 0.13818255066871643, + "learning_rate": 0.00095670253134984, + "loss": 2.8652, + "step": 4794 + }, + { + "epoch": 0.1421878243335409, + "grad_norm": 0.15312254428863525, + "learning_rate": 0.0009566833771481262, + "loss": 2.8944, + "step": 4795 + }, + { + "epoch": 0.14221747768585238, + "grad_norm": 0.15092000365257263, + "learning_rate": 0.0009566642189024026, + "loss": 2.9241, + "step": 4796 + }, + { + "epoch": 0.14224713103816386, + "grad_norm": 0.14505824446678162, + "learning_rate": 0.0009566450566128391, + "loss": 2.9164, + "step": 4797 + }, + { + "epoch": 0.14227678439047534, + "grad_norm": 0.1528274118900299, + "learning_rate": 0.0009566258902796051, + "loss": 2.8667, + "step": 4798 + }, + { + "epoch": 0.1423064377427868, + "grad_norm": 0.16754576563835144, + "learning_rate": 0.0009566067199028705, + "loss": 2.8737, + "step": 4799 + }, + { + "epoch": 0.1423360910950983, + "grad_norm": 0.1577879637479782, + "learning_rate": 0.000956587545482805, + "loss": 2.8716, + "step": 4800 + }, + { + "epoch": 0.1423657444474098, + "grad_norm": 0.1677362620830536, + "learning_rate": 0.0009565683670195787, + "loss": 2.8959, + "step": 4801 + }, + { + "epoch": 0.14239539779972127, + "grad_norm": 0.19365230202674866, + "learning_rate": 0.0009565491845133607, + "loss": 2.8553, + "step": 4802 + }, + { + "epoch": 0.14242505115203274, + "grad_norm": 0.22109250724315643, + "learning_rate": 0.0009565299979643217, + "loss": 2.8957, + "step": 4803 + }, + { + "epoch": 0.14245470450434422, + "grad_norm": 0.19014416635036469, + "learning_rate": 0.0009565108073726308, + "loss": 2.8723, + "step": 4804 + }, + { + "epoch": 0.1424843578566557, + "grad_norm": 0.17544639110565186, + "learning_rate": 0.0009564916127384587, + "loss": 2.8986, + "step": 4805 + }, + { + "epoch": 0.14251401120896717, + "grad_norm": 0.18366572260856628, + "learning_rate": 0.0009564724140619747, + "loss": 2.9212, + "step": 4806 + }, + { + "epoch": 0.14254366456127865, + "grad_norm": 0.14296698570251465, + "learning_rate": 0.0009564532113433493, + "loss": 2.8805, + "step": 4807 + }, + { + "epoch": 0.14257331791359013, + "grad_norm": 0.149567112326622, + "learning_rate": 0.0009564340045827524, + "loss": 2.8848, + "step": 4808 + }, + { + "epoch": 0.1426029712659016, + "grad_norm": 0.16346707940101624, + "learning_rate": 0.000956414793780354, + "loss": 2.8898, + "step": 4809 + }, + { + "epoch": 0.14263262461821308, + "grad_norm": 0.1268472820520401, + "learning_rate": 0.0009563955789363243, + "loss": 2.8852, + "step": 4810 + }, + { + "epoch": 0.14266227797052458, + "grad_norm": 0.14705109596252441, + "learning_rate": 0.0009563763600508333, + "loss": 2.8776, + "step": 4811 + }, + { + "epoch": 0.14269193132283606, + "grad_norm": 0.11884014308452606, + "learning_rate": 0.0009563571371240514, + "loss": 2.8691, + "step": 4812 + }, + { + "epoch": 0.14272158467514753, + "grad_norm": 0.11922485381364822, + "learning_rate": 0.0009563379101561487, + "loss": 2.8788, + "step": 4813 + }, + { + "epoch": 0.142751238027459, + "grad_norm": 0.13250276446342468, + "learning_rate": 0.0009563186791472954, + "loss": 2.9221, + "step": 4814 + }, + { + "epoch": 0.14278089137977049, + "grad_norm": 0.1261705756187439, + "learning_rate": 0.000956299444097662, + "loss": 2.8957, + "step": 4815 + }, + { + "epoch": 0.14281054473208196, + "grad_norm": 0.15410684049129486, + "learning_rate": 0.0009562802050074186, + "loss": 2.9171, + "step": 4816 + }, + { + "epoch": 0.14284019808439344, + "grad_norm": 0.15819144248962402, + "learning_rate": 0.0009562609618767357, + "loss": 2.9053, + "step": 4817 + }, + { + "epoch": 0.14286985143670491, + "grad_norm": 0.1563832014799118, + "learning_rate": 0.0009562417147057836, + "loss": 2.8812, + "step": 4818 + }, + { + "epoch": 0.1428995047890164, + "grad_norm": 0.1419086754322052, + "learning_rate": 0.0009562224634947329, + "loss": 2.8601, + "step": 4819 + }, + { + "epoch": 0.14292915814132787, + "grad_norm": 0.1484157145023346, + "learning_rate": 0.0009562032082437539, + "loss": 2.9137, + "step": 4820 + }, + { + "epoch": 0.14295881149363934, + "grad_norm": 0.16313084959983826, + "learning_rate": 0.0009561839489530173, + "loss": 2.9118, + "step": 4821 + }, + { + "epoch": 0.14298846484595085, + "grad_norm": 0.17332926392555237, + "learning_rate": 0.0009561646856226933, + "loss": 2.9105, + "step": 4822 + }, + { + "epoch": 0.14301811819826232, + "grad_norm": 0.19384226202964783, + "learning_rate": 0.0009561454182529529, + "loss": 2.8902, + "step": 4823 + }, + { + "epoch": 0.1430477715505738, + "grad_norm": 0.17927801609039307, + "learning_rate": 0.0009561261468439666, + "loss": 2.9252, + "step": 4824 + }, + { + "epoch": 0.14307742490288528, + "grad_norm": 0.17871667444705963, + "learning_rate": 0.0009561068713959048, + "loss": 2.892, + "step": 4825 + }, + { + "epoch": 0.14310707825519675, + "grad_norm": 0.16002494096755981, + "learning_rate": 0.0009560875919089384, + "loss": 2.8573, + "step": 4826 + }, + { + "epoch": 0.14313673160750823, + "grad_norm": 0.1468561440706253, + "learning_rate": 0.0009560683083832381, + "loss": 2.8731, + "step": 4827 + }, + { + "epoch": 0.1431663849598197, + "grad_norm": 0.14586719870567322, + "learning_rate": 0.0009560490208189747, + "loss": 2.9093, + "step": 4828 + }, + { + "epoch": 0.14319603831213118, + "grad_norm": 0.14984455704689026, + "learning_rate": 0.0009560297292163189, + "loss": 2.8845, + "step": 4829 + }, + { + "epoch": 0.14322569166444266, + "grad_norm": 0.13813655078411102, + "learning_rate": 0.0009560104335754416, + "loss": 2.8738, + "step": 4830 + }, + { + "epoch": 0.14325534501675413, + "grad_norm": 0.12797191739082336, + "learning_rate": 0.0009559911338965135, + "loss": 2.8882, + "step": 4831 + }, + { + "epoch": 0.14328499836906564, + "grad_norm": 0.12939783930778503, + "learning_rate": 0.0009559718301797058, + "loss": 2.8913, + "step": 4832 + }, + { + "epoch": 0.1433146517213771, + "grad_norm": 0.12900161743164062, + "learning_rate": 0.0009559525224251893, + "loss": 2.8691, + "step": 4833 + }, + { + "epoch": 0.1433443050736886, + "grad_norm": 0.13142050802707672, + "learning_rate": 0.0009559332106331348, + "loss": 2.8787, + "step": 4834 + }, + { + "epoch": 0.14337395842600006, + "grad_norm": 0.13934879004955292, + "learning_rate": 0.0009559138948037136, + "loss": 2.8618, + "step": 4835 + }, + { + "epoch": 0.14340361177831154, + "grad_norm": 0.1498771458864212, + "learning_rate": 0.0009558945749370964, + "loss": 2.883, + "step": 4836 + }, + { + "epoch": 0.14343326513062302, + "grad_norm": 0.14909406006336212, + "learning_rate": 0.0009558752510334548, + "loss": 2.8994, + "step": 4837 + }, + { + "epoch": 0.1434629184829345, + "grad_norm": 0.16043168306350708, + "learning_rate": 0.0009558559230929593, + "loss": 2.9201, + "step": 4838 + }, + { + "epoch": 0.14349257183524597, + "grad_norm": 0.15979653596878052, + "learning_rate": 0.0009558365911157815, + "loss": 2.8546, + "step": 4839 + }, + { + "epoch": 0.14352222518755745, + "grad_norm": 0.16610859334468842, + "learning_rate": 0.0009558172551020925, + "loss": 2.9067, + "step": 4840 + }, + { + "epoch": 0.14355187853986892, + "grad_norm": 0.18433386087417603, + "learning_rate": 0.0009557979150520633, + "loss": 2.8796, + "step": 4841 + }, + { + "epoch": 0.1435815318921804, + "grad_norm": 0.16814014315605164, + "learning_rate": 0.0009557785709658654, + "loss": 2.8383, + "step": 4842 + }, + { + "epoch": 0.1436111852444919, + "grad_norm": 0.15468092262744904, + "learning_rate": 0.00095575922284367, + "loss": 2.85, + "step": 4843 + }, + { + "epoch": 0.14364083859680338, + "grad_norm": 0.13433152437210083, + "learning_rate": 0.0009557398706856486, + "loss": 2.8892, + "step": 4844 + }, + { + "epoch": 0.14367049194911485, + "grad_norm": 0.13596761226654053, + "learning_rate": 0.0009557205144919723, + "loss": 2.9108, + "step": 4845 + }, + { + "epoch": 0.14370014530142633, + "grad_norm": 0.14660821855068207, + "learning_rate": 0.0009557011542628126, + "loss": 2.8517, + "step": 4846 + }, + { + "epoch": 0.1437297986537378, + "grad_norm": 0.15696585178375244, + "learning_rate": 0.0009556817899983409, + "loss": 2.8813, + "step": 4847 + }, + { + "epoch": 0.14375945200604928, + "grad_norm": 0.14972807466983795, + "learning_rate": 0.0009556624216987288, + "loss": 2.8992, + "step": 4848 + }, + { + "epoch": 0.14378910535836076, + "grad_norm": 0.13555656373500824, + "learning_rate": 0.0009556430493641479, + "loss": 2.8688, + "step": 4849 + }, + { + "epoch": 0.14381875871067223, + "grad_norm": 0.14049533009529114, + "learning_rate": 0.0009556236729947694, + "loss": 2.8502, + "step": 4850 + }, + { + "epoch": 0.1438484120629837, + "grad_norm": 0.16749964654445648, + "learning_rate": 0.0009556042925907651, + "loss": 2.866, + "step": 4851 + }, + { + "epoch": 0.1438780654152952, + "grad_norm": 0.16884465515613556, + "learning_rate": 0.0009555849081523066, + "loss": 2.889, + "step": 4852 + }, + { + "epoch": 0.1439077187676067, + "grad_norm": 0.17099225521087646, + "learning_rate": 0.0009555655196795657, + "loss": 2.8946, + "step": 4853 + }, + { + "epoch": 0.14393737211991817, + "grad_norm": 0.16818174719810486, + "learning_rate": 0.0009555461271727136, + "loss": 2.8965, + "step": 4854 + }, + { + "epoch": 0.14396702547222964, + "grad_norm": 0.19116194546222687, + "learning_rate": 0.0009555267306319225, + "loss": 2.8867, + "step": 4855 + }, + { + "epoch": 0.14399667882454112, + "grad_norm": 0.18329182267189026, + "learning_rate": 0.000955507330057364, + "loss": 2.8625, + "step": 4856 + }, + { + "epoch": 0.1440263321768526, + "grad_norm": 0.17002224922180176, + "learning_rate": 0.00095548792544921, + "loss": 2.8212, + "step": 4857 + }, + { + "epoch": 0.14405598552916407, + "grad_norm": 0.18809282779693604, + "learning_rate": 0.0009554685168076323, + "loss": 2.8857, + "step": 4858 + }, + { + "epoch": 0.14408563888147555, + "grad_norm": 0.1703728288412094, + "learning_rate": 0.0009554491041328023, + "loss": 2.8682, + "step": 4859 + }, + { + "epoch": 0.14411529223378702, + "grad_norm": 0.1489674299955368, + "learning_rate": 0.0009554296874248927, + "loss": 2.898, + "step": 4860 + }, + { + "epoch": 0.1441449455860985, + "grad_norm": 0.1502505987882614, + "learning_rate": 0.000955410266684075, + "loss": 2.8503, + "step": 4861 + }, + { + "epoch": 0.14417459893840998, + "grad_norm": 0.1598508507013321, + "learning_rate": 0.0009553908419105211, + "loss": 2.8516, + "step": 4862 + }, + { + "epoch": 0.14420425229072148, + "grad_norm": 0.15582478046417236, + "learning_rate": 0.0009553714131044031, + "loss": 2.8853, + "step": 4863 + }, + { + "epoch": 0.14423390564303296, + "grad_norm": 0.15133561193943024, + "learning_rate": 0.0009553519802658932, + "loss": 2.8539, + "step": 4864 + }, + { + "epoch": 0.14426355899534443, + "grad_norm": 0.15864725410938263, + "learning_rate": 0.0009553325433951633, + "loss": 2.8742, + "step": 4865 + }, + { + "epoch": 0.1442932123476559, + "grad_norm": 0.1662781834602356, + "learning_rate": 0.0009553131024923855, + "loss": 2.9018, + "step": 4866 + }, + { + "epoch": 0.14432286569996738, + "grad_norm": 0.15288224816322327, + "learning_rate": 0.0009552936575577322, + "loss": 2.8896, + "step": 4867 + }, + { + "epoch": 0.14435251905227886, + "grad_norm": 0.16256357729434967, + "learning_rate": 0.0009552742085913753, + "loss": 2.8712, + "step": 4868 + }, + { + "epoch": 0.14438217240459034, + "grad_norm": 0.16187414526939392, + "learning_rate": 0.0009552547555934872, + "loss": 2.8785, + "step": 4869 + }, + { + "epoch": 0.1444118257569018, + "grad_norm": 0.14876757562160492, + "learning_rate": 0.00095523529856424, + "loss": 2.8544, + "step": 4870 + }, + { + "epoch": 0.1444414791092133, + "grad_norm": 0.1399846374988556, + "learning_rate": 0.000955215837503806, + "loss": 2.9025, + "step": 4871 + }, + { + "epoch": 0.14447113246152476, + "grad_norm": 0.13640974462032318, + "learning_rate": 0.0009551963724123577, + "loss": 2.8979, + "step": 4872 + }, + { + "epoch": 0.14450078581383624, + "grad_norm": 0.15139274299144745, + "learning_rate": 0.0009551769032900676, + "loss": 2.8706, + "step": 4873 + }, + { + "epoch": 0.14453043916614775, + "grad_norm": 0.141072615981102, + "learning_rate": 0.0009551574301371078, + "loss": 2.8748, + "step": 4874 + }, + { + "epoch": 0.14456009251845922, + "grad_norm": 0.1442769169807434, + "learning_rate": 0.0009551379529536507, + "loss": 2.9115, + "step": 4875 + }, + { + "epoch": 0.1445897458707707, + "grad_norm": 0.14234672486782074, + "learning_rate": 0.0009551184717398689, + "loss": 2.8327, + "step": 4876 + }, + { + "epoch": 0.14461939922308217, + "grad_norm": 0.13959582149982452, + "learning_rate": 0.000955098986495935, + "loss": 2.8655, + "step": 4877 + }, + { + "epoch": 0.14464905257539365, + "grad_norm": 0.15802247822284698, + "learning_rate": 0.0009550794972220213, + "loss": 2.9098, + "step": 4878 + }, + { + "epoch": 0.14467870592770513, + "grad_norm": 0.18431790173053741, + "learning_rate": 0.0009550600039183009, + "loss": 2.9062, + "step": 4879 + }, + { + "epoch": 0.1447083592800166, + "grad_norm": 0.19290368258953094, + "learning_rate": 0.0009550405065849456, + "loss": 2.8632, + "step": 4880 + }, + { + "epoch": 0.14473801263232808, + "grad_norm": 0.1867024004459381, + "learning_rate": 0.0009550210052221288, + "loss": 2.8898, + "step": 4881 + }, + { + "epoch": 0.14476766598463955, + "grad_norm": 0.16636085510253906, + "learning_rate": 0.0009550014998300229, + "loss": 2.9197, + "step": 4882 + }, + { + "epoch": 0.14479731933695103, + "grad_norm": 0.1254986822605133, + "learning_rate": 0.0009549819904088006, + "loss": 2.8699, + "step": 4883 + }, + { + "epoch": 0.14482697268926253, + "grad_norm": 0.1409064382314682, + "learning_rate": 0.0009549624769586345, + "loss": 2.8722, + "step": 4884 + }, + { + "epoch": 0.144856626041574, + "grad_norm": 0.13285665214061737, + "learning_rate": 0.0009549429594796978, + "loss": 2.8777, + "step": 4885 + }, + { + "epoch": 0.1448862793938855, + "grad_norm": 0.12279482185840607, + "learning_rate": 0.000954923437972163, + "loss": 2.9214, + "step": 4886 + }, + { + "epoch": 0.14491593274619696, + "grad_norm": 0.12653391063213348, + "learning_rate": 0.0009549039124362031, + "loss": 2.8738, + "step": 4887 + }, + { + "epoch": 0.14494558609850844, + "grad_norm": 0.1413455307483673, + "learning_rate": 0.0009548843828719909, + "loss": 2.9004, + "step": 4888 + }, + { + "epoch": 0.14497523945081991, + "grad_norm": 0.15581250190734863, + "learning_rate": 0.0009548648492796994, + "loss": 2.9155, + "step": 4889 + }, + { + "epoch": 0.1450048928031314, + "grad_norm": 0.16453784704208374, + "learning_rate": 0.0009548453116595018, + "loss": 2.8806, + "step": 4890 + }, + { + "epoch": 0.14503454615544287, + "grad_norm": 0.1578955054283142, + "learning_rate": 0.0009548257700115706, + "loss": 2.8903, + "step": 4891 + }, + { + "epoch": 0.14506419950775434, + "grad_norm": 0.15458518266677856, + "learning_rate": 0.0009548062243360793, + "loss": 2.8537, + "step": 4892 + }, + { + "epoch": 0.14509385286006582, + "grad_norm": 0.14841194450855255, + "learning_rate": 0.0009547866746332008, + "loss": 2.8799, + "step": 4893 + }, + { + "epoch": 0.1451235062123773, + "grad_norm": 0.17536281049251556, + "learning_rate": 0.0009547671209031082, + "loss": 2.8703, + "step": 4894 + }, + { + "epoch": 0.1451531595646888, + "grad_norm": 0.17815649509429932, + "learning_rate": 0.0009547475631459748, + "loss": 2.887, + "step": 4895 + }, + { + "epoch": 0.14518281291700028, + "grad_norm": 0.1935468316078186, + "learning_rate": 0.0009547280013619734, + "loss": 2.9021, + "step": 4896 + }, + { + "epoch": 0.14521246626931175, + "grad_norm": 0.18361972272396088, + "learning_rate": 0.0009547084355512778, + "loss": 2.8955, + "step": 4897 + }, + { + "epoch": 0.14524211962162323, + "grad_norm": 0.15568117797374725, + "learning_rate": 0.0009546888657140609, + "loss": 2.9322, + "step": 4898 + }, + { + "epoch": 0.1452717729739347, + "grad_norm": 0.16813260316848755, + "learning_rate": 0.0009546692918504959, + "loss": 2.8854, + "step": 4899 + }, + { + "epoch": 0.14530142632624618, + "grad_norm": 0.19909580051898956, + "learning_rate": 0.0009546497139607564, + "loss": 2.8561, + "step": 4900 + }, + { + "epoch": 0.14533107967855766, + "grad_norm": 0.1968689113855362, + "learning_rate": 0.0009546301320450155, + "loss": 2.8809, + "step": 4901 + }, + { + "epoch": 0.14536073303086913, + "grad_norm": 0.19567622244358063, + "learning_rate": 0.0009546105461034469, + "loss": 2.8885, + "step": 4902 + }, + { + "epoch": 0.1453903863831806, + "grad_norm": 0.15263165533542633, + "learning_rate": 0.0009545909561362239, + "loss": 2.8876, + "step": 4903 + }, + { + "epoch": 0.14542003973549208, + "grad_norm": 0.15916825830936432, + "learning_rate": 0.0009545713621435197, + "loss": 2.8826, + "step": 4904 + }, + { + "epoch": 0.1454496930878036, + "grad_norm": 0.1415998339653015, + "learning_rate": 0.0009545517641255083, + "loss": 2.8623, + "step": 4905 + }, + { + "epoch": 0.14547934644011506, + "grad_norm": 0.14900018274784088, + "learning_rate": 0.000954532162082363, + "loss": 2.8755, + "step": 4906 + }, + { + "epoch": 0.14550899979242654, + "grad_norm": 0.17686894536018372, + "learning_rate": 0.0009545125560142573, + "loss": 2.9237, + "step": 4907 + }, + { + "epoch": 0.14553865314473802, + "grad_norm": 0.16639843583106995, + "learning_rate": 0.0009544929459213649, + "loss": 2.89, + "step": 4908 + }, + { + "epoch": 0.1455683064970495, + "grad_norm": 0.13242663443088531, + "learning_rate": 0.0009544733318038594, + "loss": 2.8848, + "step": 4909 + }, + { + "epoch": 0.14559795984936097, + "grad_norm": 0.1303132176399231, + "learning_rate": 0.0009544537136619147, + "loss": 2.8541, + "step": 4910 + }, + { + "epoch": 0.14562761320167245, + "grad_norm": 0.14137224853038788, + "learning_rate": 0.0009544340914957042, + "loss": 2.8702, + "step": 4911 + }, + { + "epoch": 0.14565726655398392, + "grad_norm": 0.14139008522033691, + "learning_rate": 0.0009544144653054018, + "loss": 2.9073, + "step": 4912 + }, + { + "epoch": 0.1456869199062954, + "grad_norm": 0.13914765417575836, + "learning_rate": 0.0009543948350911815, + "loss": 2.8931, + "step": 4913 + }, + { + "epoch": 0.14571657325860687, + "grad_norm": 0.1417926549911499, + "learning_rate": 0.0009543752008532169, + "loss": 2.902, + "step": 4914 + }, + { + "epoch": 0.14574622661091838, + "grad_norm": 0.16250622272491455, + "learning_rate": 0.0009543555625916817, + "loss": 2.8668, + "step": 4915 + }, + { + "epoch": 0.14577587996322985, + "grad_norm": 0.15154054760932922, + "learning_rate": 0.0009543359203067504, + "loss": 2.89, + "step": 4916 + }, + { + "epoch": 0.14580553331554133, + "grad_norm": 0.12219396978616714, + "learning_rate": 0.0009543162739985962, + "loss": 2.9188, + "step": 4917 + }, + { + "epoch": 0.1458351866678528, + "grad_norm": 0.12421330064535141, + "learning_rate": 0.0009542966236673935, + "loss": 2.8518, + "step": 4918 + }, + { + "epoch": 0.14586484002016428, + "grad_norm": 0.14511147141456604, + "learning_rate": 0.0009542769693133162, + "loss": 2.9031, + "step": 4919 + }, + { + "epoch": 0.14589449337247576, + "grad_norm": 0.14904342591762543, + "learning_rate": 0.0009542573109365385, + "loss": 2.8745, + "step": 4920 + }, + { + "epoch": 0.14592414672478723, + "grad_norm": 0.15133431553840637, + "learning_rate": 0.0009542376485372341, + "loss": 2.8961, + "step": 4921 + }, + { + "epoch": 0.1459538000770987, + "grad_norm": 0.1650116741657257, + "learning_rate": 0.0009542179821155774, + "loss": 2.8928, + "step": 4922 + }, + { + "epoch": 0.1459834534294102, + "grad_norm": 0.17175480723381042, + "learning_rate": 0.0009541983116717426, + "loss": 2.8895, + "step": 4923 + }, + { + "epoch": 0.14601310678172166, + "grad_norm": 0.16661924123764038, + "learning_rate": 0.0009541786372059038, + "loss": 2.9038, + "step": 4924 + }, + { + "epoch": 0.14604276013403314, + "grad_norm": 0.20029592514038086, + "learning_rate": 0.0009541589587182352, + "loss": 2.9061, + "step": 4925 + }, + { + "epoch": 0.14607241348634464, + "grad_norm": 0.14637711644172668, + "learning_rate": 0.000954139276208911, + "loss": 2.8851, + "step": 4926 + }, + { + "epoch": 0.14610206683865612, + "grad_norm": 0.12070702016353607, + "learning_rate": 0.0009541195896781056, + "loss": 2.8935, + "step": 4927 + }, + { + "epoch": 0.1461317201909676, + "grad_norm": 0.13697023689746857, + "learning_rate": 0.0009540998991259933, + "loss": 2.8738, + "step": 4928 + }, + { + "epoch": 0.14616137354327907, + "grad_norm": 0.14095242321491241, + "learning_rate": 0.0009540802045527485, + "loss": 2.8642, + "step": 4929 + }, + { + "epoch": 0.14619102689559055, + "grad_norm": 0.16538770496845245, + "learning_rate": 0.0009540605059585454, + "loss": 2.8976, + "step": 4930 + }, + { + "epoch": 0.14622068024790202, + "grad_norm": 0.20210227370262146, + "learning_rate": 0.0009540408033435587, + "loss": 2.8674, + "step": 4931 + }, + { + "epoch": 0.1462503336002135, + "grad_norm": 0.20028014481067657, + "learning_rate": 0.0009540210967079627, + "loss": 2.9045, + "step": 4932 + }, + { + "epoch": 0.14627998695252498, + "grad_norm": 0.16583213210105896, + "learning_rate": 0.000954001386051932, + "loss": 2.8865, + "step": 4933 + }, + { + "epoch": 0.14630964030483645, + "grad_norm": 0.1518743932247162, + "learning_rate": 0.0009539816713756411, + "loss": 2.8816, + "step": 4934 + }, + { + "epoch": 0.14633929365714793, + "grad_norm": 0.17353376746177673, + "learning_rate": 0.0009539619526792645, + "loss": 2.8818, + "step": 4935 + }, + { + "epoch": 0.14636894700945943, + "grad_norm": 0.18252041935920715, + "learning_rate": 0.0009539422299629769, + "loss": 2.8672, + "step": 4936 + }, + { + "epoch": 0.1463986003617709, + "grad_norm": 0.14269466698169708, + "learning_rate": 0.000953922503226953, + "loss": 2.9014, + "step": 4937 + }, + { + "epoch": 0.14642825371408238, + "grad_norm": 0.1400987058877945, + "learning_rate": 0.0009539027724713673, + "loss": 2.8682, + "step": 4938 + }, + { + "epoch": 0.14645790706639386, + "grad_norm": 0.1441451758146286, + "learning_rate": 0.0009538830376963947, + "loss": 2.9123, + "step": 4939 + }, + { + "epoch": 0.14648756041870534, + "grad_norm": 0.1276828497648239, + "learning_rate": 0.0009538632989022101, + "loss": 2.8609, + "step": 4940 + }, + { + "epoch": 0.1465172137710168, + "grad_norm": 0.12592583894729614, + "learning_rate": 0.0009538435560889878, + "loss": 2.8765, + "step": 4941 + }, + { + "epoch": 0.1465468671233283, + "grad_norm": 0.13127756118774414, + "learning_rate": 0.0009538238092569029, + "loss": 2.8552, + "step": 4942 + }, + { + "epoch": 0.14657652047563977, + "grad_norm": 0.13643768429756165, + "learning_rate": 0.0009538040584061305, + "loss": 2.8669, + "step": 4943 + }, + { + "epoch": 0.14660617382795124, + "grad_norm": 0.1457807421684265, + "learning_rate": 0.0009537843035368451, + "loss": 2.8953, + "step": 4944 + }, + { + "epoch": 0.14663582718026272, + "grad_norm": 0.1313750147819519, + "learning_rate": 0.0009537645446492218, + "loss": 2.8902, + "step": 4945 + }, + { + "epoch": 0.1466654805325742, + "grad_norm": 0.1273527443408966, + "learning_rate": 0.0009537447817434357, + "loss": 2.861, + "step": 4946 + }, + { + "epoch": 0.1466951338848857, + "grad_norm": 0.13880732655525208, + "learning_rate": 0.0009537250148196614, + "loss": 2.8806, + "step": 4947 + }, + { + "epoch": 0.14672478723719717, + "grad_norm": 0.13729916512966156, + "learning_rate": 0.0009537052438780744, + "loss": 2.8609, + "step": 4948 + }, + { + "epoch": 0.14675444058950865, + "grad_norm": 0.1461217850446701, + "learning_rate": 0.0009536854689188496, + "loss": 2.8663, + "step": 4949 + }, + { + "epoch": 0.14678409394182013, + "grad_norm": 0.13436229526996613, + "learning_rate": 0.0009536656899421623, + "loss": 2.8859, + "step": 4950 + }, + { + "epoch": 0.1468137472941316, + "grad_norm": 0.1616576761007309, + "learning_rate": 0.000953645906948187, + "loss": 2.8681, + "step": 4951 + }, + { + "epoch": 0.14684340064644308, + "grad_norm": 0.17331300675868988, + "learning_rate": 0.0009536261199370997, + "loss": 2.8378, + "step": 4952 + }, + { + "epoch": 0.14687305399875455, + "grad_norm": 0.15684229135513306, + "learning_rate": 0.0009536063289090751, + "loss": 2.8916, + "step": 4953 + }, + { + "epoch": 0.14690270735106603, + "grad_norm": 0.1604033261537552, + "learning_rate": 0.0009535865338642886, + "loss": 2.908, + "step": 4954 + }, + { + "epoch": 0.1469323607033775, + "grad_norm": 0.1685100793838501, + "learning_rate": 0.0009535667348029155, + "loss": 2.8806, + "step": 4955 + }, + { + "epoch": 0.14696201405568898, + "grad_norm": 0.15253666043281555, + "learning_rate": 0.0009535469317251311, + "loss": 2.9006, + "step": 4956 + }, + { + "epoch": 0.1469916674080005, + "grad_norm": 0.16566570103168488, + "learning_rate": 0.0009535271246311108, + "loss": 2.8723, + "step": 4957 + }, + { + "epoch": 0.14702132076031196, + "grad_norm": 0.17489029467105865, + "learning_rate": 0.0009535073135210299, + "loss": 2.8785, + "step": 4958 + }, + { + "epoch": 0.14705097411262344, + "grad_norm": 0.17508432269096375, + "learning_rate": 0.0009534874983950639, + "loss": 2.888, + "step": 4959 + }, + { + "epoch": 0.14708062746493492, + "grad_norm": 0.1740923374891281, + "learning_rate": 0.0009534676792533883, + "loss": 2.88, + "step": 4960 + }, + { + "epoch": 0.1471102808172464, + "grad_norm": 0.17459048330783844, + "learning_rate": 0.0009534478560961786, + "loss": 2.8807, + "step": 4961 + }, + { + "epoch": 0.14713993416955787, + "grad_norm": 0.1820840686559677, + "learning_rate": 0.0009534280289236101, + "loss": 2.8938, + "step": 4962 + }, + { + "epoch": 0.14716958752186934, + "grad_norm": 0.15334416925907135, + "learning_rate": 0.0009534081977358588, + "loss": 2.884, + "step": 4963 + }, + { + "epoch": 0.14719924087418082, + "grad_norm": 0.16409938037395477, + "learning_rate": 0.0009533883625331, + "loss": 2.8794, + "step": 4964 + }, + { + "epoch": 0.1472288942264923, + "grad_norm": 0.17826074361801147, + "learning_rate": 0.0009533685233155094, + "loss": 2.8601, + "step": 4965 + }, + { + "epoch": 0.14725854757880377, + "grad_norm": 0.18679270148277283, + "learning_rate": 0.0009533486800832628, + "loss": 2.8603, + "step": 4966 + }, + { + "epoch": 0.14728820093111528, + "grad_norm": 0.16912440955638885, + "learning_rate": 0.0009533288328365357, + "loss": 2.8638, + "step": 4967 + }, + { + "epoch": 0.14731785428342675, + "grad_norm": 0.17181181907653809, + "learning_rate": 0.0009533089815755041, + "loss": 2.8804, + "step": 4968 + }, + { + "epoch": 0.14734750763573823, + "grad_norm": 0.17550356686115265, + "learning_rate": 0.0009532891263003436, + "loss": 2.8355, + "step": 4969 + }, + { + "epoch": 0.1473771609880497, + "grad_norm": 0.15659834444522858, + "learning_rate": 0.00095326926701123, + "loss": 2.8828, + "step": 4970 + }, + { + "epoch": 0.14740681434036118, + "grad_norm": 0.14515051245689392, + "learning_rate": 0.0009532494037083394, + "loss": 2.9058, + "step": 4971 + }, + { + "epoch": 0.14743646769267266, + "grad_norm": 0.1407797634601593, + "learning_rate": 0.0009532295363918474, + "loss": 2.8679, + "step": 4972 + }, + { + "epoch": 0.14746612104498413, + "grad_norm": 0.1544780433177948, + "learning_rate": 0.0009532096650619302, + "loss": 2.8961, + "step": 4973 + }, + { + "epoch": 0.1474957743972956, + "grad_norm": 0.13661037385463715, + "learning_rate": 0.0009531897897187635, + "loss": 2.9028, + "step": 4974 + }, + { + "epoch": 0.14752542774960709, + "grad_norm": 0.1401021033525467, + "learning_rate": 0.0009531699103625235, + "loss": 2.8832, + "step": 4975 + }, + { + "epoch": 0.14755508110191856, + "grad_norm": 0.135695680975914, + "learning_rate": 0.000953150026993386, + "loss": 2.8852, + "step": 4976 + }, + { + "epoch": 0.14758473445423004, + "grad_norm": 0.13877789676189423, + "learning_rate": 0.0009531301396115273, + "loss": 2.9001, + "step": 4977 + }, + { + "epoch": 0.14761438780654154, + "grad_norm": 0.1573142409324646, + "learning_rate": 0.0009531102482171235, + "loss": 2.9005, + "step": 4978 + }, + { + "epoch": 0.14764404115885302, + "grad_norm": 0.1865428239107132, + "learning_rate": 0.0009530903528103507, + "loss": 2.8968, + "step": 4979 + }, + { + "epoch": 0.1476736945111645, + "grad_norm": 0.19050537049770355, + "learning_rate": 0.000953070453391385, + "loss": 2.8714, + "step": 4980 + }, + { + "epoch": 0.14770334786347597, + "grad_norm": 0.15984870493412018, + "learning_rate": 0.0009530505499604026, + "loss": 2.8724, + "step": 4981 + }, + { + "epoch": 0.14773300121578745, + "grad_norm": 0.15868370234966278, + "learning_rate": 0.0009530306425175798, + "loss": 2.8551, + "step": 4982 + }, + { + "epoch": 0.14776265456809892, + "grad_norm": 0.17225781083106995, + "learning_rate": 0.0009530107310630931, + "loss": 2.8927, + "step": 4983 + }, + { + "epoch": 0.1477923079204104, + "grad_norm": 0.1558752804994583, + "learning_rate": 0.0009529908155971185, + "loss": 2.8974, + "step": 4984 + }, + { + "epoch": 0.14782196127272187, + "grad_norm": 0.1393619179725647, + "learning_rate": 0.0009529708961198325, + "loss": 2.8328, + "step": 4985 + }, + { + "epoch": 0.14785161462503335, + "grad_norm": 0.13057148456573486, + "learning_rate": 0.0009529509726314114, + "loss": 2.9003, + "step": 4986 + }, + { + "epoch": 0.14788126797734483, + "grad_norm": 0.1403616964817047, + "learning_rate": 0.0009529310451320316, + "loss": 2.8838, + "step": 4987 + }, + { + "epoch": 0.14791092132965633, + "grad_norm": 0.13348272442817688, + "learning_rate": 0.0009529111136218699, + "loss": 2.8619, + "step": 4988 + }, + { + "epoch": 0.1479405746819678, + "grad_norm": 0.1315593421459198, + "learning_rate": 0.0009528911781011025, + "loss": 2.8551, + "step": 4989 + }, + { + "epoch": 0.14797022803427928, + "grad_norm": 0.12827974557876587, + "learning_rate": 0.0009528712385699059, + "loss": 2.8869, + "step": 4990 + }, + { + "epoch": 0.14799988138659076, + "grad_norm": 0.12205611169338226, + "learning_rate": 0.0009528512950284566, + "loss": 2.851, + "step": 4991 + }, + { + "epoch": 0.14802953473890224, + "grad_norm": 0.12607009708881378, + "learning_rate": 0.0009528313474769316, + "loss": 2.8884, + "step": 4992 + }, + { + "epoch": 0.1480591880912137, + "grad_norm": 0.13839460909366608, + "learning_rate": 0.0009528113959155071, + "loss": 2.8856, + "step": 4993 + }, + { + "epoch": 0.1480888414435252, + "grad_norm": 0.13371263444423676, + "learning_rate": 0.0009527914403443602, + "loss": 2.8753, + "step": 4994 + }, + { + "epoch": 0.14811849479583666, + "grad_norm": 0.1204250231385231, + "learning_rate": 0.0009527714807636672, + "loss": 2.8906, + "step": 4995 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.1279841959476471, + "learning_rate": 0.000952751517173605, + "loss": 2.8807, + "step": 4996 + }, + { + "epoch": 0.14817780150045962, + "grad_norm": 0.1338104009628296, + "learning_rate": 0.0009527315495743505, + "loss": 2.8773, + "step": 4997 + }, + { + "epoch": 0.1482074548527711, + "grad_norm": 0.1280151903629303, + "learning_rate": 0.0009527115779660805, + "loss": 2.8553, + "step": 4998 + }, + { + "epoch": 0.1482371082050826, + "grad_norm": 0.14376908540725708, + "learning_rate": 0.0009526916023489716, + "loss": 2.8714, + "step": 4999 + }, + { + "epoch": 0.14826676155739407, + "grad_norm": 0.15582482516765594, + "learning_rate": 0.0009526716227232009, + "loss": 2.8701, + "step": 5000 + }, + { + "epoch": 0.14829641490970555, + "grad_norm": 0.17874380946159363, + "learning_rate": 0.0009526516390889453, + "loss": 2.8732, + "step": 5001 + }, + { + "epoch": 0.14832606826201702, + "grad_norm": 0.20315201580524445, + "learning_rate": 0.0009526316514463815, + "loss": 2.8843, + "step": 5002 + }, + { + "epoch": 0.1483557216143285, + "grad_norm": 0.20215332508087158, + "learning_rate": 0.0009526116597956871, + "loss": 2.9097, + "step": 5003 + }, + { + "epoch": 0.14838537496663998, + "grad_norm": 0.22304397821426392, + "learning_rate": 0.0009525916641370386, + "loss": 2.8982, + "step": 5004 + }, + { + "epoch": 0.14841502831895145, + "grad_norm": 0.21246549487113953, + "learning_rate": 0.0009525716644706132, + "loss": 2.8758, + "step": 5005 + }, + { + "epoch": 0.14844468167126293, + "grad_norm": 0.16130591928958893, + "learning_rate": 0.0009525516607965881, + "loss": 2.8867, + "step": 5006 + }, + { + "epoch": 0.1484743350235744, + "grad_norm": 0.1641283929347992, + "learning_rate": 0.0009525316531151404, + "loss": 2.8827, + "step": 5007 + }, + { + "epoch": 0.14850398837588588, + "grad_norm": 0.17047934234142303, + "learning_rate": 0.0009525116414264472, + "loss": 2.8382, + "step": 5008 + }, + { + "epoch": 0.14853364172819739, + "grad_norm": 0.16998609900474548, + "learning_rate": 0.0009524916257306857, + "loss": 2.8794, + "step": 5009 + }, + { + "epoch": 0.14856329508050886, + "grad_norm": 0.1813494861125946, + "learning_rate": 0.0009524716060280332, + "loss": 2.8688, + "step": 5010 + }, + { + "epoch": 0.14859294843282034, + "grad_norm": 0.17814110219478607, + "learning_rate": 0.0009524515823186669, + "loss": 2.9074, + "step": 5011 + }, + { + "epoch": 0.1486226017851318, + "grad_norm": 0.1764603704214096, + "learning_rate": 0.0009524315546027642, + "loss": 2.859, + "step": 5012 + }, + { + "epoch": 0.1486522551374433, + "grad_norm": 0.15638206899166107, + "learning_rate": 0.0009524115228805026, + "loss": 2.8583, + "step": 5013 + }, + { + "epoch": 0.14868190848975477, + "grad_norm": 0.1547122746706009, + "learning_rate": 0.0009523914871520592, + "loss": 2.8652, + "step": 5014 + }, + { + "epoch": 0.14871156184206624, + "grad_norm": 0.1403396725654602, + "learning_rate": 0.0009523714474176113, + "loss": 2.9029, + "step": 5015 + }, + { + "epoch": 0.14874121519437772, + "grad_norm": 0.1391514539718628, + "learning_rate": 0.0009523514036773368, + "loss": 2.8702, + "step": 5016 + }, + { + "epoch": 0.1487708685466892, + "grad_norm": 0.13866224884986877, + "learning_rate": 0.0009523313559314131, + "loss": 2.9161, + "step": 5017 + }, + { + "epoch": 0.14880052189900067, + "grad_norm": 0.15003103017807007, + "learning_rate": 0.0009523113041800174, + "loss": 2.9066, + "step": 5018 + }, + { + "epoch": 0.14883017525131217, + "grad_norm": 0.15281276404857635, + "learning_rate": 0.0009522912484233274, + "loss": 2.8497, + "step": 5019 + }, + { + "epoch": 0.14885982860362365, + "grad_norm": 0.15151025354862213, + "learning_rate": 0.000952271188661521, + "loss": 2.8655, + "step": 5020 + }, + { + "epoch": 0.14888948195593513, + "grad_norm": 0.13427917659282684, + "learning_rate": 0.0009522511248947755, + "loss": 2.897, + "step": 5021 + }, + { + "epoch": 0.1489191353082466, + "grad_norm": 0.13990607857704163, + "learning_rate": 0.0009522310571232687, + "loss": 2.8783, + "step": 5022 + }, + { + "epoch": 0.14894878866055808, + "grad_norm": 0.15064235031604767, + "learning_rate": 0.0009522109853471781, + "loss": 2.8512, + "step": 5023 + }, + { + "epoch": 0.14897844201286956, + "grad_norm": 0.17560331523418427, + "learning_rate": 0.0009521909095666818, + "loss": 2.8695, + "step": 5024 + }, + { + "epoch": 0.14900809536518103, + "grad_norm": 0.13805390894412994, + "learning_rate": 0.0009521708297819572, + "loss": 2.8471, + "step": 5025 + }, + { + "epoch": 0.1490377487174925, + "grad_norm": 0.13245974481105804, + "learning_rate": 0.0009521507459931824, + "loss": 2.8736, + "step": 5026 + }, + { + "epoch": 0.14906740206980398, + "grad_norm": 0.14112639427185059, + "learning_rate": 0.0009521306582005351, + "loss": 2.8833, + "step": 5027 + }, + { + "epoch": 0.14909705542211546, + "grad_norm": 0.12926658987998962, + "learning_rate": 0.0009521105664041933, + "loss": 2.8607, + "step": 5028 + }, + { + "epoch": 0.14912670877442694, + "grad_norm": 0.11900953203439713, + "learning_rate": 0.0009520904706043347, + "loss": 2.8903, + "step": 5029 + }, + { + "epoch": 0.14915636212673844, + "grad_norm": 0.12894175946712494, + "learning_rate": 0.0009520703708011376, + "loss": 2.854, + "step": 5030 + }, + { + "epoch": 0.14918601547904992, + "grad_norm": 0.1395377814769745, + "learning_rate": 0.0009520502669947794, + "loss": 2.8581, + "step": 5031 + }, + { + "epoch": 0.1492156688313614, + "grad_norm": 0.167693093419075, + "learning_rate": 0.0009520301591854388, + "loss": 2.899, + "step": 5032 + }, + { + "epoch": 0.14924532218367287, + "grad_norm": 0.1805209070444107, + "learning_rate": 0.0009520100473732934, + "loss": 2.8726, + "step": 5033 + }, + { + "epoch": 0.14927497553598434, + "grad_norm": 0.1943150758743286, + "learning_rate": 0.0009519899315585215, + "loss": 2.909, + "step": 5034 + }, + { + "epoch": 0.14930462888829582, + "grad_norm": 0.17034374177455902, + "learning_rate": 0.0009519698117413011, + "loss": 2.8828, + "step": 5035 + }, + { + "epoch": 0.1493342822406073, + "grad_norm": 0.18619006872177124, + "learning_rate": 0.0009519496879218106, + "loss": 2.883, + "step": 5036 + }, + { + "epoch": 0.14936393559291877, + "grad_norm": 0.20116516947746277, + "learning_rate": 0.000951929560100228, + "loss": 2.9013, + "step": 5037 + }, + { + "epoch": 0.14939358894523025, + "grad_norm": 0.18144598603248596, + "learning_rate": 0.0009519094282767316, + "loss": 2.864, + "step": 5038 + }, + { + "epoch": 0.14942324229754173, + "grad_norm": 0.16630186140537262, + "learning_rate": 0.0009518892924514995, + "loss": 2.8593, + "step": 5039 + }, + { + "epoch": 0.14945289564985323, + "grad_norm": 0.1672411412000656, + "learning_rate": 0.0009518691526247102, + "loss": 2.8932, + "step": 5040 + }, + { + "epoch": 0.1494825490021647, + "grad_norm": 0.15464185178279877, + "learning_rate": 0.0009518490087965419, + "loss": 2.8729, + "step": 5041 + }, + { + "epoch": 0.14951220235447618, + "grad_norm": 0.14556333422660828, + "learning_rate": 0.0009518288609671733, + "loss": 2.8634, + "step": 5042 + }, + { + "epoch": 0.14954185570678766, + "grad_norm": 0.1480063647031784, + "learning_rate": 0.0009518087091367824, + "loss": 2.8454, + "step": 5043 + }, + { + "epoch": 0.14957150905909913, + "grad_norm": 0.13884879648685455, + "learning_rate": 0.0009517885533055479, + "loss": 2.8587, + "step": 5044 + }, + { + "epoch": 0.1496011624114106, + "grad_norm": 0.15066051483154297, + "learning_rate": 0.0009517683934736481, + "loss": 2.875, + "step": 5045 + }, + { + "epoch": 0.14963081576372209, + "grad_norm": 0.16510796546936035, + "learning_rate": 0.0009517482296412618, + "loss": 2.8908, + "step": 5046 + }, + { + "epoch": 0.14966046911603356, + "grad_norm": 0.14839866757392883, + "learning_rate": 0.0009517280618085673, + "loss": 2.8424, + "step": 5047 + }, + { + "epoch": 0.14969012246834504, + "grad_norm": 0.15599048137664795, + "learning_rate": 0.0009517078899757432, + "loss": 2.851, + "step": 5048 + }, + { + "epoch": 0.14971977582065651, + "grad_norm": 0.14641143381595612, + "learning_rate": 0.0009516877141429683, + "loss": 2.894, + "step": 5049 + }, + { + "epoch": 0.149749429172968, + "grad_norm": 0.1454574465751648, + "learning_rate": 0.0009516675343104212, + "loss": 2.8793, + "step": 5050 + }, + { + "epoch": 0.1497790825252795, + "grad_norm": 0.15131054818630219, + "learning_rate": 0.0009516473504782805, + "loss": 2.8754, + "step": 5051 + }, + { + "epoch": 0.14980873587759097, + "grad_norm": 0.14770784974098206, + "learning_rate": 0.000951627162646725, + "loss": 2.8731, + "step": 5052 + }, + { + "epoch": 0.14983838922990245, + "grad_norm": 0.13474571704864502, + "learning_rate": 0.0009516069708159334, + "loss": 2.8572, + "step": 5053 + }, + { + "epoch": 0.14986804258221392, + "grad_norm": 0.11961358785629272, + "learning_rate": 0.0009515867749860846, + "loss": 2.8643, + "step": 5054 + }, + { + "epoch": 0.1498976959345254, + "grad_norm": 0.12426750361919403, + "learning_rate": 0.0009515665751573574, + "loss": 2.8919, + "step": 5055 + }, + { + "epoch": 0.14992734928683688, + "grad_norm": 0.13038937747478485, + "learning_rate": 0.0009515463713299306, + "loss": 2.8523, + "step": 5056 + }, + { + "epoch": 0.14995700263914835, + "grad_norm": 0.14638970792293549, + "learning_rate": 0.0009515261635039832, + "loss": 2.8839, + "step": 5057 + }, + { + "epoch": 0.14998665599145983, + "grad_norm": 0.1633509397506714, + "learning_rate": 0.0009515059516796942, + "loss": 2.8634, + "step": 5058 + }, + { + "epoch": 0.1500163093437713, + "grad_norm": 0.19045908749103546, + "learning_rate": 0.0009514857358572424, + "loss": 2.8598, + "step": 5059 + }, + { + "epoch": 0.15004596269608278, + "grad_norm": 0.18807213008403778, + "learning_rate": 0.0009514655160368071, + "loss": 2.8747, + "step": 5060 + }, + { + "epoch": 0.15007561604839428, + "grad_norm": 0.18460121750831604, + "learning_rate": 0.000951445292218567, + "loss": 2.8275, + "step": 5061 + }, + { + "epoch": 0.15010526940070576, + "grad_norm": 0.15862466394901276, + "learning_rate": 0.0009514250644027014, + "loss": 2.8473, + "step": 5062 + }, + { + "epoch": 0.15013492275301724, + "grad_norm": 0.15881264209747314, + "learning_rate": 0.0009514048325893892, + "loss": 2.9032, + "step": 5063 + }, + { + "epoch": 0.1501645761053287, + "grad_norm": 0.18488360941410065, + "learning_rate": 0.00095138459677881, + "loss": 2.846, + "step": 5064 + }, + { + "epoch": 0.1501942294576402, + "grad_norm": 0.16616103053092957, + "learning_rate": 0.0009513643569711425, + "loss": 2.8655, + "step": 5065 + }, + { + "epoch": 0.15022388280995166, + "grad_norm": 0.16098926961421967, + "learning_rate": 0.0009513441131665662, + "loss": 2.8672, + "step": 5066 + }, + { + "epoch": 0.15025353616226314, + "grad_norm": 0.1372121423482895, + "learning_rate": 0.0009513238653652603, + "loss": 2.88, + "step": 5067 + }, + { + "epoch": 0.15028318951457462, + "grad_norm": 0.13309375941753387, + "learning_rate": 0.0009513036135674043, + "loss": 2.8711, + "step": 5068 + }, + { + "epoch": 0.1503128428668861, + "grad_norm": 0.14716701209545135, + "learning_rate": 0.000951283357773177, + "loss": 2.8836, + "step": 5069 + }, + { + "epoch": 0.15034249621919757, + "grad_norm": 0.14826588332653046, + "learning_rate": 0.0009512630979827583, + "loss": 2.8724, + "step": 5070 + }, + { + "epoch": 0.15037214957150907, + "grad_norm": 0.15623977780342102, + "learning_rate": 0.0009512428341963274, + "loss": 2.8672, + "step": 5071 + }, + { + "epoch": 0.15040180292382055, + "grad_norm": 0.15702469646930695, + "learning_rate": 0.0009512225664140637, + "loss": 2.8869, + "step": 5072 + }, + { + "epoch": 0.15043145627613203, + "grad_norm": 0.16431160271167755, + "learning_rate": 0.0009512022946361467, + "loss": 2.8326, + "step": 5073 + }, + { + "epoch": 0.1504611096284435, + "grad_norm": 0.15120504796504974, + "learning_rate": 0.000951182018862756, + "loss": 2.916, + "step": 5074 + }, + { + "epoch": 0.15049076298075498, + "grad_norm": 0.1356113702058792, + "learning_rate": 0.0009511617390940711, + "loss": 2.8879, + "step": 5075 + }, + { + "epoch": 0.15052041633306645, + "grad_norm": 0.13630764186382294, + "learning_rate": 0.0009511414553302715, + "loss": 2.9085, + "step": 5076 + }, + { + "epoch": 0.15055006968537793, + "grad_norm": 0.15071967244148254, + "learning_rate": 0.000951121167571537, + "loss": 2.8863, + "step": 5077 + }, + { + "epoch": 0.1505797230376894, + "grad_norm": 0.1376141905784607, + "learning_rate": 0.0009511008758180471, + "loss": 2.8886, + "step": 5078 + }, + { + "epoch": 0.15060937639000088, + "grad_norm": 0.1467772126197815, + "learning_rate": 0.0009510805800699813, + "loss": 2.8516, + "step": 5079 + }, + { + "epoch": 0.15063902974231236, + "grad_norm": 0.1504310816526413, + "learning_rate": 0.0009510602803275197, + "loss": 2.9033, + "step": 5080 + }, + { + "epoch": 0.15066868309462383, + "grad_norm": 0.14209383726119995, + "learning_rate": 0.0009510399765908418, + "loss": 2.8984, + "step": 5081 + }, + { + "epoch": 0.15069833644693534, + "grad_norm": 0.14338187873363495, + "learning_rate": 0.0009510196688601275, + "loss": 2.8811, + "step": 5082 + }, + { + "epoch": 0.15072798979924681, + "grad_norm": 0.15965688228607178, + "learning_rate": 0.0009509993571355568, + "loss": 2.8553, + "step": 5083 + }, + { + "epoch": 0.1507576431515583, + "grad_norm": 0.13463358581066132, + "learning_rate": 0.0009509790414173092, + "loss": 2.828, + "step": 5084 + }, + { + "epoch": 0.15078729650386977, + "grad_norm": 0.1581297218799591, + "learning_rate": 0.0009509587217055649, + "loss": 2.8586, + "step": 5085 + }, + { + "epoch": 0.15081694985618124, + "grad_norm": 0.19464100897312164, + "learning_rate": 0.0009509383980005037, + "loss": 2.8994, + "step": 5086 + }, + { + "epoch": 0.15084660320849272, + "grad_norm": 0.19473814964294434, + "learning_rate": 0.0009509180703023055, + "loss": 2.8525, + "step": 5087 + }, + { + "epoch": 0.1508762565608042, + "grad_norm": 0.17317384481430054, + "learning_rate": 0.0009508977386111504, + "loss": 2.8692, + "step": 5088 + }, + { + "epoch": 0.15090590991311567, + "grad_norm": 0.15020890533924103, + "learning_rate": 0.0009508774029272184, + "loss": 2.8818, + "step": 5089 + }, + { + "epoch": 0.15093556326542715, + "grad_norm": 0.15434378385543823, + "learning_rate": 0.0009508570632506897, + "loss": 2.9087, + "step": 5090 + }, + { + "epoch": 0.15096521661773862, + "grad_norm": 0.14659030735492706, + "learning_rate": 0.0009508367195817443, + "loss": 2.8653, + "step": 5091 + }, + { + "epoch": 0.15099486997005013, + "grad_norm": 0.14416345953941345, + "learning_rate": 0.0009508163719205622, + "loss": 2.8423, + "step": 5092 + }, + { + "epoch": 0.1510245233223616, + "grad_norm": 0.12964309751987457, + "learning_rate": 0.0009507960202673239, + "loss": 2.8373, + "step": 5093 + }, + { + "epoch": 0.15105417667467308, + "grad_norm": 0.14917249977588654, + "learning_rate": 0.0009507756646222094, + "loss": 2.8633, + "step": 5094 + }, + { + "epoch": 0.15108383002698456, + "grad_norm": 0.15624701976776123, + "learning_rate": 0.0009507553049853992, + "loss": 2.8878, + "step": 5095 + }, + { + "epoch": 0.15111348337929603, + "grad_norm": 0.15288177132606506, + "learning_rate": 0.0009507349413570732, + "loss": 2.8926, + "step": 5096 + }, + { + "epoch": 0.1511431367316075, + "grad_norm": 0.14228929579257965, + "learning_rate": 0.000950714573737412, + "loss": 2.8321, + "step": 5097 + }, + { + "epoch": 0.15117279008391898, + "grad_norm": 0.13691674172878265, + "learning_rate": 0.0009506942021265958, + "loss": 2.8602, + "step": 5098 + }, + { + "epoch": 0.15120244343623046, + "grad_norm": 0.14851325750350952, + "learning_rate": 0.0009506738265248052, + "loss": 2.8567, + "step": 5099 + }, + { + "epoch": 0.15123209678854194, + "grad_norm": 0.1456289440393448, + "learning_rate": 0.0009506534469322205, + "loss": 2.8787, + "step": 5100 + }, + { + "epoch": 0.1512617501408534, + "grad_norm": 0.1483628898859024, + "learning_rate": 0.0009506330633490221, + "loss": 2.8768, + "step": 5101 + }, + { + "epoch": 0.1512914034931649, + "grad_norm": 0.1701841652393341, + "learning_rate": 0.0009506126757753906, + "loss": 2.8985, + "step": 5102 + }, + { + "epoch": 0.1513210568454764, + "grad_norm": 0.2020481377840042, + "learning_rate": 0.0009505922842115064, + "loss": 2.8693, + "step": 5103 + }, + { + "epoch": 0.15135071019778787, + "grad_norm": 0.22186270356178284, + "learning_rate": 0.0009505718886575504, + "loss": 2.8615, + "step": 5104 + }, + { + "epoch": 0.15138036355009934, + "grad_norm": 0.1937873363494873, + "learning_rate": 0.0009505514891137028, + "loss": 2.8771, + "step": 5105 + }, + { + "epoch": 0.15141001690241082, + "grad_norm": 0.16366568207740784, + "learning_rate": 0.0009505310855801445, + "loss": 2.8891, + "step": 5106 + }, + { + "epoch": 0.1514396702547223, + "grad_norm": 0.17618440091609955, + "learning_rate": 0.0009505106780570563, + "loss": 2.8847, + "step": 5107 + }, + { + "epoch": 0.15146932360703377, + "grad_norm": 0.1631775200366974, + "learning_rate": 0.0009504902665446184, + "loss": 2.8645, + "step": 5108 + }, + { + "epoch": 0.15149897695934525, + "grad_norm": 0.1336451917886734, + "learning_rate": 0.0009504698510430121, + "loss": 2.8442, + "step": 5109 + }, + { + "epoch": 0.15152863031165673, + "grad_norm": 0.14157618582248688, + "learning_rate": 0.0009504494315524177, + "loss": 2.8914, + "step": 5110 + }, + { + "epoch": 0.1515582836639682, + "grad_norm": 0.15335127711296082, + "learning_rate": 0.0009504290080730165, + "loss": 2.8583, + "step": 5111 + }, + { + "epoch": 0.15158793701627968, + "grad_norm": 0.16547133028507233, + "learning_rate": 0.0009504085806049889, + "loss": 2.9053, + "step": 5112 + }, + { + "epoch": 0.15161759036859118, + "grad_norm": 0.17486512660980225, + "learning_rate": 0.0009503881491485163, + "loss": 2.8524, + "step": 5113 + }, + { + "epoch": 0.15164724372090266, + "grad_norm": 0.17258669435977936, + "learning_rate": 0.0009503677137037792, + "loss": 2.8972, + "step": 5114 + }, + { + "epoch": 0.15167689707321413, + "grad_norm": 0.18140164017677307, + "learning_rate": 0.0009503472742709585, + "loss": 2.8958, + "step": 5115 + }, + { + "epoch": 0.1517065504255256, + "grad_norm": 0.16787400841712952, + "learning_rate": 0.0009503268308502355, + "loss": 2.9117, + "step": 5116 + }, + { + "epoch": 0.1517362037778371, + "grad_norm": 0.15491990745067596, + "learning_rate": 0.0009503063834417913, + "loss": 2.8751, + "step": 5117 + }, + { + "epoch": 0.15176585713014856, + "grad_norm": 0.17168734967708588, + "learning_rate": 0.0009502859320458066, + "loss": 2.8777, + "step": 5118 + }, + { + "epoch": 0.15179551048246004, + "grad_norm": 0.14862732589244843, + "learning_rate": 0.0009502654766624627, + "loss": 2.8827, + "step": 5119 + }, + { + "epoch": 0.15182516383477151, + "grad_norm": 0.14619173109531403, + "learning_rate": 0.0009502450172919408, + "loss": 2.8869, + "step": 5120 + }, + { + "epoch": 0.151854817187083, + "grad_norm": 0.15484678745269775, + "learning_rate": 0.0009502245539344218, + "loss": 2.8706, + "step": 5121 + }, + { + "epoch": 0.15188447053939447, + "grad_norm": 0.15070022642612457, + "learning_rate": 0.0009502040865900873, + "loss": 2.9158, + "step": 5122 + }, + { + "epoch": 0.15191412389170597, + "grad_norm": 0.146789088845253, + "learning_rate": 0.0009501836152591182, + "loss": 2.8493, + "step": 5123 + }, + { + "epoch": 0.15194377724401745, + "grad_norm": 0.15832296013832092, + "learning_rate": 0.000950163139941696, + "loss": 2.8473, + "step": 5124 + }, + { + "epoch": 0.15197343059632892, + "grad_norm": 0.14151445031166077, + "learning_rate": 0.0009501426606380019, + "loss": 2.8945, + "step": 5125 + }, + { + "epoch": 0.1520030839486404, + "grad_norm": 0.1372232288122177, + "learning_rate": 0.0009501221773482171, + "loss": 2.8874, + "step": 5126 + }, + { + "epoch": 0.15203273730095188, + "grad_norm": 0.12491960823535919, + "learning_rate": 0.0009501016900725235, + "loss": 2.8824, + "step": 5127 + }, + { + "epoch": 0.15206239065326335, + "grad_norm": 0.13136659562587738, + "learning_rate": 0.000950081198811102, + "loss": 2.8882, + "step": 5128 + }, + { + "epoch": 0.15209204400557483, + "grad_norm": 0.155517116189003, + "learning_rate": 0.0009500607035641342, + "loss": 2.8887, + "step": 5129 + }, + { + "epoch": 0.1521216973578863, + "grad_norm": 0.16756263375282288, + "learning_rate": 0.0009500402043318017, + "loss": 2.915, + "step": 5130 + }, + { + "epoch": 0.15215135071019778, + "grad_norm": 0.1668189913034439, + "learning_rate": 0.0009500197011142858, + "loss": 2.8814, + "step": 5131 + }, + { + "epoch": 0.15218100406250926, + "grad_norm": 0.1495603322982788, + "learning_rate": 0.0009499991939117682, + "loss": 2.9015, + "step": 5132 + }, + { + "epoch": 0.15221065741482073, + "grad_norm": 0.1469481885433197, + "learning_rate": 0.0009499786827244308, + "loss": 2.8679, + "step": 5133 + }, + { + "epoch": 0.15224031076713224, + "grad_norm": 0.16184234619140625, + "learning_rate": 0.0009499581675524546, + "loss": 2.8572, + "step": 5134 + }, + { + "epoch": 0.1522699641194437, + "grad_norm": 0.1718752533197403, + "learning_rate": 0.0009499376483960218, + "loss": 2.8752, + "step": 5135 + }, + { + "epoch": 0.1522996174717552, + "grad_norm": 0.19855736196041107, + "learning_rate": 0.0009499171252553137, + "loss": 2.8641, + "step": 5136 + }, + { + "epoch": 0.15232927082406666, + "grad_norm": 0.18018344044685364, + "learning_rate": 0.0009498965981305123, + "loss": 2.8268, + "step": 5137 + }, + { + "epoch": 0.15235892417637814, + "grad_norm": 0.162482351064682, + "learning_rate": 0.0009498760670217995, + "loss": 2.8456, + "step": 5138 + }, + { + "epoch": 0.15238857752868962, + "grad_norm": 0.15579694509506226, + "learning_rate": 0.0009498555319293566, + "loss": 2.8311, + "step": 5139 + }, + { + "epoch": 0.1524182308810011, + "grad_norm": 0.15404261648654938, + "learning_rate": 0.0009498349928533658, + "loss": 2.86, + "step": 5140 + }, + { + "epoch": 0.15244788423331257, + "grad_norm": 0.15633384883403778, + "learning_rate": 0.0009498144497940091, + "loss": 2.8512, + "step": 5141 + }, + { + "epoch": 0.15247753758562405, + "grad_norm": 0.1425904631614685, + "learning_rate": 0.0009497939027514682, + "loss": 2.8553, + "step": 5142 + }, + { + "epoch": 0.15250719093793552, + "grad_norm": 0.11308647692203522, + "learning_rate": 0.000949773351725925, + "loss": 2.846, + "step": 5143 + }, + { + "epoch": 0.15253684429024703, + "grad_norm": 0.12880317866802216, + "learning_rate": 0.0009497527967175615, + "loss": 2.8558, + "step": 5144 + }, + { + "epoch": 0.1525664976425585, + "grad_norm": 0.14733648300170898, + "learning_rate": 0.0009497322377265599, + "loss": 2.8767, + "step": 5145 + }, + { + "epoch": 0.15259615099486998, + "grad_norm": 0.14292624592781067, + "learning_rate": 0.0009497116747531021, + "loss": 2.8741, + "step": 5146 + }, + { + "epoch": 0.15262580434718145, + "grad_norm": 0.14911070466041565, + "learning_rate": 0.0009496911077973703, + "loss": 2.8418, + "step": 5147 + }, + { + "epoch": 0.15265545769949293, + "grad_norm": 0.15267948806285858, + "learning_rate": 0.0009496705368595465, + "loss": 2.8319, + "step": 5148 + }, + { + "epoch": 0.1526851110518044, + "grad_norm": 0.14845576882362366, + "learning_rate": 0.0009496499619398129, + "loss": 2.8565, + "step": 5149 + }, + { + "epoch": 0.15271476440411588, + "grad_norm": 0.1628844290971756, + "learning_rate": 0.0009496293830383518, + "loss": 2.8718, + "step": 5150 + }, + { + "epoch": 0.15274441775642736, + "grad_norm": 0.17814531922340393, + "learning_rate": 0.0009496088001553451, + "loss": 2.8436, + "step": 5151 + }, + { + "epoch": 0.15277407110873883, + "grad_norm": 0.17588217556476593, + "learning_rate": 0.0009495882132909756, + "loss": 2.8943, + "step": 5152 + }, + { + "epoch": 0.1528037244610503, + "grad_norm": 0.17127054929733276, + "learning_rate": 0.0009495676224454251, + "loss": 2.8729, + "step": 5153 + }, + { + "epoch": 0.1528333778133618, + "grad_norm": 0.15698614716529846, + "learning_rate": 0.0009495470276188763, + "loss": 2.8634, + "step": 5154 + }, + { + "epoch": 0.1528630311656733, + "grad_norm": 0.19740279018878937, + "learning_rate": 0.0009495264288115112, + "loss": 2.8606, + "step": 5155 + }, + { + "epoch": 0.15289268451798477, + "grad_norm": 0.19416211545467377, + "learning_rate": 0.0009495058260235125, + "loss": 2.8771, + "step": 5156 + }, + { + "epoch": 0.15292233787029624, + "grad_norm": 0.1601772904396057, + "learning_rate": 0.0009494852192550626, + "loss": 2.8276, + "step": 5157 + }, + { + "epoch": 0.15295199122260772, + "grad_norm": 0.15792293846607208, + "learning_rate": 0.0009494646085063439, + "loss": 2.8809, + "step": 5158 + }, + { + "epoch": 0.1529816445749192, + "grad_norm": 0.14470331370830536, + "learning_rate": 0.000949443993777539, + "loss": 2.836, + "step": 5159 + }, + { + "epoch": 0.15301129792723067, + "grad_norm": 0.15142758190631866, + "learning_rate": 0.0009494233750688303, + "loss": 2.8592, + "step": 5160 + }, + { + "epoch": 0.15304095127954215, + "grad_norm": 0.15051284432411194, + "learning_rate": 0.0009494027523804004, + "loss": 2.8631, + "step": 5161 + }, + { + "epoch": 0.15307060463185362, + "grad_norm": 0.1520671844482422, + "learning_rate": 0.0009493821257124321, + "loss": 2.8479, + "step": 5162 + }, + { + "epoch": 0.1531002579841651, + "grad_norm": 0.16182218492031097, + "learning_rate": 0.000949361495065108, + "loss": 2.8642, + "step": 5163 + }, + { + "epoch": 0.15312991133647658, + "grad_norm": 0.1489800661802292, + "learning_rate": 0.0009493408604386106, + "loss": 2.9105, + "step": 5164 + }, + { + "epoch": 0.15315956468878808, + "grad_norm": 0.15370602905750275, + "learning_rate": 0.0009493202218331228, + "loss": 2.8728, + "step": 5165 + }, + { + "epoch": 0.15318921804109956, + "grad_norm": 0.15891508758068085, + "learning_rate": 0.0009492995792488273, + "loss": 2.8546, + "step": 5166 + }, + { + "epoch": 0.15321887139341103, + "grad_norm": 0.14358364045619965, + "learning_rate": 0.0009492789326859069, + "loss": 2.8252, + "step": 5167 + }, + { + "epoch": 0.1532485247457225, + "grad_norm": 0.11243793368339539, + "learning_rate": 0.0009492582821445445, + "loss": 2.8534, + "step": 5168 + }, + { + "epoch": 0.15327817809803398, + "grad_norm": 0.12575851380825043, + "learning_rate": 0.0009492376276249229, + "loss": 2.8486, + "step": 5169 + }, + { + "epoch": 0.15330783145034546, + "grad_norm": 0.13363251090049744, + "learning_rate": 0.0009492169691272249, + "loss": 2.8908, + "step": 5170 + }, + { + "epoch": 0.15333748480265694, + "grad_norm": 0.1336725801229477, + "learning_rate": 0.0009491963066516336, + "loss": 2.8392, + "step": 5171 + }, + { + "epoch": 0.1533671381549684, + "grad_norm": 0.12981046736240387, + "learning_rate": 0.0009491756401983318, + "loss": 2.8525, + "step": 5172 + }, + { + "epoch": 0.1533967915072799, + "grad_norm": 0.13609883189201355, + "learning_rate": 0.0009491549697675029, + "loss": 2.8612, + "step": 5173 + }, + { + "epoch": 0.15342644485959137, + "grad_norm": 0.13784699141979218, + "learning_rate": 0.0009491342953593292, + "loss": 2.9041, + "step": 5174 + }, + { + "epoch": 0.15345609821190287, + "grad_norm": 0.16678465902805328, + "learning_rate": 0.0009491136169739946, + "loss": 2.8869, + "step": 5175 + }, + { + "epoch": 0.15348575156421435, + "grad_norm": 0.187597393989563, + "learning_rate": 0.0009490929346116817, + "loss": 2.8732, + "step": 5176 + }, + { + "epoch": 0.15351540491652582, + "grad_norm": 0.1870621293783188, + "learning_rate": 0.0009490722482725737, + "loss": 2.9002, + "step": 5177 + }, + { + "epoch": 0.1535450582688373, + "grad_norm": 0.1613057553768158, + "learning_rate": 0.0009490515579568539, + "loss": 2.8702, + "step": 5178 + }, + { + "epoch": 0.15357471162114877, + "grad_norm": 0.15121780335903168, + "learning_rate": 0.0009490308636647056, + "loss": 2.8843, + "step": 5179 + }, + { + "epoch": 0.15360436497346025, + "grad_norm": 0.14569346606731415, + "learning_rate": 0.0009490101653963117, + "loss": 2.8936, + "step": 5180 + }, + { + "epoch": 0.15363401832577173, + "grad_norm": 0.11981548368930817, + "learning_rate": 0.0009489894631518559, + "loss": 2.8739, + "step": 5181 + }, + { + "epoch": 0.1536636716780832, + "grad_norm": 0.13064594566822052, + "learning_rate": 0.0009489687569315213, + "loss": 2.8946, + "step": 5182 + }, + { + "epoch": 0.15369332503039468, + "grad_norm": 0.12946149706840515, + "learning_rate": 0.0009489480467354912, + "loss": 2.8581, + "step": 5183 + }, + { + "epoch": 0.15372297838270615, + "grad_norm": 0.11924433708190918, + "learning_rate": 0.0009489273325639492, + "loss": 2.8556, + "step": 5184 + }, + { + "epoch": 0.15375263173501763, + "grad_norm": 0.11844466626644135, + "learning_rate": 0.0009489066144170786, + "loss": 2.8642, + "step": 5185 + }, + { + "epoch": 0.15378228508732913, + "grad_norm": 0.11557257920503616, + "learning_rate": 0.0009488858922950628, + "loss": 2.8563, + "step": 5186 + }, + { + "epoch": 0.1538119384396406, + "grad_norm": 0.11947473883628845, + "learning_rate": 0.0009488651661980854, + "loss": 2.8423, + "step": 5187 + }, + { + "epoch": 0.1538415917919521, + "grad_norm": 0.10912121087312698, + "learning_rate": 0.0009488444361263299, + "loss": 2.8778, + "step": 5188 + }, + { + "epoch": 0.15387124514426356, + "grad_norm": 0.12416655570268631, + "learning_rate": 0.0009488237020799799, + "loss": 2.8347, + "step": 5189 + }, + { + "epoch": 0.15390089849657504, + "grad_norm": 0.1306164413690567, + "learning_rate": 0.0009488029640592191, + "loss": 2.8929, + "step": 5190 + }, + { + "epoch": 0.15393055184888652, + "grad_norm": 0.12272254377603531, + "learning_rate": 0.0009487822220642308, + "loss": 2.8702, + "step": 5191 + }, + { + "epoch": 0.153960205201198, + "grad_norm": 0.14183102548122406, + "learning_rate": 0.0009487614760951991, + "loss": 2.9013, + "step": 5192 + }, + { + "epoch": 0.15398985855350947, + "grad_norm": 0.18157970905303955, + "learning_rate": 0.0009487407261523073, + "loss": 2.8551, + "step": 5193 + }, + { + "epoch": 0.15401951190582094, + "grad_norm": 0.20598390698432922, + "learning_rate": 0.0009487199722357395, + "loss": 2.8941, + "step": 5194 + }, + { + "epoch": 0.15404916525813242, + "grad_norm": 0.19499924778938293, + "learning_rate": 0.0009486992143456792, + "loss": 2.855, + "step": 5195 + }, + { + "epoch": 0.15407881861044392, + "grad_norm": 0.17376603186130524, + "learning_rate": 0.0009486784524823104, + "loss": 2.8585, + "step": 5196 + }, + { + "epoch": 0.1541084719627554, + "grad_norm": 0.18308182060718536, + "learning_rate": 0.000948657686645817, + "loss": 2.8925, + "step": 5197 + }, + { + "epoch": 0.15413812531506688, + "grad_norm": 0.22342829406261444, + "learning_rate": 0.0009486369168363825, + "loss": 2.84, + "step": 5198 + }, + { + "epoch": 0.15416777866737835, + "grad_norm": 0.17887499928474426, + "learning_rate": 0.0009486161430541913, + "loss": 2.889, + "step": 5199 + }, + { + "epoch": 0.15419743201968983, + "grad_norm": 0.17370976507663727, + "learning_rate": 0.0009485953652994271, + "loss": 2.851, + "step": 5200 + }, + { + "epoch": 0.1542270853720013, + "grad_norm": 0.18335482478141785, + "learning_rate": 0.0009485745835722739, + "loss": 2.8874, + "step": 5201 + }, + { + "epoch": 0.15425673872431278, + "grad_norm": 0.16443702578544617, + "learning_rate": 0.0009485537978729158, + "loss": 2.8855, + "step": 5202 + }, + { + "epoch": 0.15428639207662426, + "grad_norm": 0.14887720346450806, + "learning_rate": 0.0009485330082015369, + "loss": 2.8525, + "step": 5203 + }, + { + "epoch": 0.15431604542893573, + "grad_norm": 0.1728004515171051, + "learning_rate": 0.0009485122145583212, + "loss": 2.8402, + "step": 5204 + }, + { + "epoch": 0.1543456987812472, + "grad_norm": 0.15920202434062958, + "learning_rate": 0.0009484914169434527, + "loss": 2.8681, + "step": 5205 + }, + { + "epoch": 0.15437535213355869, + "grad_norm": 0.15423141419887543, + "learning_rate": 0.0009484706153571158, + "loss": 2.8589, + "step": 5206 + }, + { + "epoch": 0.1544050054858702, + "grad_norm": 0.16010896861553192, + "learning_rate": 0.0009484498097994947, + "loss": 2.904, + "step": 5207 + }, + { + "epoch": 0.15443465883818167, + "grad_norm": 0.15590853989124298, + "learning_rate": 0.0009484290002707736, + "loss": 2.8663, + "step": 5208 + }, + { + "epoch": 0.15446431219049314, + "grad_norm": 0.1530561000108719, + "learning_rate": 0.0009484081867711366, + "loss": 2.8697, + "step": 5209 + }, + { + "epoch": 0.15449396554280462, + "grad_norm": 0.1363542675971985, + "learning_rate": 0.0009483873693007682, + "loss": 2.8571, + "step": 5210 + }, + { + "epoch": 0.1545236188951161, + "grad_norm": 0.13660770654678345, + "learning_rate": 0.0009483665478598526, + "loss": 2.8824, + "step": 5211 + }, + { + "epoch": 0.15455327224742757, + "grad_norm": 0.1338651478290558, + "learning_rate": 0.0009483457224485743, + "loss": 2.8987, + "step": 5212 + }, + { + "epoch": 0.15458292559973905, + "grad_norm": 0.1517815738916397, + "learning_rate": 0.0009483248930671178, + "loss": 2.8512, + "step": 5213 + }, + { + "epoch": 0.15461257895205052, + "grad_norm": 0.15806084871292114, + "learning_rate": 0.0009483040597156673, + "loss": 2.8836, + "step": 5214 + }, + { + "epoch": 0.154642232304362, + "grad_norm": 0.15206868946552277, + "learning_rate": 0.0009482832223944073, + "loss": 2.8288, + "step": 5215 + }, + { + "epoch": 0.15467188565667347, + "grad_norm": 0.12666359543800354, + "learning_rate": 0.0009482623811035226, + "loss": 2.8694, + "step": 5216 + }, + { + "epoch": 0.15470153900898498, + "grad_norm": 0.13042128086090088, + "learning_rate": 0.0009482415358431975, + "loss": 2.8576, + "step": 5217 + }, + { + "epoch": 0.15473119236129645, + "grad_norm": 0.15857912600040436, + "learning_rate": 0.0009482206866136167, + "loss": 2.8932, + "step": 5218 + }, + { + "epoch": 0.15476084571360793, + "grad_norm": 0.15863680839538574, + "learning_rate": 0.0009481998334149646, + "loss": 2.8702, + "step": 5219 + }, + { + "epoch": 0.1547904990659194, + "grad_norm": 0.1387430727481842, + "learning_rate": 0.0009481789762474263, + "loss": 2.8392, + "step": 5220 + }, + { + "epoch": 0.15482015241823088, + "grad_norm": 0.1519538164138794, + "learning_rate": 0.0009481581151111862, + "loss": 2.8916, + "step": 5221 + }, + { + "epoch": 0.15484980577054236, + "grad_norm": 0.14238281548023224, + "learning_rate": 0.0009481372500064289, + "loss": 2.8428, + "step": 5222 + }, + { + "epoch": 0.15487945912285384, + "grad_norm": 0.14660078287124634, + "learning_rate": 0.0009481163809333394, + "loss": 2.8594, + "step": 5223 + }, + { + "epoch": 0.1549091124751653, + "grad_norm": 0.13680635392665863, + "learning_rate": 0.0009480955078921024, + "loss": 2.8704, + "step": 5224 + }, + { + "epoch": 0.1549387658274768, + "grad_norm": 0.14743226766586304, + "learning_rate": 0.0009480746308829028, + "loss": 2.893, + "step": 5225 + }, + { + "epoch": 0.15496841917978826, + "grad_norm": 0.1695554554462433, + "learning_rate": 0.0009480537499059254, + "loss": 2.8503, + "step": 5226 + }, + { + "epoch": 0.15499807253209977, + "grad_norm": 0.1706445813179016, + "learning_rate": 0.0009480328649613552, + "loss": 2.8785, + "step": 5227 + }, + { + "epoch": 0.15502772588441124, + "grad_norm": 0.17338059842586517, + "learning_rate": 0.0009480119760493771, + "loss": 2.8721, + "step": 5228 + }, + { + "epoch": 0.15505737923672272, + "grad_norm": 0.17055831849575043, + "learning_rate": 0.0009479910831701761, + "loss": 2.874, + "step": 5229 + }, + { + "epoch": 0.1550870325890342, + "grad_norm": 0.13629008829593658, + "learning_rate": 0.000947970186323937, + "loss": 2.8464, + "step": 5230 + }, + { + "epoch": 0.15511668594134567, + "grad_norm": 0.13537922501564026, + "learning_rate": 0.000947949285510845, + "loss": 2.8691, + "step": 5231 + }, + { + "epoch": 0.15514633929365715, + "grad_norm": 0.1527959108352661, + "learning_rate": 0.0009479283807310852, + "loss": 2.8703, + "step": 5232 + }, + { + "epoch": 0.15517599264596862, + "grad_norm": 0.14159463346004486, + "learning_rate": 0.0009479074719848428, + "loss": 2.8953, + "step": 5233 + }, + { + "epoch": 0.1552056459982801, + "grad_norm": 0.15565125644207, + "learning_rate": 0.000947886559272303, + "loss": 2.8786, + "step": 5234 + }, + { + "epoch": 0.15523529935059158, + "grad_norm": 0.17064756155014038, + "learning_rate": 0.0009478656425936505, + "loss": 2.8702, + "step": 5235 + }, + { + "epoch": 0.15526495270290305, + "grad_norm": 0.162956103682518, + "learning_rate": 0.0009478447219490712, + "loss": 2.8787, + "step": 5236 + }, + { + "epoch": 0.15529460605521453, + "grad_norm": 0.15849648416042328, + "learning_rate": 0.0009478237973387497, + "loss": 2.872, + "step": 5237 + }, + { + "epoch": 0.15532425940752603, + "grad_norm": 0.1624682992696762, + "learning_rate": 0.0009478028687628717, + "loss": 2.8669, + "step": 5238 + }, + { + "epoch": 0.1553539127598375, + "grad_norm": 0.14721781015396118, + "learning_rate": 0.0009477819362216224, + "loss": 2.8937, + "step": 5239 + }, + { + "epoch": 0.15538356611214899, + "grad_norm": 0.1907905638217926, + "learning_rate": 0.0009477609997151873, + "loss": 2.8765, + "step": 5240 + }, + { + "epoch": 0.15541321946446046, + "grad_norm": 0.1584164798259735, + "learning_rate": 0.0009477400592437517, + "loss": 2.8525, + "step": 5241 + }, + { + "epoch": 0.15544287281677194, + "grad_norm": 0.13435901701450348, + "learning_rate": 0.000947719114807501, + "loss": 2.895, + "step": 5242 + }, + { + "epoch": 0.1554725261690834, + "grad_norm": 0.1602506935596466, + "learning_rate": 0.0009476981664066207, + "loss": 2.8709, + "step": 5243 + }, + { + "epoch": 0.1555021795213949, + "grad_norm": 0.14041991531848907, + "learning_rate": 0.0009476772140412963, + "loss": 2.8777, + "step": 5244 + }, + { + "epoch": 0.15553183287370637, + "grad_norm": 0.13254289329051971, + "learning_rate": 0.0009476562577117131, + "loss": 2.8502, + "step": 5245 + }, + { + "epoch": 0.15556148622601784, + "grad_norm": 0.126936137676239, + "learning_rate": 0.0009476352974180573, + "loss": 2.8266, + "step": 5246 + }, + { + "epoch": 0.15559113957832932, + "grad_norm": 0.12819820642471313, + "learning_rate": 0.0009476143331605138, + "loss": 2.8113, + "step": 5247 + }, + { + "epoch": 0.15562079293064082, + "grad_norm": 0.1252467930316925, + "learning_rate": 0.0009475933649392686, + "loss": 2.8719, + "step": 5248 + }, + { + "epoch": 0.1556504462829523, + "grad_norm": 0.14738546311855316, + "learning_rate": 0.0009475723927545075, + "loss": 2.8677, + "step": 5249 + }, + { + "epoch": 0.15568009963526377, + "grad_norm": 0.14615696668624878, + "learning_rate": 0.0009475514166064157, + "loss": 2.8588, + "step": 5250 + }, + { + "epoch": 0.15570975298757525, + "grad_norm": 0.15434879064559937, + "learning_rate": 0.0009475304364951795, + "loss": 2.865, + "step": 5251 + }, + { + "epoch": 0.15573940633988673, + "grad_norm": 0.13951615989208221, + "learning_rate": 0.0009475094524209845, + "loss": 2.8719, + "step": 5252 + }, + { + "epoch": 0.1557690596921982, + "grad_norm": 0.14037083089351654, + "learning_rate": 0.0009474884643840165, + "loss": 2.8507, + "step": 5253 + }, + { + "epoch": 0.15579871304450968, + "grad_norm": 0.14288942515850067, + "learning_rate": 0.0009474674723844612, + "loss": 2.8767, + "step": 5254 + }, + { + "epoch": 0.15582836639682116, + "grad_norm": 0.1667705923318863, + "learning_rate": 0.0009474464764225046, + "loss": 2.8373, + "step": 5255 + }, + { + "epoch": 0.15585801974913263, + "grad_norm": 0.16747887432575226, + "learning_rate": 0.0009474254764983329, + "loss": 2.8713, + "step": 5256 + }, + { + "epoch": 0.1558876731014441, + "grad_norm": 0.17316733300685883, + "learning_rate": 0.0009474044726121316, + "loss": 2.8839, + "step": 5257 + }, + { + "epoch": 0.15591732645375558, + "grad_norm": 0.17142556607723236, + "learning_rate": 0.0009473834647640869, + "loss": 2.8583, + "step": 5258 + }, + { + "epoch": 0.1559469798060671, + "grad_norm": 0.16899186372756958, + "learning_rate": 0.0009473624529543849, + "loss": 2.8925, + "step": 5259 + }, + { + "epoch": 0.15597663315837856, + "grad_norm": 0.157923623919487, + "learning_rate": 0.0009473414371832116, + "loss": 2.8651, + "step": 5260 + }, + { + "epoch": 0.15600628651069004, + "grad_norm": 0.15638098120689392, + "learning_rate": 0.0009473204174507531, + "loss": 2.8601, + "step": 5261 + }, + { + "epoch": 0.15603593986300152, + "grad_norm": 0.1482105702161789, + "learning_rate": 0.0009472993937571954, + "loss": 2.8744, + "step": 5262 + }, + { + "epoch": 0.156065593215313, + "grad_norm": 0.13776171207427979, + "learning_rate": 0.0009472783661027249, + "loss": 2.8824, + "step": 5263 + }, + { + "epoch": 0.15609524656762447, + "grad_norm": 0.12966294586658478, + "learning_rate": 0.0009472573344875277, + "loss": 2.8651, + "step": 5264 + }, + { + "epoch": 0.15612489991993594, + "grad_norm": 0.1498088538646698, + "learning_rate": 0.00094723629891179, + "loss": 2.8831, + "step": 5265 + }, + { + "epoch": 0.15615455327224742, + "grad_norm": 0.1642037183046341, + "learning_rate": 0.0009472152593756981, + "loss": 2.8496, + "step": 5266 + }, + { + "epoch": 0.1561842066245589, + "grad_norm": 0.15306954085826874, + "learning_rate": 0.0009471942158794382, + "loss": 2.8531, + "step": 5267 + }, + { + "epoch": 0.15621385997687037, + "grad_norm": 0.17012326419353485, + "learning_rate": 0.000947173168423197, + "loss": 2.8628, + "step": 5268 + }, + { + "epoch": 0.15624351332918188, + "grad_norm": 0.20869183540344238, + "learning_rate": 0.0009471521170071604, + "loss": 2.8767, + "step": 5269 + }, + { + "epoch": 0.15627316668149335, + "grad_norm": 0.1924537718296051, + "learning_rate": 0.0009471310616315151, + "loss": 2.8711, + "step": 5270 + }, + { + "epoch": 0.15630282003380483, + "grad_norm": 0.17279253900051117, + "learning_rate": 0.0009471100022964476, + "loss": 2.8877, + "step": 5271 + }, + { + "epoch": 0.1563324733861163, + "grad_norm": 0.19534318149089813, + "learning_rate": 0.0009470889390021443, + "loss": 2.8692, + "step": 5272 + }, + { + "epoch": 0.15636212673842778, + "grad_norm": 0.17446006834506989, + "learning_rate": 0.0009470678717487918, + "loss": 2.8422, + "step": 5273 + }, + { + "epoch": 0.15639178009073926, + "grad_norm": 0.18119150400161743, + "learning_rate": 0.0009470468005365763, + "loss": 2.8351, + "step": 5274 + }, + { + "epoch": 0.15642143344305073, + "grad_norm": 0.19943763315677643, + "learning_rate": 0.0009470257253656847, + "loss": 2.8542, + "step": 5275 + }, + { + "epoch": 0.1564510867953622, + "grad_norm": 0.1625123769044876, + "learning_rate": 0.0009470046462363037, + "loss": 2.8865, + "step": 5276 + }, + { + "epoch": 0.15648074014767369, + "grad_norm": 0.17867378890514374, + "learning_rate": 0.0009469835631486196, + "loss": 2.8323, + "step": 5277 + }, + { + "epoch": 0.15651039349998516, + "grad_norm": 0.16200774908065796, + "learning_rate": 0.0009469624761028196, + "loss": 2.8821, + "step": 5278 + }, + { + "epoch": 0.15654004685229667, + "grad_norm": 0.15328702330589294, + "learning_rate": 0.00094694138509909, + "loss": 2.8721, + "step": 5279 + }, + { + "epoch": 0.15656970020460814, + "grad_norm": 0.1507256180047989, + "learning_rate": 0.0009469202901376177, + "loss": 2.8443, + "step": 5280 + }, + { + "epoch": 0.15659935355691962, + "grad_norm": 0.14888343214988708, + "learning_rate": 0.0009468991912185895, + "loss": 2.8242, + "step": 5281 + }, + { + "epoch": 0.1566290069092311, + "grad_norm": 0.15693886578083038, + "learning_rate": 0.0009468780883421926, + "loss": 2.873, + "step": 5282 + }, + { + "epoch": 0.15665866026154257, + "grad_norm": 0.14511895179748535, + "learning_rate": 0.0009468569815086133, + "loss": 2.8787, + "step": 5283 + }, + { + "epoch": 0.15668831361385405, + "grad_norm": 0.1473289430141449, + "learning_rate": 0.0009468358707180384, + "loss": 2.869, + "step": 5284 + }, + { + "epoch": 0.15671796696616552, + "grad_norm": 0.1349049061536789, + "learning_rate": 0.0009468147559706555, + "loss": 2.8363, + "step": 5285 + }, + { + "epoch": 0.156747620318477, + "grad_norm": 0.14106875658035278, + "learning_rate": 0.0009467936372666513, + "loss": 2.8529, + "step": 5286 + }, + { + "epoch": 0.15677727367078848, + "grad_norm": 0.13612526655197144, + "learning_rate": 0.0009467725146062125, + "loss": 2.8972, + "step": 5287 + }, + { + "epoch": 0.15680692702309995, + "grad_norm": 0.14590321481227875, + "learning_rate": 0.0009467513879895267, + "loss": 2.858, + "step": 5288 + }, + { + "epoch": 0.15683658037541143, + "grad_norm": 0.12623073160648346, + "learning_rate": 0.0009467302574167804, + "loss": 2.8546, + "step": 5289 + }, + { + "epoch": 0.15686623372772293, + "grad_norm": 0.11643873900175095, + "learning_rate": 0.000946709122888161, + "loss": 2.8502, + "step": 5290 + }, + { + "epoch": 0.1568958870800344, + "grad_norm": 0.12140580266714096, + "learning_rate": 0.0009466879844038558, + "loss": 2.8863, + "step": 5291 + }, + { + "epoch": 0.15692554043234588, + "grad_norm": 0.13790804147720337, + "learning_rate": 0.0009466668419640518, + "loss": 2.8565, + "step": 5292 + }, + { + "epoch": 0.15695519378465736, + "grad_norm": 0.1327098309993744, + "learning_rate": 0.0009466456955689362, + "loss": 2.867, + "step": 5293 + }, + { + "epoch": 0.15698484713696884, + "grad_norm": 0.14391599595546722, + "learning_rate": 0.0009466245452186963, + "loss": 2.849, + "step": 5294 + }, + { + "epoch": 0.1570145004892803, + "grad_norm": 0.155338317155838, + "learning_rate": 0.0009466033909135194, + "loss": 2.849, + "step": 5295 + }, + { + "epoch": 0.1570441538415918, + "grad_norm": 0.1703779697418213, + "learning_rate": 0.0009465822326535927, + "loss": 2.8423, + "step": 5296 + }, + { + "epoch": 0.15707380719390326, + "grad_norm": 0.19960805773735046, + "learning_rate": 0.0009465610704391039, + "loss": 2.8724, + "step": 5297 + }, + { + "epoch": 0.15710346054621474, + "grad_norm": 0.196359321475029, + "learning_rate": 0.0009465399042702401, + "loss": 2.8735, + "step": 5298 + }, + { + "epoch": 0.15713311389852622, + "grad_norm": 0.17075464129447937, + "learning_rate": 0.0009465187341471886, + "loss": 2.8788, + "step": 5299 + }, + { + "epoch": 0.15716276725083772, + "grad_norm": 0.16837766766548157, + "learning_rate": 0.0009464975600701373, + "loss": 2.8706, + "step": 5300 + }, + { + "epoch": 0.1571924206031492, + "grad_norm": 0.1589178442955017, + "learning_rate": 0.0009464763820392734, + "loss": 2.8605, + "step": 5301 + }, + { + "epoch": 0.15722207395546067, + "grad_norm": 0.167008638381958, + "learning_rate": 0.0009464552000547844, + "loss": 2.8946, + "step": 5302 + }, + { + "epoch": 0.15725172730777215, + "grad_norm": 0.1910594254732132, + "learning_rate": 0.0009464340141168582, + "loss": 2.869, + "step": 5303 + }, + { + "epoch": 0.15728138066008363, + "grad_norm": 0.18260407447814941, + "learning_rate": 0.000946412824225682, + "loss": 2.8415, + "step": 5304 + }, + { + "epoch": 0.1573110340123951, + "grad_norm": 0.1682051718235016, + "learning_rate": 0.0009463916303814436, + "loss": 2.8425, + "step": 5305 + }, + { + "epoch": 0.15734068736470658, + "grad_norm": 0.16768842935562134, + "learning_rate": 0.0009463704325843307, + "loss": 2.8714, + "step": 5306 + }, + { + "epoch": 0.15737034071701805, + "grad_norm": 0.14504878222942352, + "learning_rate": 0.000946349230834531, + "loss": 2.8853, + "step": 5307 + }, + { + "epoch": 0.15739999406932953, + "grad_norm": 0.15485446155071259, + "learning_rate": 0.0009463280251322324, + "loss": 2.8246, + "step": 5308 + }, + { + "epoch": 0.157429647421641, + "grad_norm": 0.15894855558872223, + "learning_rate": 0.0009463068154776223, + "loss": 2.8499, + "step": 5309 + }, + { + "epoch": 0.15745930077395248, + "grad_norm": 0.14547885954380035, + "learning_rate": 0.000946285601870889, + "loss": 2.8535, + "step": 5310 + }, + { + "epoch": 0.15748895412626399, + "grad_norm": 0.16824749112129211, + "learning_rate": 0.0009462643843122198, + "loss": 2.846, + "step": 5311 + }, + { + "epoch": 0.15751860747857546, + "grad_norm": 0.17435020208358765, + "learning_rate": 0.0009462431628018031, + "loss": 2.8623, + "step": 5312 + }, + { + "epoch": 0.15754826083088694, + "grad_norm": 0.15838812291622162, + "learning_rate": 0.0009462219373398264, + "loss": 2.837, + "step": 5313 + }, + { + "epoch": 0.15757791418319841, + "grad_norm": 0.15335702896118164, + "learning_rate": 0.0009462007079264781, + "loss": 2.8697, + "step": 5314 + }, + { + "epoch": 0.1576075675355099, + "grad_norm": 0.17064927518367767, + "learning_rate": 0.0009461794745619457, + "loss": 2.8495, + "step": 5315 + }, + { + "epoch": 0.15763722088782137, + "grad_norm": 0.17397266626358032, + "learning_rate": 0.0009461582372464176, + "loss": 2.8818, + "step": 5316 + }, + { + "epoch": 0.15766687424013284, + "grad_norm": 0.17461080849170685, + "learning_rate": 0.0009461369959800817, + "loss": 2.8646, + "step": 5317 + }, + { + "epoch": 0.15769652759244432, + "grad_norm": 0.1721803992986679, + "learning_rate": 0.0009461157507631261, + "loss": 2.8561, + "step": 5318 + }, + { + "epoch": 0.1577261809447558, + "grad_norm": 0.1806948184967041, + "learning_rate": 0.000946094501595739, + "loss": 2.8644, + "step": 5319 + }, + { + "epoch": 0.15775583429706727, + "grad_norm": 0.1446962058544159, + "learning_rate": 0.0009460732484781085, + "loss": 2.8586, + "step": 5320 + }, + { + "epoch": 0.15778548764937878, + "grad_norm": 0.1513892114162445, + "learning_rate": 0.0009460519914104229, + "loss": 2.8547, + "step": 5321 + }, + { + "epoch": 0.15781514100169025, + "grad_norm": 0.154036283493042, + "learning_rate": 0.0009460307303928701, + "loss": 2.9164, + "step": 5322 + }, + { + "epoch": 0.15784479435400173, + "grad_norm": 0.12338784337043762, + "learning_rate": 0.0009460094654256388, + "loss": 2.848, + "step": 5323 + }, + { + "epoch": 0.1578744477063132, + "grad_norm": 0.12059585750102997, + "learning_rate": 0.0009459881965089172, + "loss": 2.8603, + "step": 5324 + }, + { + "epoch": 0.15790410105862468, + "grad_norm": 0.11759111285209656, + "learning_rate": 0.0009459669236428935, + "loss": 2.8578, + "step": 5325 + }, + { + "epoch": 0.15793375441093616, + "grad_norm": 0.125588059425354, + "learning_rate": 0.0009459456468277561, + "loss": 2.877, + "step": 5326 + }, + { + "epoch": 0.15796340776324763, + "grad_norm": 0.12381565570831299, + "learning_rate": 0.0009459243660636935, + "loss": 2.8449, + "step": 5327 + }, + { + "epoch": 0.1579930611155591, + "grad_norm": 0.12579815089702606, + "learning_rate": 0.000945903081350894, + "loss": 2.8537, + "step": 5328 + }, + { + "epoch": 0.15802271446787058, + "grad_norm": 0.1361597180366516, + "learning_rate": 0.0009458817926895463, + "loss": 2.8528, + "step": 5329 + }, + { + "epoch": 0.15805236782018206, + "grad_norm": 0.13521981239318848, + "learning_rate": 0.0009458605000798387, + "loss": 2.8861, + "step": 5330 + }, + { + "epoch": 0.15808202117249356, + "grad_norm": 0.16203036904335022, + "learning_rate": 0.0009458392035219599, + "loss": 2.856, + "step": 5331 + }, + { + "epoch": 0.15811167452480504, + "grad_norm": 0.1693500429391861, + "learning_rate": 0.0009458179030160985, + "loss": 2.8486, + "step": 5332 + }, + { + "epoch": 0.15814132787711652, + "grad_norm": 0.17416734993457794, + "learning_rate": 0.0009457965985624428, + "loss": 2.8299, + "step": 5333 + }, + { + "epoch": 0.158170981229428, + "grad_norm": 0.16111575067043304, + "learning_rate": 0.0009457752901611819, + "loss": 2.8683, + "step": 5334 + }, + { + "epoch": 0.15820063458173947, + "grad_norm": 0.16951192915439606, + "learning_rate": 0.0009457539778125042, + "loss": 2.8538, + "step": 5335 + }, + { + "epoch": 0.15823028793405094, + "grad_norm": 0.17875665426254272, + "learning_rate": 0.0009457326615165987, + "loss": 2.9035, + "step": 5336 + }, + { + "epoch": 0.15825994128636242, + "grad_norm": 0.1620597094297409, + "learning_rate": 0.0009457113412736538, + "loss": 2.8263, + "step": 5337 + }, + { + "epoch": 0.1582895946386739, + "grad_norm": 0.16401834785938263, + "learning_rate": 0.0009456900170838585, + "loss": 2.8878, + "step": 5338 + }, + { + "epoch": 0.15831924799098537, + "grad_norm": 0.15450258553028107, + "learning_rate": 0.0009456686889474015, + "loss": 2.8708, + "step": 5339 + }, + { + "epoch": 0.15834890134329685, + "grad_norm": 0.14499962329864502, + "learning_rate": 0.0009456473568644719, + "loss": 2.8643, + "step": 5340 + }, + { + "epoch": 0.15837855469560833, + "grad_norm": 0.1411062628030777, + "learning_rate": 0.0009456260208352584, + "loss": 2.853, + "step": 5341 + }, + { + "epoch": 0.15840820804791983, + "grad_norm": 0.13038098812103271, + "learning_rate": 0.00094560468085995, + "loss": 2.853, + "step": 5342 + }, + { + "epoch": 0.1584378614002313, + "grad_norm": 0.1275714486837387, + "learning_rate": 0.0009455833369387356, + "loss": 2.8446, + "step": 5343 + }, + { + "epoch": 0.15846751475254278, + "grad_norm": 0.12171175330877304, + "learning_rate": 0.0009455619890718043, + "loss": 2.8762, + "step": 5344 + }, + { + "epoch": 0.15849716810485426, + "grad_norm": 0.13318710029125214, + "learning_rate": 0.0009455406372593453, + "loss": 2.8716, + "step": 5345 + }, + { + "epoch": 0.15852682145716573, + "grad_norm": 0.13447202742099762, + "learning_rate": 0.0009455192815015472, + "loss": 2.8911, + "step": 5346 + }, + { + "epoch": 0.1585564748094772, + "grad_norm": 0.13988927006721497, + "learning_rate": 0.0009454979217985996, + "loss": 2.874, + "step": 5347 + }, + { + "epoch": 0.1585861281617887, + "grad_norm": 0.15668097138404846, + "learning_rate": 0.0009454765581506914, + "loss": 2.8934, + "step": 5348 + }, + { + "epoch": 0.15861578151410016, + "grad_norm": 0.17745815217494965, + "learning_rate": 0.0009454551905580117, + "loss": 2.8717, + "step": 5349 + }, + { + "epoch": 0.15864543486641164, + "grad_norm": 0.18200258910655975, + "learning_rate": 0.0009454338190207498, + "loss": 2.891, + "step": 5350 + }, + { + "epoch": 0.15867508821872311, + "grad_norm": 0.17044827342033386, + "learning_rate": 0.0009454124435390952, + "loss": 2.8863, + "step": 5351 + }, + { + "epoch": 0.15870474157103462, + "grad_norm": 0.1690678894519806, + "learning_rate": 0.0009453910641132368, + "loss": 2.8798, + "step": 5352 + }, + { + "epoch": 0.1587343949233461, + "grad_norm": 0.175868421792984, + "learning_rate": 0.0009453696807433641, + "loss": 2.8406, + "step": 5353 + }, + { + "epoch": 0.15876404827565757, + "grad_norm": 0.18690335750579834, + "learning_rate": 0.0009453482934296665, + "loss": 2.8508, + "step": 5354 + }, + { + "epoch": 0.15879370162796905, + "grad_norm": 0.16517207026481628, + "learning_rate": 0.0009453269021723332, + "loss": 2.8968, + "step": 5355 + }, + { + "epoch": 0.15882335498028052, + "grad_norm": 0.13858741521835327, + "learning_rate": 0.0009453055069715537, + "loss": 2.8764, + "step": 5356 + }, + { + "epoch": 0.158853008332592, + "grad_norm": 0.1484750211238861, + "learning_rate": 0.0009452841078275177, + "loss": 2.8486, + "step": 5357 + }, + { + "epoch": 0.15888266168490348, + "grad_norm": 0.15302112698554993, + "learning_rate": 0.0009452627047404143, + "loss": 2.8973, + "step": 5358 + }, + { + "epoch": 0.15891231503721495, + "grad_norm": 0.16751638054847717, + "learning_rate": 0.0009452412977104333, + "loss": 2.878, + "step": 5359 + }, + { + "epoch": 0.15894196838952643, + "grad_norm": 0.14598895609378815, + "learning_rate": 0.0009452198867377641, + "loss": 2.8446, + "step": 5360 + }, + { + "epoch": 0.1589716217418379, + "grad_norm": 0.14582334458827972, + "learning_rate": 0.0009451984718225966, + "loss": 2.8671, + "step": 5361 + }, + { + "epoch": 0.15900127509414938, + "grad_norm": 0.1118161752820015, + "learning_rate": 0.00094517705296512, + "loss": 2.8575, + "step": 5362 + }, + { + "epoch": 0.15903092844646088, + "grad_norm": 0.13750244677066803, + "learning_rate": 0.0009451556301655244, + "loss": 2.8445, + "step": 5363 + }, + { + "epoch": 0.15906058179877236, + "grad_norm": 0.13907966017723083, + "learning_rate": 0.0009451342034239991, + "loss": 2.8445, + "step": 5364 + }, + { + "epoch": 0.15909023515108384, + "grad_norm": 0.1327579915523529, + "learning_rate": 0.000945112772740734, + "loss": 2.859, + "step": 5365 + }, + { + "epoch": 0.1591198885033953, + "grad_norm": 0.14600606262683868, + "learning_rate": 0.0009450913381159191, + "loss": 2.8562, + "step": 5366 + }, + { + "epoch": 0.1591495418557068, + "grad_norm": 0.14788687229156494, + "learning_rate": 0.0009450698995497437, + "loss": 2.844, + "step": 5367 + }, + { + "epoch": 0.15917919520801826, + "grad_norm": 0.14561352133750916, + "learning_rate": 0.0009450484570423983, + "loss": 2.821, + "step": 5368 + }, + { + "epoch": 0.15920884856032974, + "grad_norm": 0.14928744733333588, + "learning_rate": 0.0009450270105940721, + "loss": 2.8683, + "step": 5369 + }, + { + "epoch": 0.15923850191264122, + "grad_norm": 0.15273474156856537, + "learning_rate": 0.0009450055602049555, + "loss": 2.828, + "step": 5370 + }, + { + "epoch": 0.1592681552649527, + "grad_norm": 0.16216547787189484, + "learning_rate": 0.0009449841058752383, + "loss": 2.8861, + "step": 5371 + }, + { + "epoch": 0.15929780861726417, + "grad_norm": 0.17714929580688477, + "learning_rate": 0.0009449626476051104, + "loss": 2.8823, + "step": 5372 + }, + { + "epoch": 0.15932746196957567, + "grad_norm": 0.15517961978912354, + "learning_rate": 0.000944941185394762, + "loss": 2.8815, + "step": 5373 + }, + { + "epoch": 0.15935711532188715, + "grad_norm": 0.14033204317092896, + "learning_rate": 0.0009449197192443828, + "loss": 2.8676, + "step": 5374 + }, + { + "epoch": 0.15938676867419863, + "grad_norm": 0.18018680810928345, + "learning_rate": 0.0009448982491541633, + "loss": 2.8522, + "step": 5375 + }, + { + "epoch": 0.1594164220265101, + "grad_norm": 0.17413349449634552, + "learning_rate": 0.0009448767751242934, + "loss": 2.8559, + "step": 5376 + }, + { + "epoch": 0.15944607537882158, + "grad_norm": 0.17422692477703094, + "learning_rate": 0.0009448552971549632, + "loss": 2.825, + "step": 5377 + }, + { + "epoch": 0.15947572873113305, + "grad_norm": 0.17784903943538666, + "learning_rate": 0.0009448338152463631, + "loss": 2.8674, + "step": 5378 + }, + { + "epoch": 0.15950538208344453, + "grad_norm": 0.14957794547080994, + "learning_rate": 0.0009448123293986832, + "loss": 2.8868, + "step": 5379 + }, + { + "epoch": 0.159535035435756, + "grad_norm": 0.13965891301631927, + "learning_rate": 0.0009447908396121136, + "loss": 2.8481, + "step": 5380 + }, + { + "epoch": 0.15956468878806748, + "grad_norm": 0.15739700198173523, + "learning_rate": 0.0009447693458868449, + "loss": 2.8773, + "step": 5381 + }, + { + "epoch": 0.15959434214037896, + "grad_norm": 0.15088336169719696, + "learning_rate": 0.0009447478482230673, + "loss": 2.8667, + "step": 5382 + }, + { + "epoch": 0.15962399549269046, + "grad_norm": 0.1823868304491043, + "learning_rate": 0.0009447263466209712, + "loss": 2.8713, + "step": 5383 + }, + { + "epoch": 0.15965364884500194, + "grad_norm": 0.17822669446468353, + "learning_rate": 0.000944704841080747, + "loss": 2.8591, + "step": 5384 + }, + { + "epoch": 0.15968330219731341, + "grad_norm": 0.18456733226776123, + "learning_rate": 0.000944683331602585, + "loss": 2.8738, + "step": 5385 + }, + { + "epoch": 0.1597129555496249, + "grad_norm": 0.15480166673660278, + "learning_rate": 0.0009446618181866758, + "loss": 2.8151, + "step": 5386 + }, + { + "epoch": 0.15974260890193637, + "grad_norm": 0.15768945217132568, + "learning_rate": 0.0009446403008332099, + "loss": 2.8761, + "step": 5387 + }, + { + "epoch": 0.15977226225424784, + "grad_norm": 0.1621251404285431, + "learning_rate": 0.0009446187795423777, + "loss": 2.8462, + "step": 5388 + }, + { + "epoch": 0.15980191560655932, + "grad_norm": 0.15360651910305023, + "learning_rate": 0.0009445972543143702, + "loss": 2.8344, + "step": 5389 + }, + { + "epoch": 0.1598315689588708, + "grad_norm": 0.1593654900789261, + "learning_rate": 0.0009445757251493774, + "loss": 2.8724, + "step": 5390 + }, + { + "epoch": 0.15986122231118227, + "grad_norm": 0.15393871068954468, + "learning_rate": 0.0009445541920475903, + "loss": 2.8424, + "step": 5391 + }, + { + "epoch": 0.15989087566349375, + "grad_norm": 0.140742689371109, + "learning_rate": 0.0009445326550091995, + "loss": 2.8445, + "step": 5392 + }, + { + "epoch": 0.15992052901580522, + "grad_norm": 0.13828089833259583, + "learning_rate": 0.0009445111140343958, + "loss": 2.8526, + "step": 5393 + }, + { + "epoch": 0.15995018236811673, + "grad_norm": 0.13080263137817383, + "learning_rate": 0.0009444895691233699, + "loss": 2.8742, + "step": 5394 + }, + { + "epoch": 0.1599798357204282, + "grad_norm": 0.11724454909563065, + "learning_rate": 0.0009444680202763125, + "loss": 2.8651, + "step": 5395 + }, + { + "epoch": 0.16000948907273968, + "grad_norm": 0.12777301669120789, + "learning_rate": 0.0009444464674934146, + "loss": 2.8543, + "step": 5396 + }, + { + "epoch": 0.16003914242505116, + "grad_norm": 0.13082607090473175, + "learning_rate": 0.0009444249107748668, + "loss": 2.8349, + "step": 5397 + }, + { + "epoch": 0.16006879577736263, + "grad_norm": 0.14233195781707764, + "learning_rate": 0.0009444033501208602, + "loss": 2.8681, + "step": 5398 + }, + { + "epoch": 0.1600984491296741, + "grad_norm": 0.13851438462734222, + "learning_rate": 0.0009443817855315857, + "loss": 2.857, + "step": 5399 + }, + { + "epoch": 0.16012810248198558, + "grad_norm": 0.13375942409038544, + "learning_rate": 0.0009443602170072342, + "loss": 2.8883, + "step": 5400 + }, + { + "epoch": 0.16015775583429706, + "grad_norm": 0.15758958458900452, + "learning_rate": 0.0009443386445479967, + "loss": 2.8828, + "step": 5401 + }, + { + "epoch": 0.16018740918660854, + "grad_norm": 0.17662566900253296, + "learning_rate": 0.0009443170681540642, + "loss": 2.8296, + "step": 5402 + }, + { + "epoch": 0.16021706253892, + "grad_norm": 0.16476546227931976, + "learning_rate": 0.0009442954878256278, + "loss": 2.8414, + "step": 5403 + }, + { + "epoch": 0.16024671589123152, + "grad_norm": 0.15181763470172882, + "learning_rate": 0.0009442739035628784, + "loss": 2.8563, + "step": 5404 + }, + { + "epoch": 0.160276369243543, + "grad_norm": 0.17803075909614563, + "learning_rate": 0.0009442523153660076, + "loss": 2.8995, + "step": 5405 + }, + { + "epoch": 0.16030602259585447, + "grad_norm": 0.18993185460567474, + "learning_rate": 0.0009442307232352063, + "loss": 2.8281, + "step": 5406 + }, + { + "epoch": 0.16033567594816595, + "grad_norm": 0.1743001788854599, + "learning_rate": 0.0009442091271706656, + "loss": 2.8487, + "step": 5407 + }, + { + "epoch": 0.16036532930047742, + "grad_norm": 0.1638154536485672, + "learning_rate": 0.0009441875271725768, + "loss": 2.8483, + "step": 5408 + }, + { + "epoch": 0.1603949826527889, + "grad_norm": 0.1639469414949417, + "learning_rate": 0.0009441659232411313, + "loss": 2.8331, + "step": 5409 + }, + { + "epoch": 0.16042463600510037, + "grad_norm": 0.15046577155590057, + "learning_rate": 0.0009441443153765201, + "loss": 2.8706, + "step": 5410 + }, + { + "epoch": 0.16045428935741185, + "grad_norm": 0.153468519449234, + "learning_rate": 0.0009441227035789351, + "loss": 2.8781, + "step": 5411 + }, + { + "epoch": 0.16048394270972333, + "grad_norm": 0.1526574343442917, + "learning_rate": 0.000944101087848567, + "loss": 2.8407, + "step": 5412 + }, + { + "epoch": 0.1605135960620348, + "grad_norm": 0.1616378277540207, + "learning_rate": 0.0009440794681856077, + "loss": 2.8521, + "step": 5413 + }, + { + "epoch": 0.16054324941434628, + "grad_norm": 0.15628470480442047, + "learning_rate": 0.0009440578445902484, + "loss": 2.8595, + "step": 5414 + }, + { + "epoch": 0.16057290276665778, + "grad_norm": 0.1429511159658432, + "learning_rate": 0.0009440362170626809, + "loss": 2.9019, + "step": 5415 + }, + { + "epoch": 0.16060255611896926, + "grad_norm": 0.13066236674785614, + "learning_rate": 0.0009440145856030961, + "loss": 2.8528, + "step": 5416 + }, + { + "epoch": 0.16063220947128073, + "grad_norm": 0.1371588110923767, + "learning_rate": 0.0009439929502116862, + "loss": 2.8123, + "step": 5417 + }, + { + "epoch": 0.1606618628235922, + "grad_norm": 0.15715451538562775, + "learning_rate": 0.0009439713108886425, + "loss": 2.8689, + "step": 5418 + }, + { + "epoch": 0.1606915161759037, + "grad_norm": 0.15819156169891357, + "learning_rate": 0.0009439496676341565, + "loss": 2.8464, + "step": 5419 + }, + { + "epoch": 0.16072116952821516, + "grad_norm": 0.14927345514297485, + "learning_rate": 0.0009439280204484201, + "loss": 2.8401, + "step": 5420 + }, + { + "epoch": 0.16075082288052664, + "grad_norm": 0.15611979365348816, + "learning_rate": 0.0009439063693316247, + "loss": 2.8656, + "step": 5421 + }, + { + "epoch": 0.16078047623283812, + "grad_norm": 0.17288027703762054, + "learning_rate": 0.0009438847142839624, + "loss": 2.8716, + "step": 5422 + }, + { + "epoch": 0.1608101295851496, + "grad_norm": 0.14697325229644775, + "learning_rate": 0.0009438630553056247, + "loss": 2.8228, + "step": 5423 + }, + { + "epoch": 0.16083978293746107, + "grad_norm": 0.14114248752593994, + "learning_rate": 0.0009438413923968036, + "loss": 2.8653, + "step": 5424 + }, + { + "epoch": 0.16086943628977257, + "grad_norm": 0.15082645416259766, + "learning_rate": 0.0009438197255576906, + "loss": 2.8566, + "step": 5425 + }, + { + "epoch": 0.16089908964208405, + "grad_norm": 0.15611323714256287, + "learning_rate": 0.0009437980547884778, + "loss": 2.8711, + "step": 5426 + }, + { + "epoch": 0.16092874299439552, + "grad_norm": 0.16497094929218292, + "learning_rate": 0.000943776380089357, + "loss": 2.8433, + "step": 5427 + }, + { + "epoch": 0.160958396346707, + "grad_norm": 0.15808142721652985, + "learning_rate": 0.0009437547014605203, + "loss": 2.8878, + "step": 5428 + }, + { + "epoch": 0.16098804969901848, + "grad_norm": 0.15041498839855194, + "learning_rate": 0.0009437330189021594, + "loss": 2.8152, + "step": 5429 + }, + { + "epoch": 0.16101770305132995, + "grad_norm": 0.12158577889204025, + "learning_rate": 0.0009437113324144666, + "loss": 2.8369, + "step": 5430 + }, + { + "epoch": 0.16104735640364143, + "grad_norm": 0.14542581140995026, + "learning_rate": 0.0009436896419976337, + "loss": 2.8172, + "step": 5431 + }, + { + "epoch": 0.1610770097559529, + "grad_norm": 0.143648162484169, + "learning_rate": 0.000943667947651853, + "loss": 2.8286, + "step": 5432 + }, + { + "epoch": 0.16110666310826438, + "grad_norm": 0.13080935180187225, + "learning_rate": 0.0009436462493773163, + "loss": 2.8471, + "step": 5433 + }, + { + "epoch": 0.16113631646057586, + "grad_norm": 0.11957035213708878, + "learning_rate": 0.000943624547174216, + "loss": 2.8641, + "step": 5434 + }, + { + "epoch": 0.16116596981288736, + "grad_norm": 0.15092819929122925, + "learning_rate": 0.0009436028410427441, + "loss": 2.8528, + "step": 5435 + }, + { + "epoch": 0.16119562316519884, + "grad_norm": 0.1489942967891693, + "learning_rate": 0.000943581130983093, + "loss": 2.8656, + "step": 5436 + }, + { + "epoch": 0.1612252765175103, + "grad_norm": 0.15571613609790802, + "learning_rate": 0.0009435594169954548, + "loss": 2.865, + "step": 5437 + }, + { + "epoch": 0.1612549298698218, + "grad_norm": 0.1585865020751953, + "learning_rate": 0.0009435376990800218, + "loss": 2.8446, + "step": 5438 + }, + { + "epoch": 0.16128458322213327, + "grad_norm": 0.1768549084663391, + "learning_rate": 0.0009435159772369863, + "loss": 2.8415, + "step": 5439 + }, + { + "epoch": 0.16131423657444474, + "grad_norm": 0.1799280345439911, + "learning_rate": 0.0009434942514665407, + "loss": 2.8767, + "step": 5440 + }, + { + "epoch": 0.16134388992675622, + "grad_norm": 0.17592310905456543, + "learning_rate": 0.0009434725217688776, + "loss": 2.8396, + "step": 5441 + }, + { + "epoch": 0.1613735432790677, + "grad_norm": 0.1705254316329956, + "learning_rate": 0.000943450788144189, + "loss": 2.8679, + "step": 5442 + }, + { + "epoch": 0.16140319663137917, + "grad_norm": 0.19491805136203766, + "learning_rate": 0.0009434290505926676, + "loss": 2.8607, + "step": 5443 + }, + { + "epoch": 0.16143284998369065, + "grad_norm": 0.19807502627372742, + "learning_rate": 0.0009434073091145059, + "loss": 2.8881, + "step": 5444 + }, + { + "epoch": 0.16146250333600212, + "grad_norm": 0.16392187774181366, + "learning_rate": 0.0009433855637098963, + "loss": 2.8587, + "step": 5445 + }, + { + "epoch": 0.16149215668831363, + "grad_norm": 0.14326471090316772, + "learning_rate": 0.0009433638143790313, + "loss": 2.8449, + "step": 5446 + }, + { + "epoch": 0.1615218100406251, + "grad_norm": 0.17384468019008636, + "learning_rate": 0.0009433420611221037, + "loss": 2.8619, + "step": 5447 + }, + { + "epoch": 0.16155146339293658, + "grad_norm": 0.17649216949939728, + "learning_rate": 0.0009433203039393061, + "loss": 2.8538, + "step": 5448 + }, + { + "epoch": 0.16158111674524805, + "grad_norm": 0.1970578283071518, + "learning_rate": 0.0009432985428308311, + "loss": 2.8745, + "step": 5449 + }, + { + "epoch": 0.16161077009755953, + "grad_norm": 0.18557871878147125, + "learning_rate": 0.0009432767777968716, + "loss": 2.8507, + "step": 5450 + }, + { + "epoch": 0.161640423449871, + "grad_norm": 0.1718539148569107, + "learning_rate": 0.0009432550088376199, + "loss": 2.8858, + "step": 5451 + }, + { + "epoch": 0.16167007680218248, + "grad_norm": 0.16262264549732208, + "learning_rate": 0.0009432332359532691, + "loss": 2.8388, + "step": 5452 + }, + { + "epoch": 0.16169973015449396, + "grad_norm": 0.17368030548095703, + "learning_rate": 0.000943211459144012, + "loss": 2.8601, + "step": 5453 + }, + { + "epoch": 0.16172938350680544, + "grad_norm": 0.16404518485069275, + "learning_rate": 0.0009431896784100411, + "loss": 2.8883, + "step": 5454 + }, + { + "epoch": 0.1617590368591169, + "grad_norm": 0.15518240630626678, + "learning_rate": 0.0009431678937515497, + "loss": 2.8322, + "step": 5455 + }, + { + "epoch": 0.16178869021142842, + "grad_norm": 0.12909312546253204, + "learning_rate": 0.0009431461051687306, + "loss": 2.87, + "step": 5456 + }, + { + "epoch": 0.1618183435637399, + "grad_norm": 0.142075315117836, + "learning_rate": 0.0009431243126617766, + "loss": 2.8517, + "step": 5457 + }, + { + "epoch": 0.16184799691605137, + "grad_norm": 0.1522478610277176, + "learning_rate": 0.0009431025162308807, + "loss": 2.8699, + "step": 5458 + }, + { + "epoch": 0.16187765026836284, + "grad_norm": 0.13721643388271332, + "learning_rate": 0.000943080715876236, + "loss": 2.833, + "step": 5459 + }, + { + "epoch": 0.16190730362067432, + "grad_norm": 0.11429993808269501, + "learning_rate": 0.0009430589115980354, + "loss": 2.8614, + "step": 5460 + }, + { + "epoch": 0.1619369569729858, + "grad_norm": 0.11503814905881882, + "learning_rate": 0.0009430371033964722, + "loss": 2.8596, + "step": 5461 + }, + { + "epoch": 0.16196661032529727, + "grad_norm": 0.11913737654685974, + "learning_rate": 0.0009430152912717393, + "loss": 2.8751, + "step": 5462 + }, + { + "epoch": 0.16199626367760875, + "grad_norm": 0.14571191370487213, + "learning_rate": 0.0009429934752240301, + "loss": 2.8736, + "step": 5463 + }, + { + "epoch": 0.16202591702992022, + "grad_norm": 0.17586714029312134, + "learning_rate": 0.0009429716552535376, + "loss": 2.863, + "step": 5464 + }, + { + "epoch": 0.1620555703822317, + "grad_norm": 0.19468054175376892, + "learning_rate": 0.0009429498313604551, + "loss": 2.8623, + "step": 5465 + }, + { + "epoch": 0.16208522373454318, + "grad_norm": 0.17048029601573944, + "learning_rate": 0.0009429280035449757, + "loss": 2.882, + "step": 5466 + }, + { + "epoch": 0.16211487708685468, + "grad_norm": 0.17190302908420563, + "learning_rate": 0.0009429061718072929, + "loss": 2.8421, + "step": 5467 + }, + { + "epoch": 0.16214453043916616, + "grad_norm": 0.16437608003616333, + "learning_rate": 0.0009428843361475998, + "loss": 2.8437, + "step": 5468 + }, + { + "epoch": 0.16217418379147763, + "grad_norm": 0.14580467343330383, + "learning_rate": 0.0009428624965660902, + "loss": 2.8528, + "step": 5469 + }, + { + "epoch": 0.1622038371437891, + "grad_norm": 0.16603852808475494, + "learning_rate": 0.0009428406530629567, + "loss": 2.8381, + "step": 5470 + }, + { + "epoch": 0.16223349049610059, + "grad_norm": 0.14692693948745728, + "learning_rate": 0.0009428188056383936, + "loss": 2.8651, + "step": 5471 + }, + { + "epoch": 0.16226314384841206, + "grad_norm": 0.144611656665802, + "learning_rate": 0.0009427969542925938, + "loss": 2.8786, + "step": 5472 + }, + { + "epoch": 0.16229279720072354, + "grad_norm": 0.1601598560810089, + "learning_rate": 0.0009427750990257509, + "loss": 2.8965, + "step": 5473 + }, + { + "epoch": 0.162322450553035, + "grad_norm": 0.1562914252281189, + "learning_rate": 0.0009427532398380587, + "loss": 2.8484, + "step": 5474 + }, + { + "epoch": 0.1623521039053465, + "grad_norm": 0.16122908890247345, + "learning_rate": 0.0009427313767297103, + "loss": 2.8447, + "step": 5475 + }, + { + "epoch": 0.16238175725765797, + "grad_norm": 0.16788755357265472, + "learning_rate": 0.0009427095097008998, + "loss": 2.8759, + "step": 5476 + }, + { + "epoch": 0.16241141060996947, + "grad_norm": 0.16721111536026, + "learning_rate": 0.0009426876387518204, + "loss": 2.8533, + "step": 5477 + }, + { + "epoch": 0.16244106396228095, + "grad_norm": 0.16227418184280396, + "learning_rate": 0.0009426657638826661, + "loss": 2.8561, + "step": 5478 + }, + { + "epoch": 0.16247071731459242, + "grad_norm": 0.1679740995168686, + "learning_rate": 0.0009426438850936305, + "loss": 2.8246, + "step": 5479 + }, + { + "epoch": 0.1625003706669039, + "grad_norm": 0.1537511795759201, + "learning_rate": 0.0009426220023849072, + "loss": 2.8802, + "step": 5480 + }, + { + "epoch": 0.16253002401921537, + "grad_norm": 0.13533949851989746, + "learning_rate": 0.0009426001157566903, + "loss": 2.8645, + "step": 5481 + }, + { + "epoch": 0.16255967737152685, + "grad_norm": 0.1390576809644699, + "learning_rate": 0.0009425782252091733, + "loss": 2.853, + "step": 5482 + }, + { + "epoch": 0.16258933072383833, + "grad_norm": 0.16385126113891602, + "learning_rate": 0.00094255633074255, + "loss": 2.8489, + "step": 5483 + }, + { + "epoch": 0.1626189840761498, + "grad_norm": 0.16373682022094727, + "learning_rate": 0.0009425344323570145, + "loss": 2.8303, + "step": 5484 + }, + { + "epoch": 0.16264863742846128, + "grad_norm": 0.1604204922914505, + "learning_rate": 0.0009425125300527609, + "loss": 2.866, + "step": 5485 + }, + { + "epoch": 0.16267829078077276, + "grad_norm": 0.13844643533229828, + "learning_rate": 0.0009424906238299825, + "loss": 2.8779, + "step": 5486 + }, + { + "epoch": 0.16270794413308423, + "grad_norm": 0.14854282140731812, + "learning_rate": 0.0009424687136888739, + "loss": 2.8785, + "step": 5487 + }, + { + "epoch": 0.16273759748539574, + "grad_norm": 0.15251897275447845, + "learning_rate": 0.0009424467996296289, + "loss": 2.8744, + "step": 5488 + }, + { + "epoch": 0.1627672508377072, + "grad_norm": 0.14050127565860748, + "learning_rate": 0.0009424248816524415, + "loss": 2.8292, + "step": 5489 + }, + { + "epoch": 0.1627969041900187, + "grad_norm": 0.1438228338956833, + "learning_rate": 0.0009424029597575056, + "loss": 2.8481, + "step": 5490 + }, + { + "epoch": 0.16282655754233016, + "grad_norm": 0.13967102766036987, + "learning_rate": 0.0009423810339450158, + "loss": 2.881, + "step": 5491 + }, + { + "epoch": 0.16285621089464164, + "grad_norm": 0.1322658210992813, + "learning_rate": 0.000942359104215166, + "loss": 2.8625, + "step": 5492 + }, + { + "epoch": 0.16288586424695312, + "grad_norm": 0.13056516647338867, + "learning_rate": 0.0009423371705681505, + "loss": 2.8554, + "step": 5493 + }, + { + "epoch": 0.1629155175992646, + "grad_norm": 0.15050093829631805, + "learning_rate": 0.0009423152330041634, + "loss": 2.8812, + "step": 5494 + }, + { + "epoch": 0.16294517095157607, + "grad_norm": 0.16371001303195953, + "learning_rate": 0.0009422932915233988, + "loss": 2.8679, + "step": 5495 + }, + { + "epoch": 0.16297482430388754, + "grad_norm": 0.15696775913238525, + "learning_rate": 0.0009422713461260513, + "loss": 2.8391, + "step": 5496 + }, + { + "epoch": 0.16300447765619902, + "grad_norm": 0.14220573008060455, + "learning_rate": 0.0009422493968123151, + "loss": 2.8562, + "step": 5497 + }, + { + "epoch": 0.16303413100851052, + "grad_norm": 0.13236813247203827, + "learning_rate": 0.0009422274435823846, + "loss": 2.8354, + "step": 5498 + }, + { + "epoch": 0.163063784360822, + "grad_norm": 0.14503948390483856, + "learning_rate": 0.0009422054864364542, + "loss": 2.8766, + "step": 5499 + }, + { + "epoch": 0.16309343771313348, + "grad_norm": 0.1450236290693283, + "learning_rate": 0.0009421835253747182, + "loss": 2.8488, + "step": 5500 + }, + { + "epoch": 0.16312309106544495, + "grad_norm": 0.16026094555854797, + "learning_rate": 0.0009421615603973713, + "loss": 2.8819, + "step": 5501 + }, + { + "epoch": 0.16315274441775643, + "grad_norm": 0.16777147352695465, + "learning_rate": 0.0009421395915046078, + "loss": 2.8376, + "step": 5502 + }, + { + "epoch": 0.1631823977700679, + "grad_norm": 0.1773262470960617, + "learning_rate": 0.0009421176186966224, + "loss": 2.869, + "step": 5503 + }, + { + "epoch": 0.16321205112237938, + "grad_norm": 0.1651320904493332, + "learning_rate": 0.0009420956419736096, + "loss": 2.857, + "step": 5504 + }, + { + "epoch": 0.16324170447469086, + "grad_norm": 0.1843768209218979, + "learning_rate": 0.0009420736613357639, + "loss": 2.8529, + "step": 5505 + }, + { + "epoch": 0.16327135782700233, + "grad_norm": 0.1770937442779541, + "learning_rate": 0.0009420516767832802, + "loss": 2.854, + "step": 5506 + }, + { + "epoch": 0.1633010111793138, + "grad_norm": 0.15624377131462097, + "learning_rate": 0.0009420296883163529, + "loss": 2.8383, + "step": 5507 + }, + { + "epoch": 0.1633306645316253, + "grad_norm": 0.1453547328710556, + "learning_rate": 0.0009420076959351769, + "loss": 2.8323, + "step": 5508 + }, + { + "epoch": 0.1633603178839368, + "grad_norm": 0.1442757248878479, + "learning_rate": 0.0009419856996399469, + "loss": 2.8252, + "step": 5509 + }, + { + "epoch": 0.16338997123624827, + "grad_norm": 0.15941853821277618, + "learning_rate": 0.0009419636994308576, + "loss": 2.8605, + "step": 5510 + }, + { + "epoch": 0.16341962458855974, + "grad_norm": 0.15567529201507568, + "learning_rate": 0.0009419416953081039, + "loss": 2.8458, + "step": 5511 + }, + { + "epoch": 0.16344927794087122, + "grad_norm": 0.14164398610591888, + "learning_rate": 0.0009419196872718807, + "loss": 2.8707, + "step": 5512 + }, + { + "epoch": 0.1634789312931827, + "grad_norm": 0.16534507274627686, + "learning_rate": 0.0009418976753223827, + "loss": 2.8746, + "step": 5513 + }, + { + "epoch": 0.16350858464549417, + "grad_norm": 0.1709509640932083, + "learning_rate": 0.000941875659459805, + "loss": 2.8565, + "step": 5514 + }, + { + "epoch": 0.16353823799780565, + "grad_norm": 0.18277721107006073, + "learning_rate": 0.0009418536396843425, + "loss": 2.853, + "step": 5515 + }, + { + "epoch": 0.16356789135011712, + "grad_norm": 0.2044457048177719, + "learning_rate": 0.0009418316159961901, + "loss": 2.8732, + "step": 5516 + }, + { + "epoch": 0.1635975447024286, + "grad_norm": 0.17587168514728546, + "learning_rate": 0.000941809588395543, + "loss": 2.8964, + "step": 5517 + }, + { + "epoch": 0.16362719805474008, + "grad_norm": 0.16544541716575623, + "learning_rate": 0.000941787556882596, + "loss": 2.8937, + "step": 5518 + }, + { + "epoch": 0.16365685140705158, + "grad_norm": 0.1657639443874359, + "learning_rate": 0.0009417655214575446, + "loss": 2.8451, + "step": 5519 + }, + { + "epoch": 0.16368650475936306, + "grad_norm": 0.1444130837917328, + "learning_rate": 0.0009417434821205835, + "loss": 2.8632, + "step": 5520 + }, + { + "epoch": 0.16371615811167453, + "grad_norm": 0.14033572375774384, + "learning_rate": 0.0009417214388719081, + "loss": 2.829, + "step": 5521 + }, + { + "epoch": 0.163745811463986, + "grad_norm": 0.15768462419509888, + "learning_rate": 0.0009416993917117136, + "loss": 2.8436, + "step": 5522 + }, + { + "epoch": 0.16377546481629748, + "grad_norm": 0.13945992290973663, + "learning_rate": 0.000941677340640195, + "loss": 2.8852, + "step": 5523 + }, + { + "epoch": 0.16380511816860896, + "grad_norm": 0.14766672253608704, + "learning_rate": 0.0009416552856575478, + "loss": 2.8837, + "step": 5524 + }, + { + "epoch": 0.16383477152092044, + "grad_norm": 0.14988912642002106, + "learning_rate": 0.0009416332267639673, + "loss": 2.8708, + "step": 5525 + }, + { + "epoch": 0.1638644248732319, + "grad_norm": 0.13647420704364777, + "learning_rate": 0.0009416111639596488, + "loss": 2.8326, + "step": 5526 + }, + { + "epoch": 0.1638940782255434, + "grad_norm": 0.13764742016792297, + "learning_rate": 0.0009415890972447876, + "loss": 2.8793, + "step": 5527 + }, + { + "epoch": 0.16392373157785486, + "grad_norm": 0.13511912524700165, + "learning_rate": 0.0009415670266195791, + "loss": 2.8707, + "step": 5528 + }, + { + "epoch": 0.16395338493016637, + "grad_norm": 0.1584736853837967, + "learning_rate": 0.0009415449520842188, + "loss": 2.8593, + "step": 5529 + }, + { + "epoch": 0.16398303828247784, + "grad_norm": 0.12539711594581604, + "learning_rate": 0.0009415228736389021, + "loss": 2.8821, + "step": 5530 + }, + { + "epoch": 0.16401269163478932, + "grad_norm": 0.12202997505664825, + "learning_rate": 0.0009415007912838247, + "loss": 2.8537, + "step": 5531 + }, + { + "epoch": 0.1640423449871008, + "grad_norm": 0.1063196212053299, + "learning_rate": 0.000941478705019182, + "loss": 2.8423, + "step": 5532 + }, + { + "epoch": 0.16407199833941227, + "grad_norm": 0.12010415643453598, + "learning_rate": 0.0009414566148451695, + "loss": 2.8828, + "step": 5533 + }, + { + "epoch": 0.16410165169172375, + "grad_norm": 0.13044746220111847, + "learning_rate": 0.000941434520761983, + "loss": 2.8585, + "step": 5534 + }, + { + "epoch": 0.16413130504403523, + "grad_norm": 0.12451043725013733, + "learning_rate": 0.0009414124227698179, + "loss": 2.8372, + "step": 5535 + }, + { + "epoch": 0.1641609583963467, + "grad_norm": 0.12207259237766266, + "learning_rate": 0.0009413903208688701, + "loss": 2.8292, + "step": 5536 + }, + { + "epoch": 0.16419061174865818, + "grad_norm": 0.11996840685606003, + "learning_rate": 0.0009413682150593352, + "loss": 2.8635, + "step": 5537 + }, + { + "epoch": 0.16422026510096965, + "grad_norm": 0.13120803236961365, + "learning_rate": 0.0009413461053414092, + "loss": 2.849, + "step": 5538 + }, + { + "epoch": 0.16424991845328113, + "grad_norm": 0.14823105931282043, + "learning_rate": 0.0009413239917152875, + "loss": 2.853, + "step": 5539 + }, + { + "epoch": 0.16427957180559263, + "grad_norm": 0.1678457260131836, + "learning_rate": 0.0009413018741811661, + "loss": 2.8607, + "step": 5540 + }, + { + "epoch": 0.1643092251579041, + "grad_norm": 0.16464047133922577, + "learning_rate": 0.0009412797527392409, + "loss": 2.8764, + "step": 5541 + }, + { + "epoch": 0.16433887851021559, + "grad_norm": 0.16686181724071503, + "learning_rate": 0.0009412576273897078, + "loss": 2.8306, + "step": 5542 + }, + { + "epoch": 0.16436853186252706, + "grad_norm": 0.1533130407333374, + "learning_rate": 0.0009412354981327626, + "loss": 2.8721, + "step": 5543 + }, + { + "epoch": 0.16439818521483854, + "grad_norm": 0.1546948403120041, + "learning_rate": 0.0009412133649686012, + "loss": 2.8682, + "step": 5544 + }, + { + "epoch": 0.16442783856715001, + "grad_norm": 0.15283222496509552, + "learning_rate": 0.00094119122789742, + "loss": 2.8661, + "step": 5545 + }, + { + "epoch": 0.1644574919194615, + "grad_norm": 0.1582716703414917, + "learning_rate": 0.0009411690869194145, + "loss": 2.8476, + "step": 5546 + }, + { + "epoch": 0.16448714527177297, + "grad_norm": 0.15763354301452637, + "learning_rate": 0.0009411469420347811, + "loss": 2.7876, + "step": 5547 + }, + { + "epoch": 0.16451679862408444, + "grad_norm": 0.17465533316135406, + "learning_rate": 0.0009411247932437159, + "loss": 2.8824, + "step": 5548 + }, + { + "epoch": 0.16454645197639592, + "grad_norm": 0.182539701461792, + "learning_rate": 0.0009411026405464148, + "loss": 2.8576, + "step": 5549 + }, + { + "epoch": 0.16457610532870742, + "grad_norm": 0.19587448239326477, + "learning_rate": 0.0009410804839430743, + "loss": 2.8646, + "step": 5550 + }, + { + "epoch": 0.1646057586810189, + "grad_norm": 0.1804926097393036, + "learning_rate": 0.0009410583234338901, + "loss": 2.8828, + "step": 5551 + }, + { + "epoch": 0.16463541203333037, + "grad_norm": 0.17844195663928986, + "learning_rate": 0.0009410361590190589, + "loss": 2.8871, + "step": 5552 + }, + { + "epoch": 0.16466506538564185, + "grad_norm": 0.15718494355678558, + "learning_rate": 0.0009410139906987769, + "loss": 2.8368, + "step": 5553 + }, + { + "epoch": 0.16469471873795333, + "grad_norm": 0.17366673052310944, + "learning_rate": 0.0009409918184732402, + "loss": 2.8969, + "step": 5554 + }, + { + "epoch": 0.1647243720902648, + "grad_norm": 0.1851409524679184, + "learning_rate": 0.0009409696423426453, + "loss": 2.8621, + "step": 5555 + }, + { + "epoch": 0.16475402544257628, + "grad_norm": 0.19053447246551514, + "learning_rate": 0.0009409474623071885, + "loss": 2.8477, + "step": 5556 + }, + { + "epoch": 0.16478367879488776, + "grad_norm": 0.16874468326568604, + "learning_rate": 0.0009409252783670662, + "loss": 2.8863, + "step": 5557 + }, + { + "epoch": 0.16481333214719923, + "grad_norm": 0.1508275419473648, + "learning_rate": 0.0009409030905224749, + "loss": 2.8775, + "step": 5558 + }, + { + "epoch": 0.1648429854995107, + "grad_norm": 0.15500736236572266, + "learning_rate": 0.000940880898773611, + "loss": 2.858, + "step": 5559 + }, + { + "epoch": 0.1648726388518222, + "grad_norm": 0.14572712779045105, + "learning_rate": 0.0009408587031206712, + "loss": 2.8661, + "step": 5560 + }, + { + "epoch": 0.1649022922041337, + "grad_norm": 0.1296493113040924, + "learning_rate": 0.0009408365035638519, + "loss": 2.8636, + "step": 5561 + }, + { + "epoch": 0.16493194555644516, + "grad_norm": 0.14487284421920776, + "learning_rate": 0.0009408143001033496, + "loss": 2.8355, + "step": 5562 + }, + { + "epoch": 0.16496159890875664, + "grad_norm": 0.13684186339378357, + "learning_rate": 0.0009407920927393611, + "loss": 2.8626, + "step": 5563 + }, + { + "epoch": 0.16499125226106812, + "grad_norm": 0.13634389638900757, + "learning_rate": 0.0009407698814720829, + "loss": 2.8696, + "step": 5564 + }, + { + "epoch": 0.1650209056133796, + "grad_norm": 0.14308488368988037, + "learning_rate": 0.0009407476663017116, + "loss": 2.8175, + "step": 5565 + }, + { + "epoch": 0.16505055896569107, + "grad_norm": 0.13949479162693024, + "learning_rate": 0.0009407254472284444, + "loss": 2.8524, + "step": 5566 + }, + { + "epoch": 0.16508021231800254, + "grad_norm": 0.1579054296016693, + "learning_rate": 0.0009407032242524774, + "loss": 2.8306, + "step": 5567 + }, + { + "epoch": 0.16510986567031402, + "grad_norm": 0.14761526882648468, + "learning_rate": 0.0009406809973740078, + "loss": 2.8805, + "step": 5568 + }, + { + "epoch": 0.1651395190226255, + "grad_norm": 0.16670304536819458, + "learning_rate": 0.0009406587665932324, + "loss": 2.829, + "step": 5569 + }, + { + "epoch": 0.16516917237493697, + "grad_norm": 0.15842896699905396, + "learning_rate": 0.0009406365319103479, + "loss": 2.8646, + "step": 5570 + }, + { + "epoch": 0.16519882572724848, + "grad_norm": 0.16179920732975006, + "learning_rate": 0.0009406142933255512, + "loss": 2.8937, + "step": 5571 + }, + { + "epoch": 0.16522847907955995, + "grad_norm": 0.1565847098827362, + "learning_rate": 0.0009405920508390395, + "loss": 2.8311, + "step": 5572 + }, + { + "epoch": 0.16525813243187143, + "grad_norm": 0.120486319065094, + "learning_rate": 0.0009405698044510094, + "loss": 2.8573, + "step": 5573 + }, + { + "epoch": 0.1652877857841829, + "grad_norm": 0.11775941401720047, + "learning_rate": 0.0009405475541616582, + "loss": 2.8546, + "step": 5574 + }, + { + "epoch": 0.16531743913649438, + "grad_norm": 0.12276852130889893, + "learning_rate": 0.0009405252999711828, + "loss": 2.8541, + "step": 5575 + }, + { + "epoch": 0.16534709248880586, + "grad_norm": 0.13388971984386444, + "learning_rate": 0.0009405030418797802, + "loss": 2.831, + "step": 5576 + }, + { + "epoch": 0.16537674584111733, + "grad_norm": 0.1299530416727066, + "learning_rate": 0.0009404807798876475, + "loss": 2.855, + "step": 5577 + }, + { + "epoch": 0.1654063991934288, + "grad_norm": 0.14369262754917145, + "learning_rate": 0.0009404585139949819, + "loss": 2.8722, + "step": 5578 + }, + { + "epoch": 0.1654360525457403, + "grad_norm": 0.15504378080368042, + "learning_rate": 0.0009404362442019805, + "loss": 2.8506, + "step": 5579 + }, + { + "epoch": 0.16546570589805176, + "grad_norm": 0.173246368765831, + "learning_rate": 0.0009404139705088407, + "loss": 2.8669, + "step": 5580 + }, + { + "epoch": 0.16549535925036327, + "grad_norm": 0.16419173777103424, + "learning_rate": 0.0009403916929157594, + "loss": 2.8246, + "step": 5581 + }, + { + "epoch": 0.16552501260267474, + "grad_norm": 0.1491875797510147, + "learning_rate": 0.0009403694114229343, + "loss": 2.8754, + "step": 5582 + }, + { + "epoch": 0.16555466595498622, + "grad_norm": 0.15002623200416565, + "learning_rate": 0.0009403471260305624, + "loss": 2.8455, + "step": 5583 + }, + { + "epoch": 0.1655843193072977, + "grad_norm": 0.13886938989162445, + "learning_rate": 0.0009403248367388411, + "loss": 2.8476, + "step": 5584 + }, + { + "epoch": 0.16561397265960917, + "grad_norm": 0.13161207735538483, + "learning_rate": 0.0009403025435479678, + "loss": 2.8425, + "step": 5585 + }, + { + "epoch": 0.16564362601192065, + "grad_norm": 0.14208734035491943, + "learning_rate": 0.0009402802464581397, + "loss": 2.8712, + "step": 5586 + }, + { + "epoch": 0.16567327936423212, + "grad_norm": 0.16387489438056946, + "learning_rate": 0.0009402579454695547, + "loss": 2.8668, + "step": 5587 + }, + { + "epoch": 0.1657029327165436, + "grad_norm": 0.16587390005588531, + "learning_rate": 0.0009402356405824099, + "loss": 2.8664, + "step": 5588 + }, + { + "epoch": 0.16573258606885508, + "grad_norm": 0.15937702357769012, + "learning_rate": 0.0009402133317969031, + "loss": 2.891, + "step": 5589 + }, + { + "epoch": 0.16576223942116655, + "grad_norm": 0.1516825258731842, + "learning_rate": 0.0009401910191132314, + "loss": 2.8401, + "step": 5590 + }, + { + "epoch": 0.16579189277347803, + "grad_norm": 0.1567620187997818, + "learning_rate": 0.0009401687025315928, + "loss": 2.8181, + "step": 5591 + }, + { + "epoch": 0.16582154612578953, + "grad_norm": 0.17159506678581238, + "learning_rate": 0.0009401463820521849, + "loss": 2.8764, + "step": 5592 + }, + { + "epoch": 0.165851199478101, + "grad_norm": 0.1886465847492218, + "learning_rate": 0.0009401240576752052, + "loss": 2.8542, + "step": 5593 + }, + { + "epoch": 0.16588085283041248, + "grad_norm": 0.17393848299980164, + "learning_rate": 0.0009401017294008514, + "loss": 2.8705, + "step": 5594 + }, + { + "epoch": 0.16591050618272396, + "grad_norm": 0.1327427625656128, + "learning_rate": 0.0009400793972293211, + "loss": 2.8844, + "step": 5595 + }, + { + "epoch": 0.16594015953503544, + "grad_norm": 0.14261966943740845, + "learning_rate": 0.0009400570611608123, + "loss": 2.8259, + "step": 5596 + }, + { + "epoch": 0.1659698128873469, + "grad_norm": 0.14103154838085175, + "learning_rate": 0.0009400347211955226, + "loss": 2.8382, + "step": 5597 + }, + { + "epoch": 0.1659994662396584, + "grad_norm": 0.12920056283473969, + "learning_rate": 0.0009400123773336502, + "loss": 2.83, + "step": 5598 + }, + { + "epoch": 0.16602911959196986, + "grad_norm": 0.12774834036827087, + "learning_rate": 0.0009399900295753925, + "loss": 2.8531, + "step": 5599 + }, + { + "epoch": 0.16605877294428134, + "grad_norm": 0.12012398988008499, + "learning_rate": 0.0009399676779209473, + "loss": 2.8683, + "step": 5600 + }, + { + "epoch": 0.16608842629659282, + "grad_norm": 0.12686476111412048, + "learning_rate": 0.0009399453223705132, + "loss": 2.8456, + "step": 5601 + }, + { + "epoch": 0.16611807964890432, + "grad_norm": 0.13206548988819122, + "learning_rate": 0.0009399229629242876, + "loss": 2.863, + "step": 5602 + }, + { + "epoch": 0.1661477330012158, + "grad_norm": 0.15482743084430695, + "learning_rate": 0.0009399005995824687, + "loss": 2.8672, + "step": 5603 + }, + { + "epoch": 0.16617738635352727, + "grad_norm": 0.14891855418682098, + "learning_rate": 0.0009398782323452544, + "loss": 2.8331, + "step": 5604 + }, + { + "epoch": 0.16620703970583875, + "grad_norm": 0.1555085927248001, + "learning_rate": 0.000939855861212843, + "loss": 2.8518, + "step": 5605 + }, + { + "epoch": 0.16623669305815023, + "grad_norm": 0.17198851704597473, + "learning_rate": 0.0009398334861854322, + "loss": 2.8519, + "step": 5606 + }, + { + "epoch": 0.1662663464104617, + "grad_norm": 0.20140047371387482, + "learning_rate": 0.0009398111072632205, + "loss": 2.8836, + "step": 5607 + }, + { + "epoch": 0.16629599976277318, + "grad_norm": 0.1844659298658371, + "learning_rate": 0.0009397887244464061, + "loss": 2.8703, + "step": 5608 + }, + { + "epoch": 0.16632565311508465, + "grad_norm": 0.16477133333683014, + "learning_rate": 0.0009397663377351868, + "loss": 2.8578, + "step": 5609 + }, + { + "epoch": 0.16635530646739613, + "grad_norm": 0.1962490677833557, + "learning_rate": 0.0009397439471297613, + "loss": 2.8465, + "step": 5610 + }, + { + "epoch": 0.1663849598197076, + "grad_norm": 0.1713269203901291, + "learning_rate": 0.0009397215526303276, + "loss": 2.818, + "step": 5611 + }, + { + "epoch": 0.1664146131720191, + "grad_norm": 0.14625422656536102, + "learning_rate": 0.0009396991542370839, + "loss": 2.8444, + "step": 5612 + }, + { + "epoch": 0.1664442665243306, + "grad_norm": 0.1690681129693985, + "learning_rate": 0.0009396767519502289, + "loss": 2.889, + "step": 5613 + }, + { + "epoch": 0.16647391987664206, + "grad_norm": 0.16369694471359253, + "learning_rate": 0.0009396543457699609, + "loss": 2.866, + "step": 5614 + }, + { + "epoch": 0.16650357322895354, + "grad_norm": 0.16575434803962708, + "learning_rate": 0.000939631935696478, + "loss": 2.837, + "step": 5615 + }, + { + "epoch": 0.16653322658126501, + "grad_norm": 0.15387140214443207, + "learning_rate": 0.000939609521729979, + "loss": 2.8469, + "step": 5616 + }, + { + "epoch": 0.1665628799335765, + "grad_norm": 0.15905490517616272, + "learning_rate": 0.000939587103870662, + "loss": 2.8448, + "step": 5617 + }, + { + "epoch": 0.16659253328588797, + "grad_norm": 0.16308146715164185, + "learning_rate": 0.0009395646821187259, + "loss": 2.8611, + "step": 5618 + }, + { + "epoch": 0.16662218663819944, + "grad_norm": 0.18091225624084473, + "learning_rate": 0.0009395422564743691, + "loss": 2.8658, + "step": 5619 + }, + { + "epoch": 0.16665183999051092, + "grad_norm": 0.17988449335098267, + "learning_rate": 0.0009395198269377901, + "loss": 2.8773, + "step": 5620 + }, + { + "epoch": 0.1666814933428224, + "grad_norm": 0.16161279380321503, + "learning_rate": 0.0009394973935091878, + "loss": 2.8789, + "step": 5621 + }, + { + "epoch": 0.16671114669513387, + "grad_norm": 0.1659546196460724, + "learning_rate": 0.0009394749561887604, + "loss": 2.8661, + "step": 5622 + }, + { + "epoch": 0.16674080004744538, + "grad_norm": 0.15387988090515137, + "learning_rate": 0.0009394525149767068, + "loss": 2.8304, + "step": 5623 + }, + { + "epoch": 0.16677045339975685, + "grad_norm": 0.13827645778656006, + "learning_rate": 0.0009394300698732259, + "loss": 2.8385, + "step": 5624 + }, + { + "epoch": 0.16680010675206833, + "grad_norm": 0.15366452932357788, + "learning_rate": 0.0009394076208785163, + "loss": 2.8187, + "step": 5625 + }, + { + "epoch": 0.1668297601043798, + "grad_norm": 0.1450401246547699, + "learning_rate": 0.0009393851679927767, + "loss": 2.8455, + "step": 5626 + }, + { + "epoch": 0.16685941345669128, + "grad_norm": 0.1518460065126419, + "learning_rate": 0.0009393627112162061, + "loss": 2.8718, + "step": 5627 + }, + { + "epoch": 0.16688906680900276, + "grad_norm": 0.17180447280406952, + "learning_rate": 0.0009393402505490032, + "loss": 2.8453, + "step": 5628 + }, + { + "epoch": 0.16691872016131423, + "grad_norm": 0.17192649841308594, + "learning_rate": 0.0009393177859913671, + "loss": 2.845, + "step": 5629 + }, + { + "epoch": 0.1669483735136257, + "grad_norm": 0.15464423596858978, + "learning_rate": 0.0009392953175434964, + "loss": 2.8486, + "step": 5630 + }, + { + "epoch": 0.16697802686593718, + "grad_norm": 0.13724492490291595, + "learning_rate": 0.0009392728452055904, + "loss": 2.8476, + "step": 5631 + }, + { + "epoch": 0.16700768021824866, + "grad_norm": 0.1479918658733368, + "learning_rate": 0.000939250368977848, + "loss": 2.8408, + "step": 5632 + }, + { + "epoch": 0.16703733357056016, + "grad_norm": 0.15878064930438995, + "learning_rate": 0.000939227888860468, + "loss": 2.8727, + "step": 5633 + }, + { + "epoch": 0.16706698692287164, + "grad_norm": 0.15437009930610657, + "learning_rate": 0.0009392054048536498, + "loss": 2.8043, + "step": 5634 + }, + { + "epoch": 0.16709664027518312, + "grad_norm": 0.14798617362976074, + "learning_rate": 0.0009391829169575924, + "loss": 2.8558, + "step": 5635 + }, + { + "epoch": 0.1671262936274946, + "grad_norm": 0.15035949647426605, + "learning_rate": 0.0009391604251724947, + "loss": 2.8502, + "step": 5636 + }, + { + "epoch": 0.16715594697980607, + "grad_norm": 0.13952486217021942, + "learning_rate": 0.0009391379294985563, + "loss": 2.8276, + "step": 5637 + }, + { + "epoch": 0.16718560033211755, + "grad_norm": 0.1468484103679657, + "learning_rate": 0.0009391154299359758, + "loss": 2.8557, + "step": 5638 + }, + { + "epoch": 0.16721525368442902, + "grad_norm": 0.1417829394340515, + "learning_rate": 0.0009390929264849532, + "loss": 2.8847, + "step": 5639 + }, + { + "epoch": 0.1672449070367405, + "grad_norm": 0.1458262950181961, + "learning_rate": 0.0009390704191456871, + "loss": 2.8556, + "step": 5640 + }, + { + "epoch": 0.16727456038905197, + "grad_norm": 0.16023507714271545, + "learning_rate": 0.0009390479079183771, + "loss": 2.8053, + "step": 5641 + }, + { + "epoch": 0.16730421374136345, + "grad_norm": 0.1548347771167755, + "learning_rate": 0.0009390253928032226, + "loss": 2.8371, + "step": 5642 + }, + { + "epoch": 0.16733386709367493, + "grad_norm": 0.13534559309482574, + "learning_rate": 0.0009390028738004228, + "loss": 2.8836, + "step": 5643 + }, + { + "epoch": 0.16736352044598643, + "grad_norm": 0.14571663737297058, + "learning_rate": 0.0009389803509101773, + "loss": 2.8802, + "step": 5644 + }, + { + "epoch": 0.1673931737982979, + "grad_norm": 0.14622311294078827, + "learning_rate": 0.0009389578241326855, + "loss": 2.8245, + "step": 5645 + }, + { + "epoch": 0.16742282715060938, + "grad_norm": 0.13825562596321106, + "learning_rate": 0.0009389352934681467, + "loss": 2.8577, + "step": 5646 + }, + { + "epoch": 0.16745248050292086, + "grad_norm": 0.12296177446842194, + "learning_rate": 0.0009389127589167606, + "loss": 2.8898, + "step": 5647 + }, + { + "epoch": 0.16748213385523233, + "grad_norm": 0.14004626870155334, + "learning_rate": 0.0009388902204787265, + "loss": 2.8692, + "step": 5648 + }, + { + "epoch": 0.1675117872075438, + "grad_norm": 0.14281640946865082, + "learning_rate": 0.0009388676781542443, + "loss": 2.8341, + "step": 5649 + }, + { + "epoch": 0.1675414405598553, + "grad_norm": 0.14252835512161255, + "learning_rate": 0.0009388451319435135, + "loss": 2.8541, + "step": 5650 + }, + { + "epoch": 0.16757109391216676, + "grad_norm": 0.1405470222234726, + "learning_rate": 0.0009388225818467337, + "loss": 2.8435, + "step": 5651 + }, + { + "epoch": 0.16760074726447824, + "grad_norm": 0.14205531775951385, + "learning_rate": 0.0009388000278641046, + "loss": 2.8612, + "step": 5652 + }, + { + "epoch": 0.16763040061678972, + "grad_norm": 0.1399334967136383, + "learning_rate": 0.000938777469995826, + "loss": 2.8832, + "step": 5653 + }, + { + "epoch": 0.16766005396910122, + "grad_norm": 0.1742004156112671, + "learning_rate": 0.0009387549082420975, + "loss": 2.8237, + "step": 5654 + }, + { + "epoch": 0.1676897073214127, + "grad_norm": 0.18150609731674194, + "learning_rate": 0.000938732342603119, + "loss": 2.8768, + "step": 5655 + }, + { + "epoch": 0.16771936067372417, + "grad_norm": 0.17252640426158905, + "learning_rate": 0.0009387097730790904, + "loss": 2.8492, + "step": 5656 + }, + { + "epoch": 0.16774901402603565, + "grad_norm": 0.1730637401342392, + "learning_rate": 0.0009386871996702114, + "loss": 2.865, + "step": 5657 + }, + { + "epoch": 0.16777866737834712, + "grad_norm": 0.17131733894348145, + "learning_rate": 0.0009386646223766818, + "loss": 2.8272, + "step": 5658 + }, + { + "epoch": 0.1678083207306586, + "grad_norm": 0.19367463886737823, + "learning_rate": 0.0009386420411987017, + "loss": 2.8221, + "step": 5659 + }, + { + "epoch": 0.16783797408297008, + "grad_norm": 0.20180614292621613, + "learning_rate": 0.0009386194561364712, + "loss": 2.8848, + "step": 5660 + }, + { + "epoch": 0.16786762743528155, + "grad_norm": 0.15244804322719574, + "learning_rate": 0.0009385968671901901, + "loss": 2.8574, + "step": 5661 + }, + { + "epoch": 0.16789728078759303, + "grad_norm": 0.1501014083623886, + "learning_rate": 0.0009385742743600584, + "loss": 2.8361, + "step": 5662 + }, + { + "epoch": 0.1679269341399045, + "grad_norm": 0.14899373054504395, + "learning_rate": 0.0009385516776462761, + "loss": 2.8728, + "step": 5663 + }, + { + "epoch": 0.167956587492216, + "grad_norm": 0.15038946270942688, + "learning_rate": 0.0009385290770490437, + "loss": 2.8923, + "step": 5664 + }, + { + "epoch": 0.16798624084452748, + "grad_norm": 0.16507002711296082, + "learning_rate": 0.0009385064725685608, + "loss": 2.8372, + "step": 5665 + }, + { + "epoch": 0.16801589419683896, + "grad_norm": 0.14406080543994904, + "learning_rate": 0.000938483864205028, + "loss": 2.8653, + "step": 5666 + }, + { + "epoch": 0.16804554754915044, + "grad_norm": 0.16779085993766785, + "learning_rate": 0.0009384612519586453, + "loss": 2.8293, + "step": 5667 + }, + { + "epoch": 0.1680752009014619, + "grad_norm": 0.13676944375038147, + "learning_rate": 0.0009384386358296128, + "loss": 2.8812, + "step": 5668 + }, + { + "epoch": 0.1681048542537734, + "grad_norm": 0.13356506824493408, + "learning_rate": 0.000938416015818131, + "loss": 2.8801, + "step": 5669 + }, + { + "epoch": 0.16813450760608487, + "grad_norm": 0.13791891932487488, + "learning_rate": 0.0009383933919244001, + "loss": 2.8583, + "step": 5670 + }, + { + "epoch": 0.16816416095839634, + "grad_norm": 0.14865751564502716, + "learning_rate": 0.0009383707641486206, + "loss": 2.8558, + "step": 5671 + }, + { + "epoch": 0.16819381431070782, + "grad_norm": 0.141954243183136, + "learning_rate": 0.0009383481324909926, + "loss": 2.85, + "step": 5672 + }, + { + "epoch": 0.1682234676630193, + "grad_norm": 0.14241951704025269, + "learning_rate": 0.0009383254969517167, + "loss": 2.8651, + "step": 5673 + }, + { + "epoch": 0.16825312101533077, + "grad_norm": 0.12342415750026703, + "learning_rate": 0.0009383028575309932, + "loss": 2.8034, + "step": 5674 + }, + { + "epoch": 0.16828277436764227, + "grad_norm": 0.1219186782836914, + "learning_rate": 0.0009382802142290228, + "loss": 2.8427, + "step": 5675 + }, + { + "epoch": 0.16831242771995375, + "grad_norm": 0.1339523196220398, + "learning_rate": 0.0009382575670460057, + "loss": 2.8639, + "step": 5676 + }, + { + "epoch": 0.16834208107226523, + "grad_norm": 0.14085952937602997, + "learning_rate": 0.0009382349159821428, + "loss": 2.8279, + "step": 5677 + }, + { + "epoch": 0.1683717344245767, + "grad_norm": 0.169111967086792, + "learning_rate": 0.0009382122610376344, + "loss": 2.8912, + "step": 5678 + }, + { + "epoch": 0.16840138777688818, + "grad_norm": 0.17833556234836578, + "learning_rate": 0.0009381896022126813, + "loss": 2.8739, + "step": 5679 + }, + { + "epoch": 0.16843104112919965, + "grad_norm": 0.17093892395496368, + "learning_rate": 0.0009381669395074839, + "loss": 2.8629, + "step": 5680 + }, + { + "epoch": 0.16846069448151113, + "grad_norm": 0.19806943833827972, + "learning_rate": 0.0009381442729222431, + "loss": 2.8575, + "step": 5681 + }, + { + "epoch": 0.1684903478338226, + "grad_norm": 0.18647459149360657, + "learning_rate": 0.0009381216024571596, + "loss": 2.8607, + "step": 5682 + }, + { + "epoch": 0.16852000118613408, + "grad_norm": 0.1583806574344635, + "learning_rate": 0.0009380989281124342, + "loss": 2.8576, + "step": 5683 + }, + { + "epoch": 0.16854965453844556, + "grad_norm": 0.14359897375106812, + "learning_rate": 0.0009380762498882673, + "loss": 2.8789, + "step": 5684 + }, + { + "epoch": 0.16857930789075706, + "grad_norm": 0.12109455466270447, + "learning_rate": 0.0009380535677848603, + "loss": 2.8645, + "step": 5685 + }, + { + "epoch": 0.16860896124306854, + "grad_norm": 0.1262364238500595, + "learning_rate": 0.0009380308818024137, + "loss": 2.8834, + "step": 5686 + }, + { + "epoch": 0.16863861459538002, + "grad_norm": 0.13104073703289032, + "learning_rate": 0.0009380081919411284, + "loss": 2.8266, + "step": 5687 + }, + { + "epoch": 0.1686682679476915, + "grad_norm": 0.12339514493942261, + "learning_rate": 0.0009379854982012053, + "loss": 2.8208, + "step": 5688 + }, + { + "epoch": 0.16869792130000297, + "grad_norm": 0.1170005053281784, + "learning_rate": 0.0009379628005828455, + "loss": 2.819, + "step": 5689 + }, + { + "epoch": 0.16872757465231444, + "grad_norm": 0.14269830286502838, + "learning_rate": 0.0009379400990862501, + "loss": 2.8428, + "step": 5690 + }, + { + "epoch": 0.16875722800462592, + "grad_norm": 0.1678055375814438, + "learning_rate": 0.0009379173937116198, + "loss": 2.8204, + "step": 5691 + }, + { + "epoch": 0.1687868813569374, + "grad_norm": 0.17176051437854767, + "learning_rate": 0.0009378946844591558, + "loss": 2.8281, + "step": 5692 + }, + { + "epoch": 0.16881653470924887, + "grad_norm": 0.1466618776321411, + "learning_rate": 0.0009378719713290592, + "loss": 2.8407, + "step": 5693 + }, + { + "epoch": 0.16884618806156035, + "grad_norm": 0.1392417699098587, + "learning_rate": 0.0009378492543215311, + "loss": 2.8601, + "step": 5694 + }, + { + "epoch": 0.16887584141387182, + "grad_norm": 0.13533516228199005, + "learning_rate": 0.0009378265334367728, + "loss": 2.8367, + "step": 5695 + }, + { + "epoch": 0.16890549476618333, + "grad_norm": 0.1568821668624878, + "learning_rate": 0.0009378038086749853, + "loss": 2.8417, + "step": 5696 + }, + { + "epoch": 0.1689351481184948, + "grad_norm": 0.16882359981536865, + "learning_rate": 0.00093778108003637, + "loss": 2.8611, + "step": 5697 + }, + { + "epoch": 0.16896480147080628, + "grad_norm": 0.16012470424175262, + "learning_rate": 0.0009377583475211281, + "loss": 2.8383, + "step": 5698 + }, + { + "epoch": 0.16899445482311776, + "grad_norm": 0.15007001161575317, + "learning_rate": 0.0009377356111294608, + "loss": 2.8348, + "step": 5699 + }, + { + "epoch": 0.16902410817542923, + "grad_norm": 0.15729573369026184, + "learning_rate": 0.0009377128708615696, + "loss": 2.8642, + "step": 5700 + }, + { + "epoch": 0.1690537615277407, + "grad_norm": 0.1540326476097107, + "learning_rate": 0.0009376901267176558, + "loss": 2.8326, + "step": 5701 + }, + { + "epoch": 0.16908341488005219, + "grad_norm": 0.1697121560573578, + "learning_rate": 0.0009376673786979209, + "loss": 2.8448, + "step": 5702 + }, + { + "epoch": 0.16911306823236366, + "grad_norm": 0.17175325751304626, + "learning_rate": 0.000937644626802566, + "loss": 2.8495, + "step": 5703 + }, + { + "epoch": 0.16914272158467514, + "grad_norm": 0.165540874004364, + "learning_rate": 0.0009376218710317929, + "loss": 2.8735, + "step": 5704 + }, + { + "epoch": 0.1691723749369866, + "grad_norm": 0.17086978256702423, + "learning_rate": 0.0009375991113858031, + "loss": 2.842, + "step": 5705 + }, + { + "epoch": 0.16920202828929812, + "grad_norm": 0.14779040217399597, + "learning_rate": 0.000937576347864798, + "loss": 2.8839, + "step": 5706 + }, + { + "epoch": 0.1692316816416096, + "grad_norm": 0.1390128880739212, + "learning_rate": 0.0009375535804689792, + "loss": 2.8628, + "step": 5707 + }, + { + "epoch": 0.16926133499392107, + "grad_norm": 0.15195806324481964, + "learning_rate": 0.0009375308091985483, + "loss": 2.8407, + "step": 5708 + }, + { + "epoch": 0.16929098834623255, + "grad_norm": 0.15355810523033142, + "learning_rate": 0.0009375080340537072, + "loss": 2.842, + "step": 5709 + }, + { + "epoch": 0.16932064169854402, + "grad_norm": 0.1457023024559021, + "learning_rate": 0.0009374852550346572, + "loss": 2.8894, + "step": 5710 + }, + { + "epoch": 0.1693502950508555, + "grad_norm": 0.14017260074615479, + "learning_rate": 0.0009374624721416001, + "loss": 2.8082, + "step": 5711 + }, + { + "epoch": 0.16937994840316697, + "grad_norm": 0.12936533987522125, + "learning_rate": 0.0009374396853747378, + "loss": 2.8609, + "step": 5712 + }, + { + "epoch": 0.16940960175547845, + "grad_norm": 0.12353818863630295, + "learning_rate": 0.0009374168947342721, + "loss": 2.8344, + "step": 5713 + }, + { + "epoch": 0.16943925510778993, + "grad_norm": 0.1199188083410263, + "learning_rate": 0.0009373941002204046, + "loss": 2.8527, + "step": 5714 + }, + { + "epoch": 0.1694689084601014, + "grad_norm": 0.12282121926546097, + "learning_rate": 0.0009373713018333373, + "loss": 2.8559, + "step": 5715 + }, + { + "epoch": 0.1694985618124129, + "grad_norm": 0.12489745765924454, + "learning_rate": 0.000937348499573272, + "loss": 2.8641, + "step": 5716 + }, + { + "epoch": 0.16952821516472438, + "grad_norm": 0.12341960519552231, + "learning_rate": 0.0009373256934404107, + "loss": 2.8742, + "step": 5717 + }, + { + "epoch": 0.16955786851703586, + "grad_norm": 0.12469452619552612, + "learning_rate": 0.0009373028834349554, + "loss": 2.8549, + "step": 5718 + }, + { + "epoch": 0.16958752186934734, + "grad_norm": 0.11770682781934738, + "learning_rate": 0.000937280069557108, + "loss": 2.8438, + "step": 5719 + }, + { + "epoch": 0.1696171752216588, + "grad_norm": 0.12080696225166321, + "learning_rate": 0.0009372572518070704, + "loss": 2.8511, + "step": 5720 + }, + { + "epoch": 0.1696468285739703, + "grad_norm": 0.11635817587375641, + "learning_rate": 0.0009372344301850448, + "loss": 2.8501, + "step": 5721 + }, + { + "epoch": 0.16967648192628176, + "grad_norm": 0.1465320885181427, + "learning_rate": 0.0009372116046912334, + "loss": 2.858, + "step": 5722 + }, + { + "epoch": 0.16970613527859324, + "grad_norm": 0.15115326642990112, + "learning_rate": 0.0009371887753258379, + "loss": 2.8287, + "step": 5723 + }, + { + "epoch": 0.16973578863090472, + "grad_norm": 0.1328778862953186, + "learning_rate": 0.0009371659420890611, + "loss": 2.8555, + "step": 5724 + }, + { + "epoch": 0.1697654419832162, + "grad_norm": 0.14722087979316711, + "learning_rate": 0.0009371431049811046, + "loss": 2.8529, + "step": 5725 + }, + { + "epoch": 0.16979509533552767, + "grad_norm": 0.16188423335552216, + "learning_rate": 0.000937120264002171, + "loss": 2.8546, + "step": 5726 + }, + { + "epoch": 0.16982474868783917, + "grad_norm": 0.17555510997772217, + "learning_rate": 0.0009370974191524624, + "loss": 2.8135, + "step": 5727 + }, + { + "epoch": 0.16985440204015065, + "grad_norm": 0.20126350224018097, + "learning_rate": 0.0009370745704321812, + "loss": 2.8995, + "step": 5728 + }, + { + "epoch": 0.16988405539246212, + "grad_norm": 0.22794729471206665, + "learning_rate": 0.0009370517178415295, + "loss": 2.8812, + "step": 5729 + }, + { + "epoch": 0.1699137087447736, + "grad_norm": 0.21295876801013947, + "learning_rate": 0.0009370288613807098, + "loss": 2.8738, + "step": 5730 + }, + { + "epoch": 0.16994336209708508, + "grad_norm": 0.19264450669288635, + "learning_rate": 0.0009370060010499247, + "loss": 2.8387, + "step": 5731 + }, + { + "epoch": 0.16997301544939655, + "grad_norm": 0.17121832072734833, + "learning_rate": 0.0009369831368493764, + "loss": 2.8486, + "step": 5732 + }, + { + "epoch": 0.17000266880170803, + "grad_norm": 0.15513473749160767, + "learning_rate": 0.0009369602687792673, + "loss": 2.8788, + "step": 5733 + }, + { + "epoch": 0.1700323221540195, + "grad_norm": 0.1718519628047943, + "learning_rate": 0.0009369373968398002, + "loss": 2.8621, + "step": 5734 + }, + { + "epoch": 0.17006197550633098, + "grad_norm": 0.15181182324886322, + "learning_rate": 0.0009369145210311774, + "loss": 2.8515, + "step": 5735 + }, + { + "epoch": 0.17009162885864246, + "grad_norm": 0.1474560797214508, + "learning_rate": 0.0009368916413536014, + "loss": 2.8461, + "step": 5736 + }, + { + "epoch": 0.17012128221095396, + "grad_norm": 0.1221628487110138, + "learning_rate": 0.000936868757807275, + "loss": 2.8663, + "step": 5737 + }, + { + "epoch": 0.17015093556326544, + "grad_norm": 0.1453016698360443, + "learning_rate": 0.0009368458703924008, + "loss": 2.8115, + "step": 5738 + }, + { + "epoch": 0.1701805889155769, + "grad_norm": 0.12897585332393646, + "learning_rate": 0.0009368229791091813, + "loss": 2.8492, + "step": 5739 + }, + { + "epoch": 0.1702102422678884, + "grad_norm": 0.14123450219631195, + "learning_rate": 0.0009368000839578194, + "loss": 2.862, + "step": 5740 + }, + { + "epoch": 0.17023989562019987, + "grad_norm": 0.13389356434345245, + "learning_rate": 0.0009367771849385178, + "loss": 2.8427, + "step": 5741 + }, + { + "epoch": 0.17026954897251134, + "grad_norm": 0.13555388152599335, + "learning_rate": 0.0009367542820514794, + "loss": 2.8747, + "step": 5742 + }, + { + "epoch": 0.17029920232482282, + "grad_norm": 0.125331312417984, + "learning_rate": 0.0009367313752969066, + "loss": 2.861, + "step": 5743 + }, + { + "epoch": 0.1703288556771343, + "grad_norm": 0.12483334541320801, + "learning_rate": 0.0009367084646750029, + "loss": 2.862, + "step": 5744 + }, + { + "epoch": 0.17035850902944577, + "grad_norm": 0.12965965270996094, + "learning_rate": 0.0009366855501859704, + "loss": 2.8513, + "step": 5745 + }, + { + "epoch": 0.17038816238175725, + "grad_norm": 0.1513628512620926, + "learning_rate": 0.0009366626318300125, + "loss": 2.855, + "step": 5746 + }, + { + "epoch": 0.17041781573406872, + "grad_norm": 0.1730382889509201, + "learning_rate": 0.0009366397096073321, + "loss": 2.8766, + "step": 5747 + }, + { + "epoch": 0.17044746908638023, + "grad_norm": 0.20714245736598969, + "learning_rate": 0.000936616783518132, + "loss": 2.8501, + "step": 5748 + }, + { + "epoch": 0.1704771224386917, + "grad_norm": 0.2179904282093048, + "learning_rate": 0.0009365938535626155, + "loss": 2.8887, + "step": 5749 + }, + { + "epoch": 0.17050677579100318, + "grad_norm": 0.2183627039194107, + "learning_rate": 0.0009365709197409854, + "loss": 2.86, + "step": 5750 + }, + { + "epoch": 0.17053642914331466, + "grad_norm": 0.18320107460021973, + "learning_rate": 0.0009365479820534448, + "loss": 2.8371, + "step": 5751 + }, + { + "epoch": 0.17056608249562613, + "grad_norm": 0.18035665154457092, + "learning_rate": 0.0009365250405001971, + "loss": 2.8387, + "step": 5752 + }, + { + "epoch": 0.1705957358479376, + "grad_norm": 0.17472240328788757, + "learning_rate": 0.000936502095081445, + "loss": 2.8244, + "step": 5753 + }, + { + "epoch": 0.17062538920024908, + "grad_norm": 0.14225083589553833, + "learning_rate": 0.000936479145797392, + "loss": 2.8506, + "step": 5754 + }, + { + "epoch": 0.17065504255256056, + "grad_norm": 0.14982353150844574, + "learning_rate": 0.0009364561926482413, + "loss": 2.8725, + "step": 5755 + }, + { + "epoch": 0.17068469590487204, + "grad_norm": 0.15002897381782532, + "learning_rate": 0.0009364332356341962, + "loss": 2.8753, + "step": 5756 + }, + { + "epoch": 0.1707143492571835, + "grad_norm": 0.13041375577449799, + "learning_rate": 0.0009364102747554597, + "loss": 2.844, + "step": 5757 + }, + { + "epoch": 0.17074400260949502, + "grad_norm": 0.1350855529308319, + "learning_rate": 0.0009363873100122353, + "loss": 2.8737, + "step": 5758 + }, + { + "epoch": 0.1707736559618065, + "grad_norm": 0.14947594702243805, + "learning_rate": 0.0009363643414047265, + "loss": 2.8462, + "step": 5759 + }, + { + "epoch": 0.17080330931411797, + "grad_norm": 0.14068350195884705, + "learning_rate": 0.0009363413689331365, + "loss": 2.8323, + "step": 5760 + }, + { + "epoch": 0.17083296266642944, + "grad_norm": 0.13310185074806213, + "learning_rate": 0.0009363183925976687, + "loss": 2.8288, + "step": 5761 + }, + { + "epoch": 0.17086261601874092, + "grad_norm": 0.1242053434252739, + "learning_rate": 0.0009362954123985268, + "loss": 2.8436, + "step": 5762 + }, + { + "epoch": 0.1708922693710524, + "grad_norm": 0.13118809461593628, + "learning_rate": 0.000936272428335914, + "loss": 2.8566, + "step": 5763 + }, + { + "epoch": 0.17092192272336387, + "grad_norm": 0.12941202521324158, + "learning_rate": 0.0009362494404100339, + "loss": 2.8438, + "step": 5764 + }, + { + "epoch": 0.17095157607567535, + "grad_norm": 0.12423509359359741, + "learning_rate": 0.0009362264486210903, + "loss": 2.8063, + "step": 5765 + }, + { + "epoch": 0.17098122942798682, + "grad_norm": 0.13594277203083038, + "learning_rate": 0.0009362034529692866, + "loss": 2.8961, + "step": 5766 + }, + { + "epoch": 0.1710108827802983, + "grad_norm": 0.14923693239688873, + "learning_rate": 0.0009361804534548264, + "loss": 2.8466, + "step": 5767 + }, + { + "epoch": 0.1710405361326098, + "grad_norm": 0.15917442739009857, + "learning_rate": 0.0009361574500779133, + "loss": 2.8519, + "step": 5768 + }, + { + "epoch": 0.17107018948492128, + "grad_norm": 0.14834439754486084, + "learning_rate": 0.0009361344428387513, + "loss": 2.8289, + "step": 5769 + }, + { + "epoch": 0.17109984283723276, + "grad_norm": 0.14278052747249603, + "learning_rate": 0.0009361114317375438, + "loss": 2.8047, + "step": 5770 + }, + { + "epoch": 0.17112949618954423, + "grad_norm": 0.16503015160560608, + "learning_rate": 0.0009360884167744949, + "loss": 2.8524, + "step": 5771 + }, + { + "epoch": 0.1711591495418557, + "grad_norm": 0.20243121683597565, + "learning_rate": 0.000936065397949808, + "loss": 2.8612, + "step": 5772 + }, + { + "epoch": 0.17118880289416719, + "grad_norm": 0.20522288978099823, + "learning_rate": 0.0009360423752636873, + "loss": 2.8718, + "step": 5773 + }, + { + "epoch": 0.17121845624647866, + "grad_norm": 0.15345577895641327, + "learning_rate": 0.0009360193487163365, + "loss": 2.8286, + "step": 5774 + }, + { + "epoch": 0.17124810959879014, + "grad_norm": 0.13813281059265137, + "learning_rate": 0.0009359963183079596, + "loss": 2.8295, + "step": 5775 + }, + { + "epoch": 0.17127776295110161, + "grad_norm": 0.14100214838981628, + "learning_rate": 0.0009359732840387603, + "loss": 2.8451, + "step": 5776 + }, + { + "epoch": 0.1713074163034131, + "grad_norm": 0.14029933512210846, + "learning_rate": 0.0009359502459089428, + "loss": 2.8796, + "step": 5777 + }, + { + "epoch": 0.17133706965572457, + "grad_norm": 0.15232232213020325, + "learning_rate": 0.000935927203918711, + "loss": 2.8536, + "step": 5778 + }, + { + "epoch": 0.17136672300803607, + "grad_norm": 0.16073188185691833, + "learning_rate": 0.000935904158068269, + "loss": 2.8939, + "step": 5779 + }, + { + "epoch": 0.17139637636034755, + "grad_norm": 0.17852699756622314, + "learning_rate": 0.0009358811083578209, + "loss": 2.8537, + "step": 5780 + }, + { + "epoch": 0.17142602971265902, + "grad_norm": 0.16660034656524658, + "learning_rate": 0.0009358580547875708, + "loss": 2.8411, + "step": 5781 + }, + { + "epoch": 0.1714556830649705, + "grad_norm": 0.12331932783126831, + "learning_rate": 0.0009358349973577227, + "loss": 2.8379, + "step": 5782 + }, + { + "epoch": 0.17148533641728197, + "grad_norm": 0.1333582103252411, + "learning_rate": 0.000935811936068481, + "loss": 2.8696, + "step": 5783 + }, + { + "epoch": 0.17151498976959345, + "grad_norm": 0.1281343251466751, + "learning_rate": 0.0009357888709200497, + "loss": 2.855, + "step": 5784 + }, + { + "epoch": 0.17154464312190493, + "grad_norm": 0.1480851173400879, + "learning_rate": 0.0009357658019126333, + "loss": 2.8791, + "step": 5785 + }, + { + "epoch": 0.1715742964742164, + "grad_norm": 0.1496221125125885, + "learning_rate": 0.0009357427290464358, + "loss": 2.8505, + "step": 5786 + }, + { + "epoch": 0.17160394982652788, + "grad_norm": 0.1390073299407959, + "learning_rate": 0.0009357196523216616, + "loss": 2.8251, + "step": 5787 + }, + { + "epoch": 0.17163360317883936, + "grad_norm": 0.14822402596473694, + "learning_rate": 0.0009356965717385152, + "loss": 2.8401, + "step": 5788 + }, + { + "epoch": 0.17166325653115086, + "grad_norm": 0.1335679441690445, + "learning_rate": 0.0009356734872972008, + "loss": 2.8528, + "step": 5789 + }, + { + "epoch": 0.17169290988346234, + "grad_norm": 0.1416986584663391, + "learning_rate": 0.0009356503989979229, + "loss": 2.861, + "step": 5790 + }, + { + "epoch": 0.1717225632357738, + "grad_norm": 0.149526447057724, + "learning_rate": 0.000935627306840886, + "loss": 2.8897, + "step": 5791 + }, + { + "epoch": 0.1717522165880853, + "grad_norm": 0.1739521622657776, + "learning_rate": 0.0009356042108262945, + "loss": 2.8755, + "step": 5792 + }, + { + "epoch": 0.17178186994039676, + "grad_norm": 0.1622728854417801, + "learning_rate": 0.0009355811109543528, + "loss": 2.8455, + "step": 5793 + }, + { + "epoch": 0.17181152329270824, + "grad_norm": 0.1625983715057373, + "learning_rate": 0.0009355580072252658, + "loss": 2.8538, + "step": 5794 + }, + { + "epoch": 0.17184117664501972, + "grad_norm": 0.1589575558900833, + "learning_rate": 0.0009355348996392378, + "loss": 2.8466, + "step": 5795 + }, + { + "epoch": 0.1718708299973312, + "grad_norm": 0.14263378083705902, + "learning_rate": 0.0009355117881964735, + "loss": 2.835, + "step": 5796 + }, + { + "epoch": 0.17190048334964267, + "grad_norm": 0.1628137081861496, + "learning_rate": 0.0009354886728971776, + "loss": 2.8736, + "step": 5797 + }, + { + "epoch": 0.17193013670195414, + "grad_norm": 0.1661635786294937, + "learning_rate": 0.0009354655537415546, + "loss": 2.865, + "step": 5798 + }, + { + "epoch": 0.17195979005426562, + "grad_norm": 0.14024192094802856, + "learning_rate": 0.0009354424307298095, + "loss": 2.8428, + "step": 5799 + }, + { + "epoch": 0.17198944340657712, + "grad_norm": 0.13422024250030518, + "learning_rate": 0.000935419303862147, + "loss": 2.8375, + "step": 5800 + }, + { + "epoch": 0.1720190967588886, + "grad_norm": 0.16978482902050018, + "learning_rate": 0.0009353961731387717, + "loss": 2.8542, + "step": 5801 + }, + { + "epoch": 0.17204875011120008, + "grad_norm": 0.16295531392097473, + "learning_rate": 0.0009353730385598887, + "loss": 2.7972, + "step": 5802 + }, + { + "epoch": 0.17207840346351155, + "grad_norm": 0.15191160142421722, + "learning_rate": 0.0009353499001257025, + "loss": 2.8336, + "step": 5803 + }, + { + "epoch": 0.17210805681582303, + "grad_norm": 0.12305399775505066, + "learning_rate": 0.0009353267578364184, + "loss": 2.8517, + "step": 5804 + }, + { + "epoch": 0.1721377101681345, + "grad_norm": 0.13026559352874756, + "learning_rate": 0.000935303611692241, + "loss": 2.8673, + "step": 5805 + }, + { + "epoch": 0.17216736352044598, + "grad_norm": 0.13518886268138885, + "learning_rate": 0.0009352804616933754, + "loss": 2.8871, + "step": 5806 + }, + { + "epoch": 0.17219701687275746, + "grad_norm": 0.16884014010429382, + "learning_rate": 0.0009352573078400267, + "loss": 2.8363, + "step": 5807 + }, + { + "epoch": 0.17222667022506893, + "grad_norm": 0.15338559448719025, + "learning_rate": 0.0009352341501323997, + "loss": 2.8299, + "step": 5808 + }, + { + "epoch": 0.1722563235773804, + "grad_norm": 0.15033315122127533, + "learning_rate": 0.0009352109885706997, + "loss": 2.8581, + "step": 5809 + }, + { + "epoch": 0.17228597692969191, + "grad_norm": 0.15992820262908936, + "learning_rate": 0.0009351878231551317, + "loss": 2.8388, + "step": 5810 + }, + { + "epoch": 0.1723156302820034, + "grad_norm": 0.1405440717935562, + "learning_rate": 0.0009351646538859009, + "loss": 2.8552, + "step": 5811 + }, + { + "epoch": 0.17234528363431487, + "grad_norm": 0.14412744343280792, + "learning_rate": 0.0009351414807632121, + "loss": 2.8549, + "step": 5812 + }, + { + "epoch": 0.17237493698662634, + "grad_norm": 0.141239732503891, + "learning_rate": 0.000935118303787271, + "loss": 2.8581, + "step": 5813 + }, + { + "epoch": 0.17240459033893782, + "grad_norm": 0.17118829488754272, + "learning_rate": 0.0009350951229582827, + "loss": 2.8226, + "step": 5814 + }, + { + "epoch": 0.1724342436912493, + "grad_norm": 0.18337589502334595, + "learning_rate": 0.0009350719382764523, + "loss": 2.8548, + "step": 5815 + }, + { + "epoch": 0.17246389704356077, + "grad_norm": 0.1892751008272171, + "learning_rate": 0.0009350487497419852, + "loss": 2.8949, + "step": 5816 + }, + { + "epoch": 0.17249355039587225, + "grad_norm": 0.13920576870441437, + "learning_rate": 0.0009350255573550868, + "loss": 2.8549, + "step": 5817 + }, + { + "epoch": 0.17252320374818372, + "grad_norm": 0.13505172729492188, + "learning_rate": 0.0009350023611159624, + "loss": 2.8241, + "step": 5818 + }, + { + "epoch": 0.1725528571004952, + "grad_norm": 0.15165828168392181, + "learning_rate": 0.0009349791610248175, + "loss": 2.8198, + "step": 5819 + }, + { + "epoch": 0.1725825104528067, + "grad_norm": 0.1457066833972931, + "learning_rate": 0.0009349559570818574, + "loss": 2.8418, + "step": 5820 + }, + { + "epoch": 0.17261216380511818, + "grad_norm": 0.15351338684558868, + "learning_rate": 0.0009349327492872876, + "loss": 2.863, + "step": 5821 + }, + { + "epoch": 0.17264181715742966, + "grad_norm": 0.15473689138889313, + "learning_rate": 0.0009349095376413137, + "loss": 2.8568, + "step": 5822 + }, + { + "epoch": 0.17267147050974113, + "grad_norm": 0.1781812459230423, + "learning_rate": 0.000934886322144141, + "loss": 2.8096, + "step": 5823 + }, + { + "epoch": 0.1727011238620526, + "grad_norm": 0.19554664194583893, + "learning_rate": 0.0009348631027959755, + "loss": 2.8405, + "step": 5824 + }, + { + "epoch": 0.17273077721436408, + "grad_norm": 0.189342200756073, + "learning_rate": 0.0009348398795970225, + "loss": 2.8611, + "step": 5825 + }, + { + "epoch": 0.17276043056667556, + "grad_norm": 0.18544241786003113, + "learning_rate": 0.0009348166525474878, + "loss": 2.8187, + "step": 5826 + }, + { + "epoch": 0.17279008391898704, + "grad_norm": 0.14489926397800446, + "learning_rate": 0.0009347934216475769, + "loss": 2.8449, + "step": 5827 + }, + { + "epoch": 0.1728197372712985, + "grad_norm": 0.14723508059978485, + "learning_rate": 0.0009347701868974959, + "loss": 2.8652, + "step": 5828 + }, + { + "epoch": 0.17284939062361, + "grad_norm": 0.1380547285079956, + "learning_rate": 0.0009347469482974499, + "loss": 2.8512, + "step": 5829 + }, + { + "epoch": 0.17287904397592146, + "grad_norm": 0.1341559886932373, + "learning_rate": 0.0009347237058476452, + "loss": 2.8051, + "step": 5830 + }, + { + "epoch": 0.17290869732823297, + "grad_norm": 0.14076143503189087, + "learning_rate": 0.0009347004595482875, + "loss": 2.832, + "step": 5831 + }, + { + "epoch": 0.17293835068054444, + "grad_norm": 0.1292160153388977, + "learning_rate": 0.0009346772093995826, + "loss": 2.8453, + "step": 5832 + }, + { + "epoch": 0.17296800403285592, + "grad_norm": 0.12209367007017136, + "learning_rate": 0.0009346539554017363, + "loss": 2.8507, + "step": 5833 + }, + { + "epoch": 0.1729976573851674, + "grad_norm": 0.13581152260303497, + "learning_rate": 0.0009346306975549546, + "loss": 2.8528, + "step": 5834 + }, + { + "epoch": 0.17302731073747887, + "grad_norm": 0.1338278204202652, + "learning_rate": 0.0009346074358594436, + "loss": 2.8014, + "step": 5835 + }, + { + "epoch": 0.17305696408979035, + "grad_norm": 0.15946808457374573, + "learning_rate": 0.0009345841703154092, + "loss": 2.8093, + "step": 5836 + }, + { + "epoch": 0.17308661744210183, + "grad_norm": 0.14844584465026855, + "learning_rate": 0.0009345609009230572, + "loss": 2.8731, + "step": 5837 + }, + { + "epoch": 0.1731162707944133, + "grad_norm": 0.18468068540096283, + "learning_rate": 0.0009345376276825939, + "loss": 2.8546, + "step": 5838 + }, + { + "epoch": 0.17314592414672478, + "grad_norm": 0.2257634401321411, + "learning_rate": 0.0009345143505942254, + "loss": 2.8664, + "step": 5839 + }, + { + "epoch": 0.17317557749903625, + "grad_norm": 0.22644710540771484, + "learning_rate": 0.0009344910696581577, + "loss": 2.8504, + "step": 5840 + }, + { + "epoch": 0.17320523085134776, + "grad_norm": 0.1636667549610138, + "learning_rate": 0.000934467784874597, + "loss": 2.8495, + "step": 5841 + }, + { + "epoch": 0.17323488420365923, + "grad_norm": 0.13972178101539612, + "learning_rate": 0.0009344444962437494, + "loss": 2.8543, + "step": 5842 + }, + { + "epoch": 0.1732645375559707, + "grad_norm": 0.1659066528081894, + "learning_rate": 0.0009344212037658213, + "loss": 2.8687, + "step": 5843 + }, + { + "epoch": 0.1732941909082822, + "grad_norm": 0.13790105283260345, + "learning_rate": 0.0009343979074410189, + "loss": 2.8418, + "step": 5844 + }, + { + "epoch": 0.17332384426059366, + "grad_norm": 0.13339892029762268, + "learning_rate": 0.0009343746072695484, + "loss": 2.8549, + "step": 5845 + }, + { + "epoch": 0.17335349761290514, + "grad_norm": 0.1406681388616562, + "learning_rate": 0.0009343513032516162, + "loss": 2.8417, + "step": 5846 + }, + { + "epoch": 0.17338315096521661, + "grad_norm": 0.13342398405075073, + "learning_rate": 0.0009343279953874286, + "loss": 2.8468, + "step": 5847 + }, + { + "epoch": 0.1734128043175281, + "grad_norm": 0.12532538175582886, + "learning_rate": 0.0009343046836771923, + "loss": 2.8582, + "step": 5848 + }, + { + "epoch": 0.17344245766983957, + "grad_norm": 0.13232506811618805, + "learning_rate": 0.0009342813681211131, + "loss": 2.8616, + "step": 5849 + }, + { + "epoch": 0.17347211102215104, + "grad_norm": 0.15165245532989502, + "learning_rate": 0.0009342580487193981, + "loss": 2.8771, + "step": 5850 + }, + { + "epoch": 0.17350176437446252, + "grad_norm": 0.1354011744260788, + "learning_rate": 0.0009342347254722535, + "loss": 2.8449, + "step": 5851 + }, + { + "epoch": 0.17353141772677402, + "grad_norm": 0.1324632316827774, + "learning_rate": 0.0009342113983798859, + "loss": 2.8599, + "step": 5852 + }, + { + "epoch": 0.1735610710790855, + "grad_norm": 0.14994727075099945, + "learning_rate": 0.0009341880674425017, + "loss": 2.8492, + "step": 5853 + }, + { + "epoch": 0.17359072443139698, + "grad_norm": 0.16052943468093872, + "learning_rate": 0.0009341647326603078, + "loss": 2.8551, + "step": 5854 + }, + { + "epoch": 0.17362037778370845, + "grad_norm": 0.15194721519947052, + "learning_rate": 0.0009341413940335107, + "loss": 2.8552, + "step": 5855 + }, + { + "epoch": 0.17365003113601993, + "grad_norm": 0.15115217864513397, + "learning_rate": 0.0009341180515623168, + "loss": 2.8543, + "step": 5856 + }, + { + "epoch": 0.1736796844883314, + "grad_norm": 0.15587766468524933, + "learning_rate": 0.0009340947052469331, + "loss": 2.8687, + "step": 5857 + }, + { + "epoch": 0.17370933784064288, + "grad_norm": 0.17018982768058777, + "learning_rate": 0.0009340713550875663, + "loss": 2.8407, + "step": 5858 + }, + { + "epoch": 0.17373899119295436, + "grad_norm": 0.18968723714351654, + "learning_rate": 0.0009340480010844232, + "loss": 2.8531, + "step": 5859 + }, + { + "epoch": 0.17376864454526583, + "grad_norm": 0.19258619844913483, + "learning_rate": 0.0009340246432377106, + "loss": 2.8436, + "step": 5860 + }, + { + "epoch": 0.1737982978975773, + "grad_norm": 0.16416703164577484, + "learning_rate": 0.0009340012815476352, + "loss": 2.8459, + "step": 5861 + }, + { + "epoch": 0.1738279512498888, + "grad_norm": 0.15813878178596497, + "learning_rate": 0.000933977916014404, + "loss": 2.8661, + "step": 5862 + }, + { + "epoch": 0.1738576046022003, + "grad_norm": 0.16121281683444977, + "learning_rate": 0.0009339545466382238, + "loss": 2.8652, + "step": 5863 + }, + { + "epoch": 0.17388725795451176, + "grad_norm": 0.16079044342041016, + "learning_rate": 0.0009339311734193016, + "loss": 2.8644, + "step": 5864 + }, + { + "epoch": 0.17391691130682324, + "grad_norm": 0.16349059343338013, + "learning_rate": 0.0009339077963578443, + "loss": 2.8685, + "step": 5865 + }, + { + "epoch": 0.17394656465913472, + "grad_norm": 0.1548343002796173, + "learning_rate": 0.0009338844154540593, + "loss": 2.8325, + "step": 5866 + }, + { + "epoch": 0.1739762180114462, + "grad_norm": 0.14176134765148163, + "learning_rate": 0.0009338610307081531, + "loss": 2.8356, + "step": 5867 + }, + { + "epoch": 0.17400587136375767, + "grad_norm": 0.1295493096113205, + "learning_rate": 0.000933837642120333, + "loss": 2.8681, + "step": 5868 + }, + { + "epoch": 0.17403552471606915, + "grad_norm": 0.12357281893491745, + "learning_rate": 0.0009338142496908062, + "loss": 2.8681, + "step": 5869 + }, + { + "epoch": 0.17406517806838062, + "grad_norm": 0.12727351486682892, + "learning_rate": 0.0009337908534197796, + "loss": 2.8596, + "step": 5870 + }, + { + "epoch": 0.1740948314206921, + "grad_norm": 0.12495363503694534, + "learning_rate": 0.0009337674533074607, + "loss": 2.8139, + "step": 5871 + }, + { + "epoch": 0.1741244847730036, + "grad_norm": 0.12272916734218597, + "learning_rate": 0.0009337440493540565, + "loss": 2.8423, + "step": 5872 + }, + { + "epoch": 0.17415413812531508, + "grad_norm": 0.11456689238548279, + "learning_rate": 0.0009337206415597741, + "loss": 2.8671, + "step": 5873 + }, + { + "epoch": 0.17418379147762655, + "grad_norm": 0.1174861267209053, + "learning_rate": 0.0009336972299248212, + "loss": 2.8564, + "step": 5874 + }, + { + "epoch": 0.17421344482993803, + "grad_norm": 0.12330733984708786, + "learning_rate": 0.0009336738144494048, + "loss": 2.8027, + "step": 5875 + }, + { + "epoch": 0.1742430981822495, + "grad_norm": 0.12617836892604828, + "learning_rate": 0.0009336503951337324, + "loss": 2.8774, + "step": 5876 + }, + { + "epoch": 0.17427275153456098, + "grad_norm": 0.12191692739725113, + "learning_rate": 0.0009336269719780113, + "loss": 2.8199, + "step": 5877 + }, + { + "epoch": 0.17430240488687246, + "grad_norm": 0.12702621519565582, + "learning_rate": 0.0009336035449824489, + "loss": 2.8798, + "step": 5878 + }, + { + "epoch": 0.17433205823918393, + "grad_norm": 0.14094358682632446, + "learning_rate": 0.0009335801141472527, + "loss": 2.8788, + "step": 5879 + }, + { + "epoch": 0.1743617115914954, + "grad_norm": 0.13512444496154785, + "learning_rate": 0.0009335566794726302, + "loss": 2.8198, + "step": 5880 + }, + { + "epoch": 0.1743913649438069, + "grad_norm": 0.14681215584278107, + "learning_rate": 0.0009335332409587887, + "loss": 2.823, + "step": 5881 + }, + { + "epoch": 0.17442101829611836, + "grad_norm": 0.1536455750465393, + "learning_rate": 0.0009335097986059361, + "loss": 2.833, + "step": 5882 + }, + { + "epoch": 0.17445067164842987, + "grad_norm": 0.1606423407793045, + "learning_rate": 0.0009334863524142798, + "loss": 2.8653, + "step": 5883 + }, + { + "epoch": 0.17448032500074134, + "grad_norm": 0.14343571662902832, + "learning_rate": 0.0009334629023840274, + "loss": 2.8247, + "step": 5884 + }, + { + "epoch": 0.17450997835305282, + "grad_norm": 0.15112100541591644, + "learning_rate": 0.0009334394485153866, + "loss": 2.8583, + "step": 5885 + }, + { + "epoch": 0.1745396317053643, + "grad_norm": 0.15441815555095673, + "learning_rate": 0.0009334159908085651, + "loss": 2.8181, + "step": 5886 + }, + { + "epoch": 0.17456928505767577, + "grad_norm": 0.19709810614585876, + "learning_rate": 0.0009333925292637707, + "loss": 2.8564, + "step": 5887 + }, + { + "epoch": 0.17459893840998725, + "grad_norm": 0.21595728397369385, + "learning_rate": 0.000933369063881211, + "loss": 2.8503, + "step": 5888 + }, + { + "epoch": 0.17462859176229872, + "grad_norm": 0.22839900851249695, + "learning_rate": 0.0009333455946610938, + "loss": 2.796, + "step": 5889 + }, + { + "epoch": 0.1746582451146102, + "grad_norm": 0.20719875395298004, + "learning_rate": 0.0009333221216036269, + "loss": 2.8359, + "step": 5890 + }, + { + "epoch": 0.17468789846692168, + "grad_norm": 0.19263508915901184, + "learning_rate": 0.0009332986447090185, + "loss": 2.8418, + "step": 5891 + }, + { + "epoch": 0.17471755181923315, + "grad_norm": 0.17299628257751465, + "learning_rate": 0.000933275163977476, + "loss": 2.8499, + "step": 5892 + }, + { + "epoch": 0.17474720517154466, + "grad_norm": 0.16634109616279602, + "learning_rate": 0.0009332516794092077, + "loss": 2.8602, + "step": 5893 + }, + { + "epoch": 0.17477685852385613, + "grad_norm": 0.18869249522686005, + "learning_rate": 0.0009332281910044214, + "loss": 2.8395, + "step": 5894 + }, + { + "epoch": 0.1748065118761676, + "grad_norm": 0.15347003936767578, + "learning_rate": 0.000933204698763325, + "loss": 2.826, + "step": 5895 + }, + { + "epoch": 0.17483616522847908, + "grad_norm": 0.15649452805519104, + "learning_rate": 0.0009331812026861266, + "loss": 2.8437, + "step": 5896 + }, + { + "epoch": 0.17486581858079056, + "grad_norm": 0.15605853497982025, + "learning_rate": 0.0009331577027730344, + "loss": 2.8628, + "step": 5897 + }, + { + "epoch": 0.17489547193310204, + "grad_norm": 0.14579811692237854, + "learning_rate": 0.0009331341990242563, + "loss": 2.84, + "step": 5898 + }, + { + "epoch": 0.1749251252854135, + "grad_norm": 0.1366748809814453, + "learning_rate": 0.0009331106914400008, + "loss": 2.8519, + "step": 5899 + }, + { + "epoch": 0.174954778637725, + "grad_norm": 0.1498468667268753, + "learning_rate": 0.0009330871800204754, + "loss": 2.8364, + "step": 5900 + }, + { + "epoch": 0.17498443199003647, + "grad_norm": 0.15290100872516632, + "learning_rate": 0.0009330636647658889, + "loss": 2.835, + "step": 5901 + }, + { + "epoch": 0.17501408534234794, + "grad_norm": 0.15006855130195618, + "learning_rate": 0.0009330401456764492, + "loss": 2.872, + "step": 5902 + }, + { + "epoch": 0.17504373869465942, + "grad_norm": 0.1606859266757965, + "learning_rate": 0.0009330166227523649, + "loss": 2.8322, + "step": 5903 + }, + { + "epoch": 0.17507339204697092, + "grad_norm": 0.16142958402633667, + "learning_rate": 0.0009329930959938439, + "loss": 2.8469, + "step": 5904 + }, + { + "epoch": 0.1751030453992824, + "grad_norm": 0.17448313534259796, + "learning_rate": 0.0009329695654010947, + "loss": 2.8272, + "step": 5905 + }, + { + "epoch": 0.17513269875159387, + "grad_norm": 0.15292197465896606, + "learning_rate": 0.0009329460309743257, + "loss": 2.8217, + "step": 5906 + }, + { + "epoch": 0.17516235210390535, + "grad_norm": 0.14323802292346954, + "learning_rate": 0.0009329224927137453, + "loss": 2.8485, + "step": 5907 + }, + { + "epoch": 0.17519200545621683, + "grad_norm": 0.15522082149982452, + "learning_rate": 0.000932898950619562, + "loss": 2.8783, + "step": 5908 + }, + { + "epoch": 0.1752216588085283, + "grad_norm": 0.16224315762519836, + "learning_rate": 0.000932875404691984, + "loss": 2.8493, + "step": 5909 + }, + { + "epoch": 0.17525131216083978, + "grad_norm": 0.13944140076637268, + "learning_rate": 0.0009328518549312202, + "loss": 2.838, + "step": 5910 + }, + { + "epoch": 0.17528096551315125, + "grad_norm": 0.1389239877462387, + "learning_rate": 0.0009328283013374788, + "loss": 2.8084, + "step": 5911 + }, + { + "epoch": 0.17531061886546273, + "grad_norm": 0.14693240821361542, + "learning_rate": 0.0009328047439109685, + "loss": 2.8558, + "step": 5912 + }, + { + "epoch": 0.1753402722177742, + "grad_norm": 0.1338277906179428, + "learning_rate": 0.0009327811826518979, + "loss": 2.8488, + "step": 5913 + }, + { + "epoch": 0.1753699255700857, + "grad_norm": 0.1477571576833725, + "learning_rate": 0.0009327576175604756, + "loss": 2.8734, + "step": 5914 + }, + { + "epoch": 0.1753995789223972, + "grad_norm": 0.13508915901184082, + "learning_rate": 0.0009327340486369104, + "loss": 2.8608, + "step": 5915 + }, + { + "epoch": 0.17542923227470866, + "grad_norm": 0.1298222541809082, + "learning_rate": 0.0009327104758814109, + "loss": 2.849, + "step": 5916 + }, + { + "epoch": 0.17545888562702014, + "grad_norm": 0.14123544096946716, + "learning_rate": 0.000932686899294186, + "loss": 2.8362, + "step": 5917 + }, + { + "epoch": 0.17548853897933162, + "grad_norm": 0.1456347107887268, + "learning_rate": 0.0009326633188754441, + "loss": 2.8441, + "step": 5918 + }, + { + "epoch": 0.1755181923316431, + "grad_norm": 0.1383744776248932, + "learning_rate": 0.0009326397346253943, + "loss": 2.8147, + "step": 5919 + }, + { + "epoch": 0.17554784568395457, + "grad_norm": 0.13211052119731903, + "learning_rate": 0.0009326161465442455, + "loss": 2.8581, + "step": 5920 + }, + { + "epoch": 0.17557749903626604, + "grad_norm": 0.13102063536643982, + "learning_rate": 0.0009325925546322064, + "loss": 2.8523, + "step": 5921 + }, + { + "epoch": 0.17560715238857752, + "grad_norm": 0.11090844869613647, + "learning_rate": 0.0009325689588894859, + "loss": 2.8436, + "step": 5922 + }, + { + "epoch": 0.175636805740889, + "grad_norm": 0.11613520234823227, + "learning_rate": 0.0009325453593162931, + "loss": 2.8246, + "step": 5923 + }, + { + "epoch": 0.1756664590932005, + "grad_norm": 0.1262034773826599, + "learning_rate": 0.000932521755912837, + "loss": 2.833, + "step": 5924 + }, + { + "epoch": 0.17569611244551198, + "grad_norm": 0.14559239149093628, + "learning_rate": 0.0009324981486793263, + "loss": 2.8306, + "step": 5925 + }, + { + "epoch": 0.17572576579782345, + "grad_norm": 0.14305241405963898, + "learning_rate": 0.0009324745376159705, + "loss": 2.875, + "step": 5926 + }, + { + "epoch": 0.17575541915013493, + "grad_norm": 0.13499943912029266, + "learning_rate": 0.0009324509227229781, + "loss": 2.8464, + "step": 5927 + }, + { + "epoch": 0.1757850725024464, + "grad_norm": 0.1297602504491806, + "learning_rate": 0.0009324273040005589, + "loss": 2.8241, + "step": 5928 + }, + { + "epoch": 0.17581472585475788, + "grad_norm": 0.13916105031967163, + "learning_rate": 0.0009324036814489214, + "loss": 2.8143, + "step": 5929 + }, + { + "epoch": 0.17584437920706936, + "grad_norm": 0.13887372612953186, + "learning_rate": 0.0009323800550682753, + "loss": 2.8259, + "step": 5930 + }, + { + "epoch": 0.17587403255938083, + "grad_norm": 0.13175034523010254, + "learning_rate": 0.0009323564248588294, + "loss": 2.8381, + "step": 5931 + }, + { + "epoch": 0.1759036859116923, + "grad_norm": 0.13959962129592896, + "learning_rate": 0.0009323327908207934, + "loss": 2.8485, + "step": 5932 + }, + { + "epoch": 0.17593333926400379, + "grad_norm": 0.1489269733428955, + "learning_rate": 0.0009323091529543761, + "loss": 2.859, + "step": 5933 + }, + { + "epoch": 0.17596299261631526, + "grad_norm": 0.16629241406917572, + "learning_rate": 0.0009322855112597873, + "loss": 2.7838, + "step": 5934 + }, + { + "epoch": 0.17599264596862677, + "grad_norm": 0.18149051070213318, + "learning_rate": 0.0009322618657372358, + "loss": 2.8332, + "step": 5935 + }, + { + "epoch": 0.17602229932093824, + "grad_norm": 0.15710030496120453, + "learning_rate": 0.0009322382163869314, + "loss": 2.8512, + "step": 5936 + }, + { + "epoch": 0.17605195267324972, + "grad_norm": 0.17001274228096008, + "learning_rate": 0.0009322145632090835, + "loss": 2.8193, + "step": 5937 + }, + { + "epoch": 0.1760816060255612, + "grad_norm": 0.15134881436824799, + "learning_rate": 0.0009321909062039014, + "loss": 2.866, + "step": 5938 + }, + { + "epoch": 0.17611125937787267, + "grad_norm": 0.14914767444133759, + "learning_rate": 0.0009321672453715945, + "loss": 2.8534, + "step": 5939 + }, + { + "epoch": 0.17614091273018415, + "grad_norm": 0.15245096385478973, + "learning_rate": 0.0009321435807123726, + "loss": 2.844, + "step": 5940 + }, + { + "epoch": 0.17617056608249562, + "grad_norm": 0.1428571194410324, + "learning_rate": 0.000932119912226445, + "loss": 2.8575, + "step": 5941 + }, + { + "epoch": 0.1762002194348071, + "grad_norm": 0.14770476520061493, + "learning_rate": 0.0009320962399140216, + "loss": 2.8179, + "step": 5942 + }, + { + "epoch": 0.17622987278711857, + "grad_norm": 0.14459338784217834, + "learning_rate": 0.0009320725637753115, + "loss": 2.8666, + "step": 5943 + }, + { + "epoch": 0.17625952613943005, + "grad_norm": 0.1423092633485794, + "learning_rate": 0.000932048883810525, + "loss": 2.8307, + "step": 5944 + }, + { + "epoch": 0.17628917949174155, + "grad_norm": 0.14966274797916412, + "learning_rate": 0.0009320252000198715, + "loss": 2.8686, + "step": 5945 + }, + { + "epoch": 0.17631883284405303, + "grad_norm": 0.1810445785522461, + "learning_rate": 0.0009320015124035606, + "loss": 2.8307, + "step": 5946 + }, + { + "epoch": 0.1763484861963645, + "grad_norm": 0.21356135606765747, + "learning_rate": 0.000931977820961802, + "loss": 2.8571, + "step": 5947 + }, + { + "epoch": 0.17637813954867598, + "grad_norm": 0.2366858273744583, + "learning_rate": 0.0009319541256948058, + "loss": 2.8366, + "step": 5948 + }, + { + "epoch": 0.17640779290098746, + "grad_norm": 0.21878457069396973, + "learning_rate": 0.0009319304266027817, + "loss": 2.8321, + "step": 5949 + }, + { + "epoch": 0.17643744625329894, + "grad_norm": 0.19902054965496063, + "learning_rate": 0.0009319067236859394, + "loss": 2.8306, + "step": 5950 + }, + { + "epoch": 0.1764670996056104, + "grad_norm": 0.17144599556922913, + "learning_rate": 0.0009318830169444891, + "loss": 2.8574, + "step": 5951 + }, + { + "epoch": 0.1764967529579219, + "grad_norm": 0.17076238989830017, + "learning_rate": 0.0009318593063786405, + "loss": 2.835, + "step": 5952 + }, + { + "epoch": 0.17652640631023336, + "grad_norm": 0.14985665678977966, + "learning_rate": 0.0009318355919886034, + "loss": 2.8481, + "step": 5953 + }, + { + "epoch": 0.17655605966254484, + "grad_norm": 0.1446927934885025, + "learning_rate": 0.0009318118737745882, + "loss": 2.8493, + "step": 5954 + }, + { + "epoch": 0.17658571301485632, + "grad_norm": 0.14481636881828308, + "learning_rate": 0.0009317881517368048, + "loss": 2.8148, + "step": 5955 + }, + { + "epoch": 0.17661536636716782, + "grad_norm": 0.15547814965248108, + "learning_rate": 0.0009317644258754632, + "loss": 2.8396, + "step": 5956 + }, + { + "epoch": 0.1766450197194793, + "grad_norm": 0.13728222250938416, + "learning_rate": 0.0009317406961907732, + "loss": 2.8018, + "step": 5957 + }, + { + "epoch": 0.17667467307179077, + "grad_norm": 0.12886333465576172, + "learning_rate": 0.0009317169626829456, + "loss": 2.8158, + "step": 5958 + }, + { + "epoch": 0.17670432642410225, + "grad_norm": 0.11723355203866959, + "learning_rate": 0.0009316932253521901, + "loss": 2.8522, + "step": 5959 + }, + { + "epoch": 0.17673397977641372, + "grad_norm": 0.12521466612815857, + "learning_rate": 0.0009316694841987168, + "loss": 2.8391, + "step": 5960 + }, + { + "epoch": 0.1767636331287252, + "grad_norm": 0.10865224152803421, + "learning_rate": 0.0009316457392227363, + "loss": 2.8113, + "step": 5961 + }, + { + "epoch": 0.17679328648103668, + "grad_norm": 0.11993139237165451, + "learning_rate": 0.0009316219904244587, + "loss": 2.8313, + "step": 5962 + }, + { + "epoch": 0.17682293983334815, + "grad_norm": 0.12470467388629913, + "learning_rate": 0.0009315982378040942, + "loss": 2.862, + "step": 5963 + }, + { + "epoch": 0.17685259318565963, + "grad_norm": 0.13029560446739197, + "learning_rate": 0.0009315744813618532, + "loss": 2.8639, + "step": 5964 + }, + { + "epoch": 0.1768822465379711, + "grad_norm": 0.1400073766708374, + "learning_rate": 0.0009315507210979462, + "loss": 2.8548, + "step": 5965 + }, + { + "epoch": 0.1769118998902826, + "grad_norm": 0.15890218317508698, + "learning_rate": 0.0009315269570125833, + "loss": 2.8471, + "step": 5966 + }, + { + "epoch": 0.17694155324259409, + "grad_norm": 0.157538041472435, + "learning_rate": 0.0009315031891059753, + "loss": 2.8651, + "step": 5967 + }, + { + "epoch": 0.17697120659490556, + "grad_norm": 0.15501712262630463, + "learning_rate": 0.0009314794173783326, + "loss": 2.8441, + "step": 5968 + }, + { + "epoch": 0.17700085994721704, + "grad_norm": 0.16408877074718475, + "learning_rate": 0.0009314556418298654, + "loss": 2.8784, + "step": 5969 + }, + { + "epoch": 0.1770305132995285, + "grad_norm": 0.14925214648246765, + "learning_rate": 0.0009314318624607845, + "loss": 2.8407, + "step": 5970 + }, + { + "epoch": 0.17706016665184, + "grad_norm": 0.14370054006576538, + "learning_rate": 0.0009314080792713004, + "loss": 2.8467, + "step": 5971 + }, + { + "epoch": 0.17708982000415147, + "grad_norm": 0.15138986706733704, + "learning_rate": 0.0009313842922616236, + "loss": 2.8604, + "step": 5972 + }, + { + "epoch": 0.17711947335646294, + "grad_norm": 0.1365489512681961, + "learning_rate": 0.000931360501431965, + "loss": 2.8538, + "step": 5973 + }, + { + "epoch": 0.17714912670877442, + "grad_norm": 0.15811091661453247, + "learning_rate": 0.000931336706782535, + "loss": 2.8545, + "step": 5974 + }, + { + "epoch": 0.1771787800610859, + "grad_norm": 0.1740988790988922, + "learning_rate": 0.0009313129083135445, + "loss": 2.8622, + "step": 5975 + }, + { + "epoch": 0.1772084334133974, + "grad_norm": 0.1704033464193344, + "learning_rate": 0.0009312891060252041, + "loss": 2.8646, + "step": 5976 + }, + { + "epoch": 0.17723808676570887, + "grad_norm": 0.17263084650039673, + "learning_rate": 0.0009312652999177247, + "loss": 2.8348, + "step": 5977 + }, + { + "epoch": 0.17726774011802035, + "grad_norm": 0.16708068549633026, + "learning_rate": 0.0009312414899913171, + "loss": 2.8307, + "step": 5978 + }, + { + "epoch": 0.17729739347033183, + "grad_norm": 0.17281299829483032, + "learning_rate": 0.0009312176762461919, + "loss": 2.8165, + "step": 5979 + }, + { + "epoch": 0.1773270468226433, + "grad_norm": 0.17093797028064728, + "learning_rate": 0.0009311938586825604, + "loss": 2.8139, + "step": 5980 + }, + { + "epoch": 0.17735670017495478, + "grad_norm": 0.1674107015132904, + "learning_rate": 0.0009311700373006331, + "loss": 2.8384, + "step": 5981 + }, + { + "epoch": 0.17738635352726626, + "grad_norm": 0.17182262241840363, + "learning_rate": 0.0009311462121006211, + "loss": 2.837, + "step": 5982 + }, + { + "epoch": 0.17741600687957773, + "grad_norm": 0.17345114052295685, + "learning_rate": 0.0009311223830827353, + "loss": 2.8429, + "step": 5983 + }, + { + "epoch": 0.1774456602318892, + "grad_norm": 0.13463392853736877, + "learning_rate": 0.000931098550247187, + "loss": 2.8316, + "step": 5984 + }, + { + "epoch": 0.17747531358420068, + "grad_norm": 0.13578933477401733, + "learning_rate": 0.0009310747135941869, + "loss": 2.8599, + "step": 5985 + }, + { + "epoch": 0.17750496693651216, + "grad_norm": 0.1475619524717331, + "learning_rate": 0.0009310508731239464, + "loss": 2.8518, + "step": 5986 + }, + { + "epoch": 0.17753462028882366, + "grad_norm": 0.14752306044101715, + "learning_rate": 0.0009310270288366762, + "loss": 2.8625, + "step": 5987 + }, + { + "epoch": 0.17756427364113514, + "grad_norm": 0.14401738345623016, + "learning_rate": 0.0009310031807325879, + "loss": 2.8438, + "step": 5988 + }, + { + "epoch": 0.17759392699344662, + "grad_norm": 0.1275354027748108, + "learning_rate": 0.0009309793288118923, + "loss": 2.842, + "step": 5989 + }, + { + "epoch": 0.1776235803457581, + "grad_norm": 0.12588754296302795, + "learning_rate": 0.0009309554730748009, + "loss": 2.8345, + "step": 5990 + }, + { + "epoch": 0.17765323369806957, + "grad_norm": 0.16443875432014465, + "learning_rate": 0.0009309316135215247, + "loss": 2.8625, + "step": 5991 + }, + { + "epoch": 0.17768288705038104, + "grad_norm": 0.1527591049671173, + "learning_rate": 0.0009309077501522751, + "loss": 2.8462, + "step": 5992 + }, + { + "epoch": 0.17771254040269252, + "grad_norm": 0.12274147570133209, + "learning_rate": 0.0009308838829672633, + "loss": 2.8471, + "step": 5993 + }, + { + "epoch": 0.177742193755004, + "grad_norm": 0.14081501960754395, + "learning_rate": 0.000930860011966701, + "loss": 2.8556, + "step": 5994 + }, + { + "epoch": 0.17777184710731547, + "grad_norm": 0.14916113018989563, + "learning_rate": 0.0009308361371507992, + "loss": 2.8565, + "step": 5995 + }, + { + "epoch": 0.17780150045962695, + "grad_norm": 0.12838052213191986, + "learning_rate": 0.0009308122585197693, + "loss": 2.8583, + "step": 5996 + }, + { + "epoch": 0.17783115381193845, + "grad_norm": 0.1440245658159256, + "learning_rate": 0.0009307883760738231, + "loss": 2.8574, + "step": 5997 + }, + { + "epoch": 0.17786080716424993, + "grad_norm": 0.16442233324050903, + "learning_rate": 0.000930764489813172, + "loss": 2.8494, + "step": 5998 + }, + { + "epoch": 0.1778904605165614, + "grad_norm": 0.18914301693439484, + "learning_rate": 0.0009307405997380271, + "loss": 2.8357, + "step": 5999 + }, + { + "epoch": 0.17792011386887288, + "grad_norm": 0.21865683794021606, + "learning_rate": 0.0009307167058486005, + "loss": 2.8416, + "step": 6000 + }, + { + "epoch": 0.17794976722118436, + "grad_norm": 0.19126272201538086, + "learning_rate": 0.0009306928081451035, + "loss": 2.8316, + "step": 6001 + }, + { + "epoch": 0.17797942057349583, + "grad_norm": 0.16416847705841064, + "learning_rate": 0.0009306689066277478, + "loss": 2.8894, + "step": 6002 + }, + { + "epoch": 0.1780090739258073, + "grad_norm": 0.16434559226036072, + "learning_rate": 0.0009306450012967448, + "loss": 2.8478, + "step": 6003 + }, + { + "epoch": 0.17803872727811879, + "grad_norm": 0.14443233609199524, + "learning_rate": 0.0009306210921523066, + "loss": 2.8605, + "step": 6004 + }, + { + "epoch": 0.17806838063043026, + "grad_norm": 0.13827575743198395, + "learning_rate": 0.0009305971791946446, + "loss": 2.8563, + "step": 6005 + }, + { + "epoch": 0.17809803398274174, + "grad_norm": 0.13653117418289185, + "learning_rate": 0.0009305732624239707, + "loss": 2.8252, + "step": 6006 + }, + { + "epoch": 0.17812768733505321, + "grad_norm": 0.13726605474948883, + "learning_rate": 0.0009305493418404967, + "loss": 2.883, + "step": 6007 + }, + { + "epoch": 0.17815734068736472, + "grad_norm": 0.14232781529426575, + "learning_rate": 0.0009305254174444342, + "loss": 2.8331, + "step": 6008 + }, + { + "epoch": 0.1781869940396762, + "grad_norm": 0.14711929857730865, + "learning_rate": 0.0009305014892359954, + "loss": 2.8399, + "step": 6009 + }, + { + "epoch": 0.17821664739198767, + "grad_norm": 0.1478482037782669, + "learning_rate": 0.000930477557215392, + "loss": 2.8519, + "step": 6010 + }, + { + "epoch": 0.17824630074429915, + "grad_norm": 0.11537513881921768, + "learning_rate": 0.0009304536213828358, + "loss": 2.8511, + "step": 6011 + }, + { + "epoch": 0.17827595409661062, + "grad_norm": 0.11935592442750931, + "learning_rate": 0.0009304296817385392, + "loss": 2.8585, + "step": 6012 + }, + { + "epoch": 0.1783056074489221, + "grad_norm": 0.13391973078250885, + "learning_rate": 0.0009304057382827135, + "loss": 2.8219, + "step": 6013 + }, + { + "epoch": 0.17833526080123357, + "grad_norm": 0.1286795437335968, + "learning_rate": 0.0009303817910155714, + "loss": 2.8238, + "step": 6014 + }, + { + "epoch": 0.17836491415354505, + "grad_norm": 0.1372746378183365, + "learning_rate": 0.0009303578399373245, + "loss": 2.8213, + "step": 6015 + }, + { + "epoch": 0.17839456750585653, + "grad_norm": 0.1442512422800064, + "learning_rate": 0.0009303338850481853, + "loss": 2.8416, + "step": 6016 + }, + { + "epoch": 0.178424220858168, + "grad_norm": 0.19557341933250427, + "learning_rate": 0.0009303099263483655, + "loss": 2.854, + "step": 6017 + }, + { + "epoch": 0.1784538742104795, + "grad_norm": 0.22420282661914825, + "learning_rate": 0.0009302859638380774, + "loss": 2.8661, + "step": 6018 + }, + { + "epoch": 0.17848352756279098, + "grad_norm": 0.20625624060630798, + "learning_rate": 0.0009302619975175332, + "loss": 2.8361, + "step": 6019 + }, + { + "epoch": 0.17851318091510246, + "grad_norm": 0.21199357509613037, + "learning_rate": 0.0009302380273869453, + "loss": 2.85, + "step": 6020 + }, + { + "epoch": 0.17854283426741394, + "grad_norm": 0.2004774808883667, + "learning_rate": 0.0009302140534465258, + "loss": 2.8591, + "step": 6021 + }, + { + "epoch": 0.1785724876197254, + "grad_norm": 0.15480002760887146, + "learning_rate": 0.000930190075696487, + "loss": 2.8445, + "step": 6022 + }, + { + "epoch": 0.1786021409720369, + "grad_norm": 0.14983759820461273, + "learning_rate": 0.0009301660941370411, + "loss": 2.8207, + "step": 6023 + }, + { + "epoch": 0.17863179432434836, + "grad_norm": 0.1316765397787094, + "learning_rate": 0.0009301421087684008, + "loss": 2.8632, + "step": 6024 + }, + { + "epoch": 0.17866144767665984, + "grad_norm": 0.1491585075855255, + "learning_rate": 0.0009301181195907782, + "loss": 2.8673, + "step": 6025 + }, + { + "epoch": 0.17869110102897132, + "grad_norm": 0.14189887046813965, + "learning_rate": 0.0009300941266043857, + "loss": 2.8499, + "step": 6026 + }, + { + "epoch": 0.1787207543812828, + "grad_norm": 0.13577266037464142, + "learning_rate": 0.0009300701298094361, + "loss": 2.8672, + "step": 6027 + }, + { + "epoch": 0.1787504077335943, + "grad_norm": 0.128592848777771, + "learning_rate": 0.0009300461292061415, + "loss": 2.8622, + "step": 6028 + }, + { + "epoch": 0.17878006108590577, + "grad_norm": 0.1404239684343338, + "learning_rate": 0.0009300221247947147, + "loss": 2.8466, + "step": 6029 + }, + { + "epoch": 0.17880971443821725, + "grad_norm": 0.16282601654529572, + "learning_rate": 0.0009299981165753683, + "loss": 2.8525, + "step": 6030 + }, + { + "epoch": 0.17883936779052872, + "grad_norm": 0.16192960739135742, + "learning_rate": 0.0009299741045483145, + "loss": 2.8307, + "step": 6031 + }, + { + "epoch": 0.1788690211428402, + "grad_norm": 0.14214003086090088, + "learning_rate": 0.0009299500887137664, + "loss": 2.8513, + "step": 6032 + }, + { + "epoch": 0.17889867449515168, + "grad_norm": 0.13527897000312805, + "learning_rate": 0.0009299260690719364, + "loss": 2.813, + "step": 6033 + }, + { + "epoch": 0.17892832784746315, + "grad_norm": 0.12475290149450302, + "learning_rate": 0.0009299020456230373, + "loss": 2.837, + "step": 6034 + }, + { + "epoch": 0.17895798119977463, + "grad_norm": 0.12357307225465775, + "learning_rate": 0.0009298780183672817, + "loss": 2.8517, + "step": 6035 + }, + { + "epoch": 0.1789876345520861, + "grad_norm": 0.1269390881061554, + "learning_rate": 0.0009298539873048826, + "loss": 2.8553, + "step": 6036 + }, + { + "epoch": 0.17901728790439758, + "grad_norm": 0.13746418058872223, + "learning_rate": 0.0009298299524360525, + "loss": 2.8368, + "step": 6037 + }, + { + "epoch": 0.17904694125670906, + "grad_norm": 0.13555380702018738, + "learning_rate": 0.0009298059137610045, + "loss": 2.8502, + "step": 6038 + }, + { + "epoch": 0.17907659460902056, + "grad_norm": 0.13470715284347534, + "learning_rate": 0.0009297818712799514, + "loss": 2.8823, + "step": 6039 + }, + { + "epoch": 0.17910624796133204, + "grad_norm": 0.16247305274009705, + "learning_rate": 0.000929757824993106, + "loss": 2.839, + "step": 6040 + }, + { + "epoch": 0.17913590131364351, + "grad_norm": 0.17733430862426758, + "learning_rate": 0.0009297337749006812, + "loss": 2.8656, + "step": 6041 + }, + { + "epoch": 0.179165554665955, + "grad_norm": 0.16945600509643555, + "learning_rate": 0.0009297097210028902, + "loss": 2.8834, + "step": 6042 + }, + { + "epoch": 0.17919520801826647, + "grad_norm": 0.17363350093364716, + "learning_rate": 0.0009296856632999458, + "loss": 2.8528, + "step": 6043 + }, + { + "epoch": 0.17922486137057794, + "grad_norm": 0.1589580625295639, + "learning_rate": 0.0009296616017920612, + "loss": 2.8827, + "step": 6044 + }, + { + "epoch": 0.17925451472288942, + "grad_norm": 0.1488560438156128, + "learning_rate": 0.0009296375364794492, + "loss": 2.8257, + "step": 6045 + }, + { + "epoch": 0.1792841680752009, + "grad_norm": 0.14117784798145294, + "learning_rate": 0.0009296134673623231, + "loss": 2.8566, + "step": 6046 + }, + { + "epoch": 0.17931382142751237, + "grad_norm": 0.15570612251758575, + "learning_rate": 0.0009295893944408959, + "loss": 2.853, + "step": 6047 + }, + { + "epoch": 0.17934347477982385, + "grad_norm": 0.15811029076576233, + "learning_rate": 0.0009295653177153811, + "loss": 2.8176, + "step": 6048 + }, + { + "epoch": 0.17937312813213535, + "grad_norm": 0.13709478080272675, + "learning_rate": 0.0009295412371859918, + "loss": 2.8133, + "step": 6049 + }, + { + "epoch": 0.17940278148444683, + "grad_norm": 0.14202970266342163, + "learning_rate": 0.0009295171528529407, + "loss": 2.8806, + "step": 6050 + }, + { + "epoch": 0.1794324348367583, + "grad_norm": 0.1674574464559555, + "learning_rate": 0.0009294930647164417, + "loss": 2.83, + "step": 6051 + }, + { + "epoch": 0.17946208818906978, + "grad_norm": 0.1334260106086731, + "learning_rate": 0.000929468972776708, + "loss": 2.8267, + "step": 6052 + }, + { + "epoch": 0.17949174154138126, + "grad_norm": 0.14716289937496185, + "learning_rate": 0.0009294448770339526, + "loss": 2.811, + "step": 6053 + }, + { + "epoch": 0.17952139489369273, + "grad_norm": 0.15520089864730835, + "learning_rate": 0.0009294207774883892, + "loss": 2.8575, + "step": 6054 + }, + { + "epoch": 0.1795510482460042, + "grad_norm": 0.1573234647512436, + "learning_rate": 0.0009293966741402311, + "loss": 2.8617, + "step": 6055 + }, + { + "epoch": 0.17958070159831568, + "grad_norm": 0.17404617369174957, + "learning_rate": 0.0009293725669896918, + "loss": 2.8242, + "step": 6056 + }, + { + "epoch": 0.17961035495062716, + "grad_norm": 0.16425299644470215, + "learning_rate": 0.0009293484560369847, + "loss": 2.8318, + "step": 6057 + }, + { + "epoch": 0.17964000830293864, + "grad_norm": 0.13770024478435516, + "learning_rate": 0.0009293243412823234, + "loss": 2.8303, + "step": 6058 + }, + { + "epoch": 0.1796696616552501, + "grad_norm": 0.15413595736026764, + "learning_rate": 0.0009293002227259211, + "loss": 2.855, + "step": 6059 + }, + { + "epoch": 0.17969931500756162, + "grad_norm": 0.15834780037403107, + "learning_rate": 0.000929276100367992, + "loss": 2.8846, + "step": 6060 + }, + { + "epoch": 0.1797289683598731, + "grad_norm": 0.1813373565673828, + "learning_rate": 0.0009292519742087491, + "loss": 2.8502, + "step": 6061 + }, + { + "epoch": 0.17975862171218457, + "grad_norm": 0.18375734984874725, + "learning_rate": 0.0009292278442484063, + "loss": 2.8851, + "step": 6062 + }, + { + "epoch": 0.17978827506449604, + "grad_norm": 0.13974955677986145, + "learning_rate": 0.0009292037104871773, + "loss": 2.7921, + "step": 6063 + }, + { + "epoch": 0.17981792841680752, + "grad_norm": 0.14973211288452148, + "learning_rate": 0.0009291795729252759, + "loss": 2.7997, + "step": 6064 + }, + { + "epoch": 0.179847581769119, + "grad_norm": 0.1584392488002777, + "learning_rate": 0.0009291554315629156, + "loss": 2.8302, + "step": 6065 + }, + { + "epoch": 0.17987723512143047, + "grad_norm": 0.14686210453510284, + "learning_rate": 0.0009291312864003102, + "loss": 2.8579, + "step": 6066 + }, + { + "epoch": 0.17990688847374195, + "grad_norm": 0.13626030087471008, + "learning_rate": 0.0009291071374376736, + "loss": 2.871, + "step": 6067 + }, + { + "epoch": 0.17993654182605343, + "grad_norm": 0.14394713938236237, + "learning_rate": 0.0009290829846752197, + "loss": 2.8482, + "step": 6068 + }, + { + "epoch": 0.1799661951783649, + "grad_norm": 0.13299399614334106, + "learning_rate": 0.0009290588281131624, + "loss": 2.8472, + "step": 6069 + }, + { + "epoch": 0.1799958485306764, + "grad_norm": 0.12455529719591141, + "learning_rate": 0.0009290346677517155, + "loss": 2.7964, + "step": 6070 + }, + { + "epoch": 0.18002550188298788, + "grad_norm": 0.12406260520219803, + "learning_rate": 0.000929010503591093, + "loss": 2.8185, + "step": 6071 + }, + { + "epoch": 0.18005515523529936, + "grad_norm": 0.14562390744686127, + "learning_rate": 0.0009289863356315087, + "loss": 2.828, + "step": 6072 + }, + { + "epoch": 0.18008480858761083, + "grad_norm": 0.1487995833158493, + "learning_rate": 0.0009289621638731769, + "loss": 2.8532, + "step": 6073 + }, + { + "epoch": 0.1801144619399223, + "grad_norm": 0.16492605209350586, + "learning_rate": 0.0009289379883163116, + "loss": 2.8402, + "step": 6074 + }, + { + "epoch": 0.1801441152922338, + "grad_norm": 0.14933642745018005, + "learning_rate": 0.0009289138089611267, + "loss": 2.8726, + "step": 6075 + }, + { + "epoch": 0.18017376864454526, + "grad_norm": 0.1507013887166977, + "learning_rate": 0.0009288896258078363, + "loss": 2.8302, + "step": 6076 + }, + { + "epoch": 0.18020342199685674, + "grad_norm": 0.15491992235183716, + "learning_rate": 0.0009288654388566546, + "loss": 2.8867, + "step": 6077 + }, + { + "epoch": 0.18023307534916821, + "grad_norm": 0.16433098912239075, + "learning_rate": 0.000928841248107796, + "loss": 2.8466, + "step": 6078 + }, + { + "epoch": 0.1802627287014797, + "grad_norm": 0.14896391332149506, + "learning_rate": 0.0009288170535614745, + "loss": 2.8227, + "step": 6079 + }, + { + "epoch": 0.1802923820537912, + "grad_norm": 0.1471327543258667, + "learning_rate": 0.0009287928552179043, + "loss": 2.8468, + "step": 6080 + }, + { + "epoch": 0.18032203540610267, + "grad_norm": 0.13540008664131165, + "learning_rate": 0.0009287686530772999, + "loss": 2.8294, + "step": 6081 + }, + { + "epoch": 0.18035168875841415, + "grad_norm": 0.14567959308624268, + "learning_rate": 0.0009287444471398754, + "loss": 2.8332, + "step": 6082 + }, + { + "epoch": 0.18038134211072562, + "grad_norm": 0.13859324157238007, + "learning_rate": 0.0009287202374058453, + "loss": 2.839, + "step": 6083 + }, + { + "epoch": 0.1804109954630371, + "grad_norm": 0.14214038848876953, + "learning_rate": 0.0009286960238754238, + "loss": 2.865, + "step": 6084 + }, + { + "epoch": 0.18044064881534858, + "grad_norm": 0.15387308597564697, + "learning_rate": 0.0009286718065488253, + "loss": 2.8429, + "step": 6085 + }, + { + "epoch": 0.18047030216766005, + "grad_norm": 0.13428764045238495, + "learning_rate": 0.0009286475854262646, + "loss": 2.815, + "step": 6086 + }, + { + "epoch": 0.18049995551997153, + "grad_norm": 0.14917628467082977, + "learning_rate": 0.0009286233605079559, + "loss": 2.8379, + "step": 6087 + }, + { + "epoch": 0.180529608872283, + "grad_norm": 0.14382942020893097, + "learning_rate": 0.0009285991317941138, + "loss": 2.8158, + "step": 6088 + }, + { + "epoch": 0.18055926222459448, + "grad_norm": 0.13184262812137604, + "learning_rate": 0.0009285748992849528, + "loss": 2.838, + "step": 6089 + }, + { + "epoch": 0.18058891557690596, + "grad_norm": 0.14614412188529968, + "learning_rate": 0.0009285506629806875, + "loss": 2.8746, + "step": 6090 + }, + { + "epoch": 0.18061856892921746, + "grad_norm": 0.16917730867862701, + "learning_rate": 0.0009285264228815325, + "loss": 2.8639, + "step": 6091 + }, + { + "epoch": 0.18064822228152894, + "grad_norm": 0.1629980355501175, + "learning_rate": 0.0009285021789877024, + "loss": 2.8353, + "step": 6092 + }, + { + "epoch": 0.1806778756338404, + "grad_norm": 0.1595802754163742, + "learning_rate": 0.0009284779312994121, + "loss": 2.8469, + "step": 6093 + }, + { + "epoch": 0.1807075289861519, + "grad_norm": 0.1737501323223114, + "learning_rate": 0.0009284536798168762, + "loss": 2.8375, + "step": 6094 + }, + { + "epoch": 0.18073718233846336, + "grad_norm": 0.1818520575761795, + "learning_rate": 0.0009284294245403091, + "loss": 2.8627, + "step": 6095 + }, + { + "epoch": 0.18076683569077484, + "grad_norm": 0.179319366812706, + "learning_rate": 0.0009284051654699262, + "loss": 2.8572, + "step": 6096 + }, + { + "epoch": 0.18079648904308632, + "grad_norm": 0.18015670776367188, + "learning_rate": 0.0009283809026059419, + "loss": 2.8365, + "step": 6097 + }, + { + "epoch": 0.1808261423953978, + "grad_norm": 0.16822227835655212, + "learning_rate": 0.0009283566359485713, + "loss": 2.8655, + "step": 6098 + }, + { + "epoch": 0.18085579574770927, + "grad_norm": 0.1501031517982483, + "learning_rate": 0.0009283323654980291, + "loss": 2.8388, + "step": 6099 + }, + { + "epoch": 0.18088544910002075, + "grad_norm": 0.13701431453227997, + "learning_rate": 0.0009283080912545303, + "loss": 2.8502, + "step": 6100 + }, + { + "epoch": 0.18091510245233225, + "grad_norm": 0.13152065873146057, + "learning_rate": 0.0009282838132182898, + "loss": 2.8325, + "step": 6101 + }, + { + "epoch": 0.18094475580464373, + "grad_norm": 0.14391951262950897, + "learning_rate": 0.0009282595313895225, + "loss": 2.8284, + "step": 6102 + }, + { + "epoch": 0.1809744091569552, + "grad_norm": 0.13414424657821655, + "learning_rate": 0.0009282352457684438, + "loss": 2.8337, + "step": 6103 + }, + { + "epoch": 0.18100406250926668, + "grad_norm": 0.139421284198761, + "learning_rate": 0.0009282109563552683, + "loss": 2.8637, + "step": 6104 + }, + { + "epoch": 0.18103371586157815, + "grad_norm": 0.13987493515014648, + "learning_rate": 0.0009281866631502114, + "loss": 2.8452, + "step": 6105 + }, + { + "epoch": 0.18106336921388963, + "grad_norm": 0.12902699410915375, + "learning_rate": 0.0009281623661534879, + "loss": 2.8514, + "step": 6106 + }, + { + "epoch": 0.1810930225662011, + "grad_norm": 0.13313890993595123, + "learning_rate": 0.0009281380653653133, + "loss": 2.8779, + "step": 6107 + }, + { + "epoch": 0.18112267591851258, + "grad_norm": 0.1431206315755844, + "learning_rate": 0.0009281137607859028, + "loss": 2.808, + "step": 6108 + }, + { + "epoch": 0.18115232927082406, + "grad_norm": 0.1397417038679123, + "learning_rate": 0.0009280894524154713, + "loss": 2.8203, + "step": 6109 + }, + { + "epoch": 0.18118198262313553, + "grad_norm": 0.1497650146484375, + "learning_rate": 0.0009280651402542342, + "loss": 2.8644, + "step": 6110 + }, + { + "epoch": 0.181211635975447, + "grad_norm": 0.13352127373218536, + "learning_rate": 0.0009280408243024068, + "loss": 2.8136, + "step": 6111 + }, + { + "epoch": 0.18124128932775851, + "grad_norm": 0.11602398753166199, + "learning_rate": 0.0009280165045602045, + "loss": 2.8109, + "step": 6112 + }, + { + "epoch": 0.18127094268007, + "grad_norm": 0.13280753791332245, + "learning_rate": 0.0009279921810278424, + "loss": 2.8225, + "step": 6113 + }, + { + "epoch": 0.18130059603238147, + "grad_norm": 0.13754814863204956, + "learning_rate": 0.0009279678537055363, + "loss": 2.854, + "step": 6114 + }, + { + "epoch": 0.18133024938469294, + "grad_norm": 0.1540096551179886, + "learning_rate": 0.0009279435225935012, + "loss": 2.807, + "step": 6115 + }, + { + "epoch": 0.18135990273700442, + "grad_norm": 0.1570219248533249, + "learning_rate": 0.0009279191876919528, + "loss": 2.8255, + "step": 6116 + }, + { + "epoch": 0.1813895560893159, + "grad_norm": 0.16032515466213226, + "learning_rate": 0.0009278948490011068, + "loss": 2.8177, + "step": 6117 + }, + { + "epoch": 0.18141920944162737, + "grad_norm": 0.14269253611564636, + "learning_rate": 0.0009278705065211781, + "loss": 2.8028, + "step": 6118 + }, + { + "epoch": 0.18144886279393885, + "grad_norm": 0.1475793421268463, + "learning_rate": 0.0009278461602523828, + "loss": 2.8161, + "step": 6119 + }, + { + "epoch": 0.18147851614625032, + "grad_norm": 0.16512669622898102, + "learning_rate": 0.0009278218101949364, + "loss": 2.8348, + "step": 6120 + }, + { + "epoch": 0.1815081694985618, + "grad_norm": 0.15829195082187653, + "learning_rate": 0.0009277974563490543, + "loss": 2.8375, + "step": 6121 + }, + { + "epoch": 0.1815378228508733, + "grad_norm": 0.18000422418117523, + "learning_rate": 0.0009277730987149525, + "loss": 2.8383, + "step": 6122 + }, + { + "epoch": 0.18156747620318478, + "grad_norm": 0.17665119469165802, + "learning_rate": 0.0009277487372928462, + "loss": 2.8487, + "step": 6123 + }, + { + "epoch": 0.18159712955549626, + "grad_norm": 0.1303824782371521, + "learning_rate": 0.0009277243720829515, + "loss": 2.809, + "step": 6124 + }, + { + "epoch": 0.18162678290780773, + "grad_norm": 0.16632793843746185, + "learning_rate": 0.0009277000030854841, + "loss": 2.8133, + "step": 6125 + }, + { + "epoch": 0.1816564362601192, + "grad_norm": 0.14531125128269196, + "learning_rate": 0.0009276756303006597, + "loss": 2.8376, + "step": 6126 + }, + { + "epoch": 0.18168608961243068, + "grad_norm": 0.16849453747272491, + "learning_rate": 0.0009276512537286943, + "loss": 2.8466, + "step": 6127 + }, + { + "epoch": 0.18171574296474216, + "grad_norm": 0.19616389274597168, + "learning_rate": 0.0009276268733698034, + "loss": 2.8765, + "step": 6128 + }, + { + "epoch": 0.18174539631705364, + "grad_norm": 0.16741327941417694, + "learning_rate": 0.0009276024892242034, + "loss": 2.8397, + "step": 6129 + }, + { + "epoch": 0.1817750496693651, + "grad_norm": 0.1678611785173416, + "learning_rate": 0.0009275781012921099, + "loss": 2.8379, + "step": 6130 + }, + { + "epoch": 0.1818047030216766, + "grad_norm": 0.1881203055381775, + "learning_rate": 0.0009275537095737389, + "loss": 2.8653, + "step": 6131 + }, + { + "epoch": 0.1818343563739881, + "grad_norm": 0.16138510406017303, + "learning_rate": 0.0009275293140693064, + "loss": 2.8421, + "step": 6132 + }, + { + "epoch": 0.18186400972629957, + "grad_norm": 0.15519778430461884, + "learning_rate": 0.0009275049147790285, + "loss": 2.8345, + "step": 6133 + }, + { + "epoch": 0.18189366307861105, + "grad_norm": 0.14769691228866577, + "learning_rate": 0.0009274805117031211, + "loss": 2.8222, + "step": 6134 + }, + { + "epoch": 0.18192331643092252, + "grad_norm": 0.1302659958600998, + "learning_rate": 0.0009274561048418004, + "loss": 2.8428, + "step": 6135 + }, + { + "epoch": 0.181952969783234, + "grad_norm": 0.13172224164009094, + "learning_rate": 0.0009274316941952825, + "loss": 2.8476, + "step": 6136 + }, + { + "epoch": 0.18198262313554547, + "grad_norm": 0.12941160798072815, + "learning_rate": 0.0009274072797637837, + "loss": 2.8429, + "step": 6137 + }, + { + "epoch": 0.18201227648785695, + "grad_norm": 0.16174042224884033, + "learning_rate": 0.00092738286154752, + "loss": 2.8865, + "step": 6138 + }, + { + "epoch": 0.18204192984016843, + "grad_norm": 0.16053850948810577, + "learning_rate": 0.0009273584395467077, + "loss": 2.844, + "step": 6139 + }, + { + "epoch": 0.1820715831924799, + "grad_norm": 0.1500491499900818, + "learning_rate": 0.0009273340137615631, + "loss": 2.8527, + "step": 6140 + }, + { + "epoch": 0.18210123654479138, + "grad_norm": 0.13927380740642548, + "learning_rate": 0.0009273095841923025, + "loss": 2.8505, + "step": 6141 + }, + { + "epoch": 0.18213088989710285, + "grad_norm": 0.12652920186519623, + "learning_rate": 0.000927285150839142, + "loss": 2.7982, + "step": 6142 + }, + { + "epoch": 0.18216054324941436, + "grad_norm": 0.12455841153860092, + "learning_rate": 0.0009272607137022983, + "loss": 2.8199, + "step": 6143 + }, + { + "epoch": 0.18219019660172583, + "grad_norm": 0.1368364840745926, + "learning_rate": 0.0009272362727819877, + "loss": 2.8236, + "step": 6144 + }, + { + "epoch": 0.1822198499540373, + "grad_norm": 0.14596450328826904, + "learning_rate": 0.0009272118280784263, + "loss": 2.8319, + "step": 6145 + }, + { + "epoch": 0.1822495033063488, + "grad_norm": 0.14793598651885986, + "learning_rate": 0.0009271873795918311, + "loss": 2.8244, + "step": 6146 + }, + { + "epoch": 0.18227915665866026, + "grad_norm": 0.16129319369792938, + "learning_rate": 0.0009271629273224182, + "loss": 2.8437, + "step": 6147 + }, + { + "epoch": 0.18230881001097174, + "grad_norm": 0.17312993109226227, + "learning_rate": 0.0009271384712704043, + "loss": 2.8775, + "step": 6148 + }, + { + "epoch": 0.18233846336328322, + "grad_norm": 0.15177926421165466, + "learning_rate": 0.0009271140114360059, + "loss": 2.8315, + "step": 6149 + }, + { + "epoch": 0.1823681167155947, + "grad_norm": 0.14009377360343933, + "learning_rate": 0.0009270895478194396, + "loss": 2.8228, + "step": 6150 + }, + { + "epoch": 0.18239777006790617, + "grad_norm": 0.13213962316513062, + "learning_rate": 0.0009270650804209222, + "loss": 2.8249, + "step": 6151 + }, + { + "epoch": 0.18242742342021764, + "grad_norm": 0.1461343914270401, + "learning_rate": 0.00092704060924067, + "loss": 2.8455, + "step": 6152 + }, + { + "epoch": 0.18245707677252915, + "grad_norm": 0.15131543576717377, + "learning_rate": 0.0009270161342789, + "loss": 2.8418, + "step": 6153 + }, + { + "epoch": 0.18248673012484062, + "grad_norm": 0.14229325950145721, + "learning_rate": 0.0009269916555358289, + "loss": 2.8412, + "step": 6154 + }, + { + "epoch": 0.1825163834771521, + "grad_norm": 0.13233403861522675, + "learning_rate": 0.0009269671730116732, + "loss": 2.8233, + "step": 6155 + }, + { + "epoch": 0.18254603682946358, + "grad_norm": 0.14279718697071075, + "learning_rate": 0.00092694268670665, + "loss": 2.847, + "step": 6156 + }, + { + "epoch": 0.18257569018177505, + "grad_norm": 0.15512463450431824, + "learning_rate": 0.0009269181966209761, + "loss": 2.801, + "step": 6157 + }, + { + "epoch": 0.18260534353408653, + "grad_norm": 0.1707131564617157, + "learning_rate": 0.0009268937027548683, + "loss": 2.848, + "step": 6158 + }, + { + "epoch": 0.182634996886398, + "grad_norm": 0.15144844353199005, + "learning_rate": 0.0009268692051085433, + "loss": 2.8334, + "step": 6159 + }, + { + "epoch": 0.18266465023870948, + "grad_norm": 0.1607399880886078, + "learning_rate": 0.0009268447036822183, + "loss": 2.8352, + "step": 6160 + }, + { + "epoch": 0.18269430359102096, + "grad_norm": 0.14365127682685852, + "learning_rate": 0.0009268201984761102, + "loss": 2.8557, + "step": 6161 + }, + { + "epoch": 0.18272395694333243, + "grad_norm": 0.1394784301519394, + "learning_rate": 0.0009267956894904361, + "loss": 2.8058, + "step": 6162 + }, + { + "epoch": 0.1827536102956439, + "grad_norm": 0.12887683510780334, + "learning_rate": 0.0009267711767254128, + "loss": 2.8562, + "step": 6163 + }, + { + "epoch": 0.1827832636479554, + "grad_norm": 0.13591289520263672, + "learning_rate": 0.0009267466601812575, + "loss": 2.8169, + "step": 6164 + }, + { + "epoch": 0.1828129170002669, + "grad_norm": 0.13669878244400024, + "learning_rate": 0.0009267221398581873, + "loss": 2.8182, + "step": 6165 + }, + { + "epoch": 0.18284257035257837, + "grad_norm": 0.13334964215755463, + "learning_rate": 0.0009266976157564191, + "loss": 2.8616, + "step": 6166 + }, + { + "epoch": 0.18287222370488984, + "grad_norm": 0.15355995297431946, + "learning_rate": 0.0009266730878761705, + "loss": 2.8386, + "step": 6167 + }, + { + "epoch": 0.18290187705720132, + "grad_norm": 0.15716895461082458, + "learning_rate": 0.0009266485562176583, + "loss": 2.8307, + "step": 6168 + }, + { + "epoch": 0.1829315304095128, + "grad_norm": 0.16635705530643463, + "learning_rate": 0.0009266240207811001, + "loss": 2.8251, + "step": 6169 + }, + { + "epoch": 0.18296118376182427, + "grad_norm": 0.18521976470947266, + "learning_rate": 0.0009265994815667129, + "loss": 2.8626, + "step": 6170 + }, + { + "epoch": 0.18299083711413575, + "grad_norm": 0.1721188724040985, + "learning_rate": 0.0009265749385747139, + "loss": 2.852, + "step": 6171 + }, + { + "epoch": 0.18302049046644722, + "grad_norm": 0.1450502723455429, + "learning_rate": 0.0009265503918053209, + "loss": 2.8297, + "step": 6172 + }, + { + "epoch": 0.1830501438187587, + "grad_norm": 0.13872525095939636, + "learning_rate": 0.0009265258412587507, + "loss": 2.8376, + "step": 6173 + }, + { + "epoch": 0.1830797971710702, + "grad_norm": 0.15063539147377014, + "learning_rate": 0.0009265012869352212, + "loss": 2.8296, + "step": 6174 + }, + { + "epoch": 0.18310945052338168, + "grad_norm": 0.1320536583662033, + "learning_rate": 0.0009264767288349494, + "loss": 2.877, + "step": 6175 + }, + { + "epoch": 0.18313910387569315, + "grad_norm": 0.13398341834545135, + "learning_rate": 0.000926452166958153, + "loss": 2.8402, + "step": 6176 + }, + { + "epoch": 0.18316875722800463, + "grad_norm": 0.12332766503095627, + "learning_rate": 0.0009264276013050494, + "loss": 2.8494, + "step": 6177 + }, + { + "epoch": 0.1831984105803161, + "grad_norm": 0.11588135361671448, + "learning_rate": 0.0009264030318758562, + "loss": 2.8424, + "step": 6178 + }, + { + "epoch": 0.18322806393262758, + "grad_norm": 0.13023430109024048, + "learning_rate": 0.0009263784586707912, + "loss": 2.8559, + "step": 6179 + }, + { + "epoch": 0.18325771728493906, + "grad_norm": 0.15863358974456787, + "learning_rate": 0.0009263538816900716, + "loss": 2.8525, + "step": 6180 + }, + { + "epoch": 0.18328737063725054, + "grad_norm": 0.16874578595161438, + "learning_rate": 0.0009263293009339151, + "loss": 2.7938, + "step": 6181 + }, + { + "epoch": 0.183317023989562, + "grad_norm": 0.16548457741737366, + "learning_rate": 0.0009263047164025396, + "loss": 2.8713, + "step": 6182 + }, + { + "epoch": 0.1833466773418735, + "grad_norm": 0.16400404274463654, + "learning_rate": 0.0009262801280961626, + "loss": 2.8605, + "step": 6183 + }, + { + "epoch": 0.183376330694185, + "grad_norm": 0.1613805741071701, + "learning_rate": 0.000926255536015002, + "loss": 2.8675, + "step": 6184 + }, + { + "epoch": 0.18340598404649647, + "grad_norm": 0.1650325506925583, + "learning_rate": 0.0009262309401592753, + "loss": 2.8138, + "step": 6185 + }, + { + "epoch": 0.18343563739880794, + "grad_norm": 0.18439283967018127, + "learning_rate": 0.0009262063405292005, + "loss": 2.8295, + "step": 6186 + }, + { + "epoch": 0.18346529075111942, + "grad_norm": 0.1991429328918457, + "learning_rate": 0.0009261817371249955, + "loss": 2.8492, + "step": 6187 + }, + { + "epoch": 0.1834949441034309, + "grad_norm": 0.17375844717025757, + "learning_rate": 0.0009261571299468781, + "loss": 2.8065, + "step": 6188 + }, + { + "epoch": 0.18352459745574237, + "grad_norm": 0.12854136526584625, + "learning_rate": 0.000926132518995066, + "loss": 2.8444, + "step": 6189 + }, + { + "epoch": 0.18355425080805385, + "grad_norm": 0.13071326911449432, + "learning_rate": 0.0009261079042697773, + "loss": 2.8409, + "step": 6190 + }, + { + "epoch": 0.18358390416036532, + "grad_norm": 0.16139818727970123, + "learning_rate": 0.00092608328577123, + "loss": 2.8529, + "step": 6191 + }, + { + "epoch": 0.1836135575126768, + "grad_norm": 0.15237310528755188, + "learning_rate": 0.0009260586634996422, + "loss": 2.8301, + "step": 6192 + }, + { + "epoch": 0.18364321086498828, + "grad_norm": 0.1316194236278534, + "learning_rate": 0.0009260340374552316, + "loss": 2.8664, + "step": 6193 + }, + { + "epoch": 0.18367286421729975, + "grad_norm": 0.14638924598693848, + "learning_rate": 0.0009260094076382166, + "loss": 2.8449, + "step": 6194 + }, + { + "epoch": 0.18370251756961126, + "grad_norm": 0.1600823700428009, + "learning_rate": 0.0009259847740488152, + "loss": 2.8289, + "step": 6195 + }, + { + "epoch": 0.18373217092192273, + "grad_norm": 0.152399480342865, + "learning_rate": 0.0009259601366872455, + "loss": 2.8278, + "step": 6196 + }, + { + "epoch": 0.1837618242742342, + "grad_norm": 0.17591571807861328, + "learning_rate": 0.0009259354955537256, + "loss": 2.8676, + "step": 6197 + }, + { + "epoch": 0.18379147762654569, + "grad_norm": 0.1580459028482437, + "learning_rate": 0.0009259108506484738, + "loss": 2.8261, + "step": 6198 + }, + { + "epoch": 0.18382113097885716, + "grad_norm": 0.15248839557170868, + "learning_rate": 0.0009258862019717082, + "loss": 2.8426, + "step": 6199 + }, + { + "epoch": 0.18385078433116864, + "grad_norm": 0.15593287348747253, + "learning_rate": 0.0009258615495236474, + "loss": 2.7963, + "step": 6200 + }, + { + "epoch": 0.1838804376834801, + "grad_norm": 0.14737869799137115, + "learning_rate": 0.0009258368933045093, + "loss": 2.833, + "step": 6201 + }, + { + "epoch": 0.1839100910357916, + "grad_norm": 0.1508859544992447, + "learning_rate": 0.0009258122333145126, + "loss": 2.8541, + "step": 6202 + }, + { + "epoch": 0.18393974438810307, + "grad_norm": 0.1485135704278946, + "learning_rate": 0.0009257875695538754, + "loss": 2.8578, + "step": 6203 + }, + { + "epoch": 0.18396939774041454, + "grad_norm": 0.13563258945941925, + "learning_rate": 0.0009257629020228163, + "loss": 2.8486, + "step": 6204 + }, + { + "epoch": 0.18399905109272605, + "grad_norm": 0.13187192380428314, + "learning_rate": 0.0009257382307215533, + "loss": 2.8534, + "step": 6205 + }, + { + "epoch": 0.18402870444503752, + "grad_norm": 0.12690961360931396, + "learning_rate": 0.0009257135556503054, + "loss": 2.8139, + "step": 6206 + }, + { + "epoch": 0.184058357797349, + "grad_norm": 0.14947360754013062, + "learning_rate": 0.0009256888768092908, + "loss": 2.8614, + "step": 6207 + }, + { + "epoch": 0.18408801114966047, + "grad_norm": 0.15915419161319733, + "learning_rate": 0.0009256641941987283, + "loss": 2.8547, + "step": 6208 + }, + { + "epoch": 0.18411766450197195, + "grad_norm": 0.16676801443099976, + "learning_rate": 0.0009256395078188362, + "loss": 2.8101, + "step": 6209 + }, + { + "epoch": 0.18414731785428343, + "grad_norm": 0.1524372100830078, + "learning_rate": 0.0009256148176698332, + "loss": 2.8109, + "step": 6210 + }, + { + "epoch": 0.1841769712065949, + "grad_norm": 0.1575772613286972, + "learning_rate": 0.0009255901237519377, + "loss": 2.8517, + "step": 6211 + }, + { + "epoch": 0.18420662455890638, + "grad_norm": 0.15319311618804932, + "learning_rate": 0.000925565426065369, + "loss": 2.8454, + "step": 6212 + }, + { + "epoch": 0.18423627791121786, + "grad_norm": 0.1216670498251915, + "learning_rate": 0.0009255407246103451, + "loss": 2.8589, + "step": 6213 + }, + { + "epoch": 0.18426593126352933, + "grad_norm": 0.13496451079845428, + "learning_rate": 0.0009255160193870853, + "loss": 2.8568, + "step": 6214 + }, + { + "epoch": 0.1842955846158408, + "grad_norm": 0.16810165345668793, + "learning_rate": 0.0009254913103958079, + "loss": 2.8482, + "step": 6215 + }, + { + "epoch": 0.1843252379681523, + "grad_norm": 0.1619659811258316, + "learning_rate": 0.000925466597636732, + "loss": 2.839, + "step": 6216 + }, + { + "epoch": 0.1843548913204638, + "grad_norm": 0.14354880154132843, + "learning_rate": 0.0009254418811100763, + "loss": 2.8687, + "step": 6217 + }, + { + "epoch": 0.18438454467277526, + "grad_norm": 0.14388954639434814, + "learning_rate": 0.0009254171608160598, + "loss": 2.8298, + "step": 6218 + }, + { + "epoch": 0.18441419802508674, + "grad_norm": 0.13866475224494934, + "learning_rate": 0.0009253924367549013, + "loss": 2.8113, + "step": 6219 + }, + { + "epoch": 0.18444385137739822, + "grad_norm": 0.12429305911064148, + "learning_rate": 0.0009253677089268198, + "loss": 2.8258, + "step": 6220 + }, + { + "epoch": 0.1844735047297097, + "grad_norm": 0.1556948870420456, + "learning_rate": 0.0009253429773320341, + "loss": 2.8445, + "step": 6221 + }, + { + "epoch": 0.18450315808202117, + "grad_norm": 0.17042109370231628, + "learning_rate": 0.0009253182419707633, + "loss": 2.8439, + "step": 6222 + }, + { + "epoch": 0.18453281143433264, + "grad_norm": 0.16752971708774567, + "learning_rate": 0.0009252935028432266, + "loss": 2.8297, + "step": 6223 + }, + { + "epoch": 0.18456246478664412, + "grad_norm": 0.15984870493412018, + "learning_rate": 0.0009252687599496427, + "loss": 2.8358, + "step": 6224 + }, + { + "epoch": 0.1845921181389556, + "grad_norm": 0.1586025357246399, + "learning_rate": 0.0009252440132902312, + "loss": 2.8335, + "step": 6225 + }, + { + "epoch": 0.1846217714912671, + "grad_norm": 0.17627333104610443, + "learning_rate": 0.000925219262865211, + "loss": 2.8436, + "step": 6226 + }, + { + "epoch": 0.18465142484357858, + "grad_norm": 0.14288505911827087, + "learning_rate": 0.000925194508674801, + "loss": 2.8208, + "step": 6227 + }, + { + "epoch": 0.18468107819589005, + "grad_norm": 0.13052275776863098, + "learning_rate": 0.0009251697507192208, + "loss": 2.857, + "step": 6228 + }, + { + "epoch": 0.18471073154820153, + "grad_norm": 0.13252684473991394, + "learning_rate": 0.0009251449889986894, + "loss": 2.8296, + "step": 6229 + }, + { + "epoch": 0.184740384900513, + "grad_norm": 0.12132003903388977, + "learning_rate": 0.0009251202235134262, + "loss": 2.8093, + "step": 6230 + }, + { + "epoch": 0.18477003825282448, + "grad_norm": 0.12161482870578766, + "learning_rate": 0.0009250954542636505, + "loss": 2.8194, + "step": 6231 + }, + { + "epoch": 0.18479969160513596, + "grad_norm": 0.12165012955665588, + "learning_rate": 0.0009250706812495815, + "loss": 2.874, + "step": 6232 + }, + { + "epoch": 0.18482934495744743, + "grad_norm": 0.11791326850652695, + "learning_rate": 0.0009250459044714387, + "loss": 2.8267, + "step": 6233 + }, + { + "epoch": 0.1848589983097589, + "grad_norm": 0.12408977746963501, + "learning_rate": 0.0009250211239294414, + "loss": 2.8359, + "step": 6234 + }, + { + "epoch": 0.18488865166207039, + "grad_norm": 0.14006836712360382, + "learning_rate": 0.000924996339623809, + "loss": 2.8787, + "step": 6235 + }, + { + "epoch": 0.1849183050143819, + "grad_norm": 0.16803480684757233, + "learning_rate": 0.0009249715515547612, + "loss": 2.8248, + "step": 6236 + }, + { + "epoch": 0.18494795836669337, + "grad_norm": 0.19642454385757446, + "learning_rate": 0.0009249467597225174, + "loss": 2.843, + "step": 6237 + }, + { + "epoch": 0.18497761171900484, + "grad_norm": 0.1843869686126709, + "learning_rate": 0.000924921964127297, + "loss": 2.8417, + "step": 6238 + }, + { + "epoch": 0.18500726507131632, + "grad_norm": 0.14027516543865204, + "learning_rate": 0.0009248971647693199, + "loss": 2.8419, + "step": 6239 + }, + { + "epoch": 0.1850369184236278, + "grad_norm": 0.16177628934383392, + "learning_rate": 0.0009248723616488053, + "loss": 2.8143, + "step": 6240 + }, + { + "epoch": 0.18506657177593927, + "grad_norm": 0.1585441678762436, + "learning_rate": 0.000924847554765973, + "loss": 2.834, + "step": 6241 + }, + { + "epoch": 0.18509622512825075, + "grad_norm": 0.1604984849691391, + "learning_rate": 0.0009248227441210426, + "loss": 2.8428, + "step": 6242 + }, + { + "epoch": 0.18512587848056222, + "grad_norm": 0.1429407000541687, + "learning_rate": 0.000924797929714234, + "loss": 2.8302, + "step": 6243 + }, + { + "epoch": 0.1851555318328737, + "grad_norm": 0.17140157520771027, + "learning_rate": 0.0009247731115457667, + "loss": 2.8377, + "step": 6244 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 0.1706981360912323, + "learning_rate": 0.0009247482896158608, + "loss": 2.8185, + "step": 6245 + }, + { + "epoch": 0.18521483853749665, + "grad_norm": 0.1916198432445526, + "learning_rate": 0.0009247234639247357, + "loss": 2.8823, + "step": 6246 + }, + { + "epoch": 0.18524449188980815, + "grad_norm": 0.17264577746391296, + "learning_rate": 0.0009246986344726114, + "loss": 2.7982, + "step": 6247 + }, + { + "epoch": 0.18527414524211963, + "grad_norm": 0.16172581911087036, + "learning_rate": 0.000924673801259708, + "loss": 2.8057, + "step": 6248 + }, + { + "epoch": 0.1853037985944311, + "grad_norm": 0.18638469278812408, + "learning_rate": 0.000924648964286245, + "loss": 2.8714, + "step": 6249 + }, + { + "epoch": 0.18533345194674258, + "grad_norm": 0.15499356389045715, + "learning_rate": 0.0009246241235524427, + "loss": 2.8467, + "step": 6250 + }, + { + "epoch": 0.18536310529905406, + "grad_norm": 0.1427486389875412, + "learning_rate": 0.0009245992790585207, + "loss": 2.8309, + "step": 6251 + }, + { + "epoch": 0.18539275865136554, + "grad_norm": 0.1384320706129074, + "learning_rate": 0.0009245744308046993, + "loss": 2.8376, + "step": 6252 + }, + { + "epoch": 0.185422412003677, + "grad_norm": 0.13198544085025787, + "learning_rate": 0.0009245495787911985, + "loss": 2.8409, + "step": 6253 + }, + { + "epoch": 0.1854520653559885, + "grad_norm": 0.1214163675904274, + "learning_rate": 0.0009245247230182382, + "loss": 2.8167, + "step": 6254 + }, + { + "epoch": 0.18548171870829996, + "grad_norm": 0.13651171326637268, + "learning_rate": 0.0009244998634860386, + "loss": 2.846, + "step": 6255 + }, + { + "epoch": 0.18551137206061144, + "grad_norm": 0.14036798477172852, + "learning_rate": 0.00092447500019482, + "loss": 2.8571, + "step": 6256 + }, + { + "epoch": 0.18554102541292294, + "grad_norm": 0.13519886136054993, + "learning_rate": 0.0009244501331448023, + "loss": 2.8449, + "step": 6257 + }, + { + "epoch": 0.18557067876523442, + "grad_norm": 0.13234369456768036, + "learning_rate": 0.0009244252623362058, + "loss": 2.7894, + "step": 6258 + }, + { + "epoch": 0.1856003321175459, + "grad_norm": 0.13094082474708557, + "learning_rate": 0.0009244003877692509, + "loss": 2.8361, + "step": 6259 + }, + { + "epoch": 0.18562998546985737, + "grad_norm": 0.14183379709720612, + "learning_rate": 0.0009243755094441575, + "loss": 2.8314, + "step": 6260 + }, + { + "epoch": 0.18565963882216885, + "grad_norm": 0.14359025657176971, + "learning_rate": 0.0009243506273611463, + "loss": 2.819, + "step": 6261 + }, + { + "epoch": 0.18568929217448032, + "grad_norm": 0.15298356115818024, + "learning_rate": 0.0009243257415204373, + "loss": 2.8254, + "step": 6262 + }, + { + "epoch": 0.1857189455267918, + "grad_norm": 0.15458407998085022, + "learning_rate": 0.0009243008519222511, + "loss": 2.8812, + "step": 6263 + }, + { + "epoch": 0.18574859887910328, + "grad_norm": 0.14199359714984894, + "learning_rate": 0.0009242759585668081, + "loss": 2.8258, + "step": 6264 + }, + { + "epoch": 0.18577825223141475, + "grad_norm": 0.1410975605249405, + "learning_rate": 0.0009242510614543285, + "loss": 2.8357, + "step": 6265 + }, + { + "epoch": 0.18580790558372623, + "grad_norm": 0.13589046895503998, + "learning_rate": 0.000924226160585033, + "loss": 2.8301, + "step": 6266 + }, + { + "epoch": 0.1858375589360377, + "grad_norm": 0.11282163858413696, + "learning_rate": 0.0009242012559591422, + "loss": 2.8057, + "step": 6267 + }, + { + "epoch": 0.1858672122883492, + "grad_norm": 0.12174630910158157, + "learning_rate": 0.0009241763475768763, + "loss": 2.8754, + "step": 6268 + }, + { + "epoch": 0.18589686564066069, + "grad_norm": 0.14199557900428772, + "learning_rate": 0.0009241514354384559, + "loss": 2.7957, + "step": 6269 + }, + { + "epoch": 0.18592651899297216, + "grad_norm": 0.13659299910068512, + "learning_rate": 0.000924126519544102, + "loss": 2.822, + "step": 6270 + }, + { + "epoch": 0.18595617234528364, + "grad_norm": 0.16037701070308685, + "learning_rate": 0.0009241015998940347, + "loss": 2.8181, + "step": 6271 + }, + { + "epoch": 0.18598582569759511, + "grad_norm": 0.17283743619918823, + "learning_rate": 0.0009240766764884752, + "loss": 2.8551, + "step": 6272 + }, + { + "epoch": 0.1860154790499066, + "grad_norm": 0.17966459691524506, + "learning_rate": 0.0009240517493276438, + "loss": 2.8066, + "step": 6273 + }, + { + "epoch": 0.18604513240221807, + "grad_norm": 0.15190407633781433, + "learning_rate": 0.0009240268184117614, + "loss": 2.8399, + "step": 6274 + }, + { + "epoch": 0.18607478575452954, + "grad_norm": 0.18702329695224762, + "learning_rate": 0.0009240018837410488, + "loss": 2.8257, + "step": 6275 + }, + { + "epoch": 0.18610443910684102, + "grad_norm": 0.22398369014263153, + "learning_rate": 0.0009239769453157266, + "loss": 2.8474, + "step": 6276 + }, + { + "epoch": 0.1861340924591525, + "grad_norm": 0.21156659722328186, + "learning_rate": 0.0009239520031360158, + "loss": 2.8355, + "step": 6277 + }, + { + "epoch": 0.186163745811464, + "grad_norm": 0.17923881113529205, + "learning_rate": 0.0009239270572021374, + "loss": 2.8003, + "step": 6278 + }, + { + "epoch": 0.18619339916377547, + "grad_norm": 0.17168894410133362, + "learning_rate": 0.0009239021075143119, + "loss": 2.818, + "step": 6279 + }, + { + "epoch": 0.18622305251608695, + "grad_norm": 0.1702268421649933, + "learning_rate": 0.0009238771540727608, + "loss": 2.8464, + "step": 6280 + }, + { + "epoch": 0.18625270586839843, + "grad_norm": 0.15346214175224304, + "learning_rate": 0.0009238521968777045, + "loss": 2.851, + "step": 6281 + }, + { + "epoch": 0.1862823592207099, + "grad_norm": 0.1572597175836563, + "learning_rate": 0.0009238272359293643, + "loss": 2.8257, + "step": 6282 + }, + { + "epoch": 0.18631201257302138, + "grad_norm": 0.14787344634532928, + "learning_rate": 0.0009238022712279611, + "loss": 2.8199, + "step": 6283 + }, + { + "epoch": 0.18634166592533286, + "grad_norm": 0.13986365497112274, + "learning_rate": 0.0009237773027737162, + "loss": 2.82, + "step": 6284 + }, + { + "epoch": 0.18637131927764433, + "grad_norm": 0.12959323823451996, + "learning_rate": 0.0009237523305668505, + "loss": 2.8382, + "step": 6285 + }, + { + "epoch": 0.1864009726299558, + "grad_norm": 0.1305248737335205, + "learning_rate": 0.0009237273546075851, + "loss": 2.8479, + "step": 6286 + }, + { + "epoch": 0.18643062598226728, + "grad_norm": 0.1267673671245575, + "learning_rate": 0.0009237023748961412, + "loss": 2.8579, + "step": 6287 + }, + { + "epoch": 0.1864602793345788, + "grad_norm": 0.12125736474990845, + "learning_rate": 0.0009236773914327401, + "loss": 2.8394, + "step": 6288 + }, + { + "epoch": 0.18648993268689026, + "grad_norm": 0.1353049874305725, + "learning_rate": 0.0009236524042176031, + "loss": 2.8489, + "step": 6289 + }, + { + "epoch": 0.18651958603920174, + "grad_norm": 0.12431592494249344, + "learning_rate": 0.0009236274132509513, + "loss": 2.8185, + "step": 6290 + }, + { + "epoch": 0.18654923939151322, + "grad_norm": 0.11460079997777939, + "learning_rate": 0.0009236024185330058, + "loss": 2.8526, + "step": 6291 + }, + { + "epoch": 0.1865788927438247, + "grad_norm": 0.1119433268904686, + "learning_rate": 0.0009235774200639883, + "loss": 2.8191, + "step": 6292 + }, + { + "epoch": 0.18660854609613617, + "grad_norm": 0.12759077548980713, + "learning_rate": 0.0009235524178441202, + "loss": 2.8364, + "step": 6293 + }, + { + "epoch": 0.18663819944844764, + "grad_norm": 0.13011963665485382, + "learning_rate": 0.0009235274118736228, + "loss": 2.8193, + "step": 6294 + }, + { + "epoch": 0.18666785280075912, + "grad_norm": 0.14380502700805664, + "learning_rate": 0.0009235024021527171, + "loss": 2.8635, + "step": 6295 + }, + { + "epoch": 0.1866975061530706, + "grad_norm": 0.1494857221841812, + "learning_rate": 0.0009234773886816252, + "loss": 2.8214, + "step": 6296 + }, + { + "epoch": 0.18672715950538207, + "grad_norm": 0.1407213807106018, + "learning_rate": 0.0009234523714605683, + "loss": 2.8712, + "step": 6297 + }, + { + "epoch": 0.18675681285769355, + "grad_norm": 0.12910865247249603, + "learning_rate": 0.0009234273504897678, + "loss": 2.8372, + "step": 6298 + }, + { + "epoch": 0.18678646621000505, + "grad_norm": 0.13218699395656586, + "learning_rate": 0.0009234023257694457, + "loss": 2.8236, + "step": 6299 + }, + { + "epoch": 0.18681611956231653, + "grad_norm": 0.16390977799892426, + "learning_rate": 0.0009233772972998232, + "loss": 2.822, + "step": 6300 + }, + { + "epoch": 0.186845772914628, + "grad_norm": 0.20507046580314636, + "learning_rate": 0.0009233522650811221, + "loss": 2.8343, + "step": 6301 + }, + { + "epoch": 0.18687542626693948, + "grad_norm": 0.20722587406635284, + "learning_rate": 0.0009233272291135639, + "loss": 2.8541, + "step": 6302 + }, + { + "epoch": 0.18690507961925096, + "grad_norm": 0.20775003731250763, + "learning_rate": 0.0009233021893973706, + "loss": 2.8524, + "step": 6303 + }, + { + "epoch": 0.18693473297156243, + "grad_norm": 0.18860168755054474, + "learning_rate": 0.0009232771459327636, + "loss": 2.8534, + "step": 6304 + }, + { + "epoch": 0.1869643863238739, + "grad_norm": 0.15758386254310608, + "learning_rate": 0.0009232520987199649, + "loss": 2.8416, + "step": 6305 + }, + { + "epoch": 0.1869940396761854, + "grad_norm": 0.17471638321876526, + "learning_rate": 0.0009232270477591962, + "loss": 2.8375, + "step": 6306 + }, + { + "epoch": 0.18702369302849686, + "grad_norm": 0.18876846134662628, + "learning_rate": 0.0009232019930506795, + "loss": 2.8723, + "step": 6307 + }, + { + "epoch": 0.18705334638080834, + "grad_norm": 0.19478587806224823, + "learning_rate": 0.0009231769345946361, + "loss": 2.8342, + "step": 6308 + }, + { + "epoch": 0.18708299973311984, + "grad_norm": 0.17774201929569244, + "learning_rate": 0.0009231518723912886, + "loss": 2.8023, + "step": 6309 + }, + { + "epoch": 0.18711265308543132, + "grad_norm": 0.15875762701034546, + "learning_rate": 0.0009231268064408587, + "loss": 2.8413, + "step": 6310 + }, + { + "epoch": 0.1871423064377428, + "grad_norm": 0.15152595937252045, + "learning_rate": 0.0009231017367435681, + "loss": 2.827, + "step": 6311 + }, + { + "epoch": 0.18717195979005427, + "grad_norm": 0.13052375614643097, + "learning_rate": 0.0009230766632996392, + "loss": 2.8472, + "step": 6312 + }, + { + "epoch": 0.18720161314236575, + "grad_norm": 0.1239876002073288, + "learning_rate": 0.0009230515861092936, + "loss": 2.8125, + "step": 6313 + }, + { + "epoch": 0.18723126649467722, + "grad_norm": 0.13357682526111603, + "learning_rate": 0.0009230265051727537, + "loss": 2.8055, + "step": 6314 + }, + { + "epoch": 0.1872609198469887, + "grad_norm": 0.1276763677597046, + "learning_rate": 0.0009230014204902415, + "loss": 2.8118, + "step": 6315 + }, + { + "epoch": 0.18729057319930018, + "grad_norm": 0.12349557876586914, + "learning_rate": 0.0009229763320619793, + "loss": 2.8804, + "step": 6316 + }, + { + "epoch": 0.18732022655161165, + "grad_norm": 0.12412026524543762, + "learning_rate": 0.0009229512398881887, + "loss": 2.8208, + "step": 6317 + }, + { + "epoch": 0.18734987990392313, + "grad_norm": 0.12182262539863586, + "learning_rate": 0.0009229261439690925, + "loss": 2.8538, + "step": 6318 + }, + { + "epoch": 0.1873795332562346, + "grad_norm": 0.11420170962810516, + "learning_rate": 0.0009229010443049128, + "loss": 2.8028, + "step": 6319 + }, + { + "epoch": 0.1874091866085461, + "grad_norm": 0.12133531272411346, + "learning_rate": 0.0009228759408958716, + "loss": 2.8332, + "step": 6320 + }, + { + "epoch": 0.18743883996085758, + "grad_norm": 0.12872149050235748, + "learning_rate": 0.0009228508337421913, + "loss": 2.8553, + "step": 6321 + }, + { + "epoch": 0.18746849331316906, + "grad_norm": 0.1417776346206665, + "learning_rate": 0.0009228257228440944, + "loss": 2.8128, + "step": 6322 + }, + { + "epoch": 0.18749814666548054, + "grad_norm": 0.14551939070224762, + "learning_rate": 0.000922800608201803, + "loss": 2.8128, + "step": 6323 + }, + { + "epoch": 0.187527800017792, + "grad_norm": 0.1515103578567505, + "learning_rate": 0.0009227754898155398, + "loss": 2.81, + "step": 6324 + }, + { + "epoch": 0.1875574533701035, + "grad_norm": 0.15695889294147491, + "learning_rate": 0.0009227503676855272, + "loss": 2.8145, + "step": 6325 + }, + { + "epoch": 0.18758710672241496, + "grad_norm": 0.16195760667324066, + "learning_rate": 0.0009227252418119871, + "loss": 2.8605, + "step": 6326 + }, + { + "epoch": 0.18761676007472644, + "grad_norm": 0.15302953124046326, + "learning_rate": 0.0009227001121951429, + "loss": 2.8299, + "step": 6327 + }, + { + "epoch": 0.18764641342703792, + "grad_norm": 0.13793238997459412, + "learning_rate": 0.0009226749788352162, + "loss": 2.8258, + "step": 6328 + }, + { + "epoch": 0.1876760667793494, + "grad_norm": 0.14461207389831543, + "learning_rate": 0.0009226498417324304, + "loss": 2.8398, + "step": 6329 + }, + { + "epoch": 0.1877057201316609, + "grad_norm": 0.15069575607776642, + "learning_rate": 0.0009226247008870074, + "loss": 2.8519, + "step": 6330 + }, + { + "epoch": 0.18773537348397237, + "grad_norm": 0.13734690845012665, + "learning_rate": 0.0009225995562991703, + "loss": 2.8425, + "step": 6331 + }, + { + "epoch": 0.18776502683628385, + "grad_norm": 0.13488000631332397, + "learning_rate": 0.0009225744079691417, + "loss": 2.8553, + "step": 6332 + }, + { + "epoch": 0.18779468018859533, + "grad_norm": 0.14908331632614136, + "learning_rate": 0.000922549255897144, + "loss": 2.8044, + "step": 6333 + }, + { + "epoch": 0.1878243335409068, + "grad_norm": 0.15021446347236633, + "learning_rate": 0.0009225241000834002, + "loss": 2.8412, + "step": 6334 + }, + { + "epoch": 0.18785398689321828, + "grad_norm": 0.15404938161373138, + "learning_rate": 0.0009224989405281329, + "loss": 2.8196, + "step": 6335 + }, + { + "epoch": 0.18788364024552975, + "grad_norm": 0.15054655075073242, + "learning_rate": 0.000922473777231565, + "loss": 2.8471, + "step": 6336 + }, + { + "epoch": 0.18791329359784123, + "grad_norm": 0.18085674941539764, + "learning_rate": 0.0009224486101939192, + "loss": 2.8498, + "step": 6337 + }, + { + "epoch": 0.1879429469501527, + "grad_norm": 0.192499577999115, + "learning_rate": 0.0009224234394154185, + "loss": 2.8406, + "step": 6338 + }, + { + "epoch": 0.18797260030246418, + "grad_norm": 0.18164417147636414, + "learning_rate": 0.0009223982648962858, + "loss": 2.8382, + "step": 6339 + }, + { + "epoch": 0.1880022536547757, + "grad_norm": 0.16104723513126373, + "learning_rate": 0.0009223730866367439, + "loss": 2.8135, + "step": 6340 + }, + { + "epoch": 0.18803190700708716, + "grad_norm": 0.17406700551509857, + "learning_rate": 0.0009223479046370158, + "loss": 2.86, + "step": 6341 + }, + { + "epoch": 0.18806156035939864, + "grad_norm": 0.1589241623878479, + "learning_rate": 0.0009223227188973246, + "loss": 2.8595, + "step": 6342 + }, + { + "epoch": 0.18809121371171011, + "grad_norm": 0.13842010498046875, + "learning_rate": 0.0009222975294178933, + "loss": 2.8571, + "step": 6343 + }, + { + "epoch": 0.1881208670640216, + "grad_norm": 0.16150477528572083, + "learning_rate": 0.0009222723361989447, + "loss": 2.8516, + "step": 6344 + }, + { + "epoch": 0.18815052041633307, + "grad_norm": 0.16247941553592682, + "learning_rate": 0.0009222471392407021, + "loss": 2.8513, + "step": 6345 + }, + { + "epoch": 0.18818017376864454, + "grad_norm": 0.1615404337644577, + "learning_rate": 0.0009222219385433886, + "loss": 2.8432, + "step": 6346 + }, + { + "epoch": 0.18820982712095602, + "grad_norm": 0.1512834131717682, + "learning_rate": 0.0009221967341072275, + "loss": 2.8101, + "step": 6347 + }, + { + "epoch": 0.1882394804732675, + "grad_norm": 0.16430048644542694, + "learning_rate": 0.0009221715259324416, + "loss": 2.8529, + "step": 6348 + }, + { + "epoch": 0.18826913382557897, + "grad_norm": 0.148366779088974, + "learning_rate": 0.0009221463140192546, + "loss": 2.8534, + "step": 6349 + }, + { + "epoch": 0.18829878717789045, + "grad_norm": 0.14362531900405884, + "learning_rate": 0.0009221210983678895, + "loss": 2.8329, + "step": 6350 + }, + { + "epoch": 0.18832844053020195, + "grad_norm": 0.16616253554821014, + "learning_rate": 0.0009220958789785696, + "loss": 2.8342, + "step": 6351 + }, + { + "epoch": 0.18835809388251343, + "grad_norm": 0.17054927349090576, + "learning_rate": 0.0009220706558515182, + "loss": 2.8414, + "step": 6352 + }, + { + "epoch": 0.1883877472348249, + "grad_norm": 0.16964179277420044, + "learning_rate": 0.0009220454289869586, + "loss": 2.835, + "step": 6353 + }, + { + "epoch": 0.18841740058713638, + "grad_norm": 0.16545917093753815, + "learning_rate": 0.0009220201983851145, + "loss": 2.8278, + "step": 6354 + }, + { + "epoch": 0.18844705393944786, + "grad_norm": 0.15116623044013977, + "learning_rate": 0.0009219949640462091, + "loss": 2.8465, + "step": 6355 + }, + { + "epoch": 0.18847670729175933, + "grad_norm": 0.15834563970565796, + "learning_rate": 0.0009219697259704657, + "loss": 2.8281, + "step": 6356 + }, + { + "epoch": 0.1885063606440708, + "grad_norm": 0.1504206657409668, + "learning_rate": 0.000921944484158108, + "loss": 2.8053, + "step": 6357 + }, + { + "epoch": 0.18853601399638228, + "grad_norm": 0.14494943618774414, + "learning_rate": 0.0009219192386093595, + "loss": 2.822, + "step": 6358 + }, + { + "epoch": 0.18856566734869376, + "grad_norm": 0.1618465632200241, + "learning_rate": 0.0009218939893244437, + "loss": 2.8313, + "step": 6359 + }, + { + "epoch": 0.18859532070100524, + "grad_norm": 0.16420723497867584, + "learning_rate": 0.0009218687363035841, + "loss": 2.8441, + "step": 6360 + }, + { + "epoch": 0.18862497405331674, + "grad_norm": 0.15530356764793396, + "learning_rate": 0.0009218434795470045, + "loss": 2.8327, + "step": 6361 + }, + { + "epoch": 0.18865462740562822, + "grad_norm": 0.14600268006324768, + "learning_rate": 0.0009218182190549287, + "loss": 2.8541, + "step": 6362 + }, + { + "epoch": 0.1886842807579397, + "grad_norm": 0.14714285731315613, + "learning_rate": 0.0009217929548275799, + "loss": 2.8075, + "step": 6363 + }, + { + "epoch": 0.18871393411025117, + "grad_norm": 0.17361490428447723, + "learning_rate": 0.0009217676868651821, + "loss": 2.8089, + "step": 6364 + }, + { + "epoch": 0.18874358746256265, + "grad_norm": 0.13982142508029938, + "learning_rate": 0.0009217424151679592, + "loss": 2.8388, + "step": 6365 + }, + { + "epoch": 0.18877324081487412, + "grad_norm": 0.12529809772968292, + "learning_rate": 0.0009217171397361346, + "loss": 2.8361, + "step": 6366 + }, + { + "epoch": 0.1888028941671856, + "grad_norm": 0.12018541991710663, + "learning_rate": 0.0009216918605699325, + "loss": 2.86, + "step": 6367 + }, + { + "epoch": 0.18883254751949707, + "grad_norm": 0.11775682866573334, + "learning_rate": 0.0009216665776695766, + "loss": 2.8382, + "step": 6368 + }, + { + "epoch": 0.18886220087180855, + "grad_norm": 0.14518028497695923, + "learning_rate": 0.0009216412910352907, + "loss": 2.8518, + "step": 6369 + }, + { + "epoch": 0.18889185422412003, + "grad_norm": 0.15474598109722137, + "learning_rate": 0.0009216160006672989, + "loss": 2.8503, + "step": 6370 + }, + { + "epoch": 0.1889215075764315, + "grad_norm": 0.14823351800441742, + "learning_rate": 0.0009215907065658249, + "loss": 2.8347, + "step": 6371 + }, + { + "epoch": 0.188951160928743, + "grad_norm": 0.12489945441484451, + "learning_rate": 0.000921565408731093, + "loss": 2.8619, + "step": 6372 + }, + { + "epoch": 0.18898081428105448, + "grad_norm": 0.1310778558254242, + "learning_rate": 0.0009215401071633269, + "loss": 2.8616, + "step": 6373 + }, + { + "epoch": 0.18901046763336596, + "grad_norm": 0.1414814442396164, + "learning_rate": 0.0009215148018627508, + "loss": 2.8029, + "step": 6374 + }, + { + "epoch": 0.18904012098567743, + "grad_norm": 0.14211517572402954, + "learning_rate": 0.0009214894928295888, + "loss": 2.8509, + "step": 6375 + }, + { + "epoch": 0.1890697743379889, + "grad_norm": 0.13752806186676025, + "learning_rate": 0.0009214641800640651, + "loss": 2.8278, + "step": 6376 + }, + { + "epoch": 0.1890994276903004, + "grad_norm": 0.12670904397964478, + "learning_rate": 0.0009214388635664036, + "loss": 2.8292, + "step": 6377 + }, + { + "epoch": 0.18912908104261186, + "grad_norm": 0.12700305879116058, + "learning_rate": 0.0009214135433368287, + "loss": 2.8298, + "step": 6378 + }, + { + "epoch": 0.18915873439492334, + "grad_norm": 0.12924742698669434, + "learning_rate": 0.0009213882193755645, + "loss": 2.8304, + "step": 6379 + }, + { + "epoch": 0.18918838774723482, + "grad_norm": 0.15220633149147034, + "learning_rate": 0.0009213628916828353, + "loss": 2.8108, + "step": 6380 + }, + { + "epoch": 0.1892180410995463, + "grad_norm": 0.1705891638994217, + "learning_rate": 0.0009213375602588654, + "loss": 2.8483, + "step": 6381 + }, + { + "epoch": 0.1892476944518578, + "grad_norm": 0.1781487613916397, + "learning_rate": 0.000921312225103879, + "loss": 2.8189, + "step": 6382 + }, + { + "epoch": 0.18927734780416927, + "grad_norm": 0.17921800911426544, + "learning_rate": 0.0009212868862181005, + "loss": 2.8372, + "step": 6383 + }, + { + "epoch": 0.18930700115648075, + "grad_norm": 0.17215318977832794, + "learning_rate": 0.0009212615436017545, + "loss": 2.8174, + "step": 6384 + }, + { + "epoch": 0.18933665450879222, + "grad_norm": 0.15154489874839783, + "learning_rate": 0.0009212361972550651, + "loss": 2.8196, + "step": 6385 + }, + { + "epoch": 0.1893663078611037, + "grad_norm": 0.1642463207244873, + "learning_rate": 0.0009212108471782569, + "loss": 2.8145, + "step": 6386 + }, + { + "epoch": 0.18939596121341518, + "grad_norm": 0.16779617965221405, + "learning_rate": 0.0009211854933715544, + "loss": 2.8419, + "step": 6387 + }, + { + "epoch": 0.18942561456572665, + "grad_norm": 0.14289675652980804, + "learning_rate": 0.0009211601358351818, + "loss": 2.8559, + "step": 6388 + }, + { + "epoch": 0.18945526791803813, + "grad_norm": 0.15807050466537476, + "learning_rate": 0.0009211347745693642, + "loss": 2.838, + "step": 6389 + }, + { + "epoch": 0.1894849212703496, + "grad_norm": 0.13856031000614166, + "learning_rate": 0.0009211094095743258, + "loss": 2.8341, + "step": 6390 + }, + { + "epoch": 0.18951457462266108, + "grad_norm": 0.1437910944223404, + "learning_rate": 0.0009210840408502912, + "loss": 2.8075, + "step": 6391 + }, + { + "epoch": 0.18954422797497258, + "grad_norm": 0.1497538983821869, + "learning_rate": 0.0009210586683974854, + "loss": 2.8521, + "step": 6392 + }, + { + "epoch": 0.18957388132728406, + "grad_norm": 0.1349494457244873, + "learning_rate": 0.0009210332922161325, + "loss": 2.7962, + "step": 6393 + }, + { + "epoch": 0.18960353467959554, + "grad_norm": 0.12519340217113495, + "learning_rate": 0.0009210079123064576, + "loss": 2.8174, + "step": 6394 + }, + { + "epoch": 0.189633188031907, + "grad_norm": 0.13941627740859985, + "learning_rate": 0.0009209825286686855, + "loss": 2.8453, + "step": 6395 + }, + { + "epoch": 0.1896628413842185, + "grad_norm": 0.1503332555294037, + "learning_rate": 0.0009209571413030409, + "loss": 2.8186, + "step": 6396 + }, + { + "epoch": 0.18969249473652997, + "grad_norm": 0.14791034162044525, + "learning_rate": 0.0009209317502097483, + "loss": 2.8313, + "step": 6397 + }, + { + "epoch": 0.18972214808884144, + "grad_norm": 0.15421929955482483, + "learning_rate": 0.000920906355389033, + "loss": 2.856, + "step": 6398 + }, + { + "epoch": 0.18975180144115292, + "grad_norm": 0.1550600826740265, + "learning_rate": 0.0009208809568411196, + "loss": 2.8461, + "step": 6399 + }, + { + "epoch": 0.1897814547934644, + "grad_norm": 0.14540807902812958, + "learning_rate": 0.0009208555545662332, + "loss": 2.8301, + "step": 6400 + }, + { + "epoch": 0.18981110814577587, + "grad_norm": 0.1392490267753601, + "learning_rate": 0.0009208301485645984, + "loss": 2.8051, + "step": 6401 + }, + { + "epoch": 0.18984076149808735, + "grad_norm": 0.14255821704864502, + "learning_rate": 0.0009208047388364405, + "loss": 2.8354, + "step": 6402 + }, + { + "epoch": 0.18987041485039885, + "grad_norm": 0.14010238647460938, + "learning_rate": 0.0009207793253819845, + "loss": 2.796, + "step": 6403 + }, + { + "epoch": 0.18990006820271033, + "grad_norm": 0.1536063402891159, + "learning_rate": 0.0009207539082014553, + "loss": 2.8338, + "step": 6404 + }, + { + "epoch": 0.1899297215550218, + "grad_norm": 0.15772676467895508, + "learning_rate": 0.000920728487295078, + "loss": 2.8432, + "step": 6405 + }, + { + "epoch": 0.18995937490733328, + "grad_norm": 0.13875587284564972, + "learning_rate": 0.0009207030626630777, + "loss": 2.8554, + "step": 6406 + }, + { + "epoch": 0.18998902825964475, + "grad_norm": 0.1220070868730545, + "learning_rate": 0.0009206776343056795, + "loss": 2.8122, + "step": 6407 + }, + { + "epoch": 0.19001868161195623, + "grad_norm": 0.12843720614910126, + "learning_rate": 0.0009206522022231087, + "loss": 2.7985, + "step": 6408 + }, + { + "epoch": 0.1900483349642677, + "grad_norm": 0.11413293331861496, + "learning_rate": 0.0009206267664155906, + "loss": 2.8251, + "step": 6409 + }, + { + "epoch": 0.19007798831657918, + "grad_norm": 0.12498818337917328, + "learning_rate": 0.0009206013268833502, + "loss": 2.8493, + "step": 6410 + }, + { + "epoch": 0.19010764166889066, + "grad_norm": 0.13245242834091187, + "learning_rate": 0.0009205758836266128, + "loss": 2.7946, + "step": 6411 + }, + { + "epoch": 0.19013729502120214, + "grad_norm": 0.12656505405902863, + "learning_rate": 0.0009205504366456038, + "loss": 2.8682, + "step": 6412 + }, + { + "epoch": 0.19016694837351364, + "grad_norm": 0.14238037168979645, + "learning_rate": 0.0009205249859405484, + "loss": 2.8227, + "step": 6413 + }, + { + "epoch": 0.19019660172582512, + "grad_norm": 0.17343413829803467, + "learning_rate": 0.0009204995315116722, + "loss": 2.8504, + "step": 6414 + }, + { + "epoch": 0.1902262550781366, + "grad_norm": 0.20073549449443817, + "learning_rate": 0.0009204740733592005, + "loss": 2.8049, + "step": 6415 + }, + { + "epoch": 0.19025590843044807, + "grad_norm": 0.20979811251163483, + "learning_rate": 0.0009204486114833586, + "loss": 2.8389, + "step": 6416 + }, + { + "epoch": 0.19028556178275954, + "grad_norm": 0.18141956627368927, + "learning_rate": 0.000920423145884372, + "loss": 2.8326, + "step": 6417 + }, + { + "epoch": 0.19031521513507102, + "grad_norm": 0.15068839490413666, + "learning_rate": 0.0009203976765624664, + "loss": 2.8159, + "step": 6418 + }, + { + "epoch": 0.1903448684873825, + "grad_norm": 0.17059938609600067, + "learning_rate": 0.0009203722035178672, + "loss": 2.8448, + "step": 6419 + }, + { + "epoch": 0.19037452183969397, + "grad_norm": 0.16432330012321472, + "learning_rate": 0.0009203467267508, + "loss": 2.8645, + "step": 6420 + }, + { + "epoch": 0.19040417519200545, + "grad_norm": 0.16425977647304535, + "learning_rate": 0.0009203212462614902, + "loss": 2.8896, + "step": 6421 + }, + { + "epoch": 0.19043382854431692, + "grad_norm": 0.1869356781244278, + "learning_rate": 0.000920295762050164, + "loss": 2.8107, + "step": 6422 + }, + { + "epoch": 0.1904634818966284, + "grad_norm": 0.16686242818832397, + "learning_rate": 0.0009202702741170464, + "loss": 2.8508, + "step": 6423 + }, + { + "epoch": 0.1904931352489399, + "grad_norm": 0.19381381571292877, + "learning_rate": 0.0009202447824623634, + "loss": 2.833, + "step": 6424 + }, + { + "epoch": 0.19052278860125138, + "grad_norm": 0.18536315858364105, + "learning_rate": 0.0009202192870863408, + "loss": 2.8117, + "step": 6425 + }, + { + "epoch": 0.19055244195356286, + "grad_norm": 0.18041680753231049, + "learning_rate": 0.0009201937879892042, + "loss": 2.8408, + "step": 6426 + }, + { + "epoch": 0.19058209530587433, + "grad_norm": 0.205609530210495, + "learning_rate": 0.0009201682851711795, + "loss": 2.8308, + "step": 6427 + }, + { + "epoch": 0.1906117486581858, + "grad_norm": 0.17837895452976227, + "learning_rate": 0.0009201427786324926, + "loss": 2.8317, + "step": 6428 + }, + { + "epoch": 0.19064140201049729, + "grad_norm": 0.13513806462287903, + "learning_rate": 0.0009201172683733691, + "loss": 2.8326, + "step": 6429 + }, + { + "epoch": 0.19067105536280876, + "grad_norm": 0.1483893245458603, + "learning_rate": 0.0009200917543940352, + "loss": 2.8105, + "step": 6430 + }, + { + "epoch": 0.19070070871512024, + "grad_norm": 0.15301427245140076, + "learning_rate": 0.0009200662366947168, + "loss": 2.8311, + "step": 6431 + }, + { + "epoch": 0.1907303620674317, + "grad_norm": 0.12727904319763184, + "learning_rate": 0.0009200407152756396, + "loss": 2.7861, + "step": 6432 + }, + { + "epoch": 0.1907600154197432, + "grad_norm": 0.13994181156158447, + "learning_rate": 0.00092001519013703, + "loss": 2.848, + "step": 6433 + }, + { + "epoch": 0.1907896687720547, + "grad_norm": 0.14183968305587769, + "learning_rate": 0.0009199896612791135, + "loss": 2.8313, + "step": 6434 + }, + { + "epoch": 0.19081932212436617, + "grad_norm": 0.13687017560005188, + "learning_rate": 0.0009199641287021166, + "loss": 2.8616, + "step": 6435 + }, + { + "epoch": 0.19084897547667765, + "grad_norm": 0.14556145668029785, + "learning_rate": 0.0009199385924062653, + "loss": 2.8456, + "step": 6436 + }, + { + "epoch": 0.19087862882898912, + "grad_norm": 0.14242902398109436, + "learning_rate": 0.0009199130523917855, + "loss": 2.8207, + "step": 6437 + }, + { + "epoch": 0.1909082821813006, + "grad_norm": 0.1623532772064209, + "learning_rate": 0.0009198875086589038, + "loss": 2.8455, + "step": 6438 + }, + { + "epoch": 0.19093793553361207, + "grad_norm": 0.14995422959327698, + "learning_rate": 0.0009198619612078461, + "loss": 2.84, + "step": 6439 + }, + { + "epoch": 0.19096758888592355, + "grad_norm": 0.1447875201702118, + "learning_rate": 0.0009198364100388384, + "loss": 2.7968, + "step": 6440 + }, + { + "epoch": 0.19099724223823503, + "grad_norm": 0.15339192748069763, + "learning_rate": 0.0009198108551521075, + "loss": 2.8199, + "step": 6441 + }, + { + "epoch": 0.1910268955905465, + "grad_norm": 0.1499101221561432, + "learning_rate": 0.0009197852965478792, + "loss": 2.8486, + "step": 6442 + }, + { + "epoch": 0.19105654894285798, + "grad_norm": 0.12982042133808136, + "learning_rate": 0.0009197597342263802, + "loss": 2.8373, + "step": 6443 + }, + { + "epoch": 0.19108620229516948, + "grad_norm": 0.1423550844192505, + "learning_rate": 0.0009197341681878368, + "loss": 2.8197, + "step": 6444 + }, + { + "epoch": 0.19111585564748096, + "grad_norm": 0.12990763783454895, + "learning_rate": 0.0009197085984324751, + "loss": 2.786, + "step": 6445 + }, + { + "epoch": 0.19114550899979244, + "grad_norm": 0.12917190790176392, + "learning_rate": 0.0009196830249605217, + "loss": 2.8313, + "step": 6446 + }, + { + "epoch": 0.1911751623521039, + "grad_norm": 0.1287996470928192, + "learning_rate": 0.0009196574477722033, + "loss": 2.8326, + "step": 6447 + }, + { + "epoch": 0.1912048157044154, + "grad_norm": 0.1363261193037033, + "learning_rate": 0.000919631866867746, + "loss": 2.8164, + "step": 6448 + }, + { + "epoch": 0.19123446905672686, + "grad_norm": 0.13852721452713013, + "learning_rate": 0.0009196062822473765, + "loss": 2.8305, + "step": 6449 + }, + { + "epoch": 0.19126412240903834, + "grad_norm": 0.1478385627269745, + "learning_rate": 0.0009195806939113213, + "loss": 2.8515, + "step": 6450 + }, + { + "epoch": 0.19129377576134982, + "grad_norm": 0.1531141996383667, + "learning_rate": 0.0009195551018598072, + "loss": 2.8531, + "step": 6451 + }, + { + "epoch": 0.1913234291136613, + "grad_norm": 0.15698596835136414, + "learning_rate": 0.0009195295060930605, + "loss": 2.8412, + "step": 6452 + }, + { + "epoch": 0.19135308246597277, + "grad_norm": 0.15371589362621307, + "learning_rate": 0.000919503906611308, + "loss": 2.824, + "step": 6453 + }, + { + "epoch": 0.19138273581828424, + "grad_norm": 0.14591115713119507, + "learning_rate": 0.0009194783034147764, + "loss": 2.8206, + "step": 6454 + }, + { + "epoch": 0.19141238917059575, + "grad_norm": 0.14316877722740173, + "learning_rate": 0.0009194526965036927, + "loss": 2.8334, + "step": 6455 + }, + { + "epoch": 0.19144204252290722, + "grad_norm": 0.1504707932472229, + "learning_rate": 0.000919427085878283, + "loss": 2.8141, + "step": 6456 + }, + { + "epoch": 0.1914716958752187, + "grad_norm": 0.18057067692279816, + "learning_rate": 0.0009194014715387746, + "loss": 2.8572, + "step": 6457 + }, + { + "epoch": 0.19150134922753018, + "grad_norm": 0.15623775124549866, + "learning_rate": 0.0009193758534853942, + "loss": 2.7968, + "step": 6458 + }, + { + "epoch": 0.19153100257984165, + "grad_norm": 0.16736982762813568, + "learning_rate": 0.0009193502317183687, + "loss": 2.8274, + "step": 6459 + }, + { + "epoch": 0.19156065593215313, + "grad_norm": 0.1902121603488922, + "learning_rate": 0.0009193246062379248, + "loss": 2.8387, + "step": 6460 + }, + { + "epoch": 0.1915903092844646, + "grad_norm": 0.17162121832370758, + "learning_rate": 0.0009192989770442897, + "loss": 2.8307, + "step": 6461 + }, + { + "epoch": 0.19161996263677608, + "grad_norm": 0.16499900817871094, + "learning_rate": 0.0009192733441376899, + "loss": 2.851, + "step": 6462 + }, + { + "epoch": 0.19164961598908756, + "grad_norm": 0.1417447328567505, + "learning_rate": 0.0009192477075183529, + "loss": 2.8478, + "step": 6463 + }, + { + "epoch": 0.19167926934139903, + "grad_norm": 0.13158196210861206, + "learning_rate": 0.0009192220671865055, + "loss": 2.8239, + "step": 6464 + }, + { + "epoch": 0.19170892269371054, + "grad_norm": 0.122821144759655, + "learning_rate": 0.0009191964231423746, + "loss": 2.8507, + "step": 6465 + }, + { + "epoch": 0.191738576046022, + "grad_norm": 0.1269606649875641, + "learning_rate": 0.0009191707753861875, + "loss": 2.8094, + "step": 6466 + }, + { + "epoch": 0.1917682293983335, + "grad_norm": 0.13167695701122284, + "learning_rate": 0.0009191451239181712, + "loss": 2.8591, + "step": 6467 + }, + { + "epoch": 0.19179788275064497, + "grad_norm": 0.15597686171531677, + "learning_rate": 0.0009191194687385529, + "loss": 2.7979, + "step": 6468 + }, + { + "epoch": 0.19182753610295644, + "grad_norm": 0.15537242591381073, + "learning_rate": 0.0009190938098475598, + "loss": 2.8325, + "step": 6469 + }, + { + "epoch": 0.19185718945526792, + "grad_norm": 0.15835867822170258, + "learning_rate": 0.000919068147245419, + "loss": 2.8189, + "step": 6470 + }, + { + "epoch": 0.1918868428075794, + "grad_norm": 0.16060423851013184, + "learning_rate": 0.0009190424809323579, + "loss": 2.8491, + "step": 6471 + }, + { + "epoch": 0.19191649615989087, + "grad_norm": 0.1553012728691101, + "learning_rate": 0.0009190168109086037, + "loss": 2.8017, + "step": 6472 + }, + { + "epoch": 0.19194614951220235, + "grad_norm": 0.1600118726491928, + "learning_rate": 0.0009189911371743837, + "loss": 2.8111, + "step": 6473 + }, + { + "epoch": 0.19197580286451382, + "grad_norm": 0.1502424031496048, + "learning_rate": 0.0009189654597299252, + "loss": 2.8139, + "step": 6474 + }, + { + "epoch": 0.1920054562168253, + "grad_norm": 0.1547720730304718, + "learning_rate": 0.0009189397785754558, + "loss": 2.8184, + "step": 6475 + }, + { + "epoch": 0.1920351095691368, + "grad_norm": 0.15057484805583954, + "learning_rate": 0.0009189140937112026, + "loss": 2.8305, + "step": 6476 + }, + { + "epoch": 0.19206476292144828, + "grad_norm": 0.14857496321201324, + "learning_rate": 0.0009188884051373931, + "loss": 2.8546, + "step": 6477 + }, + { + "epoch": 0.19209441627375975, + "grad_norm": 0.15497151017189026, + "learning_rate": 0.000918862712854255, + "loss": 2.8023, + "step": 6478 + }, + { + "epoch": 0.19212406962607123, + "grad_norm": 0.14390550553798676, + "learning_rate": 0.0009188370168620158, + "loss": 2.8038, + "step": 6479 + }, + { + "epoch": 0.1921537229783827, + "grad_norm": 0.1456233263015747, + "learning_rate": 0.0009188113171609029, + "loss": 2.8387, + "step": 6480 + }, + { + "epoch": 0.19218337633069418, + "grad_norm": 0.13997042179107666, + "learning_rate": 0.0009187856137511436, + "loss": 2.8422, + "step": 6481 + }, + { + "epoch": 0.19221302968300566, + "grad_norm": 0.13204443454742432, + "learning_rate": 0.0009187599066329662, + "loss": 2.8469, + "step": 6482 + }, + { + "epoch": 0.19224268303531714, + "grad_norm": 0.1384567767381668, + "learning_rate": 0.0009187341958065977, + "loss": 2.8199, + "step": 6483 + }, + { + "epoch": 0.1922723363876286, + "grad_norm": 0.12928429245948792, + "learning_rate": 0.0009187084812722661, + "loss": 2.8212, + "step": 6484 + }, + { + "epoch": 0.1923019897399401, + "grad_norm": 0.13963909447193146, + "learning_rate": 0.000918682763030199, + "loss": 2.8304, + "step": 6485 + }, + { + "epoch": 0.1923316430922516, + "grad_norm": 0.14172063767910004, + "learning_rate": 0.0009186570410806241, + "loss": 2.8105, + "step": 6486 + }, + { + "epoch": 0.19236129644456307, + "grad_norm": 0.13508786261081696, + "learning_rate": 0.0009186313154237693, + "loss": 2.8334, + "step": 6487 + }, + { + "epoch": 0.19239094979687454, + "grad_norm": 0.14579181373119354, + "learning_rate": 0.0009186055860598624, + "loss": 2.8197, + "step": 6488 + }, + { + "epoch": 0.19242060314918602, + "grad_norm": 0.14926107227802277, + "learning_rate": 0.0009185798529891311, + "loss": 2.8181, + "step": 6489 + }, + { + "epoch": 0.1924502565014975, + "grad_norm": 0.14624334871768951, + "learning_rate": 0.0009185541162118036, + "loss": 2.8426, + "step": 6490 + }, + { + "epoch": 0.19247990985380897, + "grad_norm": 0.14880676567554474, + "learning_rate": 0.0009185283757281073, + "loss": 2.8313, + "step": 6491 + }, + { + "epoch": 0.19250956320612045, + "grad_norm": 0.15585440397262573, + "learning_rate": 0.0009185026315382704, + "loss": 2.8054, + "step": 6492 + }, + { + "epoch": 0.19253921655843192, + "grad_norm": 0.14955447614192963, + "learning_rate": 0.0009184768836425209, + "loss": 2.8615, + "step": 6493 + }, + { + "epoch": 0.1925688699107434, + "grad_norm": 0.17176342010498047, + "learning_rate": 0.0009184511320410868, + "loss": 2.8252, + "step": 6494 + }, + { + "epoch": 0.19259852326305488, + "grad_norm": 0.2118600457906723, + "learning_rate": 0.0009184253767341961, + "loss": 2.8276, + "step": 6495 + }, + { + "epoch": 0.19262817661536638, + "grad_norm": 0.2154221087694168, + "learning_rate": 0.0009183996177220768, + "loss": 2.8538, + "step": 6496 + }, + { + "epoch": 0.19265782996767786, + "grad_norm": 0.20014485716819763, + "learning_rate": 0.0009183738550049571, + "loss": 2.878, + "step": 6497 + }, + { + "epoch": 0.19268748331998933, + "grad_norm": 0.18705137073993683, + "learning_rate": 0.0009183480885830651, + "loss": 2.826, + "step": 6498 + }, + { + "epoch": 0.1927171366723008, + "grad_norm": 0.15335193276405334, + "learning_rate": 0.000918322318456629, + "loss": 2.8124, + "step": 6499 + }, + { + "epoch": 0.19274679002461229, + "grad_norm": 0.15804924070835114, + "learning_rate": 0.0009182965446258768, + "loss": 2.8739, + "step": 6500 + }, + { + "epoch": 0.19277644337692376, + "grad_norm": 0.13601058721542358, + "learning_rate": 0.0009182707670910372, + "loss": 2.8387, + "step": 6501 + }, + { + "epoch": 0.19280609672923524, + "grad_norm": 0.14781305193901062, + "learning_rate": 0.0009182449858523379, + "loss": 2.819, + "step": 6502 + }, + { + "epoch": 0.19283575008154671, + "grad_norm": 0.15194116532802582, + "learning_rate": 0.0009182192009100077, + "loss": 2.8597, + "step": 6503 + }, + { + "epoch": 0.1928654034338582, + "grad_norm": 0.14410634338855743, + "learning_rate": 0.0009181934122642746, + "loss": 2.8428, + "step": 6504 + }, + { + "epoch": 0.19289505678616967, + "grad_norm": 0.13130441308021545, + "learning_rate": 0.0009181676199153669, + "loss": 2.8258, + "step": 6505 + }, + { + "epoch": 0.19292471013848114, + "grad_norm": 0.12813158333301544, + "learning_rate": 0.0009181418238635134, + "loss": 2.7926, + "step": 6506 + }, + { + "epoch": 0.19295436349079265, + "grad_norm": 0.13174660503864288, + "learning_rate": 0.000918116024108942, + "loss": 2.8307, + "step": 6507 + }, + { + "epoch": 0.19298401684310412, + "grad_norm": 0.11884080618619919, + "learning_rate": 0.0009180902206518815, + "loss": 2.8542, + "step": 6508 + }, + { + "epoch": 0.1930136701954156, + "grad_norm": 0.12204288691282272, + "learning_rate": 0.0009180644134925604, + "loss": 2.8383, + "step": 6509 + }, + { + "epoch": 0.19304332354772707, + "grad_norm": 0.13358557224273682, + "learning_rate": 0.0009180386026312073, + "loss": 2.8211, + "step": 6510 + }, + { + "epoch": 0.19307297690003855, + "grad_norm": 0.15078936517238617, + "learning_rate": 0.0009180127880680504, + "loss": 2.831, + "step": 6511 + }, + { + "epoch": 0.19310263025235003, + "grad_norm": 0.15269489586353302, + "learning_rate": 0.0009179869698033186, + "loss": 2.7974, + "step": 6512 + }, + { + "epoch": 0.1931322836046615, + "grad_norm": 0.15057560801506042, + "learning_rate": 0.0009179611478372405, + "loss": 2.8177, + "step": 6513 + }, + { + "epoch": 0.19316193695697298, + "grad_norm": 0.1456524133682251, + "learning_rate": 0.0009179353221700446, + "loss": 2.853, + "step": 6514 + }, + { + "epoch": 0.19319159030928446, + "grad_norm": 0.14593136310577393, + "learning_rate": 0.0009179094928019596, + "loss": 2.8438, + "step": 6515 + }, + { + "epoch": 0.19322124366159593, + "grad_norm": 0.1456168293952942, + "learning_rate": 0.0009178836597332145, + "loss": 2.8214, + "step": 6516 + }, + { + "epoch": 0.19325089701390744, + "grad_norm": 0.14494796097278595, + "learning_rate": 0.0009178578229640377, + "loss": 2.8551, + "step": 6517 + }, + { + "epoch": 0.1932805503662189, + "grad_norm": 0.1443614363670349, + "learning_rate": 0.0009178319824946583, + "loss": 2.836, + "step": 6518 + }, + { + "epoch": 0.1933102037185304, + "grad_norm": 0.14336831867694855, + "learning_rate": 0.000917806138325305, + "loss": 2.8016, + "step": 6519 + }, + { + "epoch": 0.19333985707084186, + "grad_norm": 0.14009332656860352, + "learning_rate": 0.0009177802904562065, + "loss": 2.8045, + "step": 6520 + }, + { + "epoch": 0.19336951042315334, + "grad_norm": 0.14632916450500488, + "learning_rate": 0.0009177544388875918, + "loss": 2.8311, + "step": 6521 + }, + { + "epoch": 0.19339916377546482, + "grad_norm": 0.13691234588623047, + "learning_rate": 0.0009177285836196898, + "loss": 2.8147, + "step": 6522 + }, + { + "epoch": 0.1934288171277763, + "grad_norm": 0.15003442764282227, + "learning_rate": 0.0009177027246527296, + "loss": 2.8465, + "step": 6523 + }, + { + "epoch": 0.19345847048008777, + "grad_norm": 0.1586778312921524, + "learning_rate": 0.00091767686198694, + "loss": 2.8049, + "step": 6524 + }, + { + "epoch": 0.19348812383239924, + "grad_norm": 0.13749155402183533, + "learning_rate": 0.0009176509956225503, + "loss": 2.8215, + "step": 6525 + }, + { + "epoch": 0.19351777718471072, + "grad_norm": 0.12951438128948212, + "learning_rate": 0.0009176251255597892, + "loss": 2.8114, + "step": 6526 + }, + { + "epoch": 0.1935474305370222, + "grad_norm": 0.13339217007160187, + "learning_rate": 0.0009175992517988858, + "loss": 2.818, + "step": 6527 + }, + { + "epoch": 0.1935770838893337, + "grad_norm": 0.13292071223258972, + "learning_rate": 0.0009175733743400694, + "loss": 2.838, + "step": 6528 + }, + { + "epoch": 0.19360673724164518, + "grad_norm": 0.1529838889837265, + "learning_rate": 0.0009175474931835692, + "loss": 2.8359, + "step": 6529 + }, + { + "epoch": 0.19363639059395665, + "grad_norm": 0.16258031129837036, + "learning_rate": 0.0009175216083296142, + "loss": 2.781, + "step": 6530 + }, + { + "epoch": 0.19366604394626813, + "grad_norm": 0.1565817892551422, + "learning_rate": 0.0009174957197784338, + "loss": 2.8347, + "step": 6531 + }, + { + "epoch": 0.1936956972985796, + "grad_norm": 0.14778025448322296, + "learning_rate": 0.0009174698275302571, + "loss": 2.8528, + "step": 6532 + }, + { + "epoch": 0.19372535065089108, + "grad_norm": 0.18109886348247528, + "learning_rate": 0.0009174439315853133, + "loss": 2.7979, + "step": 6533 + }, + { + "epoch": 0.19375500400320256, + "grad_norm": 0.1843772530555725, + "learning_rate": 0.000917418031943832, + "loss": 2.8451, + "step": 6534 + }, + { + "epoch": 0.19378465735551403, + "grad_norm": 0.19518110156059265, + "learning_rate": 0.0009173921286060422, + "loss": 2.8444, + "step": 6535 + }, + { + "epoch": 0.1938143107078255, + "grad_norm": 0.1795390099287033, + "learning_rate": 0.0009173662215721737, + "loss": 2.8486, + "step": 6536 + }, + { + "epoch": 0.193843964060137, + "grad_norm": 0.15223179757595062, + "learning_rate": 0.0009173403108424554, + "loss": 2.8041, + "step": 6537 + }, + { + "epoch": 0.1938736174124485, + "grad_norm": 0.1765916496515274, + "learning_rate": 0.0009173143964171171, + "loss": 2.8049, + "step": 6538 + }, + { + "epoch": 0.19390327076475997, + "grad_norm": 0.16270217299461365, + "learning_rate": 0.0009172884782963884, + "loss": 2.8607, + "step": 6539 + }, + { + "epoch": 0.19393292411707144, + "grad_norm": 0.14291273057460785, + "learning_rate": 0.0009172625564804984, + "loss": 2.8365, + "step": 6540 + }, + { + "epoch": 0.19396257746938292, + "grad_norm": 0.15973153710365295, + "learning_rate": 0.0009172366309696768, + "loss": 2.8371, + "step": 6541 + }, + { + "epoch": 0.1939922308216944, + "grad_norm": 0.14523588120937347, + "learning_rate": 0.0009172107017641533, + "loss": 2.826, + "step": 6542 + }, + { + "epoch": 0.19402188417400587, + "grad_norm": 0.15845607221126556, + "learning_rate": 0.0009171847688641574, + "loss": 2.7814, + "step": 6543 + }, + { + "epoch": 0.19405153752631735, + "grad_norm": 0.16883929073810577, + "learning_rate": 0.0009171588322699187, + "loss": 2.8253, + "step": 6544 + }, + { + "epoch": 0.19408119087862882, + "grad_norm": 0.15936394035816193, + "learning_rate": 0.0009171328919816671, + "loss": 2.8368, + "step": 6545 + }, + { + "epoch": 0.1941108442309403, + "grad_norm": 0.13633601367473602, + "learning_rate": 0.0009171069479996319, + "loss": 2.8288, + "step": 6546 + }, + { + "epoch": 0.19414049758325178, + "grad_norm": 0.1481577754020691, + "learning_rate": 0.0009170810003240432, + "loss": 2.8509, + "step": 6547 + }, + { + "epoch": 0.19417015093556325, + "grad_norm": 0.1269376128911972, + "learning_rate": 0.0009170550489551308, + "loss": 2.8688, + "step": 6548 + }, + { + "epoch": 0.19419980428787476, + "grad_norm": 0.12344155460596085, + "learning_rate": 0.0009170290938931242, + "loss": 2.8153, + "step": 6549 + }, + { + "epoch": 0.19422945764018623, + "grad_norm": 0.14075946807861328, + "learning_rate": 0.0009170031351382535, + "loss": 2.8195, + "step": 6550 + }, + { + "epoch": 0.1942591109924977, + "grad_norm": 0.14447538554668427, + "learning_rate": 0.0009169771726907483, + "loss": 2.8238, + "step": 6551 + }, + { + "epoch": 0.19428876434480918, + "grad_norm": 0.14913439750671387, + "learning_rate": 0.0009169512065508388, + "loss": 2.8367, + "step": 6552 + }, + { + "epoch": 0.19431841769712066, + "grad_norm": 0.1601666957139969, + "learning_rate": 0.0009169252367187546, + "loss": 2.8404, + "step": 6553 + }, + { + "epoch": 0.19434807104943214, + "grad_norm": 0.14168573915958405, + "learning_rate": 0.0009168992631947261, + "loss": 2.844, + "step": 6554 + }, + { + "epoch": 0.1943777244017436, + "grad_norm": 0.15284273028373718, + "learning_rate": 0.0009168732859789829, + "loss": 2.8417, + "step": 6555 + }, + { + "epoch": 0.1944073777540551, + "grad_norm": 0.15792328119277954, + "learning_rate": 0.0009168473050717553, + "loss": 2.8386, + "step": 6556 + }, + { + "epoch": 0.19443703110636656, + "grad_norm": 0.1304796189069748, + "learning_rate": 0.0009168213204732732, + "loss": 2.8198, + "step": 6557 + }, + { + "epoch": 0.19446668445867804, + "grad_norm": 0.12913696467876434, + "learning_rate": 0.0009167953321837668, + "loss": 2.8121, + "step": 6558 + }, + { + "epoch": 0.19449633781098954, + "grad_norm": 0.12526513636112213, + "learning_rate": 0.0009167693402034662, + "loss": 2.7837, + "step": 6559 + }, + { + "epoch": 0.19452599116330102, + "grad_norm": 0.13904301822185516, + "learning_rate": 0.0009167433445326015, + "loss": 2.7993, + "step": 6560 + }, + { + "epoch": 0.1945556445156125, + "grad_norm": 0.1486036330461502, + "learning_rate": 0.000916717345171403, + "loss": 2.8505, + "step": 6561 + }, + { + "epoch": 0.19458529786792397, + "grad_norm": 0.14729805290699005, + "learning_rate": 0.0009166913421201009, + "loss": 2.8038, + "step": 6562 + }, + { + "epoch": 0.19461495122023545, + "grad_norm": 0.16228508949279785, + "learning_rate": 0.0009166653353789254, + "loss": 2.8205, + "step": 6563 + }, + { + "epoch": 0.19464460457254693, + "grad_norm": 0.18394269049167633, + "learning_rate": 0.0009166393249481069, + "loss": 2.8239, + "step": 6564 + }, + { + "epoch": 0.1946742579248584, + "grad_norm": 0.13658441603183746, + "learning_rate": 0.0009166133108278756, + "loss": 2.819, + "step": 6565 + }, + { + "epoch": 0.19470391127716988, + "grad_norm": 0.14388972520828247, + "learning_rate": 0.0009165872930184618, + "loss": 2.8137, + "step": 6566 + }, + { + "epoch": 0.19473356462948135, + "grad_norm": 0.15230587124824524, + "learning_rate": 0.0009165612715200962, + "loss": 2.7953, + "step": 6567 + }, + { + "epoch": 0.19476321798179283, + "grad_norm": 0.13953128457069397, + "learning_rate": 0.000916535246333009, + "loss": 2.8329, + "step": 6568 + }, + { + "epoch": 0.19479287133410433, + "grad_norm": 0.1263274848461151, + "learning_rate": 0.0009165092174574307, + "loss": 2.8137, + "step": 6569 + }, + { + "epoch": 0.1948225246864158, + "grad_norm": 0.13637614250183105, + "learning_rate": 0.0009164831848935917, + "loss": 2.8347, + "step": 6570 + }, + { + "epoch": 0.1948521780387273, + "grad_norm": 0.1270075887441635, + "learning_rate": 0.0009164571486417226, + "loss": 2.8098, + "step": 6571 + }, + { + "epoch": 0.19488183139103876, + "grad_norm": 0.13097545504570007, + "learning_rate": 0.0009164311087020541, + "loss": 2.8546, + "step": 6572 + }, + { + "epoch": 0.19491148474335024, + "grad_norm": 0.14042609930038452, + "learning_rate": 0.0009164050650748165, + "loss": 2.8449, + "step": 6573 + }, + { + "epoch": 0.19494113809566171, + "grad_norm": 0.15013273060321808, + "learning_rate": 0.0009163790177602407, + "loss": 2.8523, + "step": 6574 + }, + { + "epoch": 0.1949707914479732, + "grad_norm": 0.16116881370544434, + "learning_rate": 0.0009163529667585573, + "loss": 2.8345, + "step": 6575 + }, + { + "epoch": 0.19500044480028467, + "grad_norm": 0.15498870611190796, + "learning_rate": 0.0009163269120699968, + "loss": 2.813, + "step": 6576 + }, + { + "epoch": 0.19503009815259614, + "grad_norm": 0.13954021036624908, + "learning_rate": 0.00091630085369479, + "loss": 2.8353, + "step": 6577 + }, + { + "epoch": 0.19505975150490762, + "grad_norm": 0.16906681656837463, + "learning_rate": 0.0009162747916331678, + "loss": 2.8379, + "step": 6578 + }, + { + "epoch": 0.1950894048572191, + "grad_norm": 0.19688910245895386, + "learning_rate": 0.0009162487258853606, + "loss": 2.8482, + "step": 6579 + }, + { + "epoch": 0.1951190582095306, + "grad_norm": 0.18133407831192017, + "learning_rate": 0.0009162226564515997, + "loss": 2.8022, + "step": 6580 + }, + { + "epoch": 0.19514871156184208, + "grad_norm": 0.14578895270824432, + "learning_rate": 0.0009161965833321158, + "loss": 2.8259, + "step": 6581 + }, + { + "epoch": 0.19517836491415355, + "grad_norm": 0.15384049713611603, + "learning_rate": 0.0009161705065271395, + "loss": 2.8116, + "step": 6582 + }, + { + "epoch": 0.19520801826646503, + "grad_norm": 0.15569500625133514, + "learning_rate": 0.0009161444260369021, + "loss": 2.8121, + "step": 6583 + }, + { + "epoch": 0.1952376716187765, + "grad_norm": 0.13386739790439606, + "learning_rate": 0.0009161183418616343, + "loss": 2.8273, + "step": 6584 + }, + { + "epoch": 0.19526732497108798, + "grad_norm": 0.12017744034528732, + "learning_rate": 0.0009160922540015673, + "loss": 2.8343, + "step": 6585 + }, + { + "epoch": 0.19529697832339946, + "grad_norm": 0.14892986416816711, + "learning_rate": 0.0009160661624569318, + "loss": 2.8017, + "step": 6586 + }, + { + "epoch": 0.19532663167571093, + "grad_norm": 0.15214817225933075, + "learning_rate": 0.0009160400672279591, + "loss": 2.8227, + "step": 6587 + }, + { + "epoch": 0.1953562850280224, + "grad_norm": 0.1285906434059143, + "learning_rate": 0.0009160139683148801, + "loss": 2.7989, + "step": 6588 + }, + { + "epoch": 0.19538593838033388, + "grad_norm": 0.11845383793115616, + "learning_rate": 0.0009159878657179261, + "loss": 2.8456, + "step": 6589 + }, + { + "epoch": 0.1954155917326454, + "grad_norm": 0.1349756270647049, + "learning_rate": 0.0009159617594373281, + "loss": 2.8478, + "step": 6590 + }, + { + "epoch": 0.19544524508495686, + "grad_norm": 0.12177582830190659, + "learning_rate": 0.0009159356494733173, + "loss": 2.8104, + "step": 6591 + }, + { + "epoch": 0.19547489843726834, + "grad_norm": 0.11439836025238037, + "learning_rate": 0.0009159095358261249, + "loss": 2.822, + "step": 6592 + }, + { + "epoch": 0.19550455178957982, + "grad_norm": 0.12637858092784882, + "learning_rate": 0.0009158834184959824, + "loss": 2.8146, + "step": 6593 + }, + { + "epoch": 0.1955342051418913, + "grad_norm": 0.12938044965267181, + "learning_rate": 0.0009158572974831206, + "loss": 2.8152, + "step": 6594 + }, + { + "epoch": 0.19556385849420277, + "grad_norm": 0.15507182478904724, + "learning_rate": 0.000915831172787771, + "loss": 2.8195, + "step": 6595 + }, + { + "epoch": 0.19559351184651425, + "grad_norm": 0.16138236224651337, + "learning_rate": 0.0009158050444101652, + "loss": 2.8395, + "step": 6596 + }, + { + "epoch": 0.19562316519882572, + "grad_norm": 0.2013298124074936, + "learning_rate": 0.0009157789123505342, + "loss": 2.842, + "step": 6597 + }, + { + "epoch": 0.1956528185511372, + "grad_norm": 0.22627536952495575, + "learning_rate": 0.0009157527766091097, + "loss": 2.8311, + "step": 6598 + }, + { + "epoch": 0.19568247190344867, + "grad_norm": 0.19845211505889893, + "learning_rate": 0.0009157266371861229, + "loss": 2.8412, + "step": 6599 + }, + { + "epoch": 0.19571212525576015, + "grad_norm": 0.19280269742012024, + "learning_rate": 0.0009157004940818054, + "loss": 2.8286, + "step": 6600 + }, + { + "epoch": 0.19574177860807165, + "grad_norm": 0.20579758286476135, + "learning_rate": 0.0009156743472963887, + "loss": 2.8446, + "step": 6601 + }, + { + "epoch": 0.19577143196038313, + "grad_norm": 0.18626739084720612, + "learning_rate": 0.0009156481968301042, + "loss": 2.8518, + "step": 6602 + }, + { + "epoch": 0.1958010853126946, + "grad_norm": 0.15639083087444305, + "learning_rate": 0.0009156220426831839, + "loss": 2.8367, + "step": 6603 + }, + { + "epoch": 0.19583073866500608, + "grad_norm": 0.13546855747699738, + "learning_rate": 0.0009155958848558587, + "loss": 2.8101, + "step": 6604 + }, + { + "epoch": 0.19586039201731756, + "grad_norm": 0.13642264902591705, + "learning_rate": 0.0009155697233483608, + "loss": 2.8646, + "step": 6605 + }, + { + "epoch": 0.19589004536962903, + "grad_norm": 0.13272309303283691, + "learning_rate": 0.0009155435581609215, + "loss": 2.8552, + "step": 6606 + }, + { + "epoch": 0.1959196987219405, + "grad_norm": 0.15629155933856964, + "learning_rate": 0.0009155173892937727, + "loss": 2.8093, + "step": 6607 + }, + { + "epoch": 0.195949352074252, + "grad_norm": 0.15321296453475952, + "learning_rate": 0.0009154912167471463, + "loss": 2.8113, + "step": 6608 + }, + { + "epoch": 0.19597900542656346, + "grad_norm": 0.14143028855323792, + "learning_rate": 0.0009154650405212737, + "loss": 2.8086, + "step": 6609 + }, + { + "epoch": 0.19600865877887494, + "grad_norm": 0.1353796124458313, + "learning_rate": 0.0009154388606163868, + "loss": 2.8473, + "step": 6610 + }, + { + "epoch": 0.19603831213118644, + "grad_norm": 0.14852909743785858, + "learning_rate": 0.0009154126770327175, + "loss": 2.808, + "step": 6611 + }, + { + "epoch": 0.19606796548349792, + "grad_norm": 0.1353960782289505, + "learning_rate": 0.0009153864897704977, + "loss": 2.8412, + "step": 6612 + }, + { + "epoch": 0.1960976188358094, + "grad_norm": 0.12607979774475098, + "learning_rate": 0.0009153602988299592, + "loss": 2.8469, + "step": 6613 + }, + { + "epoch": 0.19612727218812087, + "grad_norm": 0.12373784184455872, + "learning_rate": 0.000915334104211334, + "loss": 2.814, + "step": 6614 + }, + { + "epoch": 0.19615692554043235, + "grad_norm": 0.13862624764442444, + "learning_rate": 0.0009153079059148541, + "loss": 2.82, + "step": 6615 + }, + { + "epoch": 0.19618657889274382, + "grad_norm": 0.13421796262264252, + "learning_rate": 0.0009152817039407513, + "loss": 2.7831, + "step": 6616 + }, + { + "epoch": 0.1962162322450553, + "grad_norm": 0.12311666458845139, + "learning_rate": 0.0009152554982892575, + "loss": 2.8307, + "step": 6617 + }, + { + "epoch": 0.19624588559736678, + "grad_norm": 0.12009917199611664, + "learning_rate": 0.0009152292889606053, + "loss": 2.8014, + "step": 6618 + }, + { + "epoch": 0.19627553894967825, + "grad_norm": 0.11355236172676086, + "learning_rate": 0.0009152030759550265, + "loss": 2.8607, + "step": 6619 + }, + { + "epoch": 0.19630519230198973, + "grad_norm": 0.1241588443517685, + "learning_rate": 0.000915176859272753, + "loss": 2.8436, + "step": 6620 + }, + { + "epoch": 0.19633484565430123, + "grad_norm": 0.1388566792011261, + "learning_rate": 0.0009151506389140173, + "loss": 2.8724, + "step": 6621 + }, + { + "epoch": 0.1963644990066127, + "grad_norm": 0.1364651918411255, + "learning_rate": 0.0009151244148790513, + "loss": 2.8036, + "step": 6622 + }, + { + "epoch": 0.19639415235892418, + "grad_norm": 0.15703541040420532, + "learning_rate": 0.0009150981871680875, + "loss": 2.8381, + "step": 6623 + }, + { + "epoch": 0.19642380571123566, + "grad_norm": 0.16103917360305786, + "learning_rate": 0.0009150719557813579, + "loss": 2.8429, + "step": 6624 + }, + { + "epoch": 0.19645345906354714, + "grad_norm": 0.1535198837518692, + "learning_rate": 0.0009150457207190947, + "loss": 2.8681, + "step": 6625 + }, + { + "epoch": 0.1964831124158586, + "grad_norm": 0.14861378073692322, + "learning_rate": 0.0009150194819815307, + "loss": 2.8468, + "step": 6626 + }, + { + "epoch": 0.1965127657681701, + "grad_norm": 0.12626594305038452, + "learning_rate": 0.0009149932395688979, + "loss": 2.8159, + "step": 6627 + }, + { + "epoch": 0.19654241912048157, + "grad_norm": 0.13327090442180634, + "learning_rate": 0.0009149669934814287, + "loss": 2.8117, + "step": 6628 + }, + { + "epoch": 0.19657207247279304, + "grad_norm": 0.14487463235855103, + "learning_rate": 0.0009149407437193556, + "loss": 2.832, + "step": 6629 + }, + { + "epoch": 0.19660172582510452, + "grad_norm": 0.14136436581611633, + "learning_rate": 0.0009149144902829107, + "loss": 2.8086, + "step": 6630 + }, + { + "epoch": 0.196631379177416, + "grad_norm": 0.148622065782547, + "learning_rate": 0.0009148882331723271, + "loss": 2.8129, + "step": 6631 + }, + { + "epoch": 0.1966610325297275, + "grad_norm": 0.1580083668231964, + "learning_rate": 0.0009148619723878369, + "loss": 2.7953, + "step": 6632 + }, + { + "epoch": 0.19669068588203897, + "grad_norm": 0.14927954971790314, + "learning_rate": 0.0009148357079296726, + "loss": 2.8201, + "step": 6633 + }, + { + "epoch": 0.19672033923435045, + "grad_norm": 0.1743207424879074, + "learning_rate": 0.000914809439798067, + "loss": 2.8503, + "step": 6634 + }, + { + "epoch": 0.19674999258666193, + "grad_norm": 0.19563409686088562, + "learning_rate": 0.0009147831679932525, + "loss": 2.8414, + "step": 6635 + }, + { + "epoch": 0.1967796459389734, + "grad_norm": 0.17058229446411133, + "learning_rate": 0.0009147568925154621, + "loss": 2.8219, + "step": 6636 + }, + { + "epoch": 0.19680929929128488, + "grad_norm": 0.18812358379364014, + "learning_rate": 0.0009147306133649281, + "loss": 2.8182, + "step": 6637 + }, + { + "epoch": 0.19683895264359635, + "grad_norm": 0.16028821468353271, + "learning_rate": 0.0009147043305418833, + "loss": 2.8403, + "step": 6638 + }, + { + "epoch": 0.19686860599590783, + "grad_norm": 0.15213367342948914, + "learning_rate": 0.0009146780440465605, + "loss": 2.806, + "step": 6639 + }, + { + "epoch": 0.1968982593482193, + "grad_norm": 0.15487292408943176, + "learning_rate": 0.0009146517538791924, + "loss": 2.8027, + "step": 6640 + }, + { + "epoch": 0.19692791270053078, + "grad_norm": 0.1515040248632431, + "learning_rate": 0.0009146254600400119, + "loss": 2.837, + "step": 6641 + }, + { + "epoch": 0.1969575660528423, + "grad_norm": 0.15310750901699066, + "learning_rate": 0.0009145991625292517, + "loss": 2.8235, + "step": 6642 + }, + { + "epoch": 0.19698721940515376, + "grad_norm": 0.15375663340091705, + "learning_rate": 0.0009145728613471448, + "loss": 2.8411, + "step": 6643 + }, + { + "epoch": 0.19701687275746524, + "grad_norm": 0.13316316902637482, + "learning_rate": 0.0009145465564939239, + "loss": 2.8209, + "step": 6644 + }, + { + "epoch": 0.19704652610977672, + "grad_norm": 0.12619808316230774, + "learning_rate": 0.0009145202479698223, + "loss": 2.8368, + "step": 6645 + }, + { + "epoch": 0.1970761794620882, + "grad_norm": 0.12405091524124146, + "learning_rate": 0.0009144939357750728, + "loss": 2.8326, + "step": 6646 + }, + { + "epoch": 0.19710583281439967, + "grad_norm": 0.12581831216812134, + "learning_rate": 0.0009144676199099082, + "loss": 2.8109, + "step": 6647 + }, + { + "epoch": 0.19713548616671114, + "grad_norm": 0.10951075702905655, + "learning_rate": 0.0009144413003745617, + "loss": 2.7674, + "step": 6648 + }, + { + "epoch": 0.19716513951902262, + "grad_norm": 0.1197105273604393, + "learning_rate": 0.0009144149771692664, + "loss": 2.828, + "step": 6649 + }, + { + "epoch": 0.1971947928713341, + "grad_norm": 0.12038338929414749, + "learning_rate": 0.0009143886502942553, + "loss": 2.8099, + "step": 6650 + }, + { + "epoch": 0.19722444622364557, + "grad_norm": 0.11315666884183884, + "learning_rate": 0.0009143623197497616, + "loss": 2.8538, + "step": 6651 + }, + { + "epoch": 0.19725409957595705, + "grad_norm": 0.13459934294223785, + "learning_rate": 0.0009143359855360184, + "loss": 2.8078, + "step": 6652 + }, + { + "epoch": 0.19728375292826855, + "grad_norm": 0.15626409649848938, + "learning_rate": 0.000914309647653259, + "loss": 2.8156, + "step": 6653 + }, + { + "epoch": 0.19731340628058003, + "grad_norm": 0.17292055487632751, + "learning_rate": 0.0009142833061017163, + "loss": 2.8243, + "step": 6654 + }, + { + "epoch": 0.1973430596328915, + "grad_norm": 0.15831463038921356, + "learning_rate": 0.0009142569608816243, + "loss": 2.8385, + "step": 6655 + }, + { + "epoch": 0.19737271298520298, + "grad_norm": 0.13787566125392914, + "learning_rate": 0.0009142306119932154, + "loss": 2.8257, + "step": 6656 + }, + { + "epoch": 0.19740236633751446, + "grad_norm": 0.13395367562770844, + "learning_rate": 0.0009142042594367235, + "loss": 2.7983, + "step": 6657 + }, + { + "epoch": 0.19743201968982593, + "grad_norm": 0.15423358976840973, + "learning_rate": 0.0009141779032123816, + "loss": 2.8392, + "step": 6658 + }, + { + "epoch": 0.1974616730421374, + "grad_norm": 0.1522010713815689, + "learning_rate": 0.0009141515433204235, + "loss": 2.8261, + "step": 6659 + }, + { + "epoch": 0.19749132639444889, + "grad_norm": 0.14659222960472107, + "learning_rate": 0.0009141251797610823, + "loss": 2.8072, + "step": 6660 + }, + { + "epoch": 0.19752097974676036, + "grad_norm": 0.15897981822490692, + "learning_rate": 0.0009140988125345915, + "loss": 2.8232, + "step": 6661 + }, + { + "epoch": 0.19755063309907184, + "grad_norm": 0.16656610369682312, + "learning_rate": 0.0009140724416411847, + "loss": 2.8349, + "step": 6662 + }, + { + "epoch": 0.19758028645138334, + "grad_norm": 0.18211469054222107, + "learning_rate": 0.0009140460670810954, + "loss": 2.8424, + "step": 6663 + }, + { + "epoch": 0.19760993980369482, + "grad_norm": 0.1467859148979187, + "learning_rate": 0.0009140196888545571, + "loss": 2.8463, + "step": 6664 + }, + { + "epoch": 0.1976395931560063, + "grad_norm": 0.1628090888261795, + "learning_rate": 0.0009139933069618033, + "loss": 2.84, + "step": 6665 + }, + { + "epoch": 0.19766924650831777, + "grad_norm": 0.16318382322788239, + "learning_rate": 0.0009139669214030677, + "loss": 2.811, + "step": 6666 + }, + { + "epoch": 0.19769889986062925, + "grad_norm": 0.16269610822200775, + "learning_rate": 0.0009139405321785841, + "loss": 2.8304, + "step": 6667 + }, + { + "epoch": 0.19772855321294072, + "grad_norm": 0.15867912769317627, + "learning_rate": 0.0009139141392885859, + "loss": 2.8539, + "step": 6668 + }, + { + "epoch": 0.1977582065652522, + "grad_norm": 0.1491471379995346, + "learning_rate": 0.000913887742733307, + "loss": 2.8082, + "step": 6669 + }, + { + "epoch": 0.19778785991756367, + "grad_norm": 0.13492725789546967, + "learning_rate": 0.0009138613425129811, + "loss": 2.8196, + "step": 6670 + }, + { + "epoch": 0.19781751326987515, + "grad_norm": 0.14089049398899078, + "learning_rate": 0.000913834938627842, + "loss": 2.8253, + "step": 6671 + }, + { + "epoch": 0.19784716662218663, + "grad_norm": 0.13077978789806366, + "learning_rate": 0.0009138085310781233, + "loss": 2.8136, + "step": 6672 + }, + { + "epoch": 0.19787681997449813, + "grad_norm": 0.14879903197288513, + "learning_rate": 0.0009137821198640592, + "loss": 2.8354, + "step": 6673 + }, + { + "epoch": 0.1979064733268096, + "grad_norm": 0.14538808166980743, + "learning_rate": 0.0009137557049858833, + "loss": 2.8419, + "step": 6674 + }, + { + "epoch": 0.19793612667912108, + "grad_norm": 0.17539215087890625, + "learning_rate": 0.0009137292864438298, + "loss": 2.8193, + "step": 6675 + }, + { + "epoch": 0.19796578003143256, + "grad_norm": 0.17167408764362335, + "learning_rate": 0.0009137028642381323, + "loss": 2.8411, + "step": 6676 + }, + { + "epoch": 0.19799543338374404, + "grad_norm": 0.1685623824596405, + "learning_rate": 0.000913676438369025, + "loss": 2.8605, + "step": 6677 + }, + { + "epoch": 0.1980250867360555, + "grad_norm": 0.15281157195568085, + "learning_rate": 0.0009136500088367418, + "loss": 2.8446, + "step": 6678 + }, + { + "epoch": 0.198054740088367, + "grad_norm": 0.1463252604007721, + "learning_rate": 0.0009136235756415168, + "loss": 2.8224, + "step": 6679 + }, + { + "epoch": 0.19808439344067846, + "grad_norm": 0.14843086898326874, + "learning_rate": 0.000913597138783584, + "loss": 2.7737, + "step": 6680 + }, + { + "epoch": 0.19811404679298994, + "grad_norm": 0.12581874430179596, + "learning_rate": 0.0009135706982631775, + "loss": 2.8406, + "step": 6681 + }, + { + "epoch": 0.19814370014530142, + "grad_norm": 0.11264852434396744, + "learning_rate": 0.0009135442540805315, + "loss": 2.817, + "step": 6682 + }, + { + "epoch": 0.1981733534976129, + "grad_norm": 0.14323021471500397, + "learning_rate": 0.0009135178062358802, + "loss": 2.8415, + "step": 6683 + }, + { + "epoch": 0.1982030068499244, + "grad_norm": 0.14455683529376984, + "learning_rate": 0.0009134913547294576, + "loss": 2.7993, + "step": 6684 + }, + { + "epoch": 0.19823266020223587, + "grad_norm": 0.1558757722377777, + "learning_rate": 0.0009134648995614982, + "loss": 2.8116, + "step": 6685 + }, + { + "epoch": 0.19826231355454735, + "grad_norm": 0.17471103370189667, + "learning_rate": 0.0009134384407322362, + "loss": 2.809, + "step": 6686 + }, + { + "epoch": 0.19829196690685882, + "grad_norm": 0.1687595099210739, + "learning_rate": 0.0009134119782419058, + "loss": 2.8136, + "step": 6687 + }, + { + "epoch": 0.1983216202591703, + "grad_norm": 0.1373337358236313, + "learning_rate": 0.0009133855120907414, + "loss": 2.8443, + "step": 6688 + }, + { + "epoch": 0.19835127361148178, + "grad_norm": 0.16682545840740204, + "learning_rate": 0.0009133590422789773, + "loss": 2.8386, + "step": 6689 + }, + { + "epoch": 0.19838092696379325, + "grad_norm": 0.1720351278781891, + "learning_rate": 0.0009133325688068479, + "loss": 2.8244, + "step": 6690 + }, + { + "epoch": 0.19841058031610473, + "grad_norm": 0.15010105073451996, + "learning_rate": 0.0009133060916745876, + "loss": 2.7971, + "step": 6691 + }, + { + "epoch": 0.1984402336684162, + "grad_norm": 0.12572930753231049, + "learning_rate": 0.000913279610882431, + "loss": 2.8038, + "step": 6692 + }, + { + "epoch": 0.19846988702072768, + "grad_norm": 0.13875997066497803, + "learning_rate": 0.0009132531264306126, + "loss": 2.8433, + "step": 6693 + }, + { + "epoch": 0.19849954037303918, + "grad_norm": 0.13179053366184235, + "learning_rate": 0.0009132266383193666, + "loss": 2.7986, + "step": 6694 + }, + { + "epoch": 0.19852919372535066, + "grad_norm": 0.12693941593170166, + "learning_rate": 0.000913200146548928, + "loss": 2.8458, + "step": 6695 + }, + { + "epoch": 0.19855884707766214, + "grad_norm": 0.13985255360603333, + "learning_rate": 0.000913173651119531, + "loss": 2.8583, + "step": 6696 + }, + { + "epoch": 0.1985885004299736, + "grad_norm": 0.138216033577919, + "learning_rate": 0.0009131471520314105, + "loss": 2.8441, + "step": 6697 + }, + { + "epoch": 0.1986181537822851, + "grad_norm": 0.12738965451717377, + "learning_rate": 0.0009131206492848012, + "loss": 2.8307, + "step": 6698 + }, + { + "epoch": 0.19864780713459657, + "grad_norm": 0.11759110540151596, + "learning_rate": 0.0009130941428799373, + "loss": 2.8112, + "step": 6699 + }, + { + "epoch": 0.19867746048690804, + "grad_norm": 0.1424088478088379, + "learning_rate": 0.0009130676328170542, + "loss": 2.8142, + "step": 6700 + }, + { + "epoch": 0.19870711383921952, + "grad_norm": 0.16101334989070892, + "learning_rate": 0.0009130411190963861, + "loss": 2.8438, + "step": 6701 + }, + { + "epoch": 0.198736767191531, + "grad_norm": 0.16083315014839172, + "learning_rate": 0.000913014601718168, + "loss": 2.7749, + "step": 6702 + }, + { + "epoch": 0.19876642054384247, + "grad_norm": 0.15272952616214752, + "learning_rate": 0.0009129880806826347, + "loss": 2.8488, + "step": 6703 + }, + { + "epoch": 0.19879607389615395, + "grad_norm": 0.13612087070941925, + "learning_rate": 0.0009129615559900213, + "loss": 2.8505, + "step": 6704 + }, + { + "epoch": 0.19882572724846545, + "grad_norm": 0.1341194361448288, + "learning_rate": 0.0009129350276405622, + "loss": 2.8615, + "step": 6705 + }, + { + "epoch": 0.19885538060077693, + "grad_norm": 0.1517026722431183, + "learning_rate": 0.0009129084956344927, + "loss": 2.8024, + "step": 6706 + }, + { + "epoch": 0.1988850339530884, + "grad_norm": 0.15864039957523346, + "learning_rate": 0.0009128819599720472, + "loss": 2.81, + "step": 6707 + }, + { + "epoch": 0.19891468730539988, + "grad_norm": 0.16484732925891876, + "learning_rate": 0.0009128554206534616, + "loss": 2.804, + "step": 6708 + }, + { + "epoch": 0.19894434065771135, + "grad_norm": 0.18182431161403656, + "learning_rate": 0.00091282887767897, + "loss": 2.8249, + "step": 6709 + }, + { + "epoch": 0.19897399401002283, + "grad_norm": 0.2080288529396057, + "learning_rate": 0.000912802331048808, + "loss": 2.861, + "step": 6710 + }, + { + "epoch": 0.1990036473623343, + "grad_norm": 0.20203062891960144, + "learning_rate": 0.0009127757807632106, + "loss": 2.8529, + "step": 6711 + }, + { + "epoch": 0.19903330071464578, + "grad_norm": 0.19154785573482513, + "learning_rate": 0.0009127492268224127, + "loss": 2.8201, + "step": 6712 + }, + { + "epoch": 0.19906295406695726, + "grad_norm": 0.17181339859962463, + "learning_rate": 0.0009127226692266495, + "loss": 2.8414, + "step": 6713 + }, + { + "epoch": 0.19909260741926874, + "grad_norm": 0.1574755162000656, + "learning_rate": 0.0009126961079761562, + "loss": 2.8444, + "step": 6714 + }, + { + "epoch": 0.19912226077158024, + "grad_norm": 0.14174248278141022, + "learning_rate": 0.0009126695430711681, + "loss": 2.848, + "step": 6715 + }, + { + "epoch": 0.19915191412389172, + "grad_norm": 0.14970114827156067, + "learning_rate": 0.0009126429745119203, + "loss": 2.8717, + "step": 6716 + }, + { + "epoch": 0.1991815674762032, + "grad_norm": 0.15718106925487518, + "learning_rate": 0.0009126164022986483, + "loss": 2.778, + "step": 6717 + }, + { + "epoch": 0.19921122082851467, + "grad_norm": 0.14651668071746826, + "learning_rate": 0.000912589826431587, + "loss": 2.8076, + "step": 6718 + }, + { + "epoch": 0.19924087418082614, + "grad_norm": 0.12465813755989075, + "learning_rate": 0.0009125632469109722, + "loss": 2.8371, + "step": 6719 + }, + { + "epoch": 0.19927052753313762, + "grad_norm": 0.13678386807441711, + "learning_rate": 0.0009125366637370389, + "loss": 2.8006, + "step": 6720 + }, + { + "epoch": 0.1993001808854491, + "grad_norm": 0.13116605579853058, + "learning_rate": 0.0009125100769100226, + "loss": 2.8254, + "step": 6721 + }, + { + "epoch": 0.19932983423776057, + "grad_norm": 0.11765478551387787, + "learning_rate": 0.0009124834864301588, + "loss": 2.7957, + "step": 6722 + }, + { + "epoch": 0.19935948759007205, + "grad_norm": 0.13713788986206055, + "learning_rate": 0.0009124568922976829, + "loss": 2.8165, + "step": 6723 + }, + { + "epoch": 0.19938914094238352, + "grad_norm": 0.15259209275245667, + "learning_rate": 0.0009124302945128305, + "loss": 2.7995, + "step": 6724 + }, + { + "epoch": 0.19941879429469503, + "grad_norm": 0.14089515805244446, + "learning_rate": 0.0009124036930758371, + "loss": 2.8358, + "step": 6725 + }, + { + "epoch": 0.1994484476470065, + "grad_norm": 0.11885819584131241, + "learning_rate": 0.000912377087986938, + "loss": 2.825, + "step": 6726 + }, + { + "epoch": 0.19947810099931798, + "grad_norm": 0.12931983172893524, + "learning_rate": 0.0009123504792463692, + "loss": 2.8586, + "step": 6727 + }, + { + "epoch": 0.19950775435162946, + "grad_norm": 0.13221189379692078, + "learning_rate": 0.000912323866854366, + "loss": 2.8239, + "step": 6728 + }, + { + "epoch": 0.19953740770394093, + "grad_norm": 0.1195969358086586, + "learning_rate": 0.0009122972508111642, + "loss": 2.805, + "step": 6729 + }, + { + "epoch": 0.1995670610562524, + "grad_norm": 0.13178405165672302, + "learning_rate": 0.0009122706311169994, + "loss": 2.8052, + "step": 6730 + }, + { + "epoch": 0.19959671440856389, + "grad_norm": 0.13762512803077698, + "learning_rate": 0.0009122440077721077, + "loss": 2.8041, + "step": 6731 + }, + { + "epoch": 0.19962636776087536, + "grad_norm": 0.13011358678340912, + "learning_rate": 0.0009122173807767243, + "loss": 2.7872, + "step": 6732 + }, + { + "epoch": 0.19965602111318684, + "grad_norm": 0.1298162341117859, + "learning_rate": 0.0009121907501310853, + "loss": 2.8355, + "step": 6733 + }, + { + "epoch": 0.19968567446549831, + "grad_norm": 0.14098455011844635, + "learning_rate": 0.0009121641158354264, + "loss": 2.8114, + "step": 6734 + }, + { + "epoch": 0.1997153278178098, + "grad_norm": 0.16281239688396454, + "learning_rate": 0.0009121374778899836, + "loss": 2.801, + "step": 6735 + }, + { + "epoch": 0.1997449811701213, + "grad_norm": 0.16788844764232635, + "learning_rate": 0.0009121108362949926, + "loss": 2.8021, + "step": 6736 + }, + { + "epoch": 0.19977463452243277, + "grad_norm": 0.1593323051929474, + "learning_rate": 0.0009120841910506894, + "loss": 2.8173, + "step": 6737 + }, + { + "epoch": 0.19980428787474425, + "grad_norm": 0.17786449193954468, + "learning_rate": 0.0009120575421573101, + "loss": 2.8277, + "step": 6738 + }, + { + "epoch": 0.19983394122705572, + "grad_norm": 0.18074704706668854, + "learning_rate": 0.0009120308896150904, + "loss": 2.8306, + "step": 6739 + }, + { + "epoch": 0.1998635945793672, + "grad_norm": 0.14277750253677368, + "learning_rate": 0.0009120042334242665, + "loss": 2.8432, + "step": 6740 + }, + { + "epoch": 0.19989324793167867, + "grad_norm": 0.17032934725284576, + "learning_rate": 0.0009119775735850744, + "loss": 2.8215, + "step": 6741 + }, + { + "epoch": 0.19992290128399015, + "grad_norm": 0.15125693380832672, + "learning_rate": 0.0009119509100977501, + "loss": 2.8296, + "step": 6742 + }, + { + "epoch": 0.19995255463630163, + "grad_norm": 0.16054819524288177, + "learning_rate": 0.0009119242429625298, + "loss": 2.8266, + "step": 6743 + }, + { + "epoch": 0.1999822079886131, + "grad_norm": 0.13764449954032898, + "learning_rate": 0.0009118975721796496, + "loss": 2.8135, + "step": 6744 + }, + { + "epoch": 0.20001186134092458, + "grad_norm": 0.1400553584098816, + "learning_rate": 0.0009118708977493457, + "loss": 2.8397, + "step": 6745 + }, + { + "epoch": 0.20004151469323608, + "grad_norm": 0.15417292714118958, + "learning_rate": 0.0009118442196718545, + "loss": 2.8161, + "step": 6746 + }, + { + "epoch": 0.20007116804554756, + "grad_norm": 0.16503262519836426, + "learning_rate": 0.0009118175379474119, + "loss": 2.7879, + "step": 6747 + }, + { + "epoch": 0.20010082139785904, + "grad_norm": 0.15998291969299316, + "learning_rate": 0.0009117908525762542, + "loss": 2.7975, + "step": 6748 + }, + { + "epoch": 0.2001304747501705, + "grad_norm": 0.1665981560945511, + "learning_rate": 0.0009117641635586181, + "loss": 2.8032, + "step": 6749 + }, + { + "epoch": 0.200160128102482, + "grad_norm": 0.16430245339870453, + "learning_rate": 0.0009117374708947394, + "loss": 2.8446, + "step": 6750 + }, + { + "epoch": 0.20018978145479346, + "grad_norm": 0.16017018258571625, + "learning_rate": 0.0009117107745848547, + "loss": 2.8238, + "step": 6751 + }, + { + "epoch": 0.20021943480710494, + "grad_norm": 0.15403960645198822, + "learning_rate": 0.0009116840746292006, + "loss": 2.8133, + "step": 6752 + }, + { + "epoch": 0.20024908815941642, + "grad_norm": 0.15549659729003906, + "learning_rate": 0.0009116573710280133, + "loss": 2.8236, + "step": 6753 + }, + { + "epoch": 0.2002787415117279, + "grad_norm": 0.14433376491069794, + "learning_rate": 0.0009116306637815293, + "loss": 2.8234, + "step": 6754 + }, + { + "epoch": 0.20030839486403937, + "grad_norm": 0.15684080123901367, + "learning_rate": 0.0009116039528899851, + "loss": 2.8236, + "step": 6755 + }, + { + "epoch": 0.20033804821635084, + "grad_norm": 0.17425301671028137, + "learning_rate": 0.0009115772383536171, + "loss": 2.8611, + "step": 6756 + }, + { + "epoch": 0.20036770156866235, + "grad_norm": 0.14678990840911865, + "learning_rate": 0.0009115505201726623, + "loss": 2.789, + "step": 6757 + }, + { + "epoch": 0.20039735492097382, + "grad_norm": 0.14292481541633606, + "learning_rate": 0.0009115237983473569, + "loss": 2.8416, + "step": 6758 + }, + { + "epoch": 0.2004270082732853, + "grad_norm": 0.1719241589307785, + "learning_rate": 0.0009114970728779376, + "loss": 2.8504, + "step": 6759 + }, + { + "epoch": 0.20045666162559678, + "grad_norm": 0.16033083200454712, + "learning_rate": 0.0009114703437646412, + "loss": 2.8375, + "step": 6760 + }, + { + "epoch": 0.20048631497790825, + "grad_norm": 0.13420844078063965, + "learning_rate": 0.000911443611007704, + "loss": 2.8477, + "step": 6761 + }, + { + "epoch": 0.20051596833021973, + "grad_norm": 0.12533244490623474, + "learning_rate": 0.0009114168746073633, + "loss": 2.8523, + "step": 6762 + }, + { + "epoch": 0.2005456216825312, + "grad_norm": 0.1421419233083725, + "learning_rate": 0.0009113901345638554, + "loss": 2.8305, + "step": 6763 + }, + { + "epoch": 0.20057527503484268, + "grad_norm": 0.14520443975925446, + "learning_rate": 0.0009113633908774171, + "loss": 2.8433, + "step": 6764 + }, + { + "epoch": 0.20060492838715416, + "grad_norm": 0.13234396278858185, + "learning_rate": 0.0009113366435482857, + "loss": 2.8313, + "step": 6765 + }, + { + "epoch": 0.20063458173946563, + "grad_norm": 0.11844821274280548, + "learning_rate": 0.0009113098925766975, + "loss": 2.8268, + "step": 6766 + }, + { + "epoch": 0.20066423509177714, + "grad_norm": 0.12648022174835205, + "learning_rate": 0.0009112831379628896, + "loss": 2.7918, + "step": 6767 + }, + { + "epoch": 0.20069388844408861, + "grad_norm": 0.13462889194488525, + "learning_rate": 0.0009112563797070989, + "loss": 2.8401, + "step": 6768 + }, + { + "epoch": 0.2007235417964001, + "grad_norm": 0.12314535677433014, + "learning_rate": 0.0009112296178095625, + "loss": 2.7909, + "step": 6769 + }, + { + "epoch": 0.20075319514871157, + "grad_norm": 0.11826517432928085, + "learning_rate": 0.0009112028522705171, + "loss": 2.7992, + "step": 6770 + }, + { + "epoch": 0.20078284850102304, + "grad_norm": 0.15898919105529785, + "learning_rate": 0.0009111760830902001, + "loss": 2.8656, + "step": 6771 + }, + { + "epoch": 0.20081250185333452, + "grad_norm": 0.1862787902355194, + "learning_rate": 0.000911149310268848, + "loss": 2.845, + "step": 6772 + }, + { + "epoch": 0.200842155205646, + "grad_norm": 0.2078879177570343, + "learning_rate": 0.0009111225338066984, + "loss": 2.8349, + "step": 6773 + }, + { + "epoch": 0.20087180855795747, + "grad_norm": 0.1739218533039093, + "learning_rate": 0.0009110957537039881, + "loss": 2.821, + "step": 6774 + }, + { + "epoch": 0.20090146191026895, + "grad_norm": 0.13177095353603363, + "learning_rate": 0.0009110689699609544, + "loss": 2.793, + "step": 6775 + }, + { + "epoch": 0.20093111526258042, + "grad_norm": 0.1496182233095169, + "learning_rate": 0.0009110421825778343, + "loss": 2.8383, + "step": 6776 + }, + { + "epoch": 0.20096076861489193, + "grad_norm": 0.1353423297405243, + "learning_rate": 0.0009110153915548654, + "loss": 2.8561, + "step": 6777 + }, + { + "epoch": 0.2009904219672034, + "grad_norm": 0.13127882778644562, + "learning_rate": 0.0009109885968922844, + "loss": 2.8071, + "step": 6778 + }, + { + "epoch": 0.20102007531951488, + "grad_norm": 0.16072791814804077, + "learning_rate": 0.0009109617985903289, + "loss": 2.8454, + "step": 6779 + }, + { + "epoch": 0.20104972867182636, + "grad_norm": 0.17473767697811127, + "learning_rate": 0.000910934996649236, + "loss": 2.843, + "step": 6780 + }, + { + "epoch": 0.20107938202413783, + "grad_norm": 0.16256675124168396, + "learning_rate": 0.0009109081910692434, + "loss": 2.8126, + "step": 6781 + }, + { + "epoch": 0.2011090353764493, + "grad_norm": 0.13535919785499573, + "learning_rate": 0.0009108813818505881, + "loss": 2.8272, + "step": 6782 + }, + { + "epoch": 0.20113868872876078, + "grad_norm": 0.1346476525068283, + "learning_rate": 0.0009108545689935076, + "loss": 2.8326, + "step": 6783 + }, + { + "epoch": 0.20116834208107226, + "grad_norm": 0.1378278285264969, + "learning_rate": 0.0009108277524982394, + "loss": 2.8323, + "step": 6784 + }, + { + "epoch": 0.20119799543338374, + "grad_norm": 0.13709381222724915, + "learning_rate": 0.000910800932365021, + "loss": 2.8102, + "step": 6785 + }, + { + "epoch": 0.2012276487856952, + "grad_norm": 0.13605250418186188, + "learning_rate": 0.0009107741085940897, + "loss": 2.8225, + "step": 6786 + }, + { + "epoch": 0.2012573021380067, + "grad_norm": 0.14102041721343994, + "learning_rate": 0.0009107472811856834, + "loss": 2.8225, + "step": 6787 + }, + { + "epoch": 0.2012869554903182, + "grad_norm": 0.13208313286304474, + "learning_rate": 0.0009107204501400392, + "loss": 2.8037, + "step": 6788 + }, + { + "epoch": 0.20131660884262967, + "grad_norm": 0.12564320862293243, + "learning_rate": 0.0009106936154573951, + "loss": 2.8451, + "step": 6789 + }, + { + "epoch": 0.20134626219494114, + "grad_norm": 0.10782328248023987, + "learning_rate": 0.0009106667771379883, + "loss": 2.8257, + "step": 6790 + }, + { + "epoch": 0.20137591554725262, + "grad_norm": 0.12559880316257477, + "learning_rate": 0.0009106399351820569, + "loss": 2.803, + "step": 6791 + }, + { + "epoch": 0.2014055688995641, + "grad_norm": 0.11457096040248871, + "learning_rate": 0.0009106130895898383, + "loss": 2.8188, + "step": 6792 + }, + { + "epoch": 0.20143522225187557, + "grad_norm": 0.10951679944992065, + "learning_rate": 0.0009105862403615703, + "loss": 2.8213, + "step": 6793 + }, + { + "epoch": 0.20146487560418705, + "grad_norm": 0.11144350469112396, + "learning_rate": 0.0009105593874974907, + "loss": 2.8002, + "step": 6794 + }, + { + "epoch": 0.20149452895649853, + "grad_norm": 0.12726178765296936, + "learning_rate": 0.0009105325309978372, + "loss": 2.8343, + "step": 6795 + }, + { + "epoch": 0.20152418230881, + "grad_norm": 0.15900976955890656, + "learning_rate": 0.0009105056708628477, + "loss": 2.8605, + "step": 6796 + }, + { + "epoch": 0.20155383566112148, + "grad_norm": 0.1669960618019104, + "learning_rate": 0.0009104788070927601, + "loss": 2.799, + "step": 6797 + }, + { + "epoch": 0.20158348901343298, + "grad_norm": 0.16269327700138092, + "learning_rate": 0.0009104519396878121, + "loss": 2.8296, + "step": 6798 + }, + { + "epoch": 0.20161314236574446, + "grad_norm": 0.18840916454792023, + "learning_rate": 0.0009104250686482418, + "loss": 2.8178, + "step": 6799 + }, + { + "epoch": 0.20164279571805593, + "grad_norm": 0.19476063549518585, + "learning_rate": 0.0009103981939742869, + "loss": 2.8041, + "step": 6800 + }, + { + "epoch": 0.2016724490703674, + "grad_norm": 0.19205304980278015, + "learning_rate": 0.0009103713156661858, + "loss": 2.8509, + "step": 6801 + }, + { + "epoch": 0.20170210242267889, + "grad_norm": 0.18241766095161438, + "learning_rate": 0.0009103444337241761, + "loss": 2.8218, + "step": 6802 + }, + { + "epoch": 0.20173175577499036, + "grad_norm": 0.1313452124595642, + "learning_rate": 0.0009103175481484961, + "loss": 2.8098, + "step": 6803 + }, + { + "epoch": 0.20176140912730184, + "grad_norm": 0.15416444838047028, + "learning_rate": 0.0009102906589393836, + "loss": 2.8017, + "step": 6804 + }, + { + "epoch": 0.20179106247961331, + "grad_norm": 0.1301732212305069, + "learning_rate": 0.0009102637660970772, + "loss": 2.828, + "step": 6805 + }, + { + "epoch": 0.2018207158319248, + "grad_norm": 0.13035215437412262, + "learning_rate": 0.0009102368696218144, + "loss": 2.8257, + "step": 6806 + }, + { + "epoch": 0.20185036918423627, + "grad_norm": 0.1281532347202301, + "learning_rate": 0.000910209969513834, + "loss": 2.7867, + "step": 6807 + }, + { + "epoch": 0.20188002253654774, + "grad_norm": 0.12232442200183868, + "learning_rate": 0.0009101830657733736, + "loss": 2.8083, + "step": 6808 + }, + { + "epoch": 0.20190967588885925, + "grad_norm": 0.13226225972175598, + "learning_rate": 0.0009101561584006719, + "loss": 2.8122, + "step": 6809 + }, + { + "epoch": 0.20193932924117072, + "grad_norm": 0.13513566553592682, + "learning_rate": 0.0009101292473959671, + "loss": 2.8035, + "step": 6810 + }, + { + "epoch": 0.2019689825934822, + "grad_norm": 0.15899991989135742, + "learning_rate": 0.0009101023327594972, + "loss": 2.8414, + "step": 6811 + }, + { + "epoch": 0.20199863594579368, + "grad_norm": 0.1961214244365692, + "learning_rate": 0.000910075414491501, + "loss": 2.798, + "step": 6812 + }, + { + "epoch": 0.20202828929810515, + "grad_norm": 0.1802399903535843, + "learning_rate": 0.0009100484925922166, + "loss": 2.8284, + "step": 6813 + }, + { + "epoch": 0.20205794265041663, + "grad_norm": 0.15522325038909912, + "learning_rate": 0.0009100215670618823, + "loss": 2.7892, + "step": 6814 + }, + { + "epoch": 0.2020875960027281, + "grad_norm": 0.16699853539466858, + "learning_rate": 0.0009099946379007367, + "loss": 2.8359, + "step": 6815 + }, + { + "epoch": 0.20211724935503958, + "grad_norm": 0.18309108912944794, + "learning_rate": 0.0009099677051090181, + "loss": 2.8257, + "step": 6816 + }, + { + "epoch": 0.20214690270735106, + "grad_norm": 0.1338694542646408, + "learning_rate": 0.0009099407686869651, + "loss": 2.8474, + "step": 6817 + }, + { + "epoch": 0.20217655605966253, + "grad_norm": 0.13751347362995148, + "learning_rate": 0.0009099138286348163, + "loss": 2.7925, + "step": 6818 + }, + { + "epoch": 0.20220620941197404, + "grad_norm": 0.1470232456922531, + "learning_rate": 0.0009098868849528101, + "loss": 2.8181, + "step": 6819 + }, + { + "epoch": 0.2022358627642855, + "grad_norm": 0.16492384672164917, + "learning_rate": 0.0009098599376411853, + "loss": 2.8114, + "step": 6820 + }, + { + "epoch": 0.202265516116597, + "grad_norm": 0.1562415361404419, + "learning_rate": 0.0009098329867001804, + "loss": 2.7814, + "step": 6821 + }, + { + "epoch": 0.20229516946890846, + "grad_norm": 0.14059390127658844, + "learning_rate": 0.0009098060321300341, + "loss": 2.8395, + "step": 6822 + }, + { + "epoch": 0.20232482282121994, + "grad_norm": 0.14643561840057373, + "learning_rate": 0.0009097790739309848, + "loss": 2.8069, + "step": 6823 + }, + { + "epoch": 0.20235447617353142, + "grad_norm": 0.1359582096338272, + "learning_rate": 0.0009097521121032717, + "loss": 2.8059, + "step": 6824 + }, + { + "epoch": 0.2023841295258429, + "grad_norm": 0.12468364089727402, + "learning_rate": 0.0009097251466471332, + "loss": 2.8269, + "step": 6825 + }, + { + "epoch": 0.20241378287815437, + "grad_norm": 0.1270361989736557, + "learning_rate": 0.0009096981775628082, + "loss": 2.8342, + "step": 6826 + }, + { + "epoch": 0.20244343623046585, + "grad_norm": 0.12684884667396545, + "learning_rate": 0.0009096712048505355, + "loss": 2.8111, + "step": 6827 + }, + { + "epoch": 0.20247308958277732, + "grad_norm": 0.13178029656410217, + "learning_rate": 0.0009096442285105542, + "loss": 2.8146, + "step": 6828 + }, + { + "epoch": 0.20250274293508883, + "grad_norm": 0.14375634491443634, + "learning_rate": 0.0009096172485431027, + "loss": 2.7906, + "step": 6829 + }, + { + "epoch": 0.2025323962874003, + "grad_norm": 0.1420823484659195, + "learning_rate": 0.0009095902649484202, + "loss": 2.8195, + "step": 6830 + }, + { + "epoch": 0.20256204963971178, + "grad_norm": 0.14577339589595795, + "learning_rate": 0.0009095632777267456, + "loss": 2.8103, + "step": 6831 + }, + { + "epoch": 0.20259170299202325, + "grad_norm": 0.14933143556118011, + "learning_rate": 0.0009095362868783179, + "loss": 2.8325, + "step": 6832 + }, + { + "epoch": 0.20262135634433473, + "grad_norm": 0.12471211701631546, + "learning_rate": 0.0009095092924033761, + "loss": 2.8097, + "step": 6833 + }, + { + "epoch": 0.2026510096966462, + "grad_norm": 0.12873169779777527, + "learning_rate": 0.0009094822943021591, + "loss": 2.8205, + "step": 6834 + }, + { + "epoch": 0.20268066304895768, + "grad_norm": 0.13488025963306427, + "learning_rate": 0.0009094552925749062, + "loss": 2.816, + "step": 6835 + }, + { + "epoch": 0.20271031640126916, + "grad_norm": 0.14211641252040863, + "learning_rate": 0.0009094282872218564, + "loss": 2.8421, + "step": 6836 + }, + { + "epoch": 0.20273996975358063, + "grad_norm": 0.13946066796779633, + "learning_rate": 0.0009094012782432488, + "loss": 2.8442, + "step": 6837 + }, + { + "epoch": 0.2027696231058921, + "grad_norm": 0.13857164978981018, + "learning_rate": 0.0009093742656393227, + "loss": 2.8403, + "step": 6838 + }, + { + "epoch": 0.2027992764582036, + "grad_norm": 0.14836132526397705, + "learning_rate": 0.0009093472494103171, + "loss": 2.8057, + "step": 6839 + }, + { + "epoch": 0.2028289298105151, + "grad_norm": 0.13550099730491638, + "learning_rate": 0.0009093202295564714, + "loss": 2.7755, + "step": 6840 + }, + { + "epoch": 0.20285858316282657, + "grad_norm": 0.16791827976703644, + "learning_rate": 0.0009092932060780248, + "loss": 2.7974, + "step": 6841 + }, + { + "epoch": 0.20288823651513804, + "grad_norm": 0.17932109534740448, + "learning_rate": 0.0009092661789752165, + "loss": 2.8194, + "step": 6842 + }, + { + "epoch": 0.20291788986744952, + "grad_norm": 0.16232453286647797, + "learning_rate": 0.0009092391482482861, + "loss": 2.8086, + "step": 6843 + }, + { + "epoch": 0.202947543219761, + "grad_norm": 0.1530010849237442, + "learning_rate": 0.0009092121138974727, + "loss": 2.8349, + "step": 6844 + }, + { + "epoch": 0.20297719657207247, + "grad_norm": 0.16567210853099823, + "learning_rate": 0.0009091850759230158, + "loss": 2.8303, + "step": 6845 + }, + { + "epoch": 0.20300684992438395, + "grad_norm": 0.15773043036460876, + "learning_rate": 0.0009091580343251549, + "loss": 2.8369, + "step": 6846 + }, + { + "epoch": 0.20303650327669542, + "grad_norm": 0.16227145493030548, + "learning_rate": 0.0009091309891041294, + "loss": 2.8123, + "step": 6847 + }, + { + "epoch": 0.2030661566290069, + "grad_norm": 0.1464497298002243, + "learning_rate": 0.0009091039402601786, + "loss": 2.8202, + "step": 6848 + }, + { + "epoch": 0.20309580998131838, + "grad_norm": 0.1335085928440094, + "learning_rate": 0.0009090768877935422, + "loss": 2.8546, + "step": 6849 + }, + { + "epoch": 0.20312546333362988, + "grad_norm": 0.1577998250722885, + "learning_rate": 0.00090904983170446, + "loss": 2.8346, + "step": 6850 + }, + { + "epoch": 0.20315511668594136, + "grad_norm": 0.14191709458827972, + "learning_rate": 0.000909022771993171, + "loss": 2.7875, + "step": 6851 + }, + { + "epoch": 0.20318477003825283, + "grad_norm": 0.12536634504795074, + "learning_rate": 0.0009089957086599154, + "loss": 2.8302, + "step": 6852 + }, + { + "epoch": 0.2032144233905643, + "grad_norm": 0.14942926168441772, + "learning_rate": 0.0009089686417049325, + "loss": 2.8012, + "step": 6853 + }, + { + "epoch": 0.20324407674287578, + "grad_norm": 0.14936044812202454, + "learning_rate": 0.000908941571128462, + "loss": 2.7999, + "step": 6854 + }, + { + "epoch": 0.20327373009518726, + "grad_norm": 0.14738896489143372, + "learning_rate": 0.0009089144969307437, + "loss": 2.8398, + "step": 6855 + }, + { + "epoch": 0.20330338344749874, + "grad_norm": 0.1360187828540802, + "learning_rate": 0.0009088874191120175, + "loss": 2.7873, + "step": 6856 + }, + { + "epoch": 0.2033330367998102, + "grad_norm": 0.13335369527339935, + "learning_rate": 0.0009088603376725228, + "loss": 2.8112, + "step": 6857 + }, + { + "epoch": 0.2033626901521217, + "grad_norm": 0.13243180513381958, + "learning_rate": 0.0009088332526124999, + "loss": 2.7919, + "step": 6858 + }, + { + "epoch": 0.20339234350443317, + "grad_norm": 0.13231447339057922, + "learning_rate": 0.0009088061639321881, + "loss": 2.8335, + "step": 6859 + }, + { + "epoch": 0.20342199685674464, + "grad_norm": 0.13512951135635376, + "learning_rate": 0.0009087790716318276, + "loss": 2.8226, + "step": 6860 + }, + { + "epoch": 0.20345165020905615, + "grad_norm": 0.1244971826672554, + "learning_rate": 0.0009087519757116585, + "loss": 2.8232, + "step": 6861 + }, + { + "epoch": 0.20348130356136762, + "grad_norm": 0.13240577280521393, + "learning_rate": 0.0009087248761719202, + "loss": 2.813, + "step": 6862 + }, + { + "epoch": 0.2035109569136791, + "grad_norm": 0.1491222083568573, + "learning_rate": 0.000908697773012853, + "loss": 2.822, + "step": 6863 + }, + { + "epoch": 0.20354061026599057, + "grad_norm": 0.19283975660800934, + "learning_rate": 0.0009086706662346971, + "loss": 2.8254, + "step": 6864 + }, + { + "epoch": 0.20357026361830205, + "grad_norm": 0.19163277745246887, + "learning_rate": 0.0009086435558376921, + "loss": 2.824, + "step": 6865 + }, + { + "epoch": 0.20359991697061353, + "grad_norm": 0.16423627734184265, + "learning_rate": 0.0009086164418220784, + "loss": 2.8152, + "step": 6866 + }, + { + "epoch": 0.203629570322925, + "grad_norm": 0.15914146602153778, + "learning_rate": 0.0009085893241880958, + "loss": 2.8072, + "step": 6867 + }, + { + "epoch": 0.20365922367523648, + "grad_norm": 0.17889104783535004, + "learning_rate": 0.0009085622029359847, + "loss": 2.8347, + "step": 6868 + }, + { + "epoch": 0.20368887702754795, + "grad_norm": 0.17194043099880219, + "learning_rate": 0.0009085350780659851, + "loss": 2.8022, + "step": 6869 + }, + { + "epoch": 0.20371853037985943, + "grad_norm": 0.15391948819160461, + "learning_rate": 0.0009085079495783374, + "loss": 2.7909, + "step": 6870 + }, + { + "epoch": 0.20374818373217093, + "grad_norm": 0.1586165577173233, + "learning_rate": 0.0009084808174732815, + "loss": 2.8351, + "step": 6871 + }, + { + "epoch": 0.2037778370844824, + "grad_norm": 0.14724643528461456, + "learning_rate": 0.000908453681751058, + "loss": 2.8204, + "step": 6872 + }, + { + "epoch": 0.2038074904367939, + "grad_norm": 0.16080735623836517, + "learning_rate": 0.0009084265424119069, + "loss": 2.856, + "step": 6873 + }, + { + "epoch": 0.20383714378910536, + "grad_norm": 0.14303797483444214, + "learning_rate": 0.0009083993994560689, + "loss": 2.8073, + "step": 6874 + }, + { + "epoch": 0.20386679714141684, + "grad_norm": 0.12656787037849426, + "learning_rate": 0.0009083722528837839, + "loss": 2.8003, + "step": 6875 + }, + { + "epoch": 0.20389645049372832, + "grad_norm": 0.12890179455280304, + "learning_rate": 0.0009083451026952926, + "loss": 2.867, + "step": 6876 + }, + { + "epoch": 0.2039261038460398, + "grad_norm": 0.13550877571105957, + "learning_rate": 0.0009083179488908353, + "loss": 2.8044, + "step": 6877 + }, + { + "epoch": 0.20395575719835127, + "grad_norm": 0.1394859403371811, + "learning_rate": 0.0009082907914706524, + "loss": 2.827, + "step": 6878 + }, + { + "epoch": 0.20398541055066274, + "grad_norm": 0.1395576447248459, + "learning_rate": 0.0009082636304349845, + "loss": 2.8084, + "step": 6879 + }, + { + "epoch": 0.20401506390297422, + "grad_norm": 0.12284260988235474, + "learning_rate": 0.0009082364657840721, + "loss": 2.8167, + "step": 6880 + }, + { + "epoch": 0.20404471725528572, + "grad_norm": 0.12483762204647064, + "learning_rate": 0.0009082092975181557, + "loss": 2.7959, + "step": 6881 + }, + { + "epoch": 0.2040743706075972, + "grad_norm": 0.11815569549798965, + "learning_rate": 0.000908182125637476, + "loss": 2.7962, + "step": 6882 + }, + { + "epoch": 0.20410402395990868, + "grad_norm": 0.13331244885921478, + "learning_rate": 0.0009081549501422734, + "loss": 2.7922, + "step": 6883 + }, + { + "epoch": 0.20413367731222015, + "grad_norm": 0.16023825109004974, + "learning_rate": 0.0009081277710327886, + "loss": 2.8016, + "step": 6884 + }, + { + "epoch": 0.20416333066453163, + "grad_norm": 0.13373684883117676, + "learning_rate": 0.0009081005883092625, + "loss": 2.7915, + "step": 6885 + }, + { + "epoch": 0.2041929840168431, + "grad_norm": 0.14277763664722443, + "learning_rate": 0.0009080734019719357, + "loss": 2.8048, + "step": 6886 + }, + { + "epoch": 0.20422263736915458, + "grad_norm": 0.13471044600009918, + "learning_rate": 0.0009080462120210486, + "loss": 2.7998, + "step": 6887 + }, + { + "epoch": 0.20425229072146606, + "grad_norm": 0.15373986959457397, + "learning_rate": 0.0009080190184568424, + "loss": 2.8087, + "step": 6888 + }, + { + "epoch": 0.20428194407377753, + "grad_norm": 0.1690552979707718, + "learning_rate": 0.000907991821279558, + "loss": 2.8467, + "step": 6889 + }, + { + "epoch": 0.204311597426089, + "grad_norm": 0.18467457592487335, + "learning_rate": 0.0009079646204894356, + "loss": 2.8072, + "step": 6890 + }, + { + "epoch": 0.20434125077840049, + "grad_norm": 0.19238628447055817, + "learning_rate": 0.0009079374160867167, + "loss": 2.8259, + "step": 6891 + }, + { + "epoch": 0.204370904130712, + "grad_norm": 0.1742044985294342, + "learning_rate": 0.0009079102080716418, + "loss": 2.7975, + "step": 6892 + }, + { + "epoch": 0.20440055748302347, + "grad_norm": 0.15933291614055634, + "learning_rate": 0.0009078829964444521, + "loss": 2.808, + "step": 6893 + }, + { + "epoch": 0.20443021083533494, + "grad_norm": 0.1616646945476532, + "learning_rate": 0.0009078557812053884, + "loss": 2.7819, + "step": 6894 + }, + { + "epoch": 0.20445986418764642, + "grad_norm": 0.162091463804245, + "learning_rate": 0.0009078285623546918, + "loss": 2.8053, + "step": 6895 + }, + { + "epoch": 0.2044895175399579, + "grad_norm": 0.17054854333400726, + "learning_rate": 0.0009078013398926032, + "loss": 2.833, + "step": 6896 + }, + { + "epoch": 0.20451917089226937, + "grad_norm": 0.1361563503742218, + "learning_rate": 0.0009077741138193638, + "loss": 2.8325, + "step": 6897 + }, + { + "epoch": 0.20454882424458085, + "grad_norm": 0.12823018431663513, + "learning_rate": 0.0009077468841352146, + "loss": 2.8353, + "step": 6898 + }, + { + "epoch": 0.20457847759689232, + "grad_norm": 0.13292846083641052, + "learning_rate": 0.0009077196508403967, + "loss": 2.8357, + "step": 6899 + }, + { + "epoch": 0.2046081309492038, + "grad_norm": 0.13989494740962982, + "learning_rate": 0.0009076924139351514, + "loss": 2.8152, + "step": 6900 + }, + { + "epoch": 0.20463778430151527, + "grad_norm": 0.16227293014526367, + "learning_rate": 0.0009076651734197198, + "loss": 2.8132, + "step": 6901 + }, + { + "epoch": 0.20466743765382678, + "grad_norm": 0.15774714946746826, + "learning_rate": 0.0009076379292943431, + "loss": 2.8128, + "step": 6902 + }, + { + "epoch": 0.20469709100613825, + "grad_norm": 0.17518530786037445, + "learning_rate": 0.0009076106815592624, + "loss": 2.825, + "step": 6903 + }, + { + "epoch": 0.20472674435844973, + "grad_norm": 0.1522204726934433, + "learning_rate": 0.0009075834302147194, + "loss": 2.7923, + "step": 6904 + }, + { + "epoch": 0.2047563977107612, + "grad_norm": 0.1372414380311966, + "learning_rate": 0.0009075561752609552, + "loss": 2.7937, + "step": 6905 + }, + { + "epoch": 0.20478605106307268, + "grad_norm": 0.1367798000574112, + "learning_rate": 0.0009075289166982108, + "loss": 2.8472, + "step": 6906 + }, + { + "epoch": 0.20481570441538416, + "grad_norm": 0.13095107674598694, + "learning_rate": 0.0009075016545267281, + "loss": 2.8278, + "step": 6907 + }, + { + "epoch": 0.20484535776769563, + "grad_norm": 0.11868097633123398, + "learning_rate": 0.0009074743887467482, + "loss": 2.8199, + "step": 6908 + }, + { + "epoch": 0.2048750111200071, + "grad_norm": 0.12601588666439056, + "learning_rate": 0.0009074471193585128, + "loss": 2.829, + "step": 6909 + }, + { + "epoch": 0.2049046644723186, + "grad_norm": 0.12369253486394882, + "learning_rate": 0.0009074198463622632, + "loss": 2.8236, + "step": 6910 + }, + { + "epoch": 0.20493431782463006, + "grad_norm": 0.13103342056274414, + "learning_rate": 0.0009073925697582408, + "loss": 2.8308, + "step": 6911 + }, + { + "epoch": 0.20496397117694154, + "grad_norm": 0.1366877406835556, + "learning_rate": 0.0009073652895466873, + "loss": 2.8158, + "step": 6912 + }, + { + "epoch": 0.20499362452925304, + "grad_norm": 0.13970622420310974, + "learning_rate": 0.0009073380057278442, + "loss": 2.8315, + "step": 6913 + }, + { + "epoch": 0.20502327788156452, + "grad_norm": 0.14215922355651855, + "learning_rate": 0.0009073107183019532, + "loss": 2.8347, + "step": 6914 + }, + { + "epoch": 0.205052931233876, + "grad_norm": 0.13941799104213715, + "learning_rate": 0.0009072834272692558, + "loss": 2.8291, + "step": 6915 + }, + { + "epoch": 0.20508258458618747, + "grad_norm": 0.14454710483551025, + "learning_rate": 0.000907256132629994, + "loss": 2.813, + "step": 6916 + }, + { + "epoch": 0.20511223793849895, + "grad_norm": 0.12562988698482513, + "learning_rate": 0.000907228834384409, + "loss": 2.8053, + "step": 6917 + }, + { + "epoch": 0.20514189129081042, + "grad_norm": 0.11985684931278229, + "learning_rate": 0.0009072015325327429, + "loss": 2.8457, + "step": 6918 + }, + { + "epoch": 0.2051715446431219, + "grad_norm": 0.12526927888393402, + "learning_rate": 0.0009071742270752373, + "loss": 2.8176, + "step": 6919 + }, + { + "epoch": 0.20520119799543338, + "grad_norm": 0.127656489610672, + "learning_rate": 0.0009071469180121339, + "loss": 2.8246, + "step": 6920 + }, + { + "epoch": 0.20523085134774485, + "grad_norm": 0.16015875339508057, + "learning_rate": 0.0009071196053436748, + "loss": 2.8598, + "step": 6921 + }, + { + "epoch": 0.20526050470005633, + "grad_norm": 0.17099495232105255, + "learning_rate": 0.0009070922890701017, + "loss": 2.8424, + "step": 6922 + }, + { + "epoch": 0.20529015805236783, + "grad_norm": 0.1835186630487442, + "learning_rate": 0.0009070649691916564, + "loss": 2.8019, + "step": 6923 + }, + { + "epoch": 0.2053198114046793, + "grad_norm": 0.18062502145767212, + "learning_rate": 0.000907037645708581, + "loss": 2.8078, + "step": 6924 + }, + { + "epoch": 0.20534946475699078, + "grad_norm": 0.16015832126140594, + "learning_rate": 0.0009070103186211174, + "loss": 2.815, + "step": 6925 + }, + { + "epoch": 0.20537911810930226, + "grad_norm": 0.1628064215183258, + "learning_rate": 0.0009069829879295075, + "loss": 2.836, + "step": 6926 + }, + { + "epoch": 0.20540877146161374, + "grad_norm": 0.16172200441360474, + "learning_rate": 0.0009069556536339935, + "loss": 2.8159, + "step": 6927 + }, + { + "epoch": 0.2054384248139252, + "grad_norm": 0.1639447659254074, + "learning_rate": 0.0009069283157348172, + "loss": 2.8186, + "step": 6928 + }, + { + "epoch": 0.2054680781662367, + "grad_norm": 0.16361993551254272, + "learning_rate": 0.0009069009742322208, + "loss": 2.8181, + "step": 6929 + }, + { + "epoch": 0.20549773151854817, + "grad_norm": 0.1862579733133316, + "learning_rate": 0.0009068736291264466, + "loss": 2.8178, + "step": 6930 + }, + { + "epoch": 0.20552738487085964, + "grad_norm": 0.17552539706230164, + "learning_rate": 0.0009068462804177364, + "loss": 2.8685, + "step": 6931 + }, + { + "epoch": 0.20555703822317112, + "grad_norm": 0.13692037761211395, + "learning_rate": 0.0009068189281063326, + "loss": 2.8555, + "step": 6932 + }, + { + "epoch": 0.20558669157548262, + "grad_norm": 0.14862941205501556, + "learning_rate": 0.0009067915721924775, + "loss": 2.8345, + "step": 6933 + }, + { + "epoch": 0.2056163449277941, + "grad_norm": 0.13209277391433716, + "learning_rate": 0.0009067642126764131, + "loss": 2.7953, + "step": 6934 + }, + { + "epoch": 0.20564599828010557, + "grad_norm": 0.13985709846019745, + "learning_rate": 0.0009067368495583818, + "loss": 2.8412, + "step": 6935 + }, + { + "epoch": 0.20567565163241705, + "grad_norm": 0.14780427515506744, + "learning_rate": 0.0009067094828386258, + "loss": 2.7986, + "step": 6936 + }, + { + "epoch": 0.20570530498472853, + "grad_norm": 0.14805030822753906, + "learning_rate": 0.0009066821125173877, + "loss": 2.8289, + "step": 6937 + }, + { + "epoch": 0.20573495833704, + "grad_norm": 0.13695459067821503, + "learning_rate": 0.0009066547385949095, + "loss": 2.8205, + "step": 6938 + }, + { + "epoch": 0.20576461168935148, + "grad_norm": 0.1410829871892929, + "learning_rate": 0.0009066273610714337, + "loss": 2.7948, + "step": 6939 + }, + { + "epoch": 0.20579426504166295, + "grad_norm": 0.12437128275632858, + "learning_rate": 0.0009065999799472031, + "loss": 2.802, + "step": 6940 + }, + { + "epoch": 0.20582391839397443, + "grad_norm": 0.12676270306110382, + "learning_rate": 0.0009065725952224597, + "loss": 2.8144, + "step": 6941 + }, + { + "epoch": 0.2058535717462859, + "grad_norm": 0.13632485270500183, + "learning_rate": 0.0009065452068974463, + "loss": 2.8212, + "step": 6942 + }, + { + "epoch": 0.20588322509859738, + "grad_norm": 0.12272214889526367, + "learning_rate": 0.0009065178149724051, + "loss": 2.8243, + "step": 6943 + }, + { + "epoch": 0.2059128784509089, + "grad_norm": 0.12165968865156174, + "learning_rate": 0.0009064904194475791, + "loss": 2.8504, + "step": 6944 + }, + { + "epoch": 0.20594253180322036, + "grad_norm": 0.14652633666992188, + "learning_rate": 0.0009064630203232107, + "loss": 2.8077, + "step": 6945 + }, + { + "epoch": 0.20597218515553184, + "grad_norm": 0.15948480367660522, + "learning_rate": 0.0009064356175995423, + "loss": 2.829, + "step": 6946 + }, + { + "epoch": 0.20600183850784332, + "grad_norm": 0.1708993911743164, + "learning_rate": 0.0009064082112768168, + "loss": 2.8193, + "step": 6947 + }, + { + "epoch": 0.2060314918601548, + "grad_norm": 0.17175519466400146, + "learning_rate": 0.0009063808013552768, + "loss": 2.8239, + "step": 6948 + }, + { + "epoch": 0.20606114521246627, + "grad_norm": 0.15951846539974213, + "learning_rate": 0.0009063533878351651, + "loss": 2.8362, + "step": 6949 + }, + { + "epoch": 0.20609079856477774, + "grad_norm": 0.15916843712329865, + "learning_rate": 0.0009063259707167244, + "loss": 2.7929, + "step": 6950 + }, + { + "epoch": 0.20612045191708922, + "grad_norm": 0.20271292328834534, + "learning_rate": 0.0009062985500001976, + "loss": 2.7967, + "step": 6951 + }, + { + "epoch": 0.2061501052694007, + "grad_norm": 0.20583476126194, + "learning_rate": 0.0009062711256858271, + "loss": 2.8074, + "step": 6952 + }, + { + "epoch": 0.20617975862171217, + "grad_norm": 0.16044345498085022, + "learning_rate": 0.0009062436977738563, + "loss": 2.8135, + "step": 6953 + }, + { + "epoch": 0.20620941197402368, + "grad_norm": 0.15798276662826538, + "learning_rate": 0.0009062162662645278, + "loss": 2.7841, + "step": 6954 + }, + { + "epoch": 0.20623906532633515, + "grad_norm": 0.16213276982307434, + "learning_rate": 0.0009061888311580844, + "loss": 2.8269, + "step": 6955 + }, + { + "epoch": 0.20626871867864663, + "grad_norm": 0.16520065069198608, + "learning_rate": 0.0009061613924547694, + "loss": 2.8217, + "step": 6956 + }, + { + "epoch": 0.2062983720309581, + "grad_norm": 0.13426734507083893, + "learning_rate": 0.0009061339501548253, + "loss": 2.8588, + "step": 6957 + }, + { + "epoch": 0.20632802538326958, + "grad_norm": 0.14108102023601532, + "learning_rate": 0.0009061065042584955, + "loss": 2.8289, + "step": 6958 + }, + { + "epoch": 0.20635767873558106, + "grad_norm": 0.1403336226940155, + "learning_rate": 0.0009060790547660229, + "loss": 2.8012, + "step": 6959 + }, + { + "epoch": 0.20638733208789253, + "grad_norm": 0.11623740196228027, + "learning_rate": 0.0009060516016776506, + "loss": 2.8398, + "step": 6960 + }, + { + "epoch": 0.206416985440204, + "grad_norm": 0.1294596940279007, + "learning_rate": 0.0009060241449936216, + "loss": 2.8116, + "step": 6961 + }, + { + "epoch": 0.20644663879251549, + "grad_norm": 0.13511934876441956, + "learning_rate": 0.0009059966847141791, + "loss": 2.7828, + "step": 6962 + }, + { + "epoch": 0.20647629214482696, + "grad_norm": 0.1270652711391449, + "learning_rate": 0.0009059692208395662, + "loss": 2.813, + "step": 6963 + }, + { + "epoch": 0.20650594549713844, + "grad_norm": 0.1274232268333435, + "learning_rate": 0.0009059417533700263, + "loss": 2.8477, + "step": 6964 + }, + { + "epoch": 0.20653559884944994, + "grad_norm": 0.13403509557247162, + "learning_rate": 0.0009059142823058024, + "loss": 2.8381, + "step": 6965 + }, + { + "epoch": 0.20656525220176142, + "grad_norm": 0.15395617485046387, + "learning_rate": 0.0009058868076471379, + "loss": 2.835, + "step": 6966 + }, + { + "epoch": 0.2065949055540729, + "grad_norm": 0.17274099588394165, + "learning_rate": 0.000905859329394276, + "loss": 2.8191, + "step": 6967 + }, + { + "epoch": 0.20662455890638437, + "grad_norm": 0.15403585135936737, + "learning_rate": 0.0009058318475474602, + "loss": 2.8163, + "step": 6968 + }, + { + "epoch": 0.20665421225869585, + "grad_norm": 0.15185320377349854, + "learning_rate": 0.0009058043621069336, + "loss": 2.8631, + "step": 6969 + }, + { + "epoch": 0.20668386561100732, + "grad_norm": 0.1665939837694168, + "learning_rate": 0.0009057768730729399, + "loss": 2.839, + "step": 6970 + }, + { + "epoch": 0.2067135189633188, + "grad_norm": 0.19065381586551666, + "learning_rate": 0.0009057493804457221, + "loss": 2.8139, + "step": 6971 + }, + { + "epoch": 0.20674317231563027, + "grad_norm": 0.18203605711460114, + "learning_rate": 0.0009057218842255239, + "loss": 2.8071, + "step": 6972 + }, + { + "epoch": 0.20677282566794175, + "grad_norm": 0.1602405458688736, + "learning_rate": 0.000905694384412589, + "loss": 2.8073, + "step": 6973 + }, + { + "epoch": 0.20680247902025323, + "grad_norm": 0.16696254909038544, + "learning_rate": 0.0009056668810071605, + "loss": 2.81, + "step": 6974 + }, + { + "epoch": 0.20683213237256473, + "grad_norm": 0.1652406007051468, + "learning_rate": 0.0009056393740094823, + "loss": 2.8053, + "step": 6975 + }, + { + "epoch": 0.2068617857248762, + "grad_norm": 0.14041200280189514, + "learning_rate": 0.0009056118634197976, + "loss": 2.8625, + "step": 6976 + }, + { + "epoch": 0.20689143907718768, + "grad_norm": 0.15348222851753235, + "learning_rate": 0.0009055843492383504, + "loss": 2.8058, + "step": 6977 + }, + { + "epoch": 0.20692109242949916, + "grad_norm": 0.16546571254730225, + "learning_rate": 0.0009055568314653841, + "loss": 2.8725, + "step": 6978 + }, + { + "epoch": 0.20695074578181064, + "grad_norm": 0.14347416162490845, + "learning_rate": 0.0009055293101011424, + "loss": 2.8063, + "step": 6979 + }, + { + "epoch": 0.2069803991341221, + "grad_norm": 0.13758036494255066, + "learning_rate": 0.0009055017851458691, + "loss": 2.8095, + "step": 6980 + }, + { + "epoch": 0.2070100524864336, + "grad_norm": 0.13517065346240997, + "learning_rate": 0.000905474256599808, + "loss": 2.8277, + "step": 6981 + }, + { + "epoch": 0.20703970583874506, + "grad_norm": 0.1277128905057907, + "learning_rate": 0.0009054467244632025, + "loss": 2.845, + "step": 6982 + }, + { + "epoch": 0.20706935919105654, + "grad_norm": 0.13662470877170563, + "learning_rate": 0.000905419188736297, + "loss": 2.8136, + "step": 6983 + }, + { + "epoch": 0.20709901254336802, + "grad_norm": 0.1392596811056137, + "learning_rate": 0.0009053916494193347, + "loss": 2.826, + "step": 6984 + }, + { + "epoch": 0.20712866589567952, + "grad_norm": 0.1285506933927536, + "learning_rate": 0.0009053641065125599, + "loss": 2.8301, + "step": 6985 + }, + { + "epoch": 0.207158319247991, + "grad_norm": 0.1374008059501648, + "learning_rate": 0.0009053365600162163, + "loss": 2.7834, + "step": 6986 + }, + { + "epoch": 0.20718797260030247, + "grad_norm": 0.1365123689174652, + "learning_rate": 0.0009053090099305479, + "loss": 2.8135, + "step": 6987 + }, + { + "epoch": 0.20721762595261395, + "grad_norm": 0.144785538315773, + "learning_rate": 0.0009052814562557987, + "loss": 2.8118, + "step": 6988 + }, + { + "epoch": 0.20724727930492542, + "grad_norm": 0.13693232834339142, + "learning_rate": 0.0009052538989922126, + "loss": 2.7873, + "step": 6989 + }, + { + "epoch": 0.2072769326572369, + "grad_norm": 0.15437792241573334, + "learning_rate": 0.0009052263381400336, + "loss": 2.8405, + "step": 6990 + }, + { + "epoch": 0.20730658600954838, + "grad_norm": 0.1710890680551529, + "learning_rate": 0.000905198773699506, + "loss": 2.8455, + "step": 6991 + }, + { + "epoch": 0.20733623936185985, + "grad_norm": 0.17711414396762848, + "learning_rate": 0.0009051712056708735, + "loss": 2.797, + "step": 6992 + }, + { + "epoch": 0.20736589271417133, + "grad_norm": 0.17784439027309418, + "learning_rate": 0.0009051436340543806, + "loss": 2.8266, + "step": 6993 + }, + { + "epoch": 0.2073955460664828, + "grad_norm": 0.16384725272655487, + "learning_rate": 0.0009051160588502712, + "loss": 2.8086, + "step": 6994 + }, + { + "epoch": 0.20742519941879428, + "grad_norm": 0.15052708983421326, + "learning_rate": 0.0009050884800587896, + "loss": 2.8235, + "step": 6995 + }, + { + "epoch": 0.20745485277110579, + "grad_norm": 0.17156627774238586, + "learning_rate": 0.0009050608976801798, + "loss": 2.8532, + "step": 6996 + }, + { + "epoch": 0.20748450612341726, + "grad_norm": 0.1845402717590332, + "learning_rate": 0.0009050333117146864, + "loss": 2.8175, + "step": 6997 + }, + { + "epoch": 0.20751415947572874, + "grad_norm": 0.16338877379894257, + "learning_rate": 0.0009050057221625533, + "loss": 2.8256, + "step": 6998 + }, + { + "epoch": 0.20754381282804021, + "grad_norm": 0.12696002423763275, + "learning_rate": 0.0009049781290240254, + "loss": 2.8288, + "step": 6999 + }, + { + "epoch": 0.2075734661803517, + "grad_norm": 0.1507212519645691, + "learning_rate": 0.0009049505322993463, + "loss": 2.8054, + "step": 7000 + }, + { + "epoch": 0.20760311953266317, + "grad_norm": 0.1461145132780075, + "learning_rate": 0.0009049229319887609, + "loss": 2.8231, + "step": 7001 + }, + { + "epoch": 0.20763277288497464, + "grad_norm": 0.14742842316627502, + "learning_rate": 0.0009048953280925134, + "loss": 2.8115, + "step": 7002 + }, + { + "epoch": 0.20766242623728612, + "grad_norm": 0.13092677295207977, + "learning_rate": 0.0009048677206108482, + "loss": 2.814, + "step": 7003 + }, + { + "epoch": 0.2076920795895976, + "grad_norm": 0.13607105612754822, + "learning_rate": 0.00090484010954401, + "loss": 2.8224, + "step": 7004 + }, + { + "epoch": 0.20772173294190907, + "grad_norm": 0.14390923082828522, + "learning_rate": 0.0009048124948922429, + "loss": 2.8358, + "step": 7005 + }, + { + "epoch": 0.20775138629422057, + "grad_norm": 0.15261989831924438, + "learning_rate": 0.0009047848766557917, + "loss": 2.8243, + "step": 7006 + }, + { + "epoch": 0.20778103964653205, + "grad_norm": 0.13539205491542816, + "learning_rate": 0.0009047572548349012, + "loss": 2.79, + "step": 7007 + }, + { + "epoch": 0.20781069299884353, + "grad_norm": 0.14404979348182678, + "learning_rate": 0.0009047296294298155, + "loss": 2.8006, + "step": 7008 + }, + { + "epoch": 0.207840346351155, + "grad_norm": 0.15226586163043976, + "learning_rate": 0.0009047020004407795, + "loss": 2.8064, + "step": 7009 + }, + { + "epoch": 0.20786999970346648, + "grad_norm": 0.174526646733284, + "learning_rate": 0.000904674367868038, + "loss": 2.8014, + "step": 7010 + }, + { + "epoch": 0.20789965305577796, + "grad_norm": 0.17531521618366241, + "learning_rate": 0.0009046467317118353, + "loss": 2.8191, + "step": 7011 + }, + { + "epoch": 0.20792930640808943, + "grad_norm": 0.15929743647575378, + "learning_rate": 0.0009046190919724164, + "loss": 2.8193, + "step": 7012 + }, + { + "epoch": 0.2079589597604009, + "grad_norm": 0.16458439826965332, + "learning_rate": 0.0009045914486500259, + "loss": 2.8084, + "step": 7013 + }, + { + "epoch": 0.20798861311271238, + "grad_norm": 0.15918946266174316, + "learning_rate": 0.0009045638017449089, + "loss": 2.839, + "step": 7014 + }, + { + "epoch": 0.20801826646502386, + "grad_norm": 0.12902076542377472, + "learning_rate": 0.0009045361512573098, + "loss": 2.8448, + "step": 7015 + }, + { + "epoch": 0.20804791981733534, + "grad_norm": 0.12710224092006683, + "learning_rate": 0.0009045084971874737, + "loss": 2.7931, + "step": 7016 + }, + { + "epoch": 0.20807757316964684, + "grad_norm": 0.11687172949314117, + "learning_rate": 0.0009044808395356455, + "loss": 2.8303, + "step": 7017 + }, + { + "epoch": 0.20810722652195832, + "grad_norm": 0.13661639392375946, + "learning_rate": 0.00090445317830207, + "loss": 2.8023, + "step": 7018 + }, + { + "epoch": 0.2081368798742698, + "grad_norm": 0.12500855326652527, + "learning_rate": 0.0009044255134869921, + "loss": 2.8122, + "step": 7019 + }, + { + "epoch": 0.20816653322658127, + "grad_norm": 0.14264929294586182, + "learning_rate": 0.0009043978450906569, + "loss": 2.8034, + "step": 7020 + }, + { + "epoch": 0.20819618657889274, + "grad_norm": 0.1487613022327423, + "learning_rate": 0.0009043701731133094, + "loss": 2.8224, + "step": 7021 + }, + { + "epoch": 0.20822583993120422, + "grad_norm": 0.15129354596138, + "learning_rate": 0.0009043424975551946, + "loss": 2.809, + "step": 7022 + }, + { + "epoch": 0.2082554932835157, + "grad_norm": 0.1596369743347168, + "learning_rate": 0.0009043148184165575, + "loss": 2.864, + "step": 7023 + }, + { + "epoch": 0.20828514663582717, + "grad_norm": 0.1555621325969696, + "learning_rate": 0.0009042871356976434, + "loss": 2.8161, + "step": 7024 + }, + { + "epoch": 0.20831479998813865, + "grad_norm": 0.15044602751731873, + "learning_rate": 0.0009042594493986972, + "loss": 2.8242, + "step": 7025 + }, + { + "epoch": 0.20834445334045013, + "grad_norm": 0.14959104359149933, + "learning_rate": 0.0009042317595199643, + "loss": 2.8346, + "step": 7026 + }, + { + "epoch": 0.20837410669276163, + "grad_norm": 0.13946713507175446, + "learning_rate": 0.0009042040660616897, + "loss": 2.8291, + "step": 7027 + }, + { + "epoch": 0.2084037600450731, + "grad_norm": 0.13680632412433624, + "learning_rate": 0.0009041763690241187, + "loss": 2.8445, + "step": 7028 + }, + { + "epoch": 0.20843341339738458, + "grad_norm": 0.14807890355587006, + "learning_rate": 0.0009041486684074967, + "loss": 2.808, + "step": 7029 + }, + { + "epoch": 0.20846306674969606, + "grad_norm": 0.1684316098690033, + "learning_rate": 0.0009041209642120687, + "loss": 2.8655, + "step": 7030 + }, + { + "epoch": 0.20849272010200753, + "grad_norm": 0.19567357003688812, + "learning_rate": 0.0009040932564380804, + "loss": 2.8168, + "step": 7031 + }, + { + "epoch": 0.208522373454319, + "grad_norm": 0.19790488481521606, + "learning_rate": 0.0009040655450857768, + "loss": 2.7944, + "step": 7032 + }, + { + "epoch": 0.20855202680663049, + "grad_norm": 0.1799730509519577, + "learning_rate": 0.0009040378301554034, + "loss": 2.813, + "step": 7033 + }, + { + "epoch": 0.20858168015894196, + "grad_norm": 0.13263936340808868, + "learning_rate": 0.0009040101116472057, + "loss": 2.7984, + "step": 7034 + }, + { + "epoch": 0.20861133351125344, + "grad_norm": 0.14758121967315674, + "learning_rate": 0.0009039823895614292, + "loss": 2.8221, + "step": 7035 + }, + { + "epoch": 0.20864098686356491, + "grad_norm": 0.14095798134803772, + "learning_rate": 0.0009039546638983192, + "loss": 2.8263, + "step": 7036 + }, + { + "epoch": 0.20867064021587642, + "grad_norm": 0.13924552500247955, + "learning_rate": 0.0009039269346581214, + "loss": 2.8163, + "step": 7037 + }, + { + "epoch": 0.2087002935681879, + "grad_norm": 0.1413441002368927, + "learning_rate": 0.0009038992018410813, + "loss": 2.791, + "step": 7038 + }, + { + "epoch": 0.20872994692049937, + "grad_norm": 0.1533185988664627, + "learning_rate": 0.0009038714654474443, + "loss": 2.8489, + "step": 7039 + }, + { + "epoch": 0.20875960027281085, + "grad_norm": 0.1494714617729187, + "learning_rate": 0.0009038437254774563, + "loss": 2.8061, + "step": 7040 + }, + { + "epoch": 0.20878925362512232, + "grad_norm": 0.14787504076957703, + "learning_rate": 0.0009038159819313627, + "loss": 2.8266, + "step": 7041 + }, + { + "epoch": 0.2088189069774338, + "grad_norm": 0.16160711646080017, + "learning_rate": 0.0009037882348094093, + "loss": 2.8507, + "step": 7042 + }, + { + "epoch": 0.20884856032974528, + "grad_norm": 0.15712551772594452, + "learning_rate": 0.0009037604841118416, + "loss": 2.7914, + "step": 7043 + }, + { + "epoch": 0.20887821368205675, + "grad_norm": 0.1522410362958908, + "learning_rate": 0.0009037327298389058, + "loss": 2.8146, + "step": 7044 + }, + { + "epoch": 0.20890786703436823, + "grad_norm": 0.142935112118721, + "learning_rate": 0.0009037049719908473, + "loss": 2.778, + "step": 7045 + }, + { + "epoch": 0.2089375203866797, + "grad_norm": 0.13453277945518494, + "learning_rate": 0.0009036772105679118, + "loss": 2.8141, + "step": 7046 + }, + { + "epoch": 0.20896717373899118, + "grad_norm": 0.1220959946513176, + "learning_rate": 0.0009036494455703455, + "loss": 2.8269, + "step": 7047 + }, + { + "epoch": 0.20899682709130268, + "grad_norm": 0.1292099803686142, + "learning_rate": 0.0009036216769983939, + "loss": 2.8163, + "step": 7048 + }, + { + "epoch": 0.20902648044361416, + "grad_norm": 0.12182233482599258, + "learning_rate": 0.0009035939048523032, + "loss": 2.8107, + "step": 7049 + }, + { + "epoch": 0.20905613379592564, + "grad_norm": 0.1215030699968338, + "learning_rate": 0.0009035661291323192, + "loss": 2.7889, + "step": 7050 + }, + { + "epoch": 0.2090857871482371, + "grad_norm": 0.12888431549072266, + "learning_rate": 0.0009035383498386878, + "loss": 2.8244, + "step": 7051 + }, + { + "epoch": 0.2091154405005486, + "grad_norm": 0.1415405422449112, + "learning_rate": 0.000903510566971655, + "loss": 2.8268, + "step": 7052 + }, + { + "epoch": 0.20914509385286006, + "grad_norm": 0.1474735140800476, + "learning_rate": 0.000903482780531467, + "loss": 2.8191, + "step": 7053 + }, + { + "epoch": 0.20917474720517154, + "grad_norm": 0.13842496275901794, + "learning_rate": 0.0009034549905183695, + "loss": 2.8034, + "step": 7054 + }, + { + "epoch": 0.20920440055748302, + "grad_norm": 0.15372635424137115, + "learning_rate": 0.0009034271969326092, + "loss": 2.837, + "step": 7055 + }, + { + "epoch": 0.2092340539097945, + "grad_norm": 0.14850816130638123, + "learning_rate": 0.0009033993997744314, + "loss": 2.8175, + "step": 7056 + }, + { + "epoch": 0.20926370726210597, + "grad_norm": 0.14153467118740082, + "learning_rate": 0.0009033715990440829, + "loss": 2.829, + "step": 7057 + }, + { + "epoch": 0.20929336061441747, + "grad_norm": 0.16339683532714844, + "learning_rate": 0.0009033437947418095, + "loss": 2.829, + "step": 7058 + }, + { + "epoch": 0.20932301396672895, + "grad_norm": 0.1625061184167862, + "learning_rate": 0.0009033159868678577, + "loss": 2.7876, + "step": 7059 + }, + { + "epoch": 0.20935266731904043, + "grad_norm": 0.15836146473884583, + "learning_rate": 0.0009032881754224737, + "loss": 2.7645, + "step": 7060 + }, + { + "epoch": 0.2093823206713519, + "grad_norm": 0.12919026613235474, + "learning_rate": 0.0009032603604059035, + "loss": 2.8045, + "step": 7061 + }, + { + "epoch": 0.20941197402366338, + "grad_norm": 0.13514356315135956, + "learning_rate": 0.0009032325418183937, + "loss": 2.8011, + "step": 7062 + }, + { + "epoch": 0.20944162737597485, + "grad_norm": 0.1371014565229416, + "learning_rate": 0.0009032047196601905, + "loss": 2.8194, + "step": 7063 + }, + { + "epoch": 0.20947128072828633, + "grad_norm": 0.15515770018100739, + "learning_rate": 0.0009031768939315402, + "loss": 2.8193, + "step": 7064 + }, + { + "epoch": 0.2095009340805978, + "grad_norm": 0.15736372768878937, + "learning_rate": 0.0009031490646326894, + "loss": 2.7524, + "step": 7065 + }, + { + "epoch": 0.20953058743290928, + "grad_norm": 0.1648901402950287, + "learning_rate": 0.0009031212317638843, + "loss": 2.8327, + "step": 7066 + }, + { + "epoch": 0.20956024078522076, + "grad_norm": 0.14886672794818878, + "learning_rate": 0.0009030933953253717, + "loss": 2.845, + "step": 7067 + }, + { + "epoch": 0.20958989413753223, + "grad_norm": 0.149927020072937, + "learning_rate": 0.0009030655553173978, + "loss": 2.8193, + "step": 7068 + }, + { + "epoch": 0.20961954748984374, + "grad_norm": 0.19100187718868256, + "learning_rate": 0.0009030377117402092, + "loss": 2.8051, + "step": 7069 + }, + { + "epoch": 0.20964920084215521, + "grad_norm": 0.19364339113235474, + "learning_rate": 0.0009030098645940526, + "loss": 2.8174, + "step": 7070 + }, + { + "epoch": 0.2096788541944667, + "grad_norm": 0.18457454442977905, + "learning_rate": 0.0009029820138791744, + "loss": 2.8358, + "step": 7071 + }, + { + "epoch": 0.20970850754677817, + "grad_norm": 0.1740489900112152, + "learning_rate": 0.0009029541595958211, + "loss": 2.8329, + "step": 7072 + }, + { + "epoch": 0.20973816089908964, + "grad_norm": 0.17348454892635345, + "learning_rate": 0.0009029263017442397, + "loss": 2.8305, + "step": 7073 + }, + { + "epoch": 0.20976781425140112, + "grad_norm": 0.16869999468326569, + "learning_rate": 0.000902898440324677, + "loss": 2.8407, + "step": 7074 + }, + { + "epoch": 0.2097974676037126, + "grad_norm": 0.15293584764003754, + "learning_rate": 0.0009028705753373791, + "loss": 2.8421, + "step": 7075 + }, + { + "epoch": 0.20982712095602407, + "grad_norm": 0.13862892985343933, + "learning_rate": 0.0009028427067825933, + "loss": 2.757, + "step": 7076 + }, + { + "epoch": 0.20985677430833555, + "grad_norm": 0.16815748810768127, + "learning_rate": 0.000902814834660566, + "loss": 2.8127, + "step": 7077 + }, + { + "epoch": 0.20988642766064702, + "grad_norm": 0.1449357569217682, + "learning_rate": 0.0009027869589715442, + "loss": 2.828, + "step": 7078 + }, + { + "epoch": 0.20991608101295853, + "grad_norm": 0.13752873241901398, + "learning_rate": 0.0009027590797157749, + "loss": 2.8126, + "step": 7079 + }, + { + "epoch": 0.20994573436527, + "grad_norm": 0.12825749814510345, + "learning_rate": 0.0009027311968935048, + "loss": 2.806, + "step": 7080 + }, + { + "epoch": 0.20997538771758148, + "grad_norm": 0.13291795551776886, + "learning_rate": 0.0009027033105049809, + "loss": 2.8145, + "step": 7081 + }, + { + "epoch": 0.21000504106989296, + "grad_norm": 0.12286065518856049, + "learning_rate": 0.00090267542055045, + "loss": 2.8192, + "step": 7082 + }, + { + "epoch": 0.21003469442220443, + "grad_norm": 0.11373299360275269, + "learning_rate": 0.000902647527030159, + "loss": 2.8427, + "step": 7083 + }, + { + "epoch": 0.2100643477745159, + "grad_norm": 0.11343652009963989, + "learning_rate": 0.000902619629944355, + "loss": 2.8257, + "step": 7084 + }, + { + "epoch": 0.21009400112682738, + "grad_norm": 0.11223365366458893, + "learning_rate": 0.0009025917292932853, + "loss": 2.8039, + "step": 7085 + }, + { + "epoch": 0.21012365447913886, + "grad_norm": 0.12416622042655945, + "learning_rate": 0.0009025638250771966, + "loss": 2.8226, + "step": 7086 + }, + { + "epoch": 0.21015330783145034, + "grad_norm": 0.12567339837551117, + "learning_rate": 0.0009025359172963361, + "loss": 2.8254, + "step": 7087 + }, + { + "epoch": 0.2101829611837618, + "grad_norm": 0.13608810305595398, + "learning_rate": 0.000902508005950951, + "loss": 2.8412, + "step": 7088 + }, + { + "epoch": 0.21021261453607332, + "grad_norm": 0.12495681643486023, + "learning_rate": 0.0009024800910412884, + "loss": 2.7681, + "step": 7089 + }, + { + "epoch": 0.2102422678883848, + "grad_norm": 0.12422282993793488, + "learning_rate": 0.0009024521725675956, + "loss": 2.8213, + "step": 7090 + }, + { + "epoch": 0.21027192124069627, + "grad_norm": 0.13611111044883728, + "learning_rate": 0.0009024242505301196, + "loss": 2.7958, + "step": 7091 + }, + { + "epoch": 0.21030157459300775, + "grad_norm": 0.14586415886878967, + "learning_rate": 0.0009023963249291078, + "loss": 2.8287, + "step": 7092 + }, + { + "epoch": 0.21033122794531922, + "grad_norm": 0.16893257200717926, + "learning_rate": 0.0009023683957648077, + "loss": 2.8042, + "step": 7093 + }, + { + "epoch": 0.2103608812976307, + "grad_norm": 0.17321360111236572, + "learning_rate": 0.0009023404630374661, + "loss": 2.8025, + "step": 7094 + }, + { + "epoch": 0.21039053464994217, + "grad_norm": 0.14861759543418884, + "learning_rate": 0.0009023125267473308, + "loss": 2.8276, + "step": 7095 + }, + { + "epoch": 0.21042018800225365, + "grad_norm": 0.15224094688892365, + "learning_rate": 0.000902284586894649, + "loss": 2.7839, + "step": 7096 + }, + { + "epoch": 0.21044984135456513, + "grad_norm": 0.13438856601715088, + "learning_rate": 0.0009022566434796679, + "loss": 2.8098, + "step": 7097 + }, + { + "epoch": 0.2104794947068766, + "grad_norm": 0.13437707722187042, + "learning_rate": 0.0009022286965026356, + "loss": 2.8055, + "step": 7098 + }, + { + "epoch": 0.21050914805918808, + "grad_norm": 0.13547466695308685, + "learning_rate": 0.0009022007459637989, + "loss": 2.8189, + "step": 7099 + }, + { + "epoch": 0.21053880141149958, + "grad_norm": 0.13338764011859894, + "learning_rate": 0.0009021727918634055, + "loss": 2.8257, + "step": 7100 + }, + { + "epoch": 0.21056845476381106, + "grad_norm": 0.14646808803081512, + "learning_rate": 0.0009021448342017032, + "loss": 2.7938, + "step": 7101 + }, + { + "epoch": 0.21059810811612253, + "grad_norm": 0.15206162631511688, + "learning_rate": 0.000902116872978939, + "loss": 2.8018, + "step": 7102 + }, + { + "epoch": 0.210627761468434, + "grad_norm": 0.12438754737377167, + "learning_rate": 0.0009020889081953611, + "loss": 2.8013, + "step": 7103 + }, + { + "epoch": 0.2106574148207455, + "grad_norm": 0.12769660353660583, + "learning_rate": 0.000902060939851217, + "loss": 2.8011, + "step": 7104 + }, + { + "epoch": 0.21068706817305696, + "grad_norm": 0.13834764063358307, + "learning_rate": 0.0009020329679467543, + "loss": 2.8117, + "step": 7105 + }, + { + "epoch": 0.21071672152536844, + "grad_norm": 0.15872406959533691, + "learning_rate": 0.0009020049924822204, + "loss": 2.8345, + "step": 7106 + }, + { + "epoch": 0.21074637487767992, + "grad_norm": 0.16828669607639313, + "learning_rate": 0.0009019770134578635, + "loss": 2.8262, + "step": 7107 + }, + { + "epoch": 0.2107760282299914, + "grad_norm": 0.1597074717283249, + "learning_rate": 0.0009019490308739311, + "loss": 2.8275, + "step": 7108 + }, + { + "epoch": 0.21080568158230287, + "grad_norm": 0.16009250283241272, + "learning_rate": 0.000901921044730671, + "loss": 2.8497, + "step": 7109 + }, + { + "epoch": 0.21083533493461437, + "grad_norm": 0.1598990112543106, + "learning_rate": 0.000901893055028331, + "loss": 2.8026, + "step": 7110 + }, + { + "epoch": 0.21086498828692585, + "grad_norm": 0.16681046783924103, + "learning_rate": 0.000901865061767159, + "loss": 2.788, + "step": 7111 + }, + { + "epoch": 0.21089464163923732, + "grad_norm": 0.15987001359462738, + "learning_rate": 0.0009018370649474031, + "loss": 2.8204, + "step": 7112 + }, + { + "epoch": 0.2109242949915488, + "grad_norm": 0.15314435958862305, + "learning_rate": 0.0009018090645693109, + "loss": 2.8438, + "step": 7113 + }, + { + "epoch": 0.21095394834386028, + "grad_norm": 0.14299891889095306, + "learning_rate": 0.0009017810606331305, + "loss": 2.835, + "step": 7114 + }, + { + "epoch": 0.21098360169617175, + "grad_norm": 0.14233951270580292, + "learning_rate": 0.0009017530531391098, + "loss": 2.8141, + "step": 7115 + }, + { + "epoch": 0.21101325504848323, + "grad_norm": 0.14096778631210327, + "learning_rate": 0.0009017250420874968, + "loss": 2.7957, + "step": 7116 + }, + { + "epoch": 0.2110429084007947, + "grad_norm": 0.15200038254261017, + "learning_rate": 0.0009016970274785396, + "loss": 2.8115, + "step": 7117 + }, + { + "epoch": 0.21107256175310618, + "grad_norm": 0.181008443236351, + "learning_rate": 0.0009016690093124865, + "loss": 2.8086, + "step": 7118 + }, + { + "epoch": 0.21110221510541766, + "grad_norm": 0.2103833258152008, + "learning_rate": 0.0009016409875895852, + "loss": 2.8247, + "step": 7119 + }, + { + "epoch": 0.21113186845772913, + "grad_norm": 0.19400840997695923, + "learning_rate": 0.0009016129623100839, + "loss": 2.8069, + "step": 7120 + }, + { + "epoch": 0.21116152181004064, + "grad_norm": 0.16345804929733276, + "learning_rate": 0.000901584933474231, + "loss": 2.848, + "step": 7121 + }, + { + "epoch": 0.2111911751623521, + "grad_norm": 0.1542389988899231, + "learning_rate": 0.0009015569010822746, + "loss": 2.832, + "step": 7122 + }, + { + "epoch": 0.2112208285146636, + "grad_norm": 0.15764109790325165, + "learning_rate": 0.000901528865134463, + "loss": 2.8481, + "step": 7123 + }, + { + "epoch": 0.21125048186697507, + "grad_norm": 0.15115611255168915, + "learning_rate": 0.0009015008256310442, + "loss": 2.8089, + "step": 7124 + }, + { + "epoch": 0.21128013521928654, + "grad_norm": 0.13817396759986877, + "learning_rate": 0.0009014727825722668, + "loss": 2.8374, + "step": 7125 + }, + { + "epoch": 0.21130978857159802, + "grad_norm": 0.1311454474925995, + "learning_rate": 0.0009014447359583789, + "loss": 2.8184, + "step": 7126 + }, + { + "epoch": 0.2113394419239095, + "grad_norm": 0.13864468038082123, + "learning_rate": 0.0009014166857896291, + "loss": 2.7955, + "step": 7127 + }, + { + "epoch": 0.21136909527622097, + "grad_norm": 0.1429157555103302, + "learning_rate": 0.0009013886320662656, + "loss": 2.7977, + "step": 7128 + }, + { + "epoch": 0.21139874862853245, + "grad_norm": 0.1348477452993393, + "learning_rate": 0.0009013605747885367, + "loss": 2.8109, + "step": 7129 + }, + { + "epoch": 0.21142840198084392, + "grad_norm": 0.14766213297843933, + "learning_rate": 0.0009013325139566911, + "loss": 2.8256, + "step": 7130 + }, + { + "epoch": 0.21145805533315543, + "grad_norm": 0.13184115290641785, + "learning_rate": 0.0009013044495709772, + "loss": 2.8426, + "step": 7131 + }, + { + "epoch": 0.2114877086854669, + "grad_norm": 0.12423963099718094, + "learning_rate": 0.0009012763816316436, + "loss": 2.8007, + "step": 7132 + }, + { + "epoch": 0.21151736203777838, + "grad_norm": 0.1228209063410759, + "learning_rate": 0.0009012483101389388, + "loss": 2.7903, + "step": 7133 + }, + { + "epoch": 0.21154701539008985, + "grad_norm": 0.14615529775619507, + "learning_rate": 0.0009012202350931112, + "loss": 2.8331, + "step": 7134 + }, + { + "epoch": 0.21157666874240133, + "grad_norm": 0.15983060002326965, + "learning_rate": 0.0009011921564944096, + "loss": 2.8125, + "step": 7135 + }, + { + "epoch": 0.2116063220947128, + "grad_norm": 0.15227247774600983, + "learning_rate": 0.0009011640743430827, + "loss": 2.8442, + "step": 7136 + }, + { + "epoch": 0.21163597544702428, + "grad_norm": 0.12950168550014496, + "learning_rate": 0.0009011359886393789, + "loss": 2.8474, + "step": 7137 + }, + { + "epoch": 0.21166562879933576, + "grad_norm": 0.1205458790063858, + "learning_rate": 0.0009011078993835471, + "loss": 2.8227, + "step": 7138 + }, + { + "epoch": 0.21169528215164723, + "grad_norm": 0.13049404323101044, + "learning_rate": 0.0009010798065758361, + "loss": 2.8139, + "step": 7139 + }, + { + "epoch": 0.2117249355039587, + "grad_norm": 0.14220665395259857, + "learning_rate": 0.0009010517102164944, + "loss": 2.8242, + "step": 7140 + }, + { + "epoch": 0.21175458885627021, + "grad_norm": 0.13338130712509155, + "learning_rate": 0.000901023610305771, + "loss": 2.812, + "step": 7141 + }, + { + "epoch": 0.2117842422085817, + "grad_norm": 0.1211945116519928, + "learning_rate": 0.0009009955068439148, + "loss": 2.8035, + "step": 7142 + }, + { + "epoch": 0.21181389556089317, + "grad_norm": 0.14226220548152924, + "learning_rate": 0.0009009673998311745, + "loss": 2.8256, + "step": 7143 + }, + { + "epoch": 0.21184354891320464, + "grad_norm": 0.15735842287540436, + "learning_rate": 0.0009009392892677991, + "loss": 2.8463, + "step": 7144 + }, + { + "epoch": 0.21187320226551612, + "grad_norm": 0.17956092953681946, + "learning_rate": 0.0009009111751540374, + "loss": 2.8279, + "step": 7145 + }, + { + "epoch": 0.2119028556178276, + "grad_norm": 0.16427728533744812, + "learning_rate": 0.0009008830574901385, + "loss": 2.8043, + "step": 7146 + }, + { + "epoch": 0.21193250897013907, + "grad_norm": 0.15866614878177643, + "learning_rate": 0.0009008549362763512, + "loss": 2.8017, + "step": 7147 + }, + { + "epoch": 0.21196216232245055, + "grad_norm": 0.21316313743591309, + "learning_rate": 0.0009008268115129248, + "loss": 2.7846, + "step": 7148 + }, + { + "epoch": 0.21199181567476202, + "grad_norm": 0.24479150772094727, + "learning_rate": 0.000900798683200108, + "loss": 2.7843, + "step": 7149 + }, + { + "epoch": 0.2120214690270735, + "grad_norm": 0.21202364563941956, + "learning_rate": 0.0009007705513381503, + "loss": 2.8089, + "step": 7150 + }, + { + "epoch": 0.21205112237938498, + "grad_norm": 0.1674509048461914, + "learning_rate": 0.0009007424159273004, + "loss": 2.8212, + "step": 7151 + }, + { + "epoch": 0.21208077573169648, + "grad_norm": 0.15432532131671906, + "learning_rate": 0.0009007142769678076, + "loss": 2.792, + "step": 7152 + }, + { + "epoch": 0.21211042908400796, + "grad_norm": 0.1664082556962967, + "learning_rate": 0.0009006861344599212, + "loss": 2.8329, + "step": 7153 + }, + { + "epoch": 0.21214008243631943, + "grad_norm": 0.14728815853595734, + "learning_rate": 0.0009006579884038902, + "loss": 2.7955, + "step": 7154 + }, + { + "epoch": 0.2121697357886309, + "grad_norm": 0.13577742874622345, + "learning_rate": 0.0009006298387999641, + "loss": 2.7946, + "step": 7155 + }, + { + "epoch": 0.21219938914094238, + "grad_norm": 0.14323867857456207, + "learning_rate": 0.0009006016856483918, + "loss": 2.824, + "step": 7156 + }, + { + "epoch": 0.21222904249325386, + "grad_norm": 0.14118319749832153, + "learning_rate": 0.000900573528949423, + "loss": 2.8102, + "step": 7157 + }, + { + "epoch": 0.21225869584556534, + "grad_norm": 0.14072026312351227, + "learning_rate": 0.0009005453687033067, + "loss": 2.7857, + "step": 7158 + }, + { + "epoch": 0.2122883491978768, + "grad_norm": 0.12077198922634125, + "learning_rate": 0.0009005172049102925, + "loss": 2.807, + "step": 7159 + }, + { + "epoch": 0.2123180025501883, + "grad_norm": 0.10677710920572281, + "learning_rate": 0.0009004890375706296, + "loss": 2.8198, + "step": 7160 + }, + { + "epoch": 0.21234765590249977, + "grad_norm": 0.1234811395406723, + "learning_rate": 0.0009004608666845677, + "loss": 2.8087, + "step": 7161 + }, + { + "epoch": 0.21237730925481127, + "grad_norm": 0.12482442706823349, + "learning_rate": 0.000900432692252356, + "loss": 2.7831, + "step": 7162 + }, + { + "epoch": 0.21240696260712275, + "grad_norm": 0.1306990534067154, + "learning_rate": 0.0009004045142742441, + "loss": 2.8075, + "step": 7163 + }, + { + "epoch": 0.21243661595943422, + "grad_norm": 0.14574961364269257, + "learning_rate": 0.0009003763327504815, + "loss": 2.8108, + "step": 7164 + }, + { + "epoch": 0.2124662693117457, + "grad_norm": 0.14187273383140564, + "learning_rate": 0.0009003481476813175, + "loss": 2.8106, + "step": 7165 + }, + { + "epoch": 0.21249592266405717, + "grad_norm": 0.16449467837810516, + "learning_rate": 0.0009003199590670023, + "loss": 2.7923, + "step": 7166 + }, + { + "epoch": 0.21252557601636865, + "grad_norm": 0.1761566549539566, + "learning_rate": 0.000900291766907785, + "loss": 2.8212, + "step": 7167 + }, + { + "epoch": 0.21255522936868013, + "grad_norm": 0.17447303235530853, + "learning_rate": 0.0009002635712039153, + "loss": 2.8386, + "step": 7168 + }, + { + "epoch": 0.2125848827209916, + "grad_norm": 0.18267539143562317, + "learning_rate": 0.0009002353719556431, + "loss": 2.8308, + "step": 7169 + }, + { + "epoch": 0.21261453607330308, + "grad_norm": 0.17537811398506165, + "learning_rate": 0.0009002071691632179, + "loss": 2.801, + "step": 7170 + }, + { + "epoch": 0.21264418942561455, + "grad_norm": 0.15004540979862213, + "learning_rate": 0.0009001789628268896, + "loss": 2.7917, + "step": 7171 + }, + { + "epoch": 0.21267384277792603, + "grad_norm": 0.14232969284057617, + "learning_rate": 0.0009001507529469079, + "loss": 2.7993, + "step": 7172 + }, + { + "epoch": 0.21270349613023753, + "grad_norm": 0.1475883573293686, + "learning_rate": 0.0009001225395235225, + "loss": 2.8466, + "step": 7173 + }, + { + "epoch": 0.212733149482549, + "grad_norm": 0.13939973711967468, + "learning_rate": 0.0009000943225569833, + "loss": 2.8276, + "step": 7174 + }, + { + "epoch": 0.2127628028348605, + "grad_norm": 0.13575242459774017, + "learning_rate": 0.0009000661020475404, + "loss": 2.8484, + "step": 7175 + }, + { + "epoch": 0.21279245618717196, + "grad_norm": 0.13547424972057343, + "learning_rate": 0.0009000378779954433, + "loss": 2.8129, + "step": 7176 + }, + { + "epoch": 0.21282210953948344, + "grad_norm": 0.13177898526191711, + "learning_rate": 0.0009000096504009423, + "loss": 2.7812, + "step": 7177 + }, + { + "epoch": 0.21285176289179492, + "grad_norm": 0.13014614582061768, + "learning_rate": 0.000899981419264287, + "loss": 2.7938, + "step": 7178 + }, + { + "epoch": 0.2128814162441064, + "grad_norm": 0.14227071404457092, + "learning_rate": 0.0008999531845857278, + "loss": 2.825, + "step": 7179 + }, + { + "epoch": 0.21291106959641787, + "grad_norm": 0.14530494809150696, + "learning_rate": 0.0008999249463655144, + "loss": 2.8427, + "step": 7180 + }, + { + "epoch": 0.21294072294872934, + "grad_norm": 0.13525943458080292, + "learning_rate": 0.0008998967046038968, + "loss": 2.7789, + "step": 7181 + }, + { + "epoch": 0.21297037630104082, + "grad_norm": 0.16209696233272552, + "learning_rate": 0.0008998684593011255, + "loss": 2.8196, + "step": 7182 + }, + { + "epoch": 0.21300002965335232, + "grad_norm": 0.15333335101604462, + "learning_rate": 0.0008998402104574501, + "loss": 2.8328, + "step": 7183 + }, + { + "epoch": 0.2130296830056638, + "grad_norm": 0.1437486708164215, + "learning_rate": 0.0008998119580731211, + "loss": 2.7993, + "step": 7184 + }, + { + "epoch": 0.21305933635797528, + "grad_norm": 0.15029320120811462, + "learning_rate": 0.0008997837021483887, + "loss": 2.7889, + "step": 7185 + }, + { + "epoch": 0.21308898971028675, + "grad_norm": 0.16290555894374847, + "learning_rate": 0.0008997554426835028, + "loss": 2.846, + "step": 7186 + }, + { + "epoch": 0.21311864306259823, + "grad_norm": 0.15892693400382996, + "learning_rate": 0.000899727179678714, + "loss": 2.8419, + "step": 7187 + }, + { + "epoch": 0.2131482964149097, + "grad_norm": 0.11925125867128372, + "learning_rate": 0.0008996989131342723, + "loss": 2.8071, + "step": 7188 + }, + { + "epoch": 0.21317794976722118, + "grad_norm": 0.1226305142045021, + "learning_rate": 0.0008996706430504282, + "loss": 2.7777, + "step": 7189 + }, + { + "epoch": 0.21320760311953266, + "grad_norm": 0.1624002903699875, + "learning_rate": 0.000899642369427432, + "loss": 2.8032, + "step": 7190 + }, + { + "epoch": 0.21323725647184413, + "grad_norm": 0.1465461105108261, + "learning_rate": 0.0008996140922655338, + "loss": 2.79, + "step": 7191 + }, + { + "epoch": 0.2132669098241556, + "grad_norm": 0.13465912640094757, + "learning_rate": 0.0008995858115649844, + "loss": 2.7827, + "step": 7192 + }, + { + "epoch": 0.2132965631764671, + "grad_norm": 0.13532628118991852, + "learning_rate": 0.0008995575273260341, + "loss": 2.822, + "step": 7193 + }, + { + "epoch": 0.2133262165287786, + "grad_norm": 0.11959359049797058, + "learning_rate": 0.0008995292395489331, + "loss": 2.8156, + "step": 7194 + }, + { + "epoch": 0.21335586988109007, + "grad_norm": 0.11193086951971054, + "learning_rate": 0.0008995009482339323, + "loss": 2.764, + "step": 7195 + }, + { + "epoch": 0.21338552323340154, + "grad_norm": 0.14714668691158295, + "learning_rate": 0.000899472653381282, + "loss": 2.838, + "step": 7196 + }, + { + "epoch": 0.21341517658571302, + "grad_norm": 0.15529251098632812, + "learning_rate": 0.0008994443549912328, + "loss": 2.8112, + "step": 7197 + }, + { + "epoch": 0.2134448299380245, + "grad_norm": 0.12426760792732239, + "learning_rate": 0.0008994160530640351, + "loss": 2.845, + "step": 7198 + }, + { + "epoch": 0.21347448329033597, + "grad_norm": 0.1261923909187317, + "learning_rate": 0.0008993877475999399, + "loss": 2.8135, + "step": 7199 + }, + { + "epoch": 0.21350413664264745, + "grad_norm": 0.13883575797080994, + "learning_rate": 0.0008993594385991974, + "loss": 2.8538, + "step": 7200 + }, + { + "epoch": 0.21353378999495892, + "grad_norm": 0.12357383966445923, + "learning_rate": 0.0008993311260620588, + "loss": 2.8195, + "step": 7201 + }, + { + "epoch": 0.2135634433472704, + "grad_norm": 0.1470268815755844, + "learning_rate": 0.0008993028099887743, + "loss": 2.7953, + "step": 7202 + }, + { + "epoch": 0.21359309669958187, + "grad_norm": 0.19493551552295685, + "learning_rate": 0.000899274490379595, + "loss": 2.8246, + "step": 7203 + }, + { + "epoch": 0.21362275005189338, + "grad_norm": 0.19643259048461914, + "learning_rate": 0.0008992461672347716, + "loss": 2.7624, + "step": 7204 + }, + { + "epoch": 0.21365240340420485, + "grad_norm": 0.1847076714038849, + "learning_rate": 0.0008992178405545548, + "loss": 2.8447, + "step": 7205 + }, + { + "epoch": 0.21368205675651633, + "grad_norm": 0.1649637222290039, + "learning_rate": 0.0008991895103391956, + "loss": 2.8106, + "step": 7206 + }, + { + "epoch": 0.2137117101088278, + "grad_norm": 0.16908709704875946, + "learning_rate": 0.0008991611765889446, + "loss": 2.8192, + "step": 7207 + }, + { + "epoch": 0.21374136346113928, + "grad_norm": 0.1792735606431961, + "learning_rate": 0.000899132839304053, + "loss": 2.8274, + "step": 7208 + }, + { + "epoch": 0.21377101681345076, + "grad_norm": 0.14551043510437012, + "learning_rate": 0.0008991044984847714, + "loss": 2.8134, + "step": 7209 + }, + { + "epoch": 0.21380067016576224, + "grad_norm": 0.1700258105993271, + "learning_rate": 0.0008990761541313511, + "loss": 2.779, + "step": 7210 + }, + { + "epoch": 0.2138303235180737, + "grad_norm": 0.1458691656589508, + "learning_rate": 0.000899047806244043, + "loss": 2.8246, + "step": 7211 + }, + { + "epoch": 0.2138599768703852, + "grad_norm": 0.16198128461837769, + "learning_rate": 0.0008990194548230979, + "loss": 2.833, + "step": 7212 + }, + { + "epoch": 0.21388963022269666, + "grad_norm": 0.15121299028396606, + "learning_rate": 0.0008989910998687673, + "loss": 2.8211, + "step": 7213 + }, + { + "epoch": 0.21391928357500817, + "grad_norm": 0.13927511870861053, + "learning_rate": 0.0008989627413813018, + "loss": 2.7789, + "step": 7214 + }, + { + "epoch": 0.21394893692731964, + "grad_norm": 0.13957791030406952, + "learning_rate": 0.0008989343793609529, + "loss": 2.8305, + "step": 7215 + }, + { + "epoch": 0.21397859027963112, + "grad_norm": 0.13012200593948364, + "learning_rate": 0.0008989060138079715, + "loss": 2.818, + "step": 7216 + }, + { + "epoch": 0.2140082436319426, + "grad_norm": 0.13318473100662231, + "learning_rate": 0.0008988776447226088, + "loss": 2.8195, + "step": 7217 + }, + { + "epoch": 0.21403789698425407, + "grad_norm": 0.1419304609298706, + "learning_rate": 0.0008988492721051163, + "loss": 2.8163, + "step": 7218 + }, + { + "epoch": 0.21406755033656555, + "grad_norm": 0.15160129964351654, + "learning_rate": 0.0008988208959557449, + "loss": 2.787, + "step": 7219 + }, + { + "epoch": 0.21409720368887702, + "grad_norm": 0.15771064162254333, + "learning_rate": 0.0008987925162747461, + "loss": 2.8132, + "step": 7220 + }, + { + "epoch": 0.2141268570411885, + "grad_norm": 0.1439315676689148, + "learning_rate": 0.000898764133062371, + "loss": 2.8243, + "step": 7221 + }, + { + "epoch": 0.21415651039349998, + "grad_norm": 0.12182950973510742, + "learning_rate": 0.0008987357463188711, + "loss": 2.7841, + "step": 7222 + }, + { + "epoch": 0.21418616374581145, + "grad_norm": 0.13967347145080566, + "learning_rate": 0.0008987073560444977, + "loss": 2.8132, + "step": 7223 + }, + { + "epoch": 0.21421581709812293, + "grad_norm": 0.13619674742221832, + "learning_rate": 0.0008986789622395021, + "loss": 2.8274, + "step": 7224 + }, + { + "epoch": 0.21424547045043443, + "grad_norm": 0.13391628861427307, + "learning_rate": 0.0008986505649041361, + "loss": 2.8039, + "step": 7225 + }, + { + "epoch": 0.2142751238027459, + "grad_norm": 0.11762291193008423, + "learning_rate": 0.0008986221640386509, + "loss": 2.8237, + "step": 7226 + }, + { + "epoch": 0.21430477715505739, + "grad_norm": 0.12193850427865982, + "learning_rate": 0.000898593759643298, + "loss": 2.8232, + "step": 7227 + }, + { + "epoch": 0.21433443050736886, + "grad_norm": 0.12574532628059387, + "learning_rate": 0.0008985653517183288, + "loss": 2.7724, + "step": 7228 + }, + { + "epoch": 0.21436408385968034, + "grad_norm": 0.14716626703739166, + "learning_rate": 0.0008985369402639952, + "loss": 2.8196, + "step": 7229 + }, + { + "epoch": 0.21439373721199181, + "grad_norm": 0.13817650079727173, + "learning_rate": 0.0008985085252805483, + "loss": 2.7833, + "step": 7230 + }, + { + "epoch": 0.2144233905643033, + "grad_norm": 0.12085914611816406, + "learning_rate": 0.0008984801067682402, + "loss": 2.7872, + "step": 7231 + }, + { + "epoch": 0.21445304391661477, + "grad_norm": 0.13596607744693756, + "learning_rate": 0.0008984516847273225, + "loss": 2.799, + "step": 7232 + }, + { + "epoch": 0.21448269726892624, + "grad_norm": 0.15207263827323914, + "learning_rate": 0.0008984232591580465, + "loss": 2.781, + "step": 7233 + }, + { + "epoch": 0.21451235062123772, + "grad_norm": 0.1400630921125412, + "learning_rate": 0.0008983948300606641, + "loss": 2.8369, + "step": 7234 + }, + { + "epoch": 0.21454200397354922, + "grad_norm": 0.12040489912033081, + "learning_rate": 0.0008983663974354273, + "loss": 2.7816, + "step": 7235 + }, + { + "epoch": 0.2145716573258607, + "grad_norm": 0.12828409671783447, + "learning_rate": 0.0008983379612825875, + "loss": 2.7773, + "step": 7236 + }, + { + "epoch": 0.21460131067817217, + "grad_norm": 0.1392485350370407, + "learning_rate": 0.0008983095216023968, + "loss": 2.8137, + "step": 7237 + }, + { + "epoch": 0.21463096403048365, + "grad_norm": 0.15833191573619843, + "learning_rate": 0.0008982810783951069, + "loss": 2.8308, + "step": 7238 + }, + { + "epoch": 0.21466061738279513, + "grad_norm": 0.15038146078586578, + "learning_rate": 0.0008982526316609697, + "loss": 2.7997, + "step": 7239 + }, + { + "epoch": 0.2146902707351066, + "grad_norm": 0.16457918286323547, + "learning_rate": 0.000898224181400237, + "loss": 2.7937, + "step": 7240 + }, + { + "epoch": 0.21471992408741808, + "grad_norm": 0.19254611432552338, + "learning_rate": 0.000898195727613161, + "loss": 2.8074, + "step": 7241 + }, + { + "epoch": 0.21474957743972956, + "grad_norm": 0.19951480627059937, + "learning_rate": 0.0008981672702999933, + "loss": 2.7645, + "step": 7242 + }, + { + "epoch": 0.21477923079204103, + "grad_norm": 0.17294885218143463, + "learning_rate": 0.0008981388094609861, + "loss": 2.8069, + "step": 7243 + }, + { + "epoch": 0.2148088841443525, + "grad_norm": 0.13593432307243347, + "learning_rate": 0.0008981103450963915, + "loss": 2.8246, + "step": 7244 + }, + { + "epoch": 0.214838537496664, + "grad_norm": 0.131544828414917, + "learning_rate": 0.0008980818772064613, + "loss": 2.8038, + "step": 7245 + }, + { + "epoch": 0.2148681908489755, + "grad_norm": 0.15535061061382294, + "learning_rate": 0.000898053405791448, + "loss": 2.8334, + "step": 7246 + }, + { + "epoch": 0.21489784420128696, + "grad_norm": 0.14279118180274963, + "learning_rate": 0.0008980249308516034, + "loss": 2.7844, + "step": 7247 + }, + { + "epoch": 0.21492749755359844, + "grad_norm": 0.1358165144920349, + "learning_rate": 0.0008979964523871796, + "loss": 2.7988, + "step": 7248 + }, + { + "epoch": 0.21495715090590992, + "grad_norm": 0.1409393548965454, + "learning_rate": 0.0008979679703984288, + "loss": 2.7999, + "step": 7249 + }, + { + "epoch": 0.2149868042582214, + "grad_norm": 0.1356310099363327, + "learning_rate": 0.0008979394848856035, + "loss": 2.7915, + "step": 7250 + }, + { + "epoch": 0.21501645761053287, + "grad_norm": 0.14520621299743652, + "learning_rate": 0.0008979109958489557, + "loss": 2.7994, + "step": 7251 + }, + { + "epoch": 0.21504611096284434, + "grad_norm": 0.13904565572738647, + "learning_rate": 0.0008978825032887376, + "loss": 2.7918, + "step": 7252 + }, + { + "epoch": 0.21507576431515582, + "grad_norm": 0.15379703044891357, + "learning_rate": 0.0008978540072052019, + "loss": 2.7811, + "step": 7253 + }, + { + "epoch": 0.2151054176674673, + "grad_norm": 0.17758315801620483, + "learning_rate": 0.0008978255075986005, + "loss": 2.7915, + "step": 7254 + }, + { + "epoch": 0.21513507101977877, + "grad_norm": 0.19441142678260803, + "learning_rate": 0.0008977970044691859, + "loss": 2.8255, + "step": 7255 + }, + { + "epoch": 0.21516472437209028, + "grad_norm": 0.16798970103263855, + "learning_rate": 0.0008977684978172107, + "loss": 2.8361, + "step": 7256 + }, + { + "epoch": 0.21519437772440175, + "grad_norm": 0.13716797530651093, + "learning_rate": 0.0008977399876429271, + "loss": 2.8351, + "step": 7257 + }, + { + "epoch": 0.21522403107671323, + "grad_norm": 0.16864177584648132, + "learning_rate": 0.0008977114739465877, + "loss": 2.7798, + "step": 7258 + }, + { + "epoch": 0.2152536844290247, + "grad_norm": 0.19895513355731964, + "learning_rate": 0.0008976829567284447, + "loss": 2.7609, + "step": 7259 + }, + { + "epoch": 0.21528333778133618, + "grad_norm": 0.19670721888542175, + "learning_rate": 0.0008976544359887512, + "loss": 2.8239, + "step": 7260 + }, + { + "epoch": 0.21531299113364766, + "grad_norm": 0.17745418846607208, + "learning_rate": 0.0008976259117277592, + "loss": 2.8101, + "step": 7261 + }, + { + "epoch": 0.21534264448595913, + "grad_norm": 0.16600672900676727, + "learning_rate": 0.0008975973839457215, + "loss": 2.7764, + "step": 7262 + }, + { + "epoch": 0.2153722978382706, + "grad_norm": 0.1573544591665268, + "learning_rate": 0.0008975688526428909, + "loss": 2.8113, + "step": 7263 + }, + { + "epoch": 0.21540195119058209, + "grad_norm": 0.1529160887002945, + "learning_rate": 0.0008975403178195197, + "loss": 2.8622, + "step": 7264 + }, + { + "epoch": 0.21543160454289356, + "grad_norm": 0.15290139615535736, + "learning_rate": 0.0008975117794758607, + "loss": 2.7923, + "step": 7265 + }, + { + "epoch": 0.21546125789520507, + "grad_norm": 0.15289306640625, + "learning_rate": 0.0008974832376121667, + "loss": 2.7773, + "step": 7266 + }, + { + "epoch": 0.21549091124751654, + "grad_norm": 0.12546080350875854, + "learning_rate": 0.0008974546922286906, + "loss": 2.8001, + "step": 7267 + }, + { + "epoch": 0.21552056459982802, + "grad_norm": 0.1274842619895935, + "learning_rate": 0.0008974261433256848, + "loss": 2.7758, + "step": 7268 + }, + { + "epoch": 0.2155502179521395, + "grad_norm": 0.12464601546525955, + "learning_rate": 0.0008973975909034022, + "loss": 2.8078, + "step": 7269 + }, + { + "epoch": 0.21557987130445097, + "grad_norm": 0.1250046044588089, + "learning_rate": 0.000897369034962096, + "loss": 2.7955, + "step": 7270 + }, + { + "epoch": 0.21560952465676245, + "grad_norm": 0.11087696999311447, + "learning_rate": 0.0008973404755020185, + "loss": 2.803, + "step": 7271 + }, + { + "epoch": 0.21563917800907392, + "grad_norm": 0.11862354725599289, + "learning_rate": 0.0008973119125234231, + "loss": 2.8157, + "step": 7272 + }, + { + "epoch": 0.2156688313613854, + "grad_norm": 0.12541168928146362, + "learning_rate": 0.0008972833460265624, + "loss": 2.8166, + "step": 7273 + }, + { + "epoch": 0.21569848471369688, + "grad_norm": 0.12923595309257507, + "learning_rate": 0.0008972547760116895, + "loss": 2.8365, + "step": 7274 + }, + { + "epoch": 0.21572813806600835, + "grad_norm": 0.12415485084056854, + "learning_rate": 0.0008972262024790574, + "loss": 2.8041, + "step": 7275 + }, + { + "epoch": 0.21575779141831983, + "grad_norm": 0.12647588551044464, + "learning_rate": 0.0008971976254289189, + "loss": 2.796, + "step": 7276 + }, + { + "epoch": 0.21578744477063133, + "grad_norm": 0.13534928858280182, + "learning_rate": 0.0008971690448615275, + "loss": 2.8333, + "step": 7277 + }, + { + "epoch": 0.2158170981229428, + "grad_norm": 0.15923507511615753, + "learning_rate": 0.000897140460777136, + "loss": 2.8264, + "step": 7278 + }, + { + "epoch": 0.21584675147525428, + "grad_norm": 0.16785268485546112, + "learning_rate": 0.0008971118731759976, + "loss": 2.8287, + "step": 7279 + }, + { + "epoch": 0.21587640482756576, + "grad_norm": 0.16024687886238098, + "learning_rate": 0.0008970832820583652, + "loss": 2.8105, + "step": 7280 + }, + { + "epoch": 0.21590605817987724, + "grad_norm": 0.1351381242275238, + "learning_rate": 0.0008970546874244922, + "loss": 2.7792, + "step": 7281 + }, + { + "epoch": 0.2159357115321887, + "grad_norm": 0.14233621954917908, + "learning_rate": 0.0008970260892746318, + "loss": 2.7314, + "step": 7282 + }, + { + "epoch": 0.2159653648845002, + "grad_norm": 0.15748363733291626, + "learning_rate": 0.0008969974876090374, + "loss": 2.7412, + "step": 7283 + }, + { + "epoch": 0.21599501823681166, + "grad_norm": 0.13912665843963623, + "learning_rate": 0.000896968882427962, + "loss": 2.852, + "step": 7284 + }, + { + "epoch": 0.21602467158912314, + "grad_norm": 0.14357183873653412, + "learning_rate": 0.000896940273731659, + "loss": 2.7692, + "step": 7285 + }, + { + "epoch": 0.21605432494143462, + "grad_norm": 0.12207671254873276, + "learning_rate": 0.0008969116615203818, + "loss": 2.797, + "step": 7286 + }, + { + "epoch": 0.21608397829374612, + "grad_norm": 0.12407161295413971, + "learning_rate": 0.0008968830457943836, + "loss": 2.7887, + "step": 7287 + }, + { + "epoch": 0.2161136316460576, + "grad_norm": 0.1220516487956047, + "learning_rate": 0.0008968544265539179, + "loss": 2.7954, + "step": 7288 + }, + { + "epoch": 0.21614328499836907, + "grad_norm": 0.11656563729047775, + "learning_rate": 0.0008968258037992383, + "loss": 2.7625, + "step": 7289 + }, + { + "epoch": 0.21617293835068055, + "grad_norm": 0.12699678540229797, + "learning_rate": 0.0008967971775305979, + "loss": 2.8191, + "step": 7290 + }, + { + "epoch": 0.21620259170299203, + "grad_norm": 0.14009793102741241, + "learning_rate": 0.0008967685477482506, + "loss": 2.8378, + "step": 7291 + }, + { + "epoch": 0.2162322450553035, + "grad_norm": 0.1522010862827301, + "learning_rate": 0.0008967399144524495, + "loss": 2.7928, + "step": 7292 + }, + { + "epoch": 0.21626189840761498, + "grad_norm": 0.1366993635892868, + "learning_rate": 0.0008967112776434485, + "loss": 2.8183, + "step": 7293 + }, + { + "epoch": 0.21629155175992645, + "grad_norm": 0.11209835857152939, + "learning_rate": 0.0008966826373215009, + "loss": 2.8051, + "step": 7294 + }, + { + "epoch": 0.21632120511223793, + "grad_norm": 0.12620005011558533, + "learning_rate": 0.0008966539934868605, + "loss": 2.7961, + "step": 7295 + }, + { + "epoch": 0.2163508584645494, + "grad_norm": 0.1557580977678299, + "learning_rate": 0.000896625346139781, + "loss": 2.7909, + "step": 7296 + }, + { + "epoch": 0.2163805118168609, + "grad_norm": 0.16233520209789276, + "learning_rate": 0.0008965966952805159, + "loss": 2.81, + "step": 7297 + }, + { + "epoch": 0.21641016516917239, + "grad_norm": 0.18720372021198273, + "learning_rate": 0.000896568040909319, + "loss": 2.787, + "step": 7298 + }, + { + "epoch": 0.21643981852148386, + "grad_norm": 0.19431786239147186, + "learning_rate": 0.0008965393830264441, + "loss": 2.7894, + "step": 7299 + }, + { + "epoch": 0.21646947187379534, + "grad_norm": 0.15978486835956573, + "learning_rate": 0.0008965107216321449, + "loss": 2.7864, + "step": 7300 + }, + { + "epoch": 0.21649912522610681, + "grad_norm": 0.14855468273162842, + "learning_rate": 0.0008964820567266749, + "loss": 2.8049, + "step": 7301 + }, + { + "epoch": 0.2165287785784183, + "grad_norm": 0.14693495631217957, + "learning_rate": 0.0008964533883102885, + "loss": 2.8206, + "step": 7302 + }, + { + "epoch": 0.21655843193072977, + "grad_norm": 0.16714169085025787, + "learning_rate": 0.0008964247163832393, + "loss": 2.8015, + "step": 7303 + }, + { + "epoch": 0.21658808528304124, + "grad_norm": 0.1659698635339737, + "learning_rate": 0.0008963960409457812, + "loss": 2.8482, + "step": 7304 + }, + { + "epoch": 0.21661773863535272, + "grad_norm": 0.1780260056257248, + "learning_rate": 0.000896367361998168, + "loss": 2.8297, + "step": 7305 + }, + { + "epoch": 0.2166473919876642, + "grad_norm": 0.17835891246795654, + "learning_rate": 0.0008963386795406539, + "loss": 2.7678, + "step": 7306 + }, + { + "epoch": 0.21667704533997567, + "grad_norm": 0.16311809420585632, + "learning_rate": 0.0008963099935734927, + "loss": 2.7979, + "step": 7307 + }, + { + "epoch": 0.21670669869228718, + "grad_norm": 0.1453235149383545, + "learning_rate": 0.0008962813040969386, + "loss": 2.8151, + "step": 7308 + }, + { + "epoch": 0.21673635204459865, + "grad_norm": 0.14113003015518188, + "learning_rate": 0.0008962526111112453, + "loss": 2.8344, + "step": 7309 + }, + { + "epoch": 0.21676600539691013, + "grad_norm": 0.14617332816123962, + "learning_rate": 0.0008962239146166673, + "loss": 2.7953, + "step": 7310 + }, + { + "epoch": 0.2167956587492216, + "grad_norm": 0.1437305063009262, + "learning_rate": 0.0008961952146134584, + "loss": 2.8513, + "step": 7311 + }, + { + "epoch": 0.21682531210153308, + "grad_norm": 0.15038836002349854, + "learning_rate": 0.0008961665111018728, + "loss": 2.8334, + "step": 7312 + }, + { + "epoch": 0.21685496545384456, + "grad_norm": 0.13732969760894775, + "learning_rate": 0.0008961378040821651, + "loss": 2.8016, + "step": 7313 + }, + { + "epoch": 0.21688461880615603, + "grad_norm": 0.13540467619895935, + "learning_rate": 0.0008961090935545888, + "loss": 2.8018, + "step": 7314 + }, + { + "epoch": 0.2169142721584675, + "grad_norm": 0.11793241649866104, + "learning_rate": 0.0008960803795193986, + "loss": 2.8246, + "step": 7315 + }, + { + "epoch": 0.21694392551077898, + "grad_norm": 0.12237415462732315, + "learning_rate": 0.0008960516619768486, + "loss": 2.7845, + "step": 7316 + }, + { + "epoch": 0.21697357886309046, + "grad_norm": 0.11455601453781128, + "learning_rate": 0.0008960229409271933, + "loss": 2.7961, + "step": 7317 + }, + { + "epoch": 0.21700323221540196, + "grad_norm": 0.11209901422262192, + "learning_rate": 0.0008959942163706867, + "loss": 2.8073, + "step": 7318 + }, + { + "epoch": 0.21703288556771344, + "grad_norm": 0.11577191948890686, + "learning_rate": 0.0008959654883075835, + "loss": 2.8006, + "step": 7319 + }, + { + "epoch": 0.21706253892002492, + "grad_norm": 0.13253559172153473, + "learning_rate": 0.0008959367567381378, + "loss": 2.8355, + "step": 7320 + }, + { + "epoch": 0.2170921922723364, + "grad_norm": 0.13458843529224396, + "learning_rate": 0.0008959080216626043, + "loss": 2.8335, + "step": 7321 + }, + { + "epoch": 0.21712184562464787, + "grad_norm": 0.12942874431610107, + "learning_rate": 0.0008958792830812372, + "loss": 2.7905, + "step": 7322 + }, + { + "epoch": 0.21715149897695935, + "grad_norm": 0.15137943625450134, + "learning_rate": 0.0008958505409942912, + "loss": 2.7852, + "step": 7323 + }, + { + "epoch": 0.21718115232927082, + "grad_norm": 0.1399507224559784, + "learning_rate": 0.0008958217954020206, + "loss": 2.7978, + "step": 7324 + }, + { + "epoch": 0.2172108056815823, + "grad_norm": 0.12375608831644058, + "learning_rate": 0.0008957930463046801, + "loss": 2.8229, + "step": 7325 + }, + { + "epoch": 0.21724045903389377, + "grad_norm": 0.14826998114585876, + "learning_rate": 0.0008957642937025242, + "loss": 2.8022, + "step": 7326 + }, + { + "epoch": 0.21727011238620525, + "grad_norm": 0.18242613971233368, + "learning_rate": 0.0008957355375958076, + "loss": 2.7736, + "step": 7327 + }, + { + "epoch": 0.21729976573851673, + "grad_norm": 0.20429039001464844, + "learning_rate": 0.0008957067779847849, + "loss": 2.8099, + "step": 7328 + }, + { + "epoch": 0.21732941909082823, + "grad_norm": 0.21256805956363678, + "learning_rate": 0.0008956780148697108, + "loss": 2.849, + "step": 7329 + }, + { + "epoch": 0.2173590724431397, + "grad_norm": 0.19307208061218262, + "learning_rate": 0.0008956492482508398, + "loss": 2.8328, + "step": 7330 + }, + { + "epoch": 0.21738872579545118, + "grad_norm": 0.17183849215507507, + "learning_rate": 0.0008956204781284269, + "loss": 2.8282, + "step": 7331 + }, + { + "epoch": 0.21741837914776266, + "grad_norm": 0.18941187858581543, + "learning_rate": 0.0008955917045027267, + "loss": 2.7999, + "step": 7332 + }, + { + "epoch": 0.21744803250007413, + "grad_norm": 0.168936088681221, + "learning_rate": 0.0008955629273739941, + "loss": 2.8096, + "step": 7333 + }, + { + "epoch": 0.2174776858523856, + "grad_norm": 0.151694193482399, + "learning_rate": 0.000895534146742484, + "loss": 2.8648, + "step": 7334 + }, + { + "epoch": 0.2175073392046971, + "grad_norm": 0.1565517634153366, + "learning_rate": 0.0008955053626084511, + "loss": 2.7776, + "step": 7335 + }, + { + "epoch": 0.21753699255700856, + "grad_norm": 0.1452300250530243, + "learning_rate": 0.00089547657497215, + "loss": 2.8004, + "step": 7336 + }, + { + "epoch": 0.21756664590932004, + "grad_norm": 0.15616421401500702, + "learning_rate": 0.0008954477838338363, + "loss": 2.8158, + "step": 7337 + }, + { + "epoch": 0.21759629926163152, + "grad_norm": 0.14135119318962097, + "learning_rate": 0.0008954189891937645, + "loss": 2.7757, + "step": 7338 + }, + { + "epoch": 0.21762595261394302, + "grad_norm": 0.1381630152463913, + "learning_rate": 0.0008953901910521896, + "loss": 2.8216, + "step": 7339 + }, + { + "epoch": 0.2176556059662545, + "grad_norm": 0.1284998208284378, + "learning_rate": 0.0008953613894093668, + "loss": 2.8066, + "step": 7340 + }, + { + "epoch": 0.21768525931856597, + "grad_norm": 0.12435079365968704, + "learning_rate": 0.0008953325842655511, + "loss": 2.7845, + "step": 7341 + }, + { + "epoch": 0.21771491267087745, + "grad_norm": 0.12673701345920563, + "learning_rate": 0.0008953037756209974, + "loss": 2.8143, + "step": 7342 + }, + { + "epoch": 0.21774456602318892, + "grad_norm": 0.14586541056632996, + "learning_rate": 0.0008952749634759608, + "loss": 2.8388, + "step": 7343 + }, + { + "epoch": 0.2177742193755004, + "grad_norm": 0.14995034039020538, + "learning_rate": 0.0008952461478306967, + "loss": 2.8041, + "step": 7344 + }, + { + "epoch": 0.21780387272781188, + "grad_norm": 0.1293056756258011, + "learning_rate": 0.0008952173286854602, + "loss": 2.8043, + "step": 7345 + }, + { + "epoch": 0.21783352608012335, + "grad_norm": 0.13058805465698242, + "learning_rate": 0.0008951885060405062, + "loss": 2.8037, + "step": 7346 + }, + { + "epoch": 0.21786317943243483, + "grad_norm": 0.1280648410320282, + "learning_rate": 0.0008951596798960901, + "loss": 2.8194, + "step": 7347 + }, + { + "epoch": 0.2178928327847463, + "grad_norm": 0.11983451247215271, + "learning_rate": 0.0008951308502524676, + "loss": 2.7724, + "step": 7348 + }, + { + "epoch": 0.2179224861370578, + "grad_norm": 0.13892503082752228, + "learning_rate": 0.0008951020171098933, + "loss": 2.8283, + "step": 7349 + }, + { + "epoch": 0.21795213948936928, + "grad_norm": 0.12635008990764618, + "learning_rate": 0.0008950731804686227, + "loss": 2.7986, + "step": 7350 + }, + { + "epoch": 0.21798179284168076, + "grad_norm": 0.1294192671775818, + "learning_rate": 0.0008950443403289114, + "loss": 2.8405, + "step": 7351 + }, + { + "epoch": 0.21801144619399224, + "grad_norm": 0.13298453390598297, + "learning_rate": 0.0008950154966910149, + "loss": 2.8074, + "step": 7352 + }, + { + "epoch": 0.2180410995463037, + "grad_norm": 0.12997904419898987, + "learning_rate": 0.0008949866495551881, + "loss": 2.7698, + "step": 7353 + }, + { + "epoch": 0.2180707528986152, + "grad_norm": 0.1429002285003662, + "learning_rate": 0.0008949577989216869, + "loss": 2.8137, + "step": 7354 + }, + { + "epoch": 0.21810040625092666, + "grad_norm": 0.1472173035144806, + "learning_rate": 0.0008949289447907665, + "loss": 2.8305, + "step": 7355 + }, + { + "epoch": 0.21813005960323814, + "grad_norm": 0.1397184133529663, + "learning_rate": 0.0008949000871626825, + "loss": 2.8003, + "step": 7356 + }, + { + "epoch": 0.21815971295554962, + "grad_norm": 0.13221366703510284, + "learning_rate": 0.0008948712260376903, + "loss": 2.8376, + "step": 7357 + }, + { + "epoch": 0.2181893663078611, + "grad_norm": 0.11996028572320938, + "learning_rate": 0.0008948423614160458, + "loss": 2.7778, + "step": 7358 + }, + { + "epoch": 0.21821901966017257, + "grad_norm": 0.12843914330005646, + "learning_rate": 0.0008948134932980043, + "loss": 2.8382, + "step": 7359 + }, + { + "epoch": 0.21824867301248407, + "grad_norm": 0.15198050439357758, + "learning_rate": 0.0008947846216838216, + "loss": 2.7933, + "step": 7360 + }, + { + "epoch": 0.21827832636479555, + "grad_norm": 0.17341844737529755, + "learning_rate": 0.0008947557465737535, + "loss": 2.8214, + "step": 7361 + }, + { + "epoch": 0.21830797971710703, + "grad_norm": 0.1716679036617279, + "learning_rate": 0.0008947268679680553, + "loss": 2.8047, + "step": 7362 + }, + { + "epoch": 0.2183376330694185, + "grad_norm": 0.1479315310716629, + "learning_rate": 0.000894697985866983, + "loss": 2.8073, + "step": 7363 + }, + { + "epoch": 0.21836728642172998, + "grad_norm": 0.15305492281913757, + "learning_rate": 0.0008946691002707922, + "loss": 2.7971, + "step": 7364 + }, + { + "epoch": 0.21839693977404145, + "grad_norm": 0.16409508883953094, + "learning_rate": 0.0008946402111797387, + "loss": 2.8068, + "step": 7365 + }, + { + "epoch": 0.21842659312635293, + "grad_norm": 0.154948890209198, + "learning_rate": 0.0008946113185940785, + "loss": 2.823, + "step": 7366 + }, + { + "epoch": 0.2184562464786644, + "grad_norm": 0.16821958124637604, + "learning_rate": 0.0008945824225140676, + "loss": 2.8162, + "step": 7367 + }, + { + "epoch": 0.21848589983097588, + "grad_norm": 0.19003723561763763, + "learning_rate": 0.0008945535229399613, + "loss": 2.7934, + "step": 7368 + }, + { + "epoch": 0.21851555318328736, + "grad_norm": 0.21762503683567047, + "learning_rate": 0.0008945246198720159, + "loss": 2.8184, + "step": 7369 + }, + { + "epoch": 0.21854520653559886, + "grad_norm": 0.1974007785320282, + "learning_rate": 0.0008944957133104872, + "loss": 2.8269, + "step": 7370 + }, + { + "epoch": 0.21857485988791034, + "grad_norm": 0.14414678514003754, + "learning_rate": 0.0008944668032556313, + "loss": 2.7906, + "step": 7371 + }, + { + "epoch": 0.21860451324022181, + "grad_norm": 0.16073393821716309, + "learning_rate": 0.0008944378897077041, + "loss": 2.8213, + "step": 7372 + }, + { + "epoch": 0.2186341665925333, + "grad_norm": 0.13943451642990112, + "learning_rate": 0.0008944089726669619, + "loss": 2.7762, + "step": 7373 + }, + { + "epoch": 0.21866381994484477, + "grad_norm": 0.13993625342845917, + "learning_rate": 0.0008943800521336604, + "loss": 2.799, + "step": 7374 + }, + { + "epoch": 0.21869347329715624, + "grad_norm": 0.14109760522842407, + "learning_rate": 0.0008943511281080558, + "loss": 2.813, + "step": 7375 + }, + { + "epoch": 0.21872312664946772, + "grad_norm": 0.1511409878730774, + "learning_rate": 0.0008943222005904043, + "loss": 2.8246, + "step": 7376 + }, + { + "epoch": 0.2187527800017792, + "grad_norm": 0.16893909871578217, + "learning_rate": 0.000894293269580962, + "loss": 2.8079, + "step": 7377 + }, + { + "epoch": 0.21878243335409067, + "grad_norm": 0.14020386338233948, + "learning_rate": 0.0008942643350799852, + "loss": 2.783, + "step": 7378 + }, + { + "epoch": 0.21881208670640215, + "grad_norm": 0.11437837779521942, + "learning_rate": 0.0008942353970877299, + "loss": 2.815, + "step": 7379 + }, + { + "epoch": 0.21884174005871362, + "grad_norm": 0.11022370308637619, + "learning_rate": 0.0008942064556044526, + "loss": 2.8031, + "step": 7380 + }, + { + "epoch": 0.21887139341102513, + "grad_norm": 0.12638020515441895, + "learning_rate": 0.0008941775106304095, + "loss": 2.7926, + "step": 7381 + }, + { + "epoch": 0.2189010467633366, + "grad_norm": 0.12713569402694702, + "learning_rate": 0.0008941485621658569, + "loss": 2.8147, + "step": 7382 + }, + { + "epoch": 0.21893070011564808, + "grad_norm": 0.13186617195606232, + "learning_rate": 0.000894119610211051, + "loss": 2.8238, + "step": 7383 + }, + { + "epoch": 0.21896035346795956, + "grad_norm": 0.15015777945518494, + "learning_rate": 0.0008940906547662484, + "loss": 2.8051, + "step": 7384 + }, + { + "epoch": 0.21899000682027103, + "grad_norm": 0.14391681551933289, + "learning_rate": 0.0008940616958317053, + "loss": 2.7984, + "step": 7385 + }, + { + "epoch": 0.2190196601725825, + "grad_norm": 0.14333871006965637, + "learning_rate": 0.0008940327334076785, + "loss": 2.7994, + "step": 7386 + }, + { + "epoch": 0.21904931352489398, + "grad_norm": 0.1584903746843338, + "learning_rate": 0.0008940037674944239, + "loss": 2.8098, + "step": 7387 + }, + { + "epoch": 0.21907896687720546, + "grad_norm": 0.13660456240177155, + "learning_rate": 0.0008939747980921985, + "loss": 2.8081, + "step": 7388 + }, + { + "epoch": 0.21910862022951694, + "grad_norm": 0.14514927566051483, + "learning_rate": 0.0008939458252012585, + "loss": 2.792, + "step": 7389 + }, + { + "epoch": 0.2191382735818284, + "grad_norm": 0.1468230038881302, + "learning_rate": 0.0008939168488218607, + "loss": 2.7982, + "step": 7390 + }, + { + "epoch": 0.21916792693413992, + "grad_norm": 0.12752364575862885, + "learning_rate": 0.0008938878689542615, + "loss": 2.8284, + "step": 7391 + }, + { + "epoch": 0.2191975802864514, + "grad_norm": 0.13312801718711853, + "learning_rate": 0.0008938588855987177, + "loss": 2.8227, + "step": 7392 + }, + { + "epoch": 0.21922723363876287, + "grad_norm": 0.14391160011291504, + "learning_rate": 0.000893829898755486, + "loss": 2.8243, + "step": 7393 + }, + { + "epoch": 0.21925688699107435, + "grad_norm": 0.15682832896709442, + "learning_rate": 0.0008938009084248226, + "loss": 2.7558, + "step": 7394 + }, + { + "epoch": 0.21928654034338582, + "grad_norm": 0.14657551050186157, + "learning_rate": 0.0008937719146069849, + "loss": 2.8491, + "step": 7395 + }, + { + "epoch": 0.2193161936956973, + "grad_norm": 0.12980015575885773, + "learning_rate": 0.0008937429173022291, + "loss": 2.8119, + "step": 7396 + }, + { + "epoch": 0.21934584704800877, + "grad_norm": 0.12525679171085358, + "learning_rate": 0.0008937139165108123, + "loss": 2.7924, + "step": 7397 + }, + { + "epoch": 0.21937550040032025, + "grad_norm": 0.12311738729476929, + "learning_rate": 0.0008936849122329911, + "loss": 2.7738, + "step": 7398 + }, + { + "epoch": 0.21940515375263173, + "grad_norm": 0.13735413551330566, + "learning_rate": 0.0008936559044690225, + "loss": 2.8415, + "step": 7399 + }, + { + "epoch": 0.2194348071049432, + "grad_norm": 0.13469742238521576, + "learning_rate": 0.0008936268932191631, + "loss": 2.802, + "step": 7400 + }, + { + "epoch": 0.2194644604572547, + "grad_norm": 0.1393771767616272, + "learning_rate": 0.0008935978784836702, + "loss": 2.827, + "step": 7401 + }, + { + "epoch": 0.21949411380956618, + "grad_norm": 0.13305231928825378, + "learning_rate": 0.0008935688602628005, + "loss": 2.8003, + "step": 7402 + }, + { + "epoch": 0.21952376716187766, + "grad_norm": 0.13904216885566711, + "learning_rate": 0.000893539838556811, + "loss": 2.7899, + "step": 7403 + }, + { + "epoch": 0.21955342051418913, + "grad_norm": 0.1390419453382492, + "learning_rate": 0.0008935108133659586, + "loss": 2.7781, + "step": 7404 + }, + { + "epoch": 0.2195830738665006, + "grad_norm": 0.12699179351329803, + "learning_rate": 0.0008934817846905004, + "loss": 2.7995, + "step": 7405 + }, + { + "epoch": 0.2196127272188121, + "grad_norm": 0.12965607643127441, + "learning_rate": 0.0008934527525306936, + "loss": 2.8073, + "step": 7406 + }, + { + "epoch": 0.21964238057112356, + "grad_norm": 0.15571537613868713, + "learning_rate": 0.0008934237168867949, + "loss": 2.8288, + "step": 7407 + }, + { + "epoch": 0.21967203392343504, + "grad_norm": 0.17427732050418854, + "learning_rate": 0.0008933946777590618, + "loss": 2.7871, + "step": 7408 + }, + { + "epoch": 0.21970168727574652, + "grad_norm": 0.20111821591854095, + "learning_rate": 0.0008933656351477514, + "loss": 2.8114, + "step": 7409 + }, + { + "epoch": 0.219731340628058, + "grad_norm": 0.2472396194934845, + "learning_rate": 0.0008933365890531206, + "loss": 2.8309, + "step": 7410 + }, + { + "epoch": 0.21976099398036947, + "grad_norm": 0.22575031220912933, + "learning_rate": 0.0008933075394754269, + "loss": 2.823, + "step": 7411 + }, + { + "epoch": 0.21979064733268097, + "grad_norm": 0.18974927067756653, + "learning_rate": 0.0008932784864149275, + "loss": 2.7905, + "step": 7412 + }, + { + "epoch": 0.21982030068499245, + "grad_norm": 0.19253766536712646, + "learning_rate": 0.0008932494298718795, + "loss": 2.8245, + "step": 7413 + }, + { + "epoch": 0.21984995403730392, + "grad_norm": 0.172361820936203, + "learning_rate": 0.0008932203698465402, + "loss": 2.8146, + "step": 7414 + }, + { + "epoch": 0.2198796073896154, + "grad_norm": 0.15680034458637238, + "learning_rate": 0.000893191306339167, + "loss": 2.7984, + "step": 7415 + }, + { + "epoch": 0.21990926074192688, + "grad_norm": 0.15325136482715607, + "learning_rate": 0.0008931622393500175, + "loss": 2.8041, + "step": 7416 + }, + { + "epoch": 0.21993891409423835, + "grad_norm": 0.1278759092092514, + "learning_rate": 0.0008931331688793488, + "loss": 2.8362, + "step": 7417 + }, + { + "epoch": 0.21996856744654983, + "grad_norm": 0.1298540085554123, + "learning_rate": 0.0008931040949274184, + "loss": 2.8086, + "step": 7418 + }, + { + "epoch": 0.2199982207988613, + "grad_norm": 0.12124647945165634, + "learning_rate": 0.0008930750174944837, + "loss": 2.8086, + "step": 7419 + }, + { + "epoch": 0.22002787415117278, + "grad_norm": 0.11474406719207764, + "learning_rate": 0.0008930459365808024, + "loss": 2.8077, + "step": 7420 + }, + { + "epoch": 0.22005752750348426, + "grad_norm": 0.12588626146316528, + "learning_rate": 0.0008930168521866318, + "loss": 2.8083, + "step": 7421 + }, + { + "epoch": 0.22008718085579576, + "grad_norm": 0.12334603816270828, + "learning_rate": 0.0008929877643122295, + "loss": 2.7824, + "step": 7422 + }, + { + "epoch": 0.22011683420810724, + "grad_norm": 0.13829536736011505, + "learning_rate": 0.0008929586729578531, + "loss": 2.8083, + "step": 7423 + }, + { + "epoch": 0.2201464875604187, + "grad_norm": 0.11063408106565475, + "learning_rate": 0.0008929295781237601, + "loss": 2.7838, + "step": 7424 + }, + { + "epoch": 0.2201761409127302, + "grad_norm": 0.10874750465154648, + "learning_rate": 0.0008929004798102083, + "loss": 2.8099, + "step": 7425 + }, + { + "epoch": 0.22020579426504167, + "grad_norm": 0.11011902242898941, + "learning_rate": 0.0008928713780174554, + "loss": 2.8033, + "step": 7426 + }, + { + "epoch": 0.22023544761735314, + "grad_norm": 0.1255459040403366, + "learning_rate": 0.000892842272745759, + "loss": 2.791, + "step": 7427 + }, + { + "epoch": 0.22026510096966462, + "grad_norm": 0.12195712327957153, + "learning_rate": 0.0008928131639953767, + "loss": 2.7834, + "step": 7428 + }, + { + "epoch": 0.2202947543219761, + "grad_norm": 0.12272105365991592, + "learning_rate": 0.0008927840517665666, + "loss": 2.763, + "step": 7429 + }, + { + "epoch": 0.22032440767428757, + "grad_norm": 0.1271151602268219, + "learning_rate": 0.0008927549360595861, + "loss": 2.8068, + "step": 7430 + }, + { + "epoch": 0.22035406102659905, + "grad_norm": 0.14691773056983948, + "learning_rate": 0.0008927258168746935, + "loss": 2.8008, + "step": 7431 + }, + { + "epoch": 0.22038371437891052, + "grad_norm": 0.15106147527694702, + "learning_rate": 0.0008926966942121462, + "loss": 2.8095, + "step": 7432 + }, + { + "epoch": 0.22041336773122203, + "grad_norm": 0.14089982211589813, + "learning_rate": 0.0008926675680722022, + "loss": 2.824, + "step": 7433 + }, + { + "epoch": 0.2204430210835335, + "grad_norm": 0.16663919389247894, + "learning_rate": 0.0008926384384551196, + "loss": 2.8188, + "step": 7434 + }, + { + "epoch": 0.22047267443584498, + "grad_norm": 0.17451800405979156, + "learning_rate": 0.0008926093053611561, + "loss": 2.7957, + "step": 7435 + }, + { + "epoch": 0.22050232778815645, + "grad_norm": 0.16182060539722443, + "learning_rate": 0.0008925801687905699, + "loss": 2.7911, + "step": 7436 + }, + { + "epoch": 0.22053198114046793, + "grad_norm": 0.15972554683685303, + "learning_rate": 0.0008925510287436189, + "loss": 2.7946, + "step": 7437 + }, + { + "epoch": 0.2205616344927794, + "grad_norm": 0.17428718507289886, + "learning_rate": 0.0008925218852205612, + "loss": 2.7926, + "step": 7438 + }, + { + "epoch": 0.22059128784509088, + "grad_norm": 0.16808658838272095, + "learning_rate": 0.0008924927382216549, + "loss": 2.8013, + "step": 7439 + }, + { + "epoch": 0.22062094119740236, + "grad_norm": 0.17211273312568665, + "learning_rate": 0.0008924635877471578, + "loss": 2.8247, + "step": 7440 + }, + { + "epoch": 0.22065059454971384, + "grad_norm": 0.16265231370925903, + "learning_rate": 0.0008924344337973285, + "loss": 2.834, + "step": 7441 + }, + { + "epoch": 0.2206802479020253, + "grad_norm": 0.16018016636371613, + "learning_rate": 0.0008924052763724248, + "loss": 2.8115, + "step": 7442 + }, + { + "epoch": 0.22070990125433682, + "grad_norm": 0.1692488193511963, + "learning_rate": 0.000892376115472705, + "loss": 2.8318, + "step": 7443 + }, + { + "epoch": 0.2207395546066483, + "grad_norm": 0.14207041263580322, + "learning_rate": 0.0008923469510984276, + "loss": 2.8001, + "step": 7444 + }, + { + "epoch": 0.22076920795895977, + "grad_norm": 0.13782648742198944, + "learning_rate": 0.0008923177832498504, + "loss": 2.7868, + "step": 7445 + }, + { + "epoch": 0.22079886131127124, + "grad_norm": 0.1240369901061058, + "learning_rate": 0.0008922886119272317, + "loss": 2.8338, + "step": 7446 + }, + { + "epoch": 0.22082851466358272, + "grad_norm": 0.13083751499652863, + "learning_rate": 0.0008922594371308304, + "loss": 2.819, + "step": 7447 + }, + { + "epoch": 0.2208581680158942, + "grad_norm": 0.11880552023649216, + "learning_rate": 0.0008922302588609042, + "loss": 2.7772, + "step": 7448 + }, + { + "epoch": 0.22088782136820567, + "grad_norm": 0.11355556547641754, + "learning_rate": 0.0008922010771177119, + "loss": 2.8194, + "step": 7449 + }, + { + "epoch": 0.22091747472051715, + "grad_norm": 0.12245067954063416, + "learning_rate": 0.0008921718919015116, + "loss": 2.7895, + "step": 7450 + }, + { + "epoch": 0.22094712807282862, + "grad_norm": 0.12479843944311142, + "learning_rate": 0.0008921427032125618, + "loss": 2.8291, + "step": 7451 + }, + { + "epoch": 0.2209767814251401, + "grad_norm": 0.1410541534423828, + "learning_rate": 0.0008921135110511213, + "loss": 2.8139, + "step": 7452 + }, + { + "epoch": 0.2210064347774516, + "grad_norm": 0.1501745581626892, + "learning_rate": 0.0008920843154174481, + "loss": 2.7947, + "step": 7453 + }, + { + "epoch": 0.22103608812976308, + "grad_norm": 0.17396105825901031, + "learning_rate": 0.0008920551163118011, + "loss": 2.7755, + "step": 7454 + }, + { + "epoch": 0.22106574148207456, + "grad_norm": 0.15414632856845856, + "learning_rate": 0.0008920259137344389, + "loss": 2.8031, + "step": 7455 + }, + { + "epoch": 0.22109539483438603, + "grad_norm": 0.133192777633667, + "learning_rate": 0.0008919967076856197, + "loss": 2.7791, + "step": 7456 + }, + { + "epoch": 0.2211250481866975, + "grad_norm": 0.12814994156360626, + "learning_rate": 0.0008919674981656025, + "loss": 2.8165, + "step": 7457 + }, + { + "epoch": 0.22115470153900899, + "grad_norm": 0.12363838404417038, + "learning_rate": 0.0008919382851746458, + "loss": 2.7963, + "step": 7458 + }, + { + "epoch": 0.22118435489132046, + "grad_norm": 0.11092685908079147, + "learning_rate": 0.0008919090687130082, + "loss": 2.7732, + "step": 7459 + }, + { + "epoch": 0.22121400824363194, + "grad_norm": 0.1255563348531723, + "learning_rate": 0.0008918798487809488, + "loss": 2.8369, + "step": 7460 + }, + { + "epoch": 0.2212436615959434, + "grad_norm": 0.13191662728786469, + "learning_rate": 0.0008918506253787258, + "loss": 2.7802, + "step": 7461 + }, + { + "epoch": 0.2212733149482549, + "grad_norm": 0.1533731073141098, + "learning_rate": 0.0008918213985065984, + "loss": 2.8051, + "step": 7462 + }, + { + "epoch": 0.22130296830056637, + "grad_norm": 0.17782151699066162, + "learning_rate": 0.0008917921681648252, + "loss": 2.8049, + "step": 7463 + }, + { + "epoch": 0.22133262165287787, + "grad_norm": 0.18573451042175293, + "learning_rate": 0.0008917629343536652, + "loss": 2.8089, + "step": 7464 + }, + { + "epoch": 0.22136227500518935, + "grad_norm": 0.1611977070569992, + "learning_rate": 0.000891733697073377, + "loss": 2.8075, + "step": 7465 + }, + { + "epoch": 0.22139192835750082, + "grad_norm": 0.1310395896434784, + "learning_rate": 0.0008917044563242198, + "loss": 2.802, + "step": 7466 + }, + { + "epoch": 0.2214215817098123, + "grad_norm": 0.1232520192861557, + "learning_rate": 0.0008916752121064524, + "loss": 2.8329, + "step": 7467 + }, + { + "epoch": 0.22145123506212377, + "grad_norm": 0.12631268799304962, + "learning_rate": 0.0008916459644203337, + "loss": 2.7936, + "step": 7468 + }, + { + "epoch": 0.22148088841443525, + "grad_norm": 0.14277370274066925, + "learning_rate": 0.0008916167132661228, + "loss": 2.7973, + "step": 7469 + }, + { + "epoch": 0.22151054176674673, + "grad_norm": 0.15354721248149872, + "learning_rate": 0.0008915874586440787, + "loss": 2.8063, + "step": 7470 + }, + { + "epoch": 0.2215401951190582, + "grad_norm": 0.17476734519004822, + "learning_rate": 0.0008915582005544604, + "loss": 2.8037, + "step": 7471 + }, + { + "epoch": 0.22156984847136968, + "grad_norm": 0.1615854799747467, + "learning_rate": 0.0008915289389975269, + "loss": 2.8014, + "step": 7472 + }, + { + "epoch": 0.22159950182368116, + "grad_norm": 0.12870649993419647, + "learning_rate": 0.0008914996739735377, + "loss": 2.8501, + "step": 7473 + }, + { + "epoch": 0.22162915517599266, + "grad_norm": 0.13010989129543304, + "learning_rate": 0.0008914704054827517, + "loss": 2.7823, + "step": 7474 + }, + { + "epoch": 0.22165880852830414, + "grad_norm": 0.1534704864025116, + "learning_rate": 0.0008914411335254278, + "loss": 2.8355, + "step": 7475 + }, + { + "epoch": 0.2216884618806156, + "grad_norm": 0.13890773057937622, + "learning_rate": 0.0008914118581018255, + "loss": 2.8124, + "step": 7476 + }, + { + "epoch": 0.2217181152329271, + "grad_norm": 0.125672847032547, + "learning_rate": 0.0008913825792122043, + "loss": 2.8069, + "step": 7477 + }, + { + "epoch": 0.22174776858523856, + "grad_norm": 0.1094081699848175, + "learning_rate": 0.0008913532968568229, + "loss": 2.8104, + "step": 7478 + }, + { + "epoch": 0.22177742193755004, + "grad_norm": 0.12799590826034546, + "learning_rate": 0.0008913240110359409, + "loss": 2.8016, + "step": 7479 + }, + { + "epoch": 0.22180707528986152, + "grad_norm": 0.13720068335533142, + "learning_rate": 0.0008912947217498177, + "loss": 2.8108, + "step": 7480 + }, + { + "epoch": 0.221836728642173, + "grad_norm": 0.1648350954055786, + "learning_rate": 0.0008912654289987127, + "loss": 2.7927, + "step": 7481 + }, + { + "epoch": 0.22186638199448447, + "grad_norm": 0.1814960539340973, + "learning_rate": 0.000891236132782885, + "loss": 2.803, + "step": 7482 + }, + { + "epoch": 0.22189603534679594, + "grad_norm": 0.17313729226589203, + "learning_rate": 0.0008912068331025943, + "loss": 2.8313, + "step": 7483 + }, + { + "epoch": 0.22192568869910742, + "grad_norm": 0.1647675782442093, + "learning_rate": 0.0008911775299580998, + "loss": 2.8569, + "step": 7484 + }, + { + "epoch": 0.22195534205141892, + "grad_norm": 0.1472061723470688, + "learning_rate": 0.0008911482233496612, + "loss": 2.8199, + "step": 7485 + }, + { + "epoch": 0.2219849954037304, + "grad_norm": 0.14689655601978302, + "learning_rate": 0.0008911189132775379, + "loss": 2.8082, + "step": 7486 + }, + { + "epoch": 0.22201464875604188, + "grad_norm": 0.14880090951919556, + "learning_rate": 0.0008910895997419894, + "loss": 2.8025, + "step": 7487 + }, + { + "epoch": 0.22204430210835335, + "grad_norm": 0.1167096346616745, + "learning_rate": 0.0008910602827432756, + "loss": 2.7828, + "step": 7488 + }, + { + "epoch": 0.22207395546066483, + "grad_norm": 0.13340021669864655, + "learning_rate": 0.0008910309622816557, + "loss": 2.8026, + "step": 7489 + }, + { + "epoch": 0.2221036088129763, + "grad_norm": 0.13690467178821564, + "learning_rate": 0.0008910016383573896, + "loss": 2.8078, + "step": 7490 + }, + { + "epoch": 0.22213326216528778, + "grad_norm": 0.1496332883834839, + "learning_rate": 0.0008909723109707369, + "loss": 2.7981, + "step": 7491 + }, + { + "epoch": 0.22216291551759926, + "grad_norm": 0.14784587919712067, + "learning_rate": 0.0008909429801219572, + "loss": 2.8128, + "step": 7492 + }, + { + "epoch": 0.22219256886991073, + "grad_norm": 0.1620386838912964, + "learning_rate": 0.0008909136458113103, + "loss": 2.7706, + "step": 7493 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.16097231209278107, + "learning_rate": 0.0008908843080390561, + "loss": 2.8009, + "step": 7494 + }, + { + "epoch": 0.2222518755745337, + "grad_norm": 0.15319275856018066, + "learning_rate": 0.000890854966805454, + "loss": 2.8041, + "step": 7495 + }, + { + "epoch": 0.2222815289268452, + "grad_norm": 0.14288605749607086, + "learning_rate": 0.0008908256221107642, + "loss": 2.7707, + "step": 7496 + }, + { + "epoch": 0.22231118227915667, + "grad_norm": 0.15101340413093567, + "learning_rate": 0.0008907962739552465, + "loss": 2.7946, + "step": 7497 + }, + { + "epoch": 0.22234083563146814, + "grad_norm": 0.14684121310710907, + "learning_rate": 0.0008907669223391606, + "loss": 2.8186, + "step": 7498 + }, + { + "epoch": 0.22237048898377962, + "grad_norm": 0.16366727650165558, + "learning_rate": 0.0008907375672627667, + "loss": 2.8221, + "step": 7499 + }, + { + "epoch": 0.2224001423360911, + "grad_norm": 0.14782670140266418, + "learning_rate": 0.0008907082087263246, + "loss": 2.8045, + "step": 7500 + }, + { + "epoch": 0.22242979568840257, + "grad_norm": 0.1303701102733612, + "learning_rate": 0.000890678846730094, + "loss": 2.8201, + "step": 7501 + }, + { + "epoch": 0.22245944904071405, + "grad_norm": 0.13867701590061188, + "learning_rate": 0.0008906494812743354, + "loss": 2.8047, + "step": 7502 + }, + { + "epoch": 0.22248910239302552, + "grad_norm": 0.14232274889945984, + "learning_rate": 0.0008906201123593084, + "loss": 2.7738, + "step": 7503 + }, + { + "epoch": 0.222518755745337, + "grad_norm": 0.1258459985256195, + "learning_rate": 0.0008905907399852733, + "loss": 2.8194, + "step": 7504 + }, + { + "epoch": 0.2225484090976485, + "grad_norm": 0.1452093869447708, + "learning_rate": 0.0008905613641524901, + "loss": 2.8294, + "step": 7505 + }, + { + "epoch": 0.22257806244995998, + "grad_norm": 0.14264042675495148, + "learning_rate": 0.0008905319848612192, + "loss": 2.7939, + "step": 7506 + }, + { + "epoch": 0.22260771580227146, + "grad_norm": 0.13311868906021118, + "learning_rate": 0.0008905026021117204, + "loss": 2.8152, + "step": 7507 + }, + { + "epoch": 0.22263736915458293, + "grad_norm": 0.11337383836507797, + "learning_rate": 0.0008904732159042539, + "loss": 2.7603, + "step": 7508 + }, + { + "epoch": 0.2226670225068944, + "grad_norm": 0.14137941598892212, + "learning_rate": 0.0008904438262390803, + "loss": 2.79, + "step": 7509 + }, + { + "epoch": 0.22269667585920588, + "grad_norm": 0.16042111814022064, + "learning_rate": 0.0008904144331164594, + "loss": 2.8138, + "step": 7510 + }, + { + "epoch": 0.22272632921151736, + "grad_norm": 0.17470631003379822, + "learning_rate": 0.0008903850365366517, + "loss": 2.7907, + "step": 7511 + }, + { + "epoch": 0.22275598256382884, + "grad_norm": 0.17133773863315582, + "learning_rate": 0.0008903556364999177, + "loss": 2.7842, + "step": 7512 + }, + { + "epoch": 0.2227856359161403, + "grad_norm": 0.13068950176239014, + "learning_rate": 0.0008903262330065174, + "loss": 2.806, + "step": 7513 + }, + { + "epoch": 0.2228152892684518, + "grad_norm": 0.12896403670310974, + "learning_rate": 0.0008902968260567113, + "loss": 2.8415, + "step": 7514 + }, + { + "epoch": 0.22284494262076326, + "grad_norm": 0.15701496601104736, + "learning_rate": 0.0008902674156507597, + "loss": 2.8356, + "step": 7515 + }, + { + "epoch": 0.22287459597307477, + "grad_norm": 0.16853682696819305, + "learning_rate": 0.0008902380017889233, + "loss": 2.7926, + "step": 7516 + }, + { + "epoch": 0.22290424932538624, + "grad_norm": 0.12961339950561523, + "learning_rate": 0.0008902085844714624, + "loss": 2.7891, + "step": 7517 + }, + { + "epoch": 0.22293390267769772, + "grad_norm": 0.15893787145614624, + "learning_rate": 0.0008901791636986374, + "loss": 2.8169, + "step": 7518 + }, + { + "epoch": 0.2229635560300092, + "grad_norm": 0.18425069749355316, + "learning_rate": 0.000890149739470709, + "loss": 2.7872, + "step": 7519 + }, + { + "epoch": 0.22299320938232067, + "grad_norm": 0.1663803905248642, + "learning_rate": 0.0008901203117879377, + "loss": 2.7977, + "step": 7520 + }, + { + "epoch": 0.22302286273463215, + "grad_norm": 0.14402449131011963, + "learning_rate": 0.0008900908806505841, + "loss": 2.8166, + "step": 7521 + }, + { + "epoch": 0.22305251608694363, + "grad_norm": 0.14482168853282928, + "learning_rate": 0.0008900614460589087, + "loss": 2.7993, + "step": 7522 + }, + { + "epoch": 0.2230821694392551, + "grad_norm": 0.15064522624015808, + "learning_rate": 0.0008900320080131724, + "loss": 2.814, + "step": 7523 + }, + { + "epoch": 0.22311182279156658, + "grad_norm": 0.1862749308347702, + "learning_rate": 0.0008900025665136356, + "loss": 2.8096, + "step": 7524 + }, + { + "epoch": 0.22314147614387805, + "grad_norm": 0.1943177878856659, + "learning_rate": 0.0008899731215605591, + "loss": 2.8122, + "step": 7525 + }, + { + "epoch": 0.22317112949618956, + "grad_norm": 0.17064018547534943, + "learning_rate": 0.0008899436731542037, + "loss": 2.8215, + "step": 7526 + }, + { + "epoch": 0.22320078284850103, + "grad_norm": 0.17569462954998016, + "learning_rate": 0.0008899142212948301, + "loss": 2.8371, + "step": 7527 + }, + { + "epoch": 0.2232304362008125, + "grad_norm": 0.1874183863401413, + "learning_rate": 0.0008898847659826993, + "loss": 2.8503, + "step": 7528 + }, + { + "epoch": 0.22326008955312399, + "grad_norm": 0.16279593110084534, + "learning_rate": 0.000889855307218072, + "loss": 2.7888, + "step": 7529 + }, + { + "epoch": 0.22328974290543546, + "grad_norm": 0.15660011768341064, + "learning_rate": 0.0008898258450012089, + "loss": 2.7854, + "step": 7530 + }, + { + "epoch": 0.22331939625774694, + "grad_norm": 0.1544140726327896, + "learning_rate": 0.0008897963793323711, + "loss": 2.7961, + "step": 7531 + }, + { + "epoch": 0.22334904961005841, + "grad_norm": 0.1475202739238739, + "learning_rate": 0.0008897669102118193, + "loss": 2.8212, + "step": 7532 + }, + { + "epoch": 0.2233787029623699, + "grad_norm": 0.14883357286453247, + "learning_rate": 0.0008897374376398146, + "loss": 2.7852, + "step": 7533 + }, + { + "epoch": 0.22340835631468137, + "grad_norm": 0.15292473137378693, + "learning_rate": 0.0008897079616166181, + "loss": 2.8121, + "step": 7534 + }, + { + "epoch": 0.22343800966699284, + "grad_norm": 0.1289450228214264, + "learning_rate": 0.0008896784821424908, + "loss": 2.8193, + "step": 7535 + }, + { + "epoch": 0.22346766301930432, + "grad_norm": 0.1068430095911026, + "learning_rate": 0.0008896489992176934, + "loss": 2.7902, + "step": 7536 + }, + { + "epoch": 0.22349731637161582, + "grad_norm": 0.1121787577867508, + "learning_rate": 0.0008896195128424876, + "loss": 2.7953, + "step": 7537 + }, + { + "epoch": 0.2235269697239273, + "grad_norm": 0.12235354632139206, + "learning_rate": 0.0008895900230171339, + "loss": 2.8508, + "step": 7538 + }, + { + "epoch": 0.22355662307623878, + "grad_norm": 0.12191417068243027, + "learning_rate": 0.0008895605297418936, + "loss": 2.8251, + "step": 7539 + }, + { + "epoch": 0.22358627642855025, + "grad_norm": 0.12644484639167786, + "learning_rate": 0.0008895310330170281, + "loss": 2.8257, + "step": 7540 + }, + { + "epoch": 0.22361592978086173, + "grad_norm": 0.1338626593351364, + "learning_rate": 0.0008895015328427984, + "loss": 2.7927, + "step": 7541 + }, + { + "epoch": 0.2236455831331732, + "grad_norm": 0.1210879310965538, + "learning_rate": 0.0008894720292194658, + "loss": 2.7805, + "step": 7542 + }, + { + "epoch": 0.22367523648548468, + "grad_norm": 0.13620547950267792, + "learning_rate": 0.0008894425221472915, + "loss": 2.8104, + "step": 7543 + }, + { + "epoch": 0.22370488983779616, + "grad_norm": 0.14453090727329254, + "learning_rate": 0.0008894130116265368, + "loss": 2.7998, + "step": 7544 + }, + { + "epoch": 0.22373454319010763, + "grad_norm": 0.13226604461669922, + "learning_rate": 0.0008893834976574631, + "loss": 2.8224, + "step": 7545 + }, + { + "epoch": 0.2237641965424191, + "grad_norm": 0.14555315673351288, + "learning_rate": 0.0008893539802403316, + "loss": 2.788, + "step": 7546 + }, + { + "epoch": 0.2237938498947306, + "grad_norm": 0.1586921215057373, + "learning_rate": 0.0008893244593754037, + "loss": 2.8295, + "step": 7547 + }, + { + "epoch": 0.2238235032470421, + "grad_norm": 0.1719018667936325, + "learning_rate": 0.0008892949350629411, + "loss": 2.8244, + "step": 7548 + }, + { + "epoch": 0.22385315659935356, + "grad_norm": 0.1385262906551361, + "learning_rate": 0.0008892654073032049, + "loss": 2.8028, + "step": 7549 + }, + { + "epoch": 0.22388280995166504, + "grad_norm": 0.12087590992450714, + "learning_rate": 0.0008892358760964567, + "loss": 2.8037, + "step": 7550 + }, + { + "epoch": 0.22391246330397652, + "grad_norm": 0.15488983690738678, + "learning_rate": 0.0008892063414429581, + "loss": 2.7978, + "step": 7551 + }, + { + "epoch": 0.223942116656288, + "grad_norm": 0.14426027238368988, + "learning_rate": 0.0008891768033429704, + "loss": 2.7885, + "step": 7552 + }, + { + "epoch": 0.22397177000859947, + "grad_norm": 0.1341153234243393, + "learning_rate": 0.0008891472617967554, + "loss": 2.8534, + "step": 7553 + }, + { + "epoch": 0.22400142336091095, + "grad_norm": 0.13642214238643646, + "learning_rate": 0.0008891177168045745, + "loss": 2.7954, + "step": 7554 + }, + { + "epoch": 0.22403107671322242, + "grad_norm": 0.1487487554550171, + "learning_rate": 0.0008890881683666896, + "loss": 2.8438, + "step": 7555 + }, + { + "epoch": 0.2240607300655339, + "grad_norm": 0.14288608729839325, + "learning_rate": 0.000889058616483362, + "loss": 2.801, + "step": 7556 + }, + { + "epoch": 0.2240903834178454, + "grad_norm": 0.14330580830574036, + "learning_rate": 0.0008890290611548537, + "loss": 2.8161, + "step": 7557 + }, + { + "epoch": 0.22412003677015688, + "grad_norm": 0.1706591248512268, + "learning_rate": 0.0008889995023814262, + "loss": 2.7806, + "step": 7558 + }, + { + "epoch": 0.22414969012246835, + "grad_norm": 0.15498177707195282, + "learning_rate": 0.0008889699401633414, + "loss": 2.8267, + "step": 7559 + }, + { + "epoch": 0.22417934347477983, + "grad_norm": 0.14182259142398834, + "learning_rate": 0.0008889403745008609, + "loss": 2.8004, + "step": 7560 + }, + { + "epoch": 0.2242089968270913, + "grad_norm": 0.1626025289297104, + "learning_rate": 0.0008889108053942469, + "loss": 2.7916, + "step": 7561 + }, + { + "epoch": 0.22423865017940278, + "grad_norm": 0.16211779415607452, + "learning_rate": 0.0008888812328437607, + "loss": 2.8344, + "step": 7562 + }, + { + "epoch": 0.22426830353171426, + "grad_norm": 0.17959724366664886, + "learning_rate": 0.0008888516568496645, + "loss": 2.7924, + "step": 7563 + }, + { + "epoch": 0.22429795688402573, + "grad_norm": 0.15756665170192719, + "learning_rate": 0.0008888220774122202, + "loss": 2.7752, + "step": 7564 + }, + { + "epoch": 0.2243276102363372, + "grad_norm": 0.1076953113079071, + "learning_rate": 0.0008887924945316895, + "loss": 2.8575, + "step": 7565 + }, + { + "epoch": 0.2243572635886487, + "grad_norm": 0.14226257801055908, + "learning_rate": 0.0008887629082083346, + "loss": 2.8075, + "step": 7566 + }, + { + "epoch": 0.22438691694096016, + "grad_norm": 0.14167913794517517, + "learning_rate": 0.0008887333184424176, + "loss": 2.7896, + "step": 7567 + }, + { + "epoch": 0.22441657029327167, + "grad_norm": 0.14755098521709442, + "learning_rate": 0.0008887037252342001, + "loss": 2.7773, + "step": 7568 + }, + { + "epoch": 0.22444622364558314, + "grad_norm": 0.1435800939798355, + "learning_rate": 0.0008886741285839446, + "loss": 2.8161, + "step": 7569 + }, + { + "epoch": 0.22447587699789462, + "grad_norm": 0.16999094188213348, + "learning_rate": 0.0008886445284919127, + "loss": 2.7894, + "step": 7570 + }, + { + "epoch": 0.2245055303502061, + "grad_norm": 0.17410270869731903, + "learning_rate": 0.0008886149249583671, + "loss": 2.7642, + "step": 7571 + }, + { + "epoch": 0.22453518370251757, + "grad_norm": 0.1476261168718338, + "learning_rate": 0.0008885853179835695, + "loss": 2.799, + "step": 7572 + }, + { + "epoch": 0.22456483705482905, + "grad_norm": 0.15491019189357758, + "learning_rate": 0.0008885557075677821, + "loss": 2.8219, + "step": 7573 + }, + { + "epoch": 0.22459449040714052, + "grad_norm": 0.1828838288784027, + "learning_rate": 0.0008885260937112673, + "loss": 2.8318, + "step": 7574 + }, + { + "epoch": 0.224624143759452, + "grad_norm": 0.1820540875196457, + "learning_rate": 0.0008884964764142874, + "loss": 2.8331, + "step": 7575 + }, + { + "epoch": 0.22465379711176348, + "grad_norm": 0.16577784717082977, + "learning_rate": 0.0008884668556771042, + "loss": 2.7556, + "step": 7576 + }, + { + "epoch": 0.22468345046407495, + "grad_norm": 0.15699103474617004, + "learning_rate": 0.0008884372314999805, + "loss": 2.7975, + "step": 7577 + }, + { + "epoch": 0.22471310381638646, + "grad_norm": 0.13020093739032745, + "learning_rate": 0.0008884076038831785, + "loss": 2.7987, + "step": 7578 + }, + { + "epoch": 0.22474275716869793, + "grad_norm": 0.14801111817359924, + "learning_rate": 0.0008883779728269604, + "loss": 2.8242, + "step": 7579 + }, + { + "epoch": 0.2247724105210094, + "grad_norm": 0.14924129843711853, + "learning_rate": 0.0008883483383315887, + "loss": 2.7948, + "step": 7580 + }, + { + "epoch": 0.22480206387332088, + "grad_norm": 0.15145443379878998, + "learning_rate": 0.0008883187003973259, + "loss": 2.7858, + "step": 7581 + }, + { + "epoch": 0.22483171722563236, + "grad_norm": 0.12829288840293884, + "learning_rate": 0.0008882890590244341, + "loss": 2.7942, + "step": 7582 + }, + { + "epoch": 0.22486137057794384, + "grad_norm": 0.13938820362091064, + "learning_rate": 0.0008882594142131763, + "loss": 2.7955, + "step": 7583 + }, + { + "epoch": 0.2248910239302553, + "grad_norm": 0.14085383713245392, + "learning_rate": 0.0008882297659638147, + "loss": 2.8117, + "step": 7584 + }, + { + "epoch": 0.2249206772825668, + "grad_norm": 0.1323343962430954, + "learning_rate": 0.0008882001142766117, + "loss": 2.7567, + "step": 7585 + }, + { + "epoch": 0.22495033063487826, + "grad_norm": 0.13191863894462585, + "learning_rate": 0.0008881704591518302, + "loss": 2.8142, + "step": 7586 + }, + { + "epoch": 0.22497998398718974, + "grad_norm": 0.1304280012845993, + "learning_rate": 0.0008881408005897327, + "loss": 2.7872, + "step": 7587 + }, + { + "epoch": 0.22500963733950122, + "grad_norm": 0.1346246749162674, + "learning_rate": 0.0008881111385905817, + "loss": 2.776, + "step": 7588 + }, + { + "epoch": 0.22503929069181272, + "grad_norm": 0.12739434838294983, + "learning_rate": 0.00088808147315464, + "loss": 2.8325, + "step": 7589 + }, + { + "epoch": 0.2250689440441242, + "grad_norm": 0.11442048847675323, + "learning_rate": 0.0008880518042821702, + "loss": 2.8097, + "step": 7590 + }, + { + "epoch": 0.22509859739643567, + "grad_norm": 0.1417255699634552, + "learning_rate": 0.0008880221319734351, + "loss": 2.8402, + "step": 7591 + }, + { + "epoch": 0.22512825074874715, + "grad_norm": 0.1548953801393509, + "learning_rate": 0.0008879924562286974, + "loss": 2.8326, + "step": 7592 + }, + { + "epoch": 0.22515790410105863, + "grad_norm": 0.1397436261177063, + "learning_rate": 0.0008879627770482199, + "loss": 2.8447, + "step": 7593 + }, + { + "epoch": 0.2251875574533701, + "grad_norm": 0.13708536326885223, + "learning_rate": 0.0008879330944322654, + "loss": 2.807, + "step": 7594 + }, + { + "epoch": 0.22521721080568158, + "grad_norm": 0.13543887436389923, + "learning_rate": 0.0008879034083810968, + "loss": 2.796, + "step": 7595 + }, + { + "epoch": 0.22524686415799305, + "grad_norm": 0.14231730997562408, + "learning_rate": 0.000887873718894977, + "loss": 2.8081, + "step": 7596 + }, + { + "epoch": 0.22527651751030453, + "grad_norm": 0.1716257631778717, + "learning_rate": 0.0008878440259741687, + "loss": 2.8115, + "step": 7597 + }, + { + "epoch": 0.225306170862616, + "grad_norm": 0.16250407695770264, + "learning_rate": 0.000887814329618935, + "loss": 2.803, + "step": 7598 + }, + { + "epoch": 0.2253358242149275, + "grad_norm": 0.15599626302719116, + "learning_rate": 0.0008877846298295389, + "loss": 2.8331, + "step": 7599 + }, + { + "epoch": 0.225365477567239, + "grad_norm": 0.1407512128353119, + "learning_rate": 0.0008877549266062435, + "loss": 2.8126, + "step": 7600 + }, + { + "epoch": 0.22539513091955046, + "grad_norm": 0.1529184728860855, + "learning_rate": 0.0008877252199493113, + "loss": 2.7746, + "step": 7601 + }, + { + "epoch": 0.22542478427186194, + "grad_norm": 0.13017402589321136, + "learning_rate": 0.000887695509859006, + "loss": 2.8143, + "step": 7602 + }, + { + "epoch": 0.22545443762417341, + "grad_norm": 0.13952253758907318, + "learning_rate": 0.0008876657963355903, + "loss": 2.8307, + "step": 7603 + }, + { + "epoch": 0.2254840909764849, + "grad_norm": 0.14193665981292725, + "learning_rate": 0.0008876360793793275, + "loss": 2.8463, + "step": 7604 + }, + { + "epoch": 0.22551374432879637, + "grad_norm": 0.14204399287700653, + "learning_rate": 0.0008876063589904806, + "loss": 2.814, + "step": 7605 + }, + { + "epoch": 0.22554339768110784, + "grad_norm": 0.16796322166919708, + "learning_rate": 0.0008875766351693128, + "loss": 2.8249, + "step": 7606 + }, + { + "epoch": 0.22557305103341932, + "grad_norm": 0.1597827523946762, + "learning_rate": 0.0008875469079160876, + "loss": 2.7986, + "step": 7607 + }, + { + "epoch": 0.2256027043857308, + "grad_norm": 0.150854229927063, + "learning_rate": 0.0008875171772310679, + "loss": 2.8129, + "step": 7608 + }, + { + "epoch": 0.2256323577380423, + "grad_norm": 0.1303805261850357, + "learning_rate": 0.000887487443114517, + "loss": 2.7944, + "step": 7609 + }, + { + "epoch": 0.22566201109035378, + "grad_norm": 0.13568587601184845, + "learning_rate": 0.0008874577055666984, + "loss": 2.8302, + "step": 7610 + }, + { + "epoch": 0.22569166444266525, + "grad_norm": 0.11727730929851532, + "learning_rate": 0.0008874279645878753, + "loss": 2.7951, + "step": 7611 + }, + { + "epoch": 0.22572131779497673, + "grad_norm": 0.13299524784088135, + "learning_rate": 0.000887398220178311, + "loss": 2.8352, + "step": 7612 + }, + { + "epoch": 0.2257509711472882, + "grad_norm": 0.11210470646619797, + "learning_rate": 0.0008873684723382689, + "loss": 2.8273, + "step": 7613 + }, + { + "epoch": 0.22578062449959968, + "grad_norm": 0.11998699605464935, + "learning_rate": 0.0008873387210680126, + "loss": 2.8019, + "step": 7614 + }, + { + "epoch": 0.22581027785191116, + "grad_norm": 0.1264331042766571, + "learning_rate": 0.0008873089663678053, + "loss": 2.7764, + "step": 7615 + }, + { + "epoch": 0.22583993120422263, + "grad_norm": 0.12039394676685333, + "learning_rate": 0.0008872792082379109, + "loss": 2.8078, + "step": 7616 + }, + { + "epoch": 0.2258695845565341, + "grad_norm": 0.130666583776474, + "learning_rate": 0.0008872494466785924, + "loss": 2.7879, + "step": 7617 + }, + { + "epoch": 0.22589923790884558, + "grad_norm": 0.15256597101688385, + "learning_rate": 0.0008872196816901137, + "loss": 2.8037, + "step": 7618 + }, + { + "epoch": 0.22592889126115706, + "grad_norm": 0.15319068729877472, + "learning_rate": 0.0008871899132727382, + "loss": 2.8071, + "step": 7619 + }, + { + "epoch": 0.22595854461346856, + "grad_norm": 0.1428494155406952, + "learning_rate": 0.0008871601414267295, + "loss": 2.7969, + "step": 7620 + }, + { + "epoch": 0.22598819796578004, + "grad_norm": 0.12834538519382477, + "learning_rate": 0.0008871303661523514, + "loss": 2.8311, + "step": 7621 + }, + { + "epoch": 0.22601785131809152, + "grad_norm": 0.1364232748746872, + "learning_rate": 0.0008871005874498674, + "loss": 2.8326, + "step": 7622 + }, + { + "epoch": 0.226047504670403, + "grad_norm": 0.1734168976545334, + "learning_rate": 0.0008870708053195413, + "loss": 2.826, + "step": 7623 + }, + { + "epoch": 0.22607715802271447, + "grad_norm": 0.1786673367023468, + "learning_rate": 0.0008870410197616368, + "loss": 2.8312, + "step": 7624 + }, + { + "epoch": 0.22610681137502595, + "grad_norm": 0.15618066489696503, + "learning_rate": 0.0008870112307764176, + "loss": 2.7944, + "step": 7625 + }, + { + "epoch": 0.22613646472733742, + "grad_norm": 0.16265735030174255, + "learning_rate": 0.0008869814383641474, + "loss": 2.8249, + "step": 7626 + }, + { + "epoch": 0.2261661180796489, + "grad_norm": 0.14795731008052826, + "learning_rate": 0.0008869516425250902, + "loss": 2.8155, + "step": 7627 + }, + { + "epoch": 0.22619577143196037, + "grad_norm": 0.1381325125694275, + "learning_rate": 0.0008869218432595099, + "loss": 2.805, + "step": 7628 + }, + { + "epoch": 0.22622542478427185, + "grad_norm": 0.1680866926908493, + "learning_rate": 0.0008868920405676701, + "loss": 2.8166, + "step": 7629 + }, + { + "epoch": 0.22625507813658335, + "grad_norm": 0.15992088615894318, + "learning_rate": 0.000886862234449835, + "loss": 2.8084, + "step": 7630 + }, + { + "epoch": 0.22628473148889483, + "grad_norm": 0.16646245121955872, + "learning_rate": 0.0008868324249062682, + "loss": 2.8, + "step": 7631 + }, + { + "epoch": 0.2263143848412063, + "grad_norm": 0.17007037997245789, + "learning_rate": 0.0008868026119372342, + "loss": 2.796, + "step": 7632 + }, + { + "epoch": 0.22634403819351778, + "grad_norm": 0.15481829643249512, + "learning_rate": 0.0008867727955429965, + "loss": 2.817, + "step": 7633 + }, + { + "epoch": 0.22637369154582926, + "grad_norm": 0.1761365532875061, + "learning_rate": 0.0008867429757238194, + "loss": 2.7928, + "step": 7634 + }, + { + "epoch": 0.22640334489814073, + "grad_norm": 0.1518225073814392, + "learning_rate": 0.0008867131524799667, + "loss": 2.7881, + "step": 7635 + }, + { + "epoch": 0.2264329982504522, + "grad_norm": 0.14012973010540009, + "learning_rate": 0.0008866833258117029, + "loss": 2.7687, + "step": 7636 + }, + { + "epoch": 0.2264626516027637, + "grad_norm": 0.11731625348329544, + "learning_rate": 0.0008866534957192915, + "loss": 2.7986, + "step": 7637 + }, + { + "epoch": 0.22649230495507516, + "grad_norm": 0.12419486790895462, + "learning_rate": 0.0008866236622029974, + "loss": 2.8126, + "step": 7638 + }, + { + "epoch": 0.22652195830738664, + "grad_norm": 0.13241340219974518, + "learning_rate": 0.0008865938252630843, + "loss": 2.8293, + "step": 7639 + }, + { + "epoch": 0.22655161165969812, + "grad_norm": 0.13790616393089294, + "learning_rate": 0.0008865639848998164, + "loss": 2.8112, + "step": 7640 + }, + { + "epoch": 0.22658126501200962, + "grad_norm": 0.14164884388446808, + "learning_rate": 0.0008865341411134582, + "loss": 2.794, + "step": 7641 + }, + { + "epoch": 0.2266109183643211, + "grad_norm": 0.15099292993545532, + "learning_rate": 0.0008865042939042738, + "loss": 2.8016, + "step": 7642 + }, + { + "epoch": 0.22664057171663257, + "grad_norm": 0.1549052596092224, + "learning_rate": 0.0008864744432725275, + "loss": 2.8107, + "step": 7643 + }, + { + "epoch": 0.22667022506894405, + "grad_norm": 0.16929948329925537, + "learning_rate": 0.0008864445892184836, + "loss": 2.8371, + "step": 7644 + }, + { + "epoch": 0.22669987842125552, + "grad_norm": 0.18473182618618011, + "learning_rate": 0.0008864147317424068, + "loss": 2.7959, + "step": 7645 + }, + { + "epoch": 0.226729531773567, + "grad_norm": 0.19882448017597198, + "learning_rate": 0.000886384870844561, + "loss": 2.763, + "step": 7646 + }, + { + "epoch": 0.22675918512587848, + "grad_norm": 0.17613226175308228, + "learning_rate": 0.0008863550065252108, + "loss": 2.8023, + "step": 7647 + }, + { + "epoch": 0.22678883847818995, + "grad_norm": 0.155543252825737, + "learning_rate": 0.0008863251387846207, + "loss": 2.8003, + "step": 7648 + }, + { + "epoch": 0.22681849183050143, + "grad_norm": 0.16256317496299744, + "learning_rate": 0.0008862952676230554, + "loss": 2.8085, + "step": 7649 + }, + { + "epoch": 0.2268481451828129, + "grad_norm": 0.14519096910953522, + "learning_rate": 0.0008862653930407789, + "loss": 2.8208, + "step": 7650 + }, + { + "epoch": 0.2268777985351244, + "grad_norm": 0.12945185601711273, + "learning_rate": 0.0008862355150380563, + "loss": 2.764, + "step": 7651 + }, + { + "epoch": 0.22690745188743588, + "grad_norm": 0.1344994604587555, + "learning_rate": 0.0008862056336151518, + "loss": 2.8253, + "step": 7652 + }, + { + "epoch": 0.22693710523974736, + "grad_norm": 0.1378159523010254, + "learning_rate": 0.0008861757487723301, + "loss": 2.8154, + "step": 7653 + }, + { + "epoch": 0.22696675859205884, + "grad_norm": 0.1460495889186859, + "learning_rate": 0.000886145860509856, + "loss": 2.837, + "step": 7654 + }, + { + "epoch": 0.2269964119443703, + "grad_norm": 0.1524331420660019, + "learning_rate": 0.000886115968827994, + "loss": 2.8054, + "step": 7655 + }, + { + "epoch": 0.2270260652966818, + "grad_norm": 0.15442857146263123, + "learning_rate": 0.0008860860737270087, + "loss": 2.7798, + "step": 7656 + }, + { + "epoch": 0.22705571864899327, + "grad_norm": 0.1400311142206192, + "learning_rate": 0.0008860561752071649, + "loss": 2.7823, + "step": 7657 + }, + { + "epoch": 0.22708537200130474, + "grad_norm": 0.1226065382361412, + "learning_rate": 0.0008860262732687276, + "loss": 2.8128, + "step": 7658 + }, + { + "epoch": 0.22711502535361622, + "grad_norm": 0.16139021515846252, + "learning_rate": 0.0008859963679119612, + "loss": 2.8145, + "step": 7659 + }, + { + "epoch": 0.2271446787059277, + "grad_norm": 0.15395280718803406, + "learning_rate": 0.0008859664591371308, + "loss": 2.7887, + "step": 7660 + }, + { + "epoch": 0.22717433205823917, + "grad_norm": 0.13746041059494019, + "learning_rate": 0.0008859365469445012, + "loss": 2.7745, + "step": 7661 + }, + { + "epoch": 0.22720398541055067, + "grad_norm": 0.137510284781456, + "learning_rate": 0.0008859066313343371, + "loss": 2.8151, + "step": 7662 + }, + { + "epoch": 0.22723363876286215, + "grad_norm": 0.1477077305316925, + "learning_rate": 0.0008858767123069037, + "loss": 2.7995, + "step": 7663 + }, + { + "epoch": 0.22726329211517363, + "grad_norm": 0.13421019911766052, + "learning_rate": 0.0008858467898624657, + "loss": 2.7818, + "step": 7664 + }, + { + "epoch": 0.2272929454674851, + "grad_norm": 0.12226615101099014, + "learning_rate": 0.0008858168640012883, + "loss": 2.7987, + "step": 7665 + }, + { + "epoch": 0.22732259881979658, + "grad_norm": 0.12986034154891968, + "learning_rate": 0.000885786934723636, + "loss": 2.7917, + "step": 7666 + }, + { + "epoch": 0.22735225217210805, + "grad_norm": 0.11778581887483597, + "learning_rate": 0.0008857570020297744, + "loss": 2.7627, + "step": 7667 + }, + { + "epoch": 0.22738190552441953, + "grad_norm": 0.13299085199832916, + "learning_rate": 0.0008857270659199684, + "loss": 2.8358, + "step": 7668 + }, + { + "epoch": 0.227411558876731, + "grad_norm": 0.13186362385749817, + "learning_rate": 0.0008856971263944828, + "loss": 2.8382, + "step": 7669 + }, + { + "epoch": 0.22744121222904248, + "grad_norm": 0.1291576474905014, + "learning_rate": 0.0008856671834535831, + "loss": 2.8197, + "step": 7670 + }, + { + "epoch": 0.22747086558135396, + "grad_norm": 0.13775059580802917, + "learning_rate": 0.0008856372370975343, + "loss": 2.822, + "step": 7671 + }, + { + "epoch": 0.22750051893366546, + "grad_norm": 0.13876394927501678, + "learning_rate": 0.0008856072873266015, + "loss": 2.7591, + "step": 7672 + }, + { + "epoch": 0.22753017228597694, + "grad_norm": 0.16376647353172302, + "learning_rate": 0.00088557733414105, + "loss": 2.7967, + "step": 7673 + }, + { + "epoch": 0.22755982563828842, + "grad_norm": 0.16148751974105835, + "learning_rate": 0.000885547377541145, + "loss": 2.8364, + "step": 7674 + }, + { + "epoch": 0.2275894789905999, + "grad_norm": 0.1478859931230545, + "learning_rate": 0.0008855174175271519, + "loss": 2.7835, + "step": 7675 + }, + { + "epoch": 0.22761913234291137, + "grad_norm": 0.13399344682693481, + "learning_rate": 0.0008854874540993357, + "loss": 2.8151, + "step": 7676 + }, + { + "epoch": 0.22764878569522284, + "grad_norm": 0.13633637130260468, + "learning_rate": 0.000885457487257962, + "loss": 2.8003, + "step": 7677 + }, + { + "epoch": 0.22767843904753432, + "grad_norm": 0.16197727620601654, + "learning_rate": 0.0008854275170032961, + "loss": 2.8016, + "step": 7678 + }, + { + "epoch": 0.2277080923998458, + "grad_norm": 0.17043505609035492, + "learning_rate": 0.0008853975433356034, + "loss": 2.8046, + "step": 7679 + }, + { + "epoch": 0.22773774575215727, + "grad_norm": 0.16668324172496796, + "learning_rate": 0.0008853675662551495, + "loss": 2.8092, + "step": 7680 + }, + { + "epoch": 0.22776739910446875, + "grad_norm": 0.16626010835170746, + "learning_rate": 0.0008853375857621993, + "loss": 2.8107, + "step": 7681 + }, + { + "epoch": 0.22779705245678025, + "grad_norm": 0.16920511424541473, + "learning_rate": 0.0008853076018570189, + "loss": 2.8102, + "step": 7682 + }, + { + "epoch": 0.22782670580909173, + "grad_norm": 0.1624193787574768, + "learning_rate": 0.0008852776145398733, + "loss": 2.803, + "step": 7683 + }, + { + "epoch": 0.2278563591614032, + "grad_norm": 0.14074000716209412, + "learning_rate": 0.0008852476238110285, + "loss": 2.8265, + "step": 7684 + }, + { + "epoch": 0.22788601251371468, + "grad_norm": 0.16430778801441193, + "learning_rate": 0.0008852176296707498, + "loss": 2.8035, + "step": 7685 + }, + { + "epoch": 0.22791566586602616, + "grad_norm": 0.16617558896541595, + "learning_rate": 0.0008851876321193028, + "loss": 2.8248, + "step": 7686 + }, + { + "epoch": 0.22794531921833763, + "grad_norm": 0.1487216204404831, + "learning_rate": 0.0008851576311569533, + "loss": 2.7822, + "step": 7687 + }, + { + "epoch": 0.2279749725706491, + "grad_norm": 0.13587792217731476, + "learning_rate": 0.0008851276267839669, + "loss": 2.8052, + "step": 7688 + }, + { + "epoch": 0.22800462592296059, + "grad_norm": 0.14376108348369598, + "learning_rate": 0.0008850976190006093, + "loss": 2.8054, + "step": 7689 + }, + { + "epoch": 0.22803427927527206, + "grad_norm": 0.15215006470680237, + "learning_rate": 0.000885067607807146, + "loss": 2.7746, + "step": 7690 + }, + { + "epoch": 0.22806393262758354, + "grad_norm": 0.14054717123508453, + "learning_rate": 0.000885037593203843, + "loss": 2.8147, + "step": 7691 + }, + { + "epoch": 0.228093585979895, + "grad_norm": 0.15283319354057312, + "learning_rate": 0.0008850075751909661, + "loss": 2.7793, + "step": 7692 + }, + { + "epoch": 0.22812323933220652, + "grad_norm": 0.13915063440799713, + "learning_rate": 0.000884977553768781, + "loss": 2.7397, + "step": 7693 + }, + { + "epoch": 0.228152892684518, + "grad_norm": 0.12244951725006104, + "learning_rate": 0.0008849475289375534, + "loss": 2.8365, + "step": 7694 + }, + { + "epoch": 0.22818254603682947, + "grad_norm": 0.1268790364265442, + "learning_rate": 0.0008849175006975496, + "loss": 2.8212, + "step": 7695 + }, + { + "epoch": 0.22821219938914095, + "grad_norm": 0.13026562333106995, + "learning_rate": 0.0008848874690490352, + "loss": 2.805, + "step": 7696 + }, + { + "epoch": 0.22824185274145242, + "grad_norm": 0.14582164585590363, + "learning_rate": 0.0008848574339922761, + "loss": 2.8157, + "step": 7697 + }, + { + "epoch": 0.2282715060937639, + "grad_norm": 0.14154811203479767, + "learning_rate": 0.0008848273955275385, + "loss": 2.781, + "step": 7698 + }, + { + "epoch": 0.22830115944607537, + "grad_norm": 0.13810467720031738, + "learning_rate": 0.0008847973536550882, + "loss": 2.7787, + "step": 7699 + }, + { + "epoch": 0.22833081279838685, + "grad_norm": 0.1505206674337387, + "learning_rate": 0.0008847673083751912, + "loss": 2.8011, + "step": 7700 + }, + { + "epoch": 0.22836046615069833, + "grad_norm": 0.14775513112545013, + "learning_rate": 0.0008847372596881137, + "loss": 2.8043, + "step": 7701 + }, + { + "epoch": 0.2283901195030098, + "grad_norm": 0.14214688539505005, + "learning_rate": 0.0008847072075941217, + "loss": 2.8337, + "step": 7702 + }, + { + "epoch": 0.2284197728553213, + "grad_norm": 0.16655223071575165, + "learning_rate": 0.0008846771520934814, + "loss": 2.8186, + "step": 7703 + }, + { + "epoch": 0.22844942620763278, + "grad_norm": 0.16117413341999054, + "learning_rate": 0.000884647093186459, + "loss": 2.819, + "step": 7704 + }, + { + "epoch": 0.22847907955994426, + "grad_norm": 0.15932881832122803, + "learning_rate": 0.0008846170308733204, + "loss": 2.8412, + "step": 7705 + }, + { + "epoch": 0.22850873291225574, + "grad_norm": 0.13732221722602844, + "learning_rate": 0.0008845869651543319, + "loss": 2.7593, + "step": 7706 + }, + { + "epoch": 0.2285383862645672, + "grad_norm": 0.14503854513168335, + "learning_rate": 0.0008845568960297598, + "loss": 2.7841, + "step": 7707 + }, + { + "epoch": 0.2285680396168787, + "grad_norm": 0.15861979126930237, + "learning_rate": 0.0008845268234998707, + "loss": 2.8254, + "step": 7708 + }, + { + "epoch": 0.22859769296919016, + "grad_norm": 0.14269010722637177, + "learning_rate": 0.0008844967475649303, + "loss": 2.7869, + "step": 7709 + }, + { + "epoch": 0.22862734632150164, + "grad_norm": 0.1382429450750351, + "learning_rate": 0.0008844666682252053, + "loss": 2.8007, + "step": 7710 + }, + { + "epoch": 0.22865699967381312, + "grad_norm": 0.13847094774246216, + "learning_rate": 0.0008844365854809619, + "loss": 2.7854, + "step": 7711 + }, + { + "epoch": 0.2286866530261246, + "grad_norm": 0.151944100856781, + "learning_rate": 0.0008844064993324666, + "loss": 2.8107, + "step": 7712 + }, + { + "epoch": 0.22871630637843607, + "grad_norm": 0.13720043003559113, + "learning_rate": 0.0008843764097799857, + "loss": 2.824, + "step": 7713 + }, + { + "epoch": 0.22874595973074757, + "grad_norm": 0.13036386668682098, + "learning_rate": 0.0008843463168237858, + "loss": 2.8033, + "step": 7714 + }, + { + "epoch": 0.22877561308305905, + "grad_norm": 0.14148011803627014, + "learning_rate": 0.0008843162204641333, + "loss": 2.7861, + "step": 7715 + }, + { + "epoch": 0.22880526643537052, + "grad_norm": 0.12449907511472702, + "learning_rate": 0.0008842861207012945, + "loss": 2.7922, + "step": 7716 + }, + { + "epoch": 0.228834919787682, + "grad_norm": 0.12560100853443146, + "learning_rate": 0.0008842560175355364, + "loss": 2.7967, + "step": 7717 + }, + { + "epoch": 0.22886457313999348, + "grad_norm": 0.12947919964790344, + "learning_rate": 0.0008842259109671252, + "loss": 2.7811, + "step": 7718 + }, + { + "epoch": 0.22889422649230495, + "grad_norm": 0.1336113065481186, + "learning_rate": 0.0008841958009963276, + "loss": 2.8112, + "step": 7719 + }, + { + "epoch": 0.22892387984461643, + "grad_norm": 0.17548303306102753, + "learning_rate": 0.0008841656876234102, + "loss": 2.7931, + "step": 7720 + }, + { + "epoch": 0.2289535331969279, + "grad_norm": 0.1960926204919815, + "learning_rate": 0.0008841355708486397, + "loss": 2.7461, + "step": 7721 + }, + { + "epoch": 0.22898318654923938, + "grad_norm": 0.13568203151226044, + "learning_rate": 0.0008841054506722829, + "loss": 2.7893, + "step": 7722 + }, + { + "epoch": 0.22901283990155086, + "grad_norm": 0.13091273605823517, + "learning_rate": 0.0008840753270946063, + "loss": 2.8123, + "step": 7723 + }, + { + "epoch": 0.22904249325386236, + "grad_norm": 0.16835126280784607, + "learning_rate": 0.0008840452001158767, + "loss": 2.8469, + "step": 7724 + }, + { + "epoch": 0.22907214660617384, + "grad_norm": 0.183608740568161, + "learning_rate": 0.0008840150697363611, + "loss": 2.8019, + "step": 7725 + }, + { + "epoch": 0.2291017999584853, + "grad_norm": 0.15549957752227783, + "learning_rate": 0.0008839849359563261, + "loss": 2.7848, + "step": 7726 + }, + { + "epoch": 0.2291314533107968, + "grad_norm": 0.15360097587108612, + "learning_rate": 0.0008839547987760384, + "loss": 2.7976, + "step": 7727 + }, + { + "epoch": 0.22916110666310827, + "grad_norm": 0.17984677851200104, + "learning_rate": 0.0008839246581957652, + "loss": 2.7786, + "step": 7728 + }, + { + "epoch": 0.22919076001541974, + "grad_norm": 0.17556631565093994, + "learning_rate": 0.0008838945142157732, + "loss": 2.8032, + "step": 7729 + }, + { + "epoch": 0.22922041336773122, + "grad_norm": 0.1995849460363388, + "learning_rate": 0.0008838643668363296, + "loss": 2.7755, + "step": 7730 + }, + { + "epoch": 0.2292500667200427, + "grad_norm": 0.18761548399925232, + "learning_rate": 0.0008838342160577008, + "loss": 2.8153, + "step": 7731 + }, + { + "epoch": 0.22927972007235417, + "grad_norm": 0.15654143691062927, + "learning_rate": 0.0008838040618801544, + "loss": 2.8113, + "step": 7732 + }, + { + "epoch": 0.22930937342466565, + "grad_norm": 0.14996862411499023, + "learning_rate": 0.0008837739043039569, + "loss": 2.8253, + "step": 7733 + }, + { + "epoch": 0.22933902677697715, + "grad_norm": 0.13786597549915314, + "learning_rate": 0.0008837437433293758, + "loss": 2.8131, + "step": 7734 + }, + { + "epoch": 0.22936868012928863, + "grad_norm": 0.13097061216831207, + "learning_rate": 0.0008837135789566779, + "loss": 2.8145, + "step": 7735 + }, + { + "epoch": 0.2293983334816001, + "grad_norm": 0.12422800809144974, + "learning_rate": 0.0008836834111861305, + "loss": 2.8044, + "step": 7736 + }, + { + "epoch": 0.22942798683391158, + "grad_norm": 0.1249515488743782, + "learning_rate": 0.0008836532400180007, + "loss": 2.7992, + "step": 7737 + }, + { + "epoch": 0.22945764018622306, + "grad_norm": 0.12144827097654343, + "learning_rate": 0.0008836230654525553, + "loss": 2.8189, + "step": 7738 + }, + { + "epoch": 0.22948729353853453, + "grad_norm": 0.1263459026813507, + "learning_rate": 0.000883592887490062, + "loss": 2.755, + "step": 7739 + }, + { + "epoch": 0.229516946890846, + "grad_norm": 0.10890202969312668, + "learning_rate": 0.0008835627061307878, + "loss": 2.7862, + "step": 7740 + }, + { + "epoch": 0.22954660024315748, + "grad_norm": 0.11714980006217957, + "learning_rate": 0.0008835325213750001, + "loss": 2.809, + "step": 7741 + }, + { + "epoch": 0.22957625359546896, + "grad_norm": 0.12681308388710022, + "learning_rate": 0.0008835023332229659, + "loss": 2.8441, + "step": 7742 + }, + { + "epoch": 0.22960590694778044, + "grad_norm": 0.13926947116851807, + "learning_rate": 0.000883472141674953, + "loss": 2.7903, + "step": 7743 + }, + { + "epoch": 0.2296355603000919, + "grad_norm": 0.16348974406719208, + "learning_rate": 0.0008834419467312282, + "loss": 2.8189, + "step": 7744 + }, + { + "epoch": 0.22966521365240342, + "grad_norm": 0.15115712583065033, + "learning_rate": 0.0008834117483920592, + "loss": 2.7945, + "step": 7745 + }, + { + "epoch": 0.2296948670047149, + "grad_norm": 0.14016501605510712, + "learning_rate": 0.0008833815466577133, + "loss": 2.787, + "step": 7746 + }, + { + "epoch": 0.22972452035702637, + "grad_norm": 0.15429271757602692, + "learning_rate": 0.0008833513415284582, + "loss": 2.8014, + "step": 7747 + }, + { + "epoch": 0.22975417370933784, + "grad_norm": 0.18669995665550232, + "learning_rate": 0.000883321133004561, + "loss": 2.7979, + "step": 7748 + }, + { + "epoch": 0.22978382706164932, + "grad_norm": 0.1936458945274353, + "learning_rate": 0.0008832909210862894, + "loss": 2.8119, + "step": 7749 + }, + { + "epoch": 0.2298134804139608, + "grad_norm": 0.17387740314006805, + "learning_rate": 0.000883260705773911, + "loss": 2.8039, + "step": 7750 + }, + { + "epoch": 0.22984313376627227, + "grad_norm": 0.1492653489112854, + "learning_rate": 0.0008832304870676932, + "loss": 2.8171, + "step": 7751 + }, + { + "epoch": 0.22987278711858375, + "grad_norm": 0.14068511128425598, + "learning_rate": 0.0008832002649679036, + "loss": 2.8288, + "step": 7752 + }, + { + "epoch": 0.22990244047089523, + "grad_norm": 0.14897720515727997, + "learning_rate": 0.0008831700394748099, + "loss": 2.8207, + "step": 7753 + }, + { + "epoch": 0.2299320938232067, + "grad_norm": 0.14806495606899261, + "learning_rate": 0.0008831398105886797, + "loss": 2.806, + "step": 7754 + }, + { + "epoch": 0.2299617471755182, + "grad_norm": 0.14420120418071747, + "learning_rate": 0.0008831095783097809, + "loss": 2.8052, + "step": 7755 + }, + { + "epoch": 0.22999140052782968, + "grad_norm": 0.13371098041534424, + "learning_rate": 0.0008830793426383807, + "loss": 2.8181, + "step": 7756 + }, + { + "epoch": 0.23002105388014116, + "grad_norm": 0.1484115719795227, + "learning_rate": 0.0008830491035747474, + "loss": 2.767, + "step": 7757 + }, + { + "epoch": 0.23005070723245263, + "grad_norm": 0.15436476469039917, + "learning_rate": 0.0008830188611191485, + "loss": 2.7945, + "step": 7758 + }, + { + "epoch": 0.2300803605847641, + "grad_norm": 0.1569371521472931, + "learning_rate": 0.0008829886152718519, + "loss": 2.8295, + "step": 7759 + }, + { + "epoch": 0.23011001393707559, + "grad_norm": 0.1662289947271347, + "learning_rate": 0.0008829583660331253, + "loss": 2.8149, + "step": 7760 + }, + { + "epoch": 0.23013966728938706, + "grad_norm": 0.13429775834083557, + "learning_rate": 0.0008829281134032366, + "loss": 2.7844, + "step": 7761 + }, + { + "epoch": 0.23016932064169854, + "grad_norm": 0.15110285580158234, + "learning_rate": 0.0008828978573824538, + "loss": 2.7807, + "step": 7762 + }, + { + "epoch": 0.23019897399401001, + "grad_norm": 0.14352339506149292, + "learning_rate": 0.0008828675979710448, + "loss": 2.8153, + "step": 7763 + }, + { + "epoch": 0.2302286273463215, + "grad_norm": 0.1349104940891266, + "learning_rate": 0.0008828373351692773, + "loss": 2.7839, + "step": 7764 + }, + { + "epoch": 0.23025828069863297, + "grad_norm": 0.1319933831691742, + "learning_rate": 0.0008828070689774197, + "loss": 2.8007, + "step": 7765 + }, + { + "epoch": 0.23028793405094447, + "grad_norm": 0.12981709837913513, + "learning_rate": 0.0008827767993957396, + "loss": 2.8123, + "step": 7766 + }, + { + "epoch": 0.23031758740325595, + "grad_norm": 0.1256444901227951, + "learning_rate": 0.0008827465264245053, + "loss": 2.8108, + "step": 7767 + }, + { + "epoch": 0.23034724075556742, + "grad_norm": 0.11885613948106766, + "learning_rate": 0.0008827162500639849, + "loss": 2.8616, + "step": 7768 + }, + { + "epoch": 0.2303768941078789, + "grad_norm": 0.11387871205806732, + "learning_rate": 0.0008826859703144464, + "loss": 2.8066, + "step": 7769 + }, + { + "epoch": 0.23040654746019038, + "grad_norm": 0.12438801676034927, + "learning_rate": 0.000882655687176158, + "loss": 2.8222, + "step": 7770 + }, + { + "epoch": 0.23043620081250185, + "grad_norm": 0.13135355710983276, + "learning_rate": 0.0008826254006493876, + "loss": 2.816, + "step": 7771 + }, + { + "epoch": 0.23046585416481333, + "grad_norm": 0.12810653448104858, + "learning_rate": 0.0008825951107344038, + "loss": 2.8078, + "step": 7772 + }, + { + "epoch": 0.2304955075171248, + "grad_norm": 0.13823628425598145, + "learning_rate": 0.0008825648174314745, + "loss": 2.7791, + "step": 7773 + }, + { + "epoch": 0.23052516086943628, + "grad_norm": 0.13008545339107513, + "learning_rate": 0.0008825345207408681, + "loss": 2.8112, + "step": 7774 + }, + { + "epoch": 0.23055481422174776, + "grad_norm": 0.1250673532485962, + "learning_rate": 0.0008825042206628529, + "loss": 2.7836, + "step": 7775 + }, + { + "epoch": 0.23058446757405926, + "grad_norm": 0.13798421621322632, + "learning_rate": 0.0008824739171976971, + "loss": 2.806, + "step": 7776 + }, + { + "epoch": 0.23061412092637074, + "grad_norm": 0.14600208401679993, + "learning_rate": 0.0008824436103456692, + "loss": 2.818, + "step": 7777 + }, + { + "epoch": 0.2306437742786822, + "grad_norm": 0.1600630134344101, + "learning_rate": 0.0008824133001070375, + "loss": 2.8097, + "step": 7778 + }, + { + "epoch": 0.2306734276309937, + "grad_norm": 0.15930506587028503, + "learning_rate": 0.0008823829864820703, + "loss": 2.8014, + "step": 7779 + }, + { + "epoch": 0.23070308098330516, + "grad_norm": 0.1498696208000183, + "learning_rate": 0.0008823526694710361, + "loss": 2.8135, + "step": 7780 + }, + { + "epoch": 0.23073273433561664, + "grad_norm": 0.17722712457180023, + "learning_rate": 0.0008823223490742034, + "loss": 2.8126, + "step": 7781 + }, + { + "epoch": 0.23076238768792812, + "grad_norm": 0.2200416624546051, + "learning_rate": 0.0008822920252918407, + "loss": 2.8053, + "step": 7782 + }, + { + "epoch": 0.2307920410402396, + "grad_norm": 0.24558451771736145, + "learning_rate": 0.0008822616981242165, + "loss": 2.7919, + "step": 7783 + }, + { + "epoch": 0.23082169439255107, + "grad_norm": 0.18070188164710999, + "learning_rate": 0.0008822313675715994, + "loss": 2.7923, + "step": 7784 + }, + { + "epoch": 0.23085134774486255, + "grad_norm": 0.14467771351337433, + "learning_rate": 0.0008822010336342578, + "loss": 2.8427, + "step": 7785 + }, + { + "epoch": 0.23088100109717405, + "grad_norm": 0.17651347815990448, + "learning_rate": 0.0008821706963124605, + "loss": 2.8523, + "step": 7786 + }, + { + "epoch": 0.23091065444948553, + "grad_norm": 0.1569419950246811, + "learning_rate": 0.0008821403556064762, + "loss": 2.7886, + "step": 7787 + }, + { + "epoch": 0.230940307801797, + "grad_norm": 0.2024591863155365, + "learning_rate": 0.0008821100115165735, + "loss": 2.8319, + "step": 7788 + }, + { + "epoch": 0.23096996115410848, + "grad_norm": 0.1547912210226059, + "learning_rate": 0.0008820796640430208, + "loss": 2.8061, + "step": 7789 + }, + { + "epoch": 0.23099961450641995, + "grad_norm": 0.14274080097675323, + "learning_rate": 0.0008820493131860872, + "loss": 2.7871, + "step": 7790 + }, + { + "epoch": 0.23102926785873143, + "grad_norm": 0.12606893479824066, + "learning_rate": 0.0008820189589460414, + "loss": 2.772, + "step": 7791 + }, + { + "epoch": 0.2310589212110429, + "grad_norm": 0.15030398964881897, + "learning_rate": 0.0008819886013231521, + "loss": 2.7617, + "step": 7792 + }, + { + "epoch": 0.23108857456335438, + "grad_norm": 0.13849039375782013, + "learning_rate": 0.0008819582403176882, + "loss": 2.834, + "step": 7793 + }, + { + "epoch": 0.23111822791566586, + "grad_norm": 0.13412471115589142, + "learning_rate": 0.0008819278759299186, + "loss": 2.7743, + "step": 7794 + }, + { + "epoch": 0.23114788126797733, + "grad_norm": 0.12420063465833664, + "learning_rate": 0.0008818975081601118, + "loss": 2.8041, + "step": 7795 + }, + { + "epoch": 0.2311775346202888, + "grad_norm": 0.11682716012001038, + "learning_rate": 0.0008818671370085374, + "loss": 2.7887, + "step": 7796 + }, + { + "epoch": 0.23120718797260031, + "grad_norm": 0.11245445162057877, + "learning_rate": 0.0008818367624754637, + "loss": 2.8335, + "step": 7797 + }, + { + "epoch": 0.2312368413249118, + "grad_norm": 0.10757580399513245, + "learning_rate": 0.0008818063845611599, + "loss": 2.7679, + "step": 7798 + }, + { + "epoch": 0.23126649467722327, + "grad_norm": 0.12607546150684357, + "learning_rate": 0.0008817760032658953, + "loss": 2.7889, + "step": 7799 + }, + { + "epoch": 0.23129614802953474, + "grad_norm": 0.16037051379680634, + "learning_rate": 0.0008817456185899384, + "loss": 2.7869, + "step": 7800 + }, + { + "epoch": 0.23132580138184622, + "grad_norm": 0.16964848339557648, + "learning_rate": 0.0008817152305335586, + "loss": 2.7929, + "step": 7801 + }, + { + "epoch": 0.2313554547341577, + "grad_norm": 0.15305161476135254, + "learning_rate": 0.0008816848390970249, + "loss": 2.7753, + "step": 7802 + }, + { + "epoch": 0.23138510808646917, + "grad_norm": 0.1238958090543747, + "learning_rate": 0.0008816544442806065, + "loss": 2.7743, + "step": 7803 + }, + { + "epoch": 0.23141476143878065, + "grad_norm": 0.13036033511161804, + "learning_rate": 0.0008816240460845725, + "loss": 2.7778, + "step": 7804 + }, + { + "epoch": 0.23144441479109212, + "grad_norm": 0.1420549601316452, + "learning_rate": 0.0008815936445091919, + "loss": 2.7804, + "step": 7805 + }, + { + "epoch": 0.2314740681434036, + "grad_norm": 0.10818328708410263, + "learning_rate": 0.0008815632395547342, + "loss": 2.8121, + "step": 7806 + }, + { + "epoch": 0.2315037214957151, + "grad_norm": 0.10872314870357513, + "learning_rate": 0.0008815328312214686, + "loss": 2.7932, + "step": 7807 + }, + { + "epoch": 0.23153337484802658, + "grad_norm": 0.11279803514480591, + "learning_rate": 0.0008815024195096641, + "loss": 2.7697, + "step": 7808 + }, + { + "epoch": 0.23156302820033806, + "grad_norm": 0.11458425223827362, + "learning_rate": 0.0008814720044195904, + "loss": 2.7645, + "step": 7809 + }, + { + "epoch": 0.23159268155264953, + "grad_norm": 0.11592752486467361, + "learning_rate": 0.0008814415859515164, + "loss": 2.8094, + "step": 7810 + }, + { + "epoch": 0.231622334904961, + "grad_norm": 0.1267743855714798, + "learning_rate": 0.0008814111641057119, + "loss": 2.7869, + "step": 7811 + }, + { + "epoch": 0.23165198825727248, + "grad_norm": 0.1229415014386177, + "learning_rate": 0.000881380738882446, + "loss": 2.7949, + "step": 7812 + }, + { + "epoch": 0.23168164160958396, + "grad_norm": 0.13549138605594635, + "learning_rate": 0.0008813503102819881, + "loss": 2.7804, + "step": 7813 + }, + { + "epoch": 0.23171129496189544, + "grad_norm": 0.1335865557193756, + "learning_rate": 0.0008813198783046078, + "loss": 2.8222, + "step": 7814 + }, + { + "epoch": 0.2317409483142069, + "grad_norm": 0.13192644715309143, + "learning_rate": 0.0008812894429505745, + "loss": 2.7637, + "step": 7815 + }, + { + "epoch": 0.2317706016665184, + "grad_norm": 0.13771982491016388, + "learning_rate": 0.0008812590042201578, + "loss": 2.8109, + "step": 7816 + }, + { + "epoch": 0.23180025501882986, + "grad_norm": 0.15390799939632416, + "learning_rate": 0.0008812285621136271, + "loss": 2.8132, + "step": 7817 + }, + { + "epoch": 0.23182990837114137, + "grad_norm": 0.1798260360956192, + "learning_rate": 0.0008811981166312521, + "loss": 2.7993, + "step": 7818 + }, + { + "epoch": 0.23185956172345284, + "grad_norm": 0.19809645414352417, + "learning_rate": 0.0008811676677733022, + "loss": 2.7937, + "step": 7819 + }, + { + "epoch": 0.23188921507576432, + "grad_norm": 0.15790779888629913, + "learning_rate": 0.0008811372155400474, + "loss": 2.7822, + "step": 7820 + }, + { + "epoch": 0.2319188684280758, + "grad_norm": 0.20512208342552185, + "learning_rate": 0.000881106759931757, + "loss": 2.8213, + "step": 7821 + }, + { + "epoch": 0.23194852178038727, + "grad_norm": 0.22846205532550812, + "learning_rate": 0.0008810763009487009, + "loss": 2.817, + "step": 7822 + }, + { + "epoch": 0.23197817513269875, + "grad_norm": 0.19182656705379486, + "learning_rate": 0.0008810458385911489, + "loss": 2.8028, + "step": 7823 + }, + { + "epoch": 0.23200782848501023, + "grad_norm": 0.16839613020420074, + "learning_rate": 0.0008810153728593703, + "loss": 2.8078, + "step": 7824 + }, + { + "epoch": 0.2320374818373217, + "grad_norm": 0.1855628788471222, + "learning_rate": 0.0008809849037536353, + "loss": 2.7543, + "step": 7825 + }, + { + "epoch": 0.23206713518963318, + "grad_norm": 0.15242339670658112, + "learning_rate": 0.0008809544312742135, + "loss": 2.8281, + "step": 7826 + }, + { + "epoch": 0.23209678854194465, + "grad_norm": 0.15941494703292847, + "learning_rate": 0.000880923955421375, + "loss": 2.8453, + "step": 7827 + }, + { + "epoch": 0.23212644189425616, + "grad_norm": 0.1369163542985916, + "learning_rate": 0.0008808934761953893, + "loss": 2.8041, + "step": 7828 + }, + { + "epoch": 0.23215609524656763, + "grad_norm": 0.14908824861049652, + "learning_rate": 0.0008808629935965265, + "loss": 2.7904, + "step": 7829 + }, + { + "epoch": 0.2321857485988791, + "grad_norm": 0.1325831115245819, + "learning_rate": 0.0008808325076250566, + "loss": 2.7923, + "step": 7830 + }, + { + "epoch": 0.2322154019511906, + "grad_norm": 0.13648006319999695, + "learning_rate": 0.0008808020182812495, + "loss": 2.8328, + "step": 7831 + }, + { + "epoch": 0.23224505530350206, + "grad_norm": 0.16110895574092865, + "learning_rate": 0.0008807715255653751, + "loss": 2.7819, + "step": 7832 + }, + { + "epoch": 0.23227470865581354, + "grad_norm": 0.1826406717300415, + "learning_rate": 0.0008807410294777035, + "loss": 2.7917, + "step": 7833 + }, + { + "epoch": 0.23230436200812501, + "grad_norm": 0.11954361200332642, + "learning_rate": 0.0008807105300185047, + "loss": 2.8119, + "step": 7834 + }, + { + "epoch": 0.2323340153604365, + "grad_norm": 0.13214196264743805, + "learning_rate": 0.0008806800271880488, + "loss": 2.8243, + "step": 7835 + }, + { + "epoch": 0.23236366871274797, + "grad_norm": 0.1383763998746872, + "learning_rate": 0.000880649520986606, + "loss": 2.8028, + "step": 7836 + }, + { + "epoch": 0.23239332206505944, + "grad_norm": 0.11500488221645355, + "learning_rate": 0.0008806190114144463, + "loss": 2.8009, + "step": 7837 + }, + { + "epoch": 0.23242297541737095, + "grad_norm": 0.12472755461931229, + "learning_rate": 0.0008805884984718399, + "loss": 2.811, + "step": 7838 + }, + { + "epoch": 0.23245262876968242, + "grad_norm": 0.13688407838344574, + "learning_rate": 0.000880557982159057, + "loss": 2.7887, + "step": 7839 + }, + { + "epoch": 0.2324822821219939, + "grad_norm": 0.12602700293064117, + "learning_rate": 0.000880527462476368, + "loss": 2.8184, + "step": 7840 + }, + { + "epoch": 0.23251193547430538, + "grad_norm": 0.1332380175590515, + "learning_rate": 0.0008804969394240429, + "loss": 2.7693, + "step": 7841 + }, + { + "epoch": 0.23254158882661685, + "grad_norm": 0.12730614840984344, + "learning_rate": 0.000880466413002352, + "loss": 2.8055, + "step": 7842 + }, + { + "epoch": 0.23257124217892833, + "grad_norm": 0.12682105600833893, + "learning_rate": 0.000880435883211566, + "loss": 2.8372, + "step": 7843 + }, + { + "epoch": 0.2326008955312398, + "grad_norm": 0.12357396632432938, + "learning_rate": 0.0008804053500519547, + "loss": 2.7856, + "step": 7844 + }, + { + "epoch": 0.23263054888355128, + "grad_norm": 0.12973298132419586, + "learning_rate": 0.0008803748135237888, + "loss": 2.7848, + "step": 7845 + }, + { + "epoch": 0.23266020223586276, + "grad_norm": 0.13996565341949463, + "learning_rate": 0.0008803442736273386, + "loss": 2.8089, + "step": 7846 + }, + { + "epoch": 0.23268985558817423, + "grad_norm": 0.1681736707687378, + "learning_rate": 0.0008803137303628745, + "loss": 2.7908, + "step": 7847 + }, + { + "epoch": 0.2327195089404857, + "grad_norm": 0.22382761538028717, + "learning_rate": 0.0008802831837306672, + "loss": 2.8058, + "step": 7848 + }, + { + "epoch": 0.2327491622927972, + "grad_norm": 0.22617408633232117, + "learning_rate": 0.0008802526337309868, + "loss": 2.8307, + "step": 7849 + }, + { + "epoch": 0.2327788156451087, + "grad_norm": 0.17715121805667877, + "learning_rate": 0.0008802220803641043, + "loss": 2.7912, + "step": 7850 + }, + { + "epoch": 0.23280846899742016, + "grad_norm": 0.20026202499866486, + "learning_rate": 0.00088019152363029, + "loss": 2.8354, + "step": 7851 + }, + { + "epoch": 0.23283812234973164, + "grad_norm": 0.1792813390493393, + "learning_rate": 0.0008801609635298145, + "loss": 2.8062, + "step": 7852 + }, + { + "epoch": 0.23286777570204312, + "grad_norm": 0.13873697817325592, + "learning_rate": 0.0008801304000629482, + "loss": 2.7744, + "step": 7853 + }, + { + "epoch": 0.2328974290543546, + "grad_norm": 0.143538698554039, + "learning_rate": 0.0008800998332299621, + "loss": 2.8253, + "step": 7854 + }, + { + "epoch": 0.23292708240666607, + "grad_norm": 0.12922853231430054, + "learning_rate": 0.0008800692630311268, + "loss": 2.8228, + "step": 7855 + }, + { + "epoch": 0.23295673575897755, + "grad_norm": 0.13966605067253113, + "learning_rate": 0.000880038689466713, + "loss": 2.7967, + "step": 7856 + }, + { + "epoch": 0.23298638911128902, + "grad_norm": 0.12891815602779388, + "learning_rate": 0.0008800081125369911, + "loss": 2.7732, + "step": 7857 + }, + { + "epoch": 0.2330160424636005, + "grad_norm": 0.11299952119588852, + "learning_rate": 0.0008799775322422323, + "loss": 2.7847, + "step": 7858 + }, + { + "epoch": 0.233045695815912, + "grad_norm": 0.12844599783420563, + "learning_rate": 0.0008799469485827072, + "loss": 2.7875, + "step": 7859 + }, + { + "epoch": 0.23307534916822348, + "grad_norm": 0.11921228468418121, + "learning_rate": 0.0008799163615586868, + "loss": 2.7822, + "step": 7860 + }, + { + "epoch": 0.23310500252053495, + "grad_norm": 0.12098424881696701, + "learning_rate": 0.0008798857711704416, + "loss": 2.7813, + "step": 7861 + }, + { + "epoch": 0.23313465587284643, + "grad_norm": 0.12175954133272171, + "learning_rate": 0.0008798551774182428, + "loss": 2.8077, + "step": 7862 + }, + { + "epoch": 0.2331643092251579, + "grad_norm": 0.12138903886079788, + "learning_rate": 0.000879824580302361, + "loss": 2.7942, + "step": 7863 + }, + { + "epoch": 0.23319396257746938, + "grad_norm": 0.11496955156326294, + "learning_rate": 0.0008797939798230676, + "loss": 2.7641, + "step": 7864 + }, + { + "epoch": 0.23322361592978086, + "grad_norm": 0.11744622141122818, + "learning_rate": 0.0008797633759806331, + "loss": 2.7805, + "step": 7865 + }, + { + "epoch": 0.23325326928209233, + "grad_norm": 0.12408945709466934, + "learning_rate": 0.0008797327687753289, + "loss": 2.8165, + "step": 7866 + }, + { + "epoch": 0.2332829226344038, + "grad_norm": 0.1221306174993515, + "learning_rate": 0.0008797021582074258, + "loss": 2.8081, + "step": 7867 + }, + { + "epoch": 0.2333125759867153, + "grad_norm": 0.14136384427547455, + "learning_rate": 0.000879671544277195, + "loss": 2.7492, + "step": 7868 + }, + { + "epoch": 0.23334222933902676, + "grad_norm": 0.16050009429454803, + "learning_rate": 0.0008796409269849073, + "loss": 2.7971, + "step": 7869 + }, + { + "epoch": 0.23337188269133827, + "grad_norm": 0.15506611764431, + "learning_rate": 0.0008796103063308343, + "loss": 2.7718, + "step": 7870 + }, + { + "epoch": 0.23340153604364974, + "grad_norm": 0.14808019995689392, + "learning_rate": 0.0008795796823152466, + "loss": 2.886, + "step": 7871 + }, + { + "epoch": 0.23343118939596122, + "grad_norm": 0.1318616420030594, + "learning_rate": 0.0008795490549384159, + "loss": 2.8241, + "step": 7872 + }, + { + "epoch": 0.2334608427482727, + "grad_norm": 0.12493560463190079, + "learning_rate": 0.0008795184242006129, + "loss": 2.791, + "step": 7873 + }, + { + "epoch": 0.23349049610058417, + "grad_norm": 0.12139425426721573, + "learning_rate": 0.0008794877901021094, + "loss": 2.8104, + "step": 7874 + }, + { + "epoch": 0.23352014945289565, + "grad_norm": 0.11547219008207321, + "learning_rate": 0.0008794571526431762, + "loss": 2.7825, + "step": 7875 + }, + { + "epoch": 0.23354980280520712, + "grad_norm": 0.13098549842834473, + "learning_rate": 0.0008794265118240847, + "loss": 2.7824, + "step": 7876 + }, + { + "epoch": 0.2335794561575186, + "grad_norm": 0.13469067215919495, + "learning_rate": 0.0008793958676451066, + "loss": 2.7759, + "step": 7877 + }, + { + "epoch": 0.23360910950983008, + "grad_norm": 0.13745461404323578, + "learning_rate": 0.0008793652201065128, + "loss": 2.7818, + "step": 7878 + }, + { + "epoch": 0.23363876286214155, + "grad_norm": 0.1416725367307663, + "learning_rate": 0.0008793345692085748, + "loss": 2.794, + "step": 7879 + }, + { + "epoch": 0.23366841621445306, + "grad_norm": 0.14753814041614532, + "learning_rate": 0.0008793039149515643, + "loss": 2.7876, + "step": 7880 + }, + { + "epoch": 0.23369806956676453, + "grad_norm": 0.14378045499324799, + "learning_rate": 0.0008792732573357523, + "loss": 2.8034, + "step": 7881 + }, + { + "epoch": 0.233727722919076, + "grad_norm": 0.1254470944404602, + "learning_rate": 0.0008792425963614105, + "loss": 2.7686, + "step": 7882 + }, + { + "epoch": 0.23375737627138748, + "grad_norm": 0.15886645019054413, + "learning_rate": 0.0008792119320288105, + "loss": 2.8016, + "step": 7883 + }, + { + "epoch": 0.23378702962369896, + "grad_norm": 0.17991137504577637, + "learning_rate": 0.0008791812643382238, + "loss": 2.7685, + "step": 7884 + }, + { + "epoch": 0.23381668297601044, + "grad_norm": 0.18476679921150208, + "learning_rate": 0.0008791505932899217, + "loss": 2.805, + "step": 7885 + }, + { + "epoch": 0.2338463363283219, + "grad_norm": 0.18539179861545563, + "learning_rate": 0.0008791199188841764, + "loss": 2.7792, + "step": 7886 + }, + { + "epoch": 0.2338759896806334, + "grad_norm": 0.17779013514518738, + "learning_rate": 0.0008790892411212588, + "loss": 2.8082, + "step": 7887 + }, + { + "epoch": 0.23390564303294487, + "grad_norm": 0.15294112265110016, + "learning_rate": 0.0008790585600014409, + "loss": 2.82, + "step": 7888 + }, + { + "epoch": 0.23393529638525634, + "grad_norm": 0.15380752086639404, + "learning_rate": 0.0008790278755249945, + "loss": 2.8431, + "step": 7889 + }, + { + "epoch": 0.23396494973756785, + "grad_norm": 0.18689262866973877, + "learning_rate": 0.0008789971876921913, + "loss": 2.8078, + "step": 7890 + }, + { + "epoch": 0.23399460308987932, + "grad_norm": 0.16183798015117645, + "learning_rate": 0.0008789664965033029, + "loss": 2.8229, + "step": 7891 + }, + { + "epoch": 0.2340242564421908, + "grad_norm": 0.1492653787136078, + "learning_rate": 0.000878935801958601, + "loss": 2.7459, + "step": 7892 + }, + { + "epoch": 0.23405390979450227, + "grad_norm": 0.13396087288856506, + "learning_rate": 0.0008789051040583576, + "loss": 2.787, + "step": 7893 + }, + { + "epoch": 0.23408356314681375, + "grad_norm": 0.12941958010196686, + "learning_rate": 0.0008788744028028445, + "loss": 2.7825, + "step": 7894 + }, + { + "epoch": 0.23411321649912523, + "grad_norm": 0.13656187057495117, + "learning_rate": 0.0008788436981923335, + "loss": 2.8168, + "step": 7895 + }, + { + "epoch": 0.2341428698514367, + "grad_norm": 0.1378975659608841, + "learning_rate": 0.0008788129902270965, + "loss": 2.7893, + "step": 7896 + }, + { + "epoch": 0.23417252320374818, + "grad_norm": 0.1328393518924713, + "learning_rate": 0.0008787822789074056, + "loss": 2.8173, + "step": 7897 + }, + { + "epoch": 0.23420217655605965, + "grad_norm": 0.13878022134304047, + "learning_rate": 0.0008787515642335324, + "loss": 2.8057, + "step": 7898 + }, + { + "epoch": 0.23423182990837113, + "grad_norm": 0.14577411115169525, + "learning_rate": 0.0008787208462057492, + "loss": 2.8073, + "step": 7899 + }, + { + "epoch": 0.2342614832606826, + "grad_norm": 0.15097330510616302, + "learning_rate": 0.0008786901248243277, + "loss": 2.8023, + "step": 7900 + }, + { + "epoch": 0.2342911366129941, + "grad_norm": 0.14041487872600555, + "learning_rate": 0.0008786594000895404, + "loss": 2.8057, + "step": 7901 + }, + { + "epoch": 0.2343207899653056, + "grad_norm": 0.14183495938777924, + "learning_rate": 0.0008786286720016591, + "loss": 2.792, + "step": 7902 + }, + { + "epoch": 0.23435044331761706, + "grad_norm": 0.13925543427467346, + "learning_rate": 0.0008785979405609559, + "loss": 2.7947, + "step": 7903 + }, + { + "epoch": 0.23438009666992854, + "grad_norm": 0.1293344795703888, + "learning_rate": 0.0008785672057677028, + "loss": 2.7838, + "step": 7904 + }, + { + "epoch": 0.23440975002224002, + "grad_norm": 0.11806228756904602, + "learning_rate": 0.0008785364676221722, + "loss": 2.7624, + "step": 7905 + }, + { + "epoch": 0.2344394033745515, + "grad_norm": 0.13016068935394287, + "learning_rate": 0.0008785057261246363, + "loss": 2.7876, + "step": 7906 + }, + { + "epoch": 0.23446905672686297, + "grad_norm": 0.1272325962781906, + "learning_rate": 0.000878474981275367, + "loss": 2.8137, + "step": 7907 + }, + { + "epoch": 0.23449871007917444, + "grad_norm": 0.14067330956459045, + "learning_rate": 0.000878444233074637, + "loss": 2.7796, + "step": 7908 + }, + { + "epoch": 0.23452836343148592, + "grad_norm": 0.14926451444625854, + "learning_rate": 0.0008784134815227183, + "loss": 2.8248, + "step": 7909 + }, + { + "epoch": 0.2345580167837974, + "grad_norm": 0.17669031023979187, + "learning_rate": 0.0008783827266198831, + "loss": 2.8413, + "step": 7910 + }, + { + "epoch": 0.2345876701361089, + "grad_norm": 0.16681675612926483, + "learning_rate": 0.0008783519683664042, + "loss": 2.8416, + "step": 7911 + }, + { + "epoch": 0.23461732348842038, + "grad_norm": 0.12073490023612976, + "learning_rate": 0.0008783212067625534, + "loss": 2.7891, + "step": 7912 + }, + { + "epoch": 0.23464697684073185, + "grad_norm": 0.15143553912639618, + "learning_rate": 0.0008782904418086035, + "loss": 2.8019, + "step": 7913 + }, + { + "epoch": 0.23467663019304333, + "grad_norm": 0.16167980432510376, + "learning_rate": 0.0008782596735048269, + "loss": 2.8023, + "step": 7914 + }, + { + "epoch": 0.2347062835453548, + "grad_norm": 0.15255165100097656, + "learning_rate": 0.0008782289018514958, + "loss": 2.827, + "step": 7915 + }, + { + "epoch": 0.23473593689766628, + "grad_norm": 0.1573595553636551, + "learning_rate": 0.000878198126848883, + "loss": 2.7997, + "step": 7916 + }, + { + "epoch": 0.23476559024997776, + "grad_norm": 0.1596556156873703, + "learning_rate": 0.0008781673484972608, + "loss": 2.8217, + "step": 7917 + }, + { + "epoch": 0.23479524360228923, + "grad_norm": 0.15087006986141205, + "learning_rate": 0.0008781365667969018, + "loss": 2.7374, + "step": 7918 + }, + { + "epoch": 0.2348248969546007, + "grad_norm": 0.14048458635807037, + "learning_rate": 0.0008781057817480786, + "loss": 2.8019, + "step": 7919 + }, + { + "epoch": 0.23485455030691219, + "grad_norm": 0.14143376052379608, + "learning_rate": 0.0008780749933510638, + "loss": 2.798, + "step": 7920 + }, + { + "epoch": 0.23488420365922366, + "grad_norm": 0.16943258047103882, + "learning_rate": 0.00087804420160613, + "loss": 2.8118, + "step": 7921 + }, + { + "epoch": 0.23491385701153517, + "grad_norm": 0.16674627363681793, + "learning_rate": 0.0008780134065135499, + "loss": 2.753, + "step": 7922 + }, + { + "epoch": 0.23494351036384664, + "grad_norm": 0.17112618684768677, + "learning_rate": 0.0008779826080735963, + "loss": 2.8161, + "step": 7923 + }, + { + "epoch": 0.23497316371615812, + "grad_norm": 0.17585386335849762, + "learning_rate": 0.0008779518062865418, + "loss": 2.7695, + "step": 7924 + }, + { + "epoch": 0.2350028170684696, + "grad_norm": 0.1522456556558609, + "learning_rate": 0.0008779210011526591, + "loss": 2.7904, + "step": 7925 + }, + { + "epoch": 0.23503247042078107, + "grad_norm": 0.13599757850170135, + "learning_rate": 0.0008778901926722212, + "loss": 2.8168, + "step": 7926 + }, + { + "epoch": 0.23506212377309255, + "grad_norm": 0.12318168580532074, + "learning_rate": 0.0008778593808455007, + "loss": 2.7794, + "step": 7927 + }, + { + "epoch": 0.23509177712540402, + "grad_norm": 0.11240462958812714, + "learning_rate": 0.0008778285656727704, + "loss": 2.7937, + "step": 7928 + }, + { + "epoch": 0.2351214304777155, + "grad_norm": 0.11854726076126099, + "learning_rate": 0.0008777977471543035, + "loss": 2.7854, + "step": 7929 + }, + { + "epoch": 0.23515108383002697, + "grad_norm": 0.13971775770187378, + "learning_rate": 0.0008777669252903726, + "loss": 2.8088, + "step": 7930 + }, + { + "epoch": 0.23518073718233845, + "grad_norm": 0.14934015274047852, + "learning_rate": 0.0008777361000812507, + "loss": 2.8116, + "step": 7931 + }, + { + "epoch": 0.23521039053464995, + "grad_norm": 0.1623595654964447, + "learning_rate": 0.0008777052715272109, + "loss": 2.7574, + "step": 7932 + }, + { + "epoch": 0.23524004388696143, + "grad_norm": 0.21505603194236755, + "learning_rate": 0.0008776744396285261, + "loss": 2.7832, + "step": 7933 + }, + { + "epoch": 0.2352696972392729, + "grad_norm": 0.1449887752532959, + "learning_rate": 0.0008776436043854692, + "loss": 2.8208, + "step": 7934 + }, + { + "epoch": 0.23529935059158438, + "grad_norm": 0.1285308599472046, + "learning_rate": 0.0008776127657983135, + "loss": 2.7823, + "step": 7935 + }, + { + "epoch": 0.23532900394389586, + "grad_norm": 0.13240444660186768, + "learning_rate": 0.0008775819238673317, + "loss": 2.8037, + "step": 7936 + }, + { + "epoch": 0.23535865729620734, + "grad_norm": 0.1438729166984558, + "learning_rate": 0.0008775510785927974, + "loss": 2.8114, + "step": 7937 + }, + { + "epoch": 0.2353883106485188, + "grad_norm": 0.17180103063583374, + "learning_rate": 0.0008775202299749834, + "loss": 2.802, + "step": 7938 + }, + { + "epoch": 0.2354179640008303, + "grad_norm": 0.1412719190120697, + "learning_rate": 0.0008774893780141629, + "loss": 2.7881, + "step": 7939 + }, + { + "epoch": 0.23544761735314176, + "grad_norm": 0.1573866754770279, + "learning_rate": 0.0008774585227106093, + "loss": 2.7854, + "step": 7940 + }, + { + "epoch": 0.23547727070545324, + "grad_norm": 0.16162210702896118, + "learning_rate": 0.0008774276640645955, + "loss": 2.835, + "step": 7941 + }, + { + "epoch": 0.23550692405776474, + "grad_norm": 0.15932534635066986, + "learning_rate": 0.0008773968020763951, + "loss": 2.8261, + "step": 7942 + }, + { + "epoch": 0.23553657741007622, + "grad_norm": 0.15208962559700012, + "learning_rate": 0.0008773659367462813, + "loss": 2.7996, + "step": 7943 + }, + { + "epoch": 0.2355662307623877, + "grad_norm": 0.13300330936908722, + "learning_rate": 0.0008773350680745273, + "loss": 2.7967, + "step": 7944 + }, + { + "epoch": 0.23559588411469917, + "grad_norm": 0.13093765079975128, + "learning_rate": 0.0008773041960614063, + "loss": 2.8077, + "step": 7945 + }, + { + "epoch": 0.23562553746701065, + "grad_norm": 0.11362027376890182, + "learning_rate": 0.0008772733207071922, + "loss": 2.7712, + "step": 7946 + }, + { + "epoch": 0.23565519081932212, + "grad_norm": 0.12113136053085327, + "learning_rate": 0.0008772424420121579, + "loss": 2.7648, + "step": 7947 + }, + { + "epoch": 0.2356848441716336, + "grad_norm": 0.1210782453417778, + "learning_rate": 0.0008772115599765771, + "loss": 2.8248, + "step": 7948 + }, + { + "epoch": 0.23571449752394508, + "grad_norm": 0.12686507403850555, + "learning_rate": 0.0008771806746007231, + "loss": 2.7571, + "step": 7949 + }, + { + "epoch": 0.23574415087625655, + "grad_norm": 0.1852547824382782, + "learning_rate": 0.0008771497858848695, + "loss": 2.823, + "step": 7950 + }, + { + "epoch": 0.23577380422856803, + "grad_norm": 0.15019778907299042, + "learning_rate": 0.0008771188938292897, + "loss": 2.8186, + "step": 7951 + }, + { + "epoch": 0.2358034575808795, + "grad_norm": 0.153650164604187, + "learning_rate": 0.0008770879984342577, + "loss": 2.7967, + "step": 7952 + }, + { + "epoch": 0.235833110933191, + "grad_norm": 0.13761919736862183, + "learning_rate": 0.0008770570997000464, + "loss": 2.8164, + "step": 7953 + }, + { + "epoch": 0.23586276428550249, + "grad_norm": 0.1200401782989502, + "learning_rate": 0.0008770261976269301, + "loss": 2.799, + "step": 7954 + }, + { + "epoch": 0.23589241763781396, + "grad_norm": 0.1260775923728943, + "learning_rate": 0.0008769952922151818, + "loss": 2.8001, + "step": 7955 + }, + { + "epoch": 0.23592207099012544, + "grad_norm": 0.11907364428043365, + "learning_rate": 0.0008769643834650756, + "loss": 2.7939, + "step": 7956 + }, + { + "epoch": 0.2359517243424369, + "grad_norm": 0.1239737942814827, + "learning_rate": 0.0008769334713768851, + "loss": 2.7896, + "step": 7957 + }, + { + "epoch": 0.2359813776947484, + "grad_norm": 0.11754898726940155, + "learning_rate": 0.000876902555950884, + "loss": 2.7834, + "step": 7958 + }, + { + "epoch": 0.23601103104705987, + "grad_norm": 0.11119630187749863, + "learning_rate": 0.000876871637187346, + "loss": 2.779, + "step": 7959 + }, + { + "epoch": 0.23604068439937134, + "grad_norm": 0.1302194595336914, + "learning_rate": 0.0008768407150865449, + "loss": 2.8328, + "step": 7960 + }, + { + "epoch": 0.23607033775168282, + "grad_norm": 0.13036227226257324, + "learning_rate": 0.0008768097896487548, + "loss": 2.8052, + "step": 7961 + }, + { + "epoch": 0.2360999911039943, + "grad_norm": 0.12981756031513214, + "learning_rate": 0.0008767788608742493, + "loss": 2.7908, + "step": 7962 + }, + { + "epoch": 0.2361296444563058, + "grad_norm": 0.14563323557376862, + "learning_rate": 0.0008767479287633023, + "loss": 2.7913, + "step": 7963 + }, + { + "epoch": 0.23615929780861727, + "grad_norm": 0.17654454708099365, + "learning_rate": 0.0008767169933161876, + "loss": 2.7996, + "step": 7964 + }, + { + "epoch": 0.23618895116092875, + "grad_norm": 0.17079485952854156, + "learning_rate": 0.0008766860545331794, + "loss": 2.7898, + "step": 7965 + }, + { + "epoch": 0.23621860451324023, + "grad_norm": 0.1506548523902893, + "learning_rate": 0.0008766551124145515, + "loss": 2.7859, + "step": 7966 + }, + { + "epoch": 0.2362482578655517, + "grad_norm": 0.12532949447631836, + "learning_rate": 0.0008766241669605777, + "loss": 2.8347, + "step": 7967 + }, + { + "epoch": 0.23627791121786318, + "grad_norm": 0.15233029425144196, + "learning_rate": 0.0008765932181715325, + "loss": 2.8151, + "step": 7968 + }, + { + "epoch": 0.23630756457017466, + "grad_norm": 0.15466363728046417, + "learning_rate": 0.0008765622660476897, + "loss": 2.7816, + "step": 7969 + }, + { + "epoch": 0.23633721792248613, + "grad_norm": 0.1480768769979477, + "learning_rate": 0.0008765313105893233, + "loss": 2.8097, + "step": 7970 + }, + { + "epoch": 0.2363668712747976, + "grad_norm": 0.14584851264953613, + "learning_rate": 0.0008765003517967077, + "loss": 2.7715, + "step": 7971 + }, + { + "epoch": 0.23639652462710908, + "grad_norm": 0.15810610353946686, + "learning_rate": 0.0008764693896701165, + "loss": 2.8068, + "step": 7972 + }, + { + "epoch": 0.23642617797942056, + "grad_norm": 0.144727885723114, + "learning_rate": 0.0008764384242098247, + "loss": 2.7997, + "step": 7973 + }, + { + "epoch": 0.23645583133173206, + "grad_norm": 0.1411256045103073, + "learning_rate": 0.0008764074554161057, + "loss": 2.8169, + "step": 7974 + }, + { + "epoch": 0.23648548468404354, + "grad_norm": 0.1490587443113327, + "learning_rate": 0.0008763764832892343, + "loss": 2.8205, + "step": 7975 + }, + { + "epoch": 0.23651513803635502, + "grad_norm": 0.1532294601202011, + "learning_rate": 0.0008763455078294842, + "loss": 2.814, + "step": 7976 + }, + { + "epoch": 0.2365447913886665, + "grad_norm": 0.15761132538318634, + "learning_rate": 0.0008763145290371304, + "loss": 2.8045, + "step": 7977 + }, + { + "epoch": 0.23657444474097797, + "grad_norm": 0.183177649974823, + "learning_rate": 0.0008762835469124466, + "loss": 2.7968, + "step": 7978 + }, + { + "epoch": 0.23660409809328944, + "grad_norm": 0.16891124844551086, + "learning_rate": 0.0008762525614557076, + "loss": 2.801, + "step": 7979 + }, + { + "epoch": 0.23663375144560092, + "grad_norm": 0.15897846221923828, + "learning_rate": 0.0008762215726671874, + "loss": 2.7989, + "step": 7980 + }, + { + "epoch": 0.2366634047979124, + "grad_norm": 0.15851405262947083, + "learning_rate": 0.0008761905805471607, + "loss": 2.7862, + "step": 7981 + }, + { + "epoch": 0.23669305815022387, + "grad_norm": 0.15035757422447205, + "learning_rate": 0.0008761595850959019, + "loss": 2.7848, + "step": 7982 + }, + { + "epoch": 0.23672271150253535, + "grad_norm": 0.16162654757499695, + "learning_rate": 0.0008761285863136852, + "loss": 2.7685, + "step": 7983 + }, + { + "epoch": 0.23675236485484685, + "grad_norm": 0.1544663906097412, + "learning_rate": 0.0008760975842007855, + "loss": 2.8096, + "step": 7984 + }, + { + "epoch": 0.23678201820715833, + "grad_norm": 0.161558136343956, + "learning_rate": 0.000876066578757477, + "loss": 2.819, + "step": 7985 + }, + { + "epoch": 0.2368116715594698, + "grad_norm": 0.17442971467971802, + "learning_rate": 0.0008760355699840345, + "loss": 2.7864, + "step": 7986 + }, + { + "epoch": 0.23684132491178128, + "grad_norm": 0.16463680565357208, + "learning_rate": 0.0008760045578807324, + "loss": 2.7942, + "step": 7987 + }, + { + "epoch": 0.23687097826409276, + "grad_norm": 0.14855574071407318, + "learning_rate": 0.0008759735424478455, + "loss": 2.7899, + "step": 7988 + }, + { + "epoch": 0.23690063161640423, + "grad_norm": 0.14164622128009796, + "learning_rate": 0.0008759425236856482, + "loss": 2.8275, + "step": 7989 + }, + { + "epoch": 0.2369302849687157, + "grad_norm": 0.14713001251220703, + "learning_rate": 0.0008759115015944155, + "loss": 2.8358, + "step": 7990 + }, + { + "epoch": 0.23695993832102719, + "grad_norm": 0.1454373449087143, + "learning_rate": 0.0008758804761744218, + "loss": 2.8155, + "step": 7991 + }, + { + "epoch": 0.23698959167333866, + "grad_norm": 0.13043396174907684, + "learning_rate": 0.0008758494474259419, + "loss": 2.8215, + "step": 7992 + }, + { + "epoch": 0.23701924502565014, + "grad_norm": 0.13380083441734314, + "learning_rate": 0.0008758184153492508, + "loss": 2.7783, + "step": 7993 + }, + { + "epoch": 0.23704889837796164, + "grad_norm": 0.12724754214286804, + "learning_rate": 0.0008757873799446231, + "loss": 2.7577, + "step": 7994 + }, + { + "epoch": 0.23707855173027312, + "grad_norm": 0.1476719230413437, + "learning_rate": 0.0008757563412123336, + "loss": 2.7937, + "step": 7995 + }, + { + "epoch": 0.2371082050825846, + "grad_norm": 0.14125917851924896, + "learning_rate": 0.0008757252991526572, + "loss": 2.7944, + "step": 7996 + }, + { + "epoch": 0.23713785843489607, + "grad_norm": 0.13849030435085297, + "learning_rate": 0.0008756942537658688, + "loss": 2.7823, + "step": 7997 + }, + { + "epoch": 0.23716751178720755, + "grad_norm": 0.14019571244716644, + "learning_rate": 0.0008756632050522432, + "loss": 2.8119, + "step": 7998 + }, + { + "epoch": 0.23719716513951902, + "grad_norm": 0.11989409476518631, + "learning_rate": 0.0008756321530120556, + "loss": 2.7905, + "step": 7999 + }, + { + "epoch": 0.2372268184918305, + "grad_norm": 0.11576066166162491, + "learning_rate": 0.0008756010976455807, + "loss": 2.7993, + "step": 8000 + }, + { + "epoch": 0.23725647184414198, + "grad_norm": 0.11470810323953629, + "learning_rate": 0.0008755700389530936, + "loss": 2.7676, + "step": 8001 + }, + { + "epoch": 0.23728612519645345, + "grad_norm": 0.11617736518383026, + "learning_rate": 0.0008755389769348694, + "loss": 2.7993, + "step": 8002 + }, + { + "epoch": 0.23731577854876493, + "grad_norm": 0.11529573053121567, + "learning_rate": 0.000875507911591183, + "loss": 2.7697, + "step": 8003 + }, + { + "epoch": 0.2373454319010764, + "grad_norm": 0.1259763389825821, + "learning_rate": 0.0008754768429223098, + "loss": 2.8016, + "step": 8004 + }, + { + "epoch": 0.2373750852533879, + "grad_norm": 0.15286752581596375, + "learning_rate": 0.0008754457709285247, + "loss": 2.7837, + "step": 8005 + }, + { + "epoch": 0.23740473860569938, + "grad_norm": 0.17205575108528137, + "learning_rate": 0.0008754146956101025, + "loss": 2.7553, + "step": 8006 + }, + { + "epoch": 0.23743439195801086, + "grad_norm": 0.1688910722732544, + "learning_rate": 0.000875383616967319, + "loss": 2.8121, + "step": 8007 + }, + { + "epoch": 0.23746404531032234, + "grad_norm": 0.140954852104187, + "learning_rate": 0.0008753525350004492, + "loss": 2.777, + "step": 8008 + }, + { + "epoch": 0.2374936986626338, + "grad_norm": 0.12708528339862823, + "learning_rate": 0.0008753214497097681, + "loss": 2.7958, + "step": 8009 + }, + { + "epoch": 0.2375233520149453, + "grad_norm": 0.12565436959266663, + "learning_rate": 0.0008752903610955512, + "loss": 2.8036, + "step": 8010 + }, + { + "epoch": 0.23755300536725676, + "grad_norm": 0.14095810055732727, + "learning_rate": 0.0008752592691580738, + "loss": 2.8357, + "step": 8011 + }, + { + "epoch": 0.23758265871956824, + "grad_norm": 0.16118162870407104, + "learning_rate": 0.0008752281738976111, + "loss": 2.7701, + "step": 8012 + }, + { + "epoch": 0.23761231207187972, + "grad_norm": 0.16378651559352875, + "learning_rate": 0.0008751970753144385, + "loss": 2.8272, + "step": 8013 + }, + { + "epoch": 0.2376419654241912, + "grad_norm": 0.18810886144638062, + "learning_rate": 0.0008751659734088314, + "loss": 2.8111, + "step": 8014 + }, + { + "epoch": 0.2376716187765027, + "grad_norm": 0.19002053141593933, + "learning_rate": 0.0008751348681810651, + "loss": 2.8022, + "step": 8015 + }, + { + "epoch": 0.23770127212881417, + "grad_norm": 0.17145198583602905, + "learning_rate": 0.0008751037596314153, + "loss": 2.7984, + "step": 8016 + }, + { + "epoch": 0.23773092548112565, + "grad_norm": 0.12599314749240875, + "learning_rate": 0.0008750726477601574, + "loss": 2.788, + "step": 8017 + }, + { + "epoch": 0.23776057883343713, + "grad_norm": 0.1535862237215042, + "learning_rate": 0.0008750415325675667, + "loss": 2.8081, + "step": 8018 + }, + { + "epoch": 0.2377902321857486, + "grad_norm": 0.14841026067733765, + "learning_rate": 0.0008750104140539189, + "loss": 2.8311, + "step": 8019 + }, + { + "epoch": 0.23781988553806008, + "grad_norm": 0.1418352872133255, + "learning_rate": 0.0008749792922194895, + "loss": 2.8509, + "step": 8020 + }, + { + "epoch": 0.23784953889037155, + "grad_norm": 0.1499103158712387, + "learning_rate": 0.0008749481670645541, + "loss": 2.7987, + "step": 8021 + }, + { + "epoch": 0.23787919224268303, + "grad_norm": 0.15096737444400787, + "learning_rate": 0.0008749170385893883, + "loss": 2.7912, + "step": 8022 + }, + { + "epoch": 0.2379088455949945, + "grad_norm": 0.15003393590450287, + "learning_rate": 0.0008748859067942678, + "loss": 2.8023, + "step": 8023 + }, + { + "epoch": 0.23793849894730598, + "grad_norm": 0.15406952798366547, + "learning_rate": 0.0008748547716794682, + "loss": 2.8398, + "step": 8024 + }, + { + "epoch": 0.23796815229961746, + "grad_norm": 0.13950523734092712, + "learning_rate": 0.0008748236332452653, + "loss": 2.7889, + "step": 8025 + }, + { + "epoch": 0.23799780565192896, + "grad_norm": 0.1289445012807846, + "learning_rate": 0.0008747924914919347, + "loss": 2.7702, + "step": 8026 + }, + { + "epoch": 0.23802745900424044, + "grad_norm": 0.15058982372283936, + "learning_rate": 0.0008747613464197523, + "loss": 2.8001, + "step": 8027 + }, + { + "epoch": 0.23805711235655191, + "grad_norm": 0.13906267285346985, + "learning_rate": 0.0008747301980289939, + "loss": 2.803, + "step": 8028 + }, + { + "epoch": 0.2380867657088634, + "grad_norm": 0.15167894959449768, + "learning_rate": 0.0008746990463199352, + "loss": 2.8043, + "step": 8029 + }, + { + "epoch": 0.23811641906117487, + "grad_norm": 0.1628732979297638, + "learning_rate": 0.0008746678912928523, + "loss": 2.8217, + "step": 8030 + }, + { + "epoch": 0.23814607241348634, + "grad_norm": 0.16633492708206177, + "learning_rate": 0.0008746367329480207, + "loss": 2.7802, + "step": 8031 + }, + { + "epoch": 0.23817572576579782, + "grad_norm": 0.15415607392787933, + "learning_rate": 0.0008746055712857166, + "loss": 2.7832, + "step": 8032 + }, + { + "epoch": 0.2382053791181093, + "grad_norm": 0.13627251982688904, + "learning_rate": 0.000874574406306216, + "loss": 2.8186, + "step": 8033 + }, + { + "epoch": 0.23823503247042077, + "grad_norm": 0.14181095361709595, + "learning_rate": 0.0008745432380097946, + "loss": 2.7799, + "step": 8034 + }, + { + "epoch": 0.23826468582273225, + "grad_norm": 0.1480764001607895, + "learning_rate": 0.0008745120663967285, + "loss": 2.8121, + "step": 8035 + }, + { + "epoch": 0.23829433917504375, + "grad_norm": 0.13853153586387634, + "learning_rate": 0.0008744808914672939, + "loss": 2.7512, + "step": 8036 + }, + { + "epoch": 0.23832399252735523, + "grad_norm": 0.11521206796169281, + "learning_rate": 0.0008744497132217666, + "loss": 2.78, + "step": 8037 + }, + { + "epoch": 0.2383536458796667, + "grad_norm": 0.13354220986366272, + "learning_rate": 0.000874418531660423, + "loss": 2.7733, + "step": 8038 + }, + { + "epoch": 0.23838329923197818, + "grad_norm": 0.13837479054927826, + "learning_rate": 0.0008743873467835389, + "loss": 2.7763, + "step": 8039 + }, + { + "epoch": 0.23841295258428966, + "grad_norm": 0.14808520674705505, + "learning_rate": 0.0008743561585913904, + "loss": 2.7743, + "step": 8040 + }, + { + "epoch": 0.23844260593660113, + "grad_norm": 0.13257093727588654, + "learning_rate": 0.0008743249670842541, + "loss": 2.8268, + "step": 8041 + }, + { + "epoch": 0.2384722592889126, + "grad_norm": 0.14792552590370178, + "learning_rate": 0.0008742937722624059, + "loss": 2.801, + "step": 8042 + }, + { + "epoch": 0.23850191264122408, + "grad_norm": 0.16130106151103973, + "learning_rate": 0.0008742625741261221, + "loss": 2.7984, + "step": 8043 + }, + { + "epoch": 0.23853156599353556, + "grad_norm": 0.15843841433525085, + "learning_rate": 0.000874231372675679, + "loss": 2.7493, + "step": 8044 + }, + { + "epoch": 0.23856121934584704, + "grad_norm": 0.13454864919185638, + "learning_rate": 0.0008742001679113528, + "loss": 2.7949, + "step": 8045 + }, + { + "epoch": 0.23859087269815854, + "grad_norm": 0.1544310599565506, + "learning_rate": 0.0008741689598334199, + "loss": 2.7724, + "step": 8046 + }, + { + "epoch": 0.23862052605047002, + "grad_norm": 0.15806624293327332, + "learning_rate": 0.0008741377484421566, + "loss": 2.8129, + "step": 8047 + }, + { + "epoch": 0.2386501794027815, + "grad_norm": 0.15742619335651398, + "learning_rate": 0.0008741065337378394, + "loss": 2.8091, + "step": 8048 + }, + { + "epoch": 0.23867983275509297, + "grad_norm": 0.12617789208889008, + "learning_rate": 0.0008740753157207446, + "loss": 2.7943, + "step": 8049 + }, + { + "epoch": 0.23870948610740444, + "grad_norm": 0.13056409358978271, + "learning_rate": 0.0008740440943911487, + "loss": 2.7883, + "step": 8050 + }, + { + "epoch": 0.23873913945971592, + "grad_norm": 0.1311434805393219, + "learning_rate": 0.0008740128697493282, + "loss": 2.7934, + "step": 8051 + }, + { + "epoch": 0.2387687928120274, + "grad_norm": 0.13378848135471344, + "learning_rate": 0.0008739816417955594, + "loss": 2.7614, + "step": 8052 + }, + { + "epoch": 0.23879844616433887, + "grad_norm": 0.12425190210342407, + "learning_rate": 0.000873950410530119, + "loss": 2.762, + "step": 8053 + }, + { + "epoch": 0.23882809951665035, + "grad_norm": 0.13610218465328217, + "learning_rate": 0.0008739191759532835, + "loss": 2.7975, + "step": 8054 + }, + { + "epoch": 0.23885775286896183, + "grad_norm": 0.14374122023582458, + "learning_rate": 0.0008738879380653296, + "loss": 2.802, + "step": 8055 + }, + { + "epoch": 0.2388874062212733, + "grad_norm": 0.1362280249595642, + "learning_rate": 0.0008738566968665338, + "loss": 2.8082, + "step": 8056 + }, + { + "epoch": 0.2389170595735848, + "grad_norm": 0.13807523250579834, + "learning_rate": 0.0008738254523571727, + "loss": 2.8012, + "step": 8057 + }, + { + "epoch": 0.23894671292589628, + "grad_norm": 0.14228418469429016, + "learning_rate": 0.0008737942045375231, + "loss": 2.7556, + "step": 8058 + }, + { + "epoch": 0.23897636627820776, + "grad_norm": 0.1311016082763672, + "learning_rate": 0.0008737629534078617, + "loss": 2.8145, + "step": 8059 + }, + { + "epoch": 0.23900601963051923, + "grad_norm": 0.13600602746009827, + "learning_rate": 0.0008737316989684651, + "loss": 2.791, + "step": 8060 + }, + { + "epoch": 0.2390356729828307, + "grad_norm": 0.14884833991527557, + "learning_rate": 0.0008737004412196104, + "loss": 2.7504, + "step": 8061 + }, + { + "epoch": 0.2390653263351422, + "grad_norm": 0.16054239869117737, + "learning_rate": 0.0008736691801615739, + "loss": 2.7837, + "step": 8062 + }, + { + "epoch": 0.23909497968745366, + "grad_norm": 0.13199059665203094, + "learning_rate": 0.0008736379157946329, + "loss": 2.8045, + "step": 8063 + }, + { + "epoch": 0.23912463303976514, + "grad_norm": 0.11543899029493332, + "learning_rate": 0.0008736066481190637, + "loss": 2.7985, + "step": 8064 + }, + { + "epoch": 0.23915428639207661, + "grad_norm": 0.1280985325574875, + "learning_rate": 0.0008735753771351437, + "loss": 2.7761, + "step": 8065 + }, + { + "epoch": 0.2391839397443881, + "grad_norm": 0.11924076825380325, + "learning_rate": 0.0008735441028431497, + "loss": 2.7906, + "step": 8066 + }, + { + "epoch": 0.2392135930966996, + "grad_norm": 0.12463124096393585, + "learning_rate": 0.0008735128252433582, + "loss": 2.7793, + "step": 8067 + }, + { + "epoch": 0.23924324644901107, + "grad_norm": 0.12710975110530853, + "learning_rate": 0.0008734815443360469, + "loss": 2.8196, + "step": 8068 + }, + { + "epoch": 0.23927289980132255, + "grad_norm": 0.14285483956336975, + "learning_rate": 0.0008734502601214922, + "loss": 2.7966, + "step": 8069 + }, + { + "epoch": 0.23930255315363402, + "grad_norm": 0.17569151520729065, + "learning_rate": 0.0008734189725999714, + "loss": 2.7559, + "step": 8070 + }, + { + "epoch": 0.2393322065059455, + "grad_norm": 0.19871357083320618, + "learning_rate": 0.0008733876817717615, + "loss": 2.8136, + "step": 8071 + }, + { + "epoch": 0.23936185985825698, + "grad_norm": 0.16534796357154846, + "learning_rate": 0.0008733563876371397, + "loss": 2.8097, + "step": 8072 + }, + { + "epoch": 0.23939151321056845, + "grad_norm": 0.14420466125011444, + "learning_rate": 0.0008733250901963827, + "loss": 2.8234, + "step": 8073 + }, + { + "epoch": 0.23942116656287993, + "grad_norm": 0.1530352085828781, + "learning_rate": 0.0008732937894497684, + "loss": 2.7921, + "step": 8074 + }, + { + "epoch": 0.2394508199151914, + "grad_norm": 0.16238513588905334, + "learning_rate": 0.0008732624853975731, + "loss": 2.8272, + "step": 8075 + }, + { + "epoch": 0.23948047326750288, + "grad_norm": 0.15468372404575348, + "learning_rate": 0.0008732311780400746, + "loss": 2.836, + "step": 8076 + }, + { + "epoch": 0.23951012661981436, + "grad_norm": 0.1684509515762329, + "learning_rate": 0.00087319986737755, + "loss": 2.7906, + "step": 8077 + }, + { + "epoch": 0.23953977997212586, + "grad_norm": 0.17988678812980652, + "learning_rate": 0.0008731685534102765, + "loss": 2.804, + "step": 8078 + }, + { + "epoch": 0.23956943332443734, + "grad_norm": 0.1794554591178894, + "learning_rate": 0.0008731372361385312, + "loss": 2.8128, + "step": 8079 + }, + { + "epoch": 0.2395990866767488, + "grad_norm": 0.18985292315483093, + "learning_rate": 0.0008731059155625919, + "loss": 2.818, + "step": 8080 + }, + { + "epoch": 0.2396287400290603, + "grad_norm": 0.1811877191066742, + "learning_rate": 0.0008730745916827356, + "loss": 2.7699, + "step": 8081 + }, + { + "epoch": 0.23965839338137176, + "grad_norm": 0.17036347091197968, + "learning_rate": 0.0008730432644992397, + "loss": 2.7938, + "step": 8082 + }, + { + "epoch": 0.23968804673368324, + "grad_norm": 0.1683531403541565, + "learning_rate": 0.0008730119340123817, + "loss": 2.7988, + "step": 8083 + }, + { + "epoch": 0.23971770008599472, + "grad_norm": 0.13803048431873322, + "learning_rate": 0.000872980600222439, + "loss": 2.7562, + "step": 8084 + }, + { + "epoch": 0.2397473534383062, + "grad_norm": 0.143435999751091, + "learning_rate": 0.000872949263129689, + "loss": 2.7936, + "step": 8085 + }, + { + "epoch": 0.23977700679061767, + "grad_norm": 0.14737248420715332, + "learning_rate": 0.0008729179227344092, + "loss": 2.783, + "step": 8086 + }, + { + "epoch": 0.23980666014292915, + "grad_norm": 0.1470089703798294, + "learning_rate": 0.0008728865790368774, + "loss": 2.7846, + "step": 8087 + }, + { + "epoch": 0.23983631349524065, + "grad_norm": 0.13256610929965973, + "learning_rate": 0.0008728552320373708, + "loss": 2.8031, + "step": 8088 + }, + { + "epoch": 0.23986596684755213, + "grad_norm": 0.12882396578788757, + "learning_rate": 0.0008728238817361672, + "loss": 2.7861, + "step": 8089 + }, + { + "epoch": 0.2398956201998636, + "grad_norm": 0.1355191171169281, + "learning_rate": 0.000872792528133544, + "loss": 2.8165, + "step": 8090 + }, + { + "epoch": 0.23992527355217508, + "grad_norm": 0.1143929660320282, + "learning_rate": 0.0008727611712297791, + "loss": 2.8187, + "step": 8091 + }, + { + "epoch": 0.23995492690448655, + "grad_norm": 0.11365598440170288, + "learning_rate": 0.00087272981102515, + "loss": 2.7804, + "step": 8092 + }, + { + "epoch": 0.23998458025679803, + "grad_norm": 0.1401781290769577, + "learning_rate": 0.0008726984475199344, + "loss": 2.7916, + "step": 8093 + }, + { + "epoch": 0.2400142336091095, + "grad_norm": 0.14384901523590088, + "learning_rate": 0.0008726670807144101, + "loss": 2.806, + "step": 8094 + }, + { + "epoch": 0.24004388696142098, + "grad_norm": 0.16752131283283234, + "learning_rate": 0.0008726357106088548, + "loss": 2.7761, + "step": 8095 + }, + { + "epoch": 0.24007354031373246, + "grad_norm": 0.19677986204624176, + "learning_rate": 0.0008726043372035464, + "loss": 2.806, + "step": 8096 + }, + { + "epoch": 0.24010319366604393, + "grad_norm": 0.197696715593338, + "learning_rate": 0.0008725729604987626, + "loss": 2.7987, + "step": 8097 + }, + { + "epoch": 0.24013284701835544, + "grad_norm": 0.15884345769882202, + "learning_rate": 0.0008725415804947813, + "loss": 2.7944, + "step": 8098 + }, + { + "epoch": 0.24016250037066691, + "grad_norm": 0.1678711622953415, + "learning_rate": 0.0008725101971918803, + "loss": 2.7674, + "step": 8099 + }, + { + "epoch": 0.2401921537229784, + "grad_norm": 0.14729043841362, + "learning_rate": 0.0008724788105903376, + "loss": 2.7588, + "step": 8100 + }, + { + "epoch": 0.24022180707528987, + "grad_norm": 0.13850565254688263, + "learning_rate": 0.0008724474206904311, + "loss": 2.7995, + "step": 8101 + }, + { + "epoch": 0.24025146042760134, + "grad_norm": 0.15620380640029907, + "learning_rate": 0.0008724160274924389, + "loss": 2.8006, + "step": 8102 + }, + { + "epoch": 0.24028111377991282, + "grad_norm": 0.1534796804189682, + "learning_rate": 0.0008723846309966385, + "loss": 2.7747, + "step": 8103 + }, + { + "epoch": 0.2403107671322243, + "grad_norm": 0.15316346287727356, + "learning_rate": 0.0008723532312033086, + "loss": 2.7701, + "step": 8104 + }, + { + "epoch": 0.24034042048453577, + "grad_norm": 0.1395414024591446, + "learning_rate": 0.0008723218281127268, + "loss": 2.7512, + "step": 8105 + }, + { + "epoch": 0.24037007383684725, + "grad_norm": 0.14220955967903137, + "learning_rate": 0.0008722904217251713, + "loss": 2.8146, + "step": 8106 + }, + { + "epoch": 0.24039972718915872, + "grad_norm": 0.12994906306266785, + "learning_rate": 0.0008722590120409204, + "loss": 2.8296, + "step": 8107 + }, + { + "epoch": 0.2404293805414702, + "grad_norm": 0.12001664936542511, + "learning_rate": 0.0008722275990602518, + "loss": 2.7941, + "step": 8108 + }, + { + "epoch": 0.2404590338937817, + "grad_norm": 0.12093400210142136, + "learning_rate": 0.000872196182783444, + "loss": 2.7869, + "step": 8109 + }, + { + "epoch": 0.24048868724609318, + "grad_norm": 0.12145961076021194, + "learning_rate": 0.0008721647632107751, + "loss": 2.7662, + "step": 8110 + }, + { + "epoch": 0.24051834059840466, + "grad_norm": 0.12068179249763489, + "learning_rate": 0.0008721333403425233, + "loss": 2.8086, + "step": 8111 + }, + { + "epoch": 0.24054799395071613, + "grad_norm": 0.1304541975259781, + "learning_rate": 0.000872101914178967, + "loss": 2.7877, + "step": 8112 + }, + { + "epoch": 0.2405776473030276, + "grad_norm": 0.1308797150850296, + "learning_rate": 0.0008720704847203845, + "loss": 2.7747, + "step": 8113 + }, + { + "epoch": 0.24060730065533908, + "grad_norm": 0.13392408192157745, + "learning_rate": 0.0008720390519670537, + "loss": 2.801, + "step": 8114 + }, + { + "epoch": 0.24063695400765056, + "grad_norm": 0.1308574378490448, + "learning_rate": 0.0008720076159192534, + "loss": 2.7587, + "step": 8115 + }, + { + "epoch": 0.24066660735996204, + "grad_norm": 0.13365133106708527, + "learning_rate": 0.0008719761765772617, + "loss": 2.8231, + "step": 8116 + }, + { + "epoch": 0.2406962607122735, + "grad_norm": 0.1517636924982071, + "learning_rate": 0.0008719447339413571, + "loss": 2.805, + "step": 8117 + }, + { + "epoch": 0.240725914064585, + "grad_norm": 0.1408899575471878, + "learning_rate": 0.0008719132880118182, + "loss": 2.813, + "step": 8118 + }, + { + "epoch": 0.2407555674168965, + "grad_norm": 0.13079571723937988, + "learning_rate": 0.0008718818387889231, + "loss": 2.7803, + "step": 8119 + }, + { + "epoch": 0.24078522076920797, + "grad_norm": 0.1256406009197235, + "learning_rate": 0.0008718503862729508, + "loss": 2.7809, + "step": 8120 + }, + { + "epoch": 0.24081487412151945, + "grad_norm": 0.11738388240337372, + "learning_rate": 0.0008718189304641792, + "loss": 2.8092, + "step": 8121 + }, + { + "epoch": 0.24084452747383092, + "grad_norm": 0.1296347677707672, + "learning_rate": 0.0008717874713628873, + "loss": 2.825, + "step": 8122 + }, + { + "epoch": 0.2408741808261424, + "grad_norm": 0.13108320534229279, + "learning_rate": 0.0008717560089693535, + "loss": 2.8131, + "step": 8123 + }, + { + "epoch": 0.24090383417845387, + "grad_norm": 0.12876957654953003, + "learning_rate": 0.0008717245432838563, + "loss": 2.7751, + "step": 8124 + }, + { + "epoch": 0.24093348753076535, + "grad_norm": 0.12790411710739136, + "learning_rate": 0.0008716930743066746, + "loss": 2.7984, + "step": 8125 + }, + { + "epoch": 0.24096314088307683, + "grad_norm": 0.15578174591064453, + "learning_rate": 0.0008716616020380868, + "loss": 2.7557, + "step": 8126 + }, + { + "epoch": 0.2409927942353883, + "grad_norm": 0.16906499862670898, + "learning_rate": 0.0008716301264783719, + "loss": 2.7798, + "step": 8127 + }, + { + "epoch": 0.24102244758769978, + "grad_norm": 0.14081256091594696, + "learning_rate": 0.0008715986476278084, + "loss": 2.7877, + "step": 8128 + }, + { + "epoch": 0.24105210094001125, + "grad_norm": 0.1245531514286995, + "learning_rate": 0.000871567165486675, + "loss": 2.7987, + "step": 8129 + }, + { + "epoch": 0.24108175429232276, + "grad_norm": 0.1372310370206833, + "learning_rate": 0.0008715356800552505, + "loss": 2.794, + "step": 8130 + }, + { + "epoch": 0.24111140764463423, + "grad_norm": 0.16376501321792603, + "learning_rate": 0.000871504191333814, + "loss": 2.8031, + "step": 8131 + }, + { + "epoch": 0.2411410609969457, + "grad_norm": 0.16213133931159973, + "learning_rate": 0.0008714726993226439, + "loss": 2.7945, + "step": 8132 + }, + { + "epoch": 0.2411707143492572, + "grad_norm": 0.1929619461297989, + "learning_rate": 0.0008714412040220195, + "loss": 2.7577, + "step": 8133 + }, + { + "epoch": 0.24120036770156866, + "grad_norm": 0.22717976570129395, + "learning_rate": 0.0008714097054322194, + "loss": 2.7742, + "step": 8134 + }, + { + "epoch": 0.24123002105388014, + "grad_norm": 0.1962900012731552, + "learning_rate": 0.0008713782035535225, + "loss": 2.8173, + "step": 8135 + }, + { + "epoch": 0.24125967440619162, + "grad_norm": 0.17567962408065796, + "learning_rate": 0.000871346698386208, + "loss": 2.7848, + "step": 8136 + }, + { + "epoch": 0.2412893277585031, + "grad_norm": 0.18140871822834015, + "learning_rate": 0.0008713151899305547, + "loss": 2.81, + "step": 8137 + }, + { + "epoch": 0.24131898111081457, + "grad_norm": 0.17217160761356354, + "learning_rate": 0.0008712836781868416, + "loss": 2.8069, + "step": 8138 + }, + { + "epoch": 0.24134863446312604, + "grad_norm": 0.1475120633840561, + "learning_rate": 0.0008712521631553478, + "loss": 2.8257, + "step": 8139 + }, + { + "epoch": 0.24137828781543755, + "grad_norm": 0.12497827410697937, + "learning_rate": 0.0008712206448363524, + "loss": 2.8048, + "step": 8140 + }, + { + "epoch": 0.24140794116774902, + "grad_norm": 0.1446578949689865, + "learning_rate": 0.0008711891232301345, + "loss": 2.809, + "step": 8141 + }, + { + "epoch": 0.2414375945200605, + "grad_norm": 0.1557302474975586, + "learning_rate": 0.0008711575983369733, + "loss": 2.7856, + "step": 8142 + }, + { + "epoch": 0.24146724787237198, + "grad_norm": 0.14951835572719574, + "learning_rate": 0.0008711260701571477, + "loss": 2.7897, + "step": 8143 + }, + { + "epoch": 0.24149690122468345, + "grad_norm": 0.14605584740638733, + "learning_rate": 0.000871094538690937, + "loss": 2.7769, + "step": 8144 + }, + { + "epoch": 0.24152655457699493, + "grad_norm": 0.1278938204050064, + "learning_rate": 0.0008710630039386207, + "loss": 2.8103, + "step": 8145 + }, + { + "epoch": 0.2415562079293064, + "grad_norm": 0.1322643905878067, + "learning_rate": 0.0008710314659004777, + "loss": 2.8039, + "step": 8146 + }, + { + "epoch": 0.24158586128161788, + "grad_norm": 0.12003028392791748, + "learning_rate": 0.0008709999245767872, + "loss": 2.7695, + "step": 8147 + }, + { + "epoch": 0.24161551463392936, + "grad_norm": 0.11236822605133057, + "learning_rate": 0.0008709683799678289, + "loss": 2.8018, + "step": 8148 + }, + { + "epoch": 0.24164516798624083, + "grad_norm": 0.12870045006275177, + "learning_rate": 0.0008709368320738818, + "loss": 2.8216, + "step": 8149 + }, + { + "epoch": 0.24167482133855234, + "grad_norm": 0.14589302241802216, + "learning_rate": 0.0008709052808952254, + "loss": 2.791, + "step": 8150 + }, + { + "epoch": 0.2417044746908638, + "grad_norm": 0.16296444833278656, + "learning_rate": 0.0008708737264321391, + "loss": 2.782, + "step": 8151 + }, + { + "epoch": 0.2417341280431753, + "grad_norm": 0.14292696118354797, + "learning_rate": 0.0008708421686849025, + "loss": 2.7579, + "step": 8152 + }, + { + "epoch": 0.24176378139548677, + "grad_norm": 0.14101144671440125, + "learning_rate": 0.0008708106076537945, + "loss": 2.7894, + "step": 8153 + }, + { + "epoch": 0.24179343474779824, + "grad_norm": 0.15883110463619232, + "learning_rate": 0.0008707790433390949, + "loss": 2.7973, + "step": 8154 + }, + { + "epoch": 0.24182308810010972, + "grad_norm": 0.1480601578950882, + "learning_rate": 0.0008707474757410835, + "loss": 2.8044, + "step": 8155 + }, + { + "epoch": 0.2418527414524212, + "grad_norm": 0.11223137378692627, + "learning_rate": 0.0008707159048600395, + "loss": 2.8145, + "step": 8156 + }, + { + "epoch": 0.24188239480473267, + "grad_norm": 0.1338323950767517, + "learning_rate": 0.0008706843306962425, + "loss": 2.805, + "step": 8157 + }, + { + "epoch": 0.24191204815704415, + "grad_norm": 0.14003542065620422, + "learning_rate": 0.000870652753249972, + "loss": 2.7582, + "step": 8158 + }, + { + "epoch": 0.24194170150935562, + "grad_norm": 0.1571861207485199, + "learning_rate": 0.0008706211725215078, + "loss": 2.8113, + "step": 8159 + }, + { + "epoch": 0.2419713548616671, + "grad_norm": 0.14434003829956055, + "learning_rate": 0.0008705895885111296, + "loss": 2.8127, + "step": 8160 + }, + { + "epoch": 0.2420010082139786, + "grad_norm": 0.136665478348732, + "learning_rate": 0.0008705580012191169, + "loss": 2.777, + "step": 8161 + }, + { + "epoch": 0.24203066156629008, + "grad_norm": 0.12962447106838226, + "learning_rate": 0.0008705264106457497, + "loss": 2.7855, + "step": 8162 + }, + { + "epoch": 0.24206031491860155, + "grad_norm": 0.12507511675357819, + "learning_rate": 0.0008704948167913074, + "loss": 2.7735, + "step": 8163 + }, + { + "epoch": 0.24208996827091303, + "grad_norm": 0.13578781485557556, + "learning_rate": 0.0008704632196560697, + "loss": 2.8243, + "step": 8164 + }, + { + "epoch": 0.2421196216232245, + "grad_norm": 0.11928360164165497, + "learning_rate": 0.0008704316192403168, + "loss": 2.754, + "step": 8165 + }, + { + "epoch": 0.24214927497553598, + "grad_norm": 0.11450424790382385, + "learning_rate": 0.0008704000155443283, + "loss": 2.8072, + "step": 8166 + }, + { + "epoch": 0.24217892832784746, + "grad_norm": 0.11065659672021866, + "learning_rate": 0.000870368408568384, + "loss": 2.7729, + "step": 8167 + }, + { + "epoch": 0.24220858168015894, + "grad_norm": 0.12485508620738983, + "learning_rate": 0.0008703367983127642, + "loss": 2.7966, + "step": 8168 + }, + { + "epoch": 0.2422382350324704, + "grad_norm": 0.16427570581436157, + "learning_rate": 0.0008703051847777482, + "loss": 2.7872, + "step": 8169 + }, + { + "epoch": 0.2422678883847819, + "grad_norm": 0.15969908237457275, + "learning_rate": 0.0008702735679636162, + "loss": 2.7814, + "step": 8170 + }, + { + "epoch": 0.2422975417370934, + "grad_norm": 0.161490336060524, + "learning_rate": 0.0008702419478706483, + "loss": 2.8484, + "step": 8171 + }, + { + "epoch": 0.24232719508940487, + "grad_norm": 0.15238922834396362, + "learning_rate": 0.0008702103244991242, + "loss": 2.8265, + "step": 8172 + }, + { + "epoch": 0.24235684844171634, + "grad_norm": 0.15656408667564392, + "learning_rate": 0.0008701786978493243, + "loss": 2.7857, + "step": 8173 + }, + { + "epoch": 0.24238650179402782, + "grad_norm": 0.15508605539798737, + "learning_rate": 0.0008701470679215286, + "loss": 2.7658, + "step": 8174 + }, + { + "epoch": 0.2424161551463393, + "grad_norm": 0.14197978377342224, + "learning_rate": 0.000870115434716017, + "loss": 2.732, + "step": 8175 + }, + { + "epoch": 0.24244580849865077, + "grad_norm": 0.14034263789653778, + "learning_rate": 0.0008700837982330696, + "loss": 2.8012, + "step": 8176 + }, + { + "epoch": 0.24247546185096225, + "grad_norm": 0.16324834525585175, + "learning_rate": 0.0008700521584729667, + "loss": 2.8182, + "step": 8177 + }, + { + "epoch": 0.24250511520327372, + "grad_norm": 0.17616020143032074, + "learning_rate": 0.0008700205154359884, + "loss": 2.8078, + "step": 8178 + }, + { + "epoch": 0.2425347685555852, + "grad_norm": 0.18278558552265167, + "learning_rate": 0.0008699888691224149, + "loss": 2.7662, + "step": 8179 + }, + { + "epoch": 0.24256442190789668, + "grad_norm": 0.17199431359767914, + "learning_rate": 0.0008699572195325265, + "loss": 2.7899, + "step": 8180 + }, + { + "epoch": 0.24259407526020815, + "grad_norm": 0.1454460769891739, + "learning_rate": 0.0008699255666666035, + "loss": 2.8135, + "step": 8181 + }, + { + "epoch": 0.24262372861251966, + "grad_norm": 0.12101629376411438, + "learning_rate": 0.0008698939105249259, + "loss": 2.7733, + "step": 8182 + }, + { + "epoch": 0.24265338196483113, + "grad_norm": 0.13980132341384888, + "learning_rate": 0.0008698622511077744, + "loss": 2.7881, + "step": 8183 + }, + { + "epoch": 0.2426830353171426, + "grad_norm": 0.12490391731262207, + "learning_rate": 0.0008698305884154292, + "loss": 2.7601, + "step": 8184 + }, + { + "epoch": 0.24271268866945409, + "grad_norm": 0.13657264411449432, + "learning_rate": 0.0008697989224481706, + "loss": 2.7941, + "step": 8185 + }, + { + "epoch": 0.24274234202176556, + "grad_norm": 0.1369922012090683, + "learning_rate": 0.000869767253206279, + "loss": 2.7942, + "step": 8186 + }, + { + "epoch": 0.24277199537407704, + "grad_norm": 0.12189114838838577, + "learning_rate": 0.0008697355806900349, + "loss": 2.7683, + "step": 8187 + }, + { + "epoch": 0.2428016487263885, + "grad_norm": 0.127400740981102, + "learning_rate": 0.0008697039048997188, + "loss": 2.8175, + "step": 8188 + }, + { + "epoch": 0.2428313020787, + "grad_norm": 0.14820732176303864, + "learning_rate": 0.0008696722258356113, + "loss": 2.8037, + "step": 8189 + }, + { + "epoch": 0.24286095543101147, + "grad_norm": 0.1505413055419922, + "learning_rate": 0.0008696405434979926, + "loss": 2.7891, + "step": 8190 + }, + { + "epoch": 0.24289060878332294, + "grad_norm": 0.14515282213687897, + "learning_rate": 0.0008696088578871436, + "loss": 2.7908, + "step": 8191 + }, + { + "epoch": 0.24292026213563445, + "grad_norm": 0.12615510821342468, + "learning_rate": 0.0008695771690033447, + "loss": 2.8017, + "step": 8192 + }, + { + "epoch": 0.24294991548794592, + "grad_norm": 0.1121220588684082, + "learning_rate": 0.0008695454768468764, + "loss": 2.777, + "step": 8193 + }, + { + "epoch": 0.2429795688402574, + "grad_norm": 0.13229487836360931, + "learning_rate": 0.0008695137814180196, + "loss": 2.7801, + "step": 8194 + }, + { + "epoch": 0.24300922219256887, + "grad_norm": 0.14873138070106506, + "learning_rate": 0.0008694820827170548, + "loss": 2.8097, + "step": 8195 + }, + { + "epoch": 0.24303887554488035, + "grad_norm": 0.1546662449836731, + "learning_rate": 0.0008694503807442626, + "loss": 2.7613, + "step": 8196 + }, + { + "epoch": 0.24306852889719183, + "grad_norm": 0.15857376158237457, + "learning_rate": 0.0008694186754999241, + "loss": 2.7666, + "step": 8197 + }, + { + "epoch": 0.2430981822495033, + "grad_norm": 0.14979049563407898, + "learning_rate": 0.0008693869669843198, + "loss": 2.799, + "step": 8198 + }, + { + "epoch": 0.24312783560181478, + "grad_norm": 0.15790657699108124, + "learning_rate": 0.0008693552551977302, + "loss": 2.7585, + "step": 8199 + }, + { + "epoch": 0.24315748895412626, + "grad_norm": 0.16167309880256653, + "learning_rate": 0.0008693235401404367, + "loss": 2.7908, + "step": 8200 + }, + { + "epoch": 0.24318714230643773, + "grad_norm": 0.16517822444438934, + "learning_rate": 0.0008692918218127197, + "loss": 2.792, + "step": 8201 + }, + { + "epoch": 0.24321679565874924, + "grad_norm": 0.2090936154127121, + "learning_rate": 0.0008692601002148603, + "loss": 2.7764, + "step": 8202 + }, + { + "epoch": 0.2432464490110607, + "grad_norm": 0.16101451218128204, + "learning_rate": 0.0008692283753471394, + "loss": 2.7939, + "step": 8203 + }, + { + "epoch": 0.2432761023633722, + "grad_norm": 0.14507928490638733, + "learning_rate": 0.0008691966472098378, + "loss": 2.8072, + "step": 8204 + }, + { + "epoch": 0.24330575571568366, + "grad_norm": 0.15019141137599945, + "learning_rate": 0.0008691649158032365, + "loss": 2.7961, + "step": 8205 + }, + { + "epoch": 0.24333540906799514, + "grad_norm": 0.15729741752147675, + "learning_rate": 0.0008691331811276165, + "loss": 2.7943, + "step": 8206 + }, + { + "epoch": 0.24336506242030662, + "grad_norm": 0.15158405900001526, + "learning_rate": 0.0008691014431832589, + "loss": 2.7616, + "step": 8207 + }, + { + "epoch": 0.2433947157726181, + "grad_norm": 0.14002975821495056, + "learning_rate": 0.0008690697019704445, + "loss": 2.7486, + "step": 8208 + }, + { + "epoch": 0.24342436912492957, + "grad_norm": 0.1399354189634323, + "learning_rate": 0.0008690379574894547, + "loss": 2.8033, + "step": 8209 + }, + { + "epoch": 0.24345402247724104, + "grad_norm": 0.1313304305076599, + "learning_rate": 0.0008690062097405705, + "loss": 2.7779, + "step": 8210 + }, + { + "epoch": 0.24348367582955252, + "grad_norm": 0.12413600832223892, + "learning_rate": 0.0008689744587240728, + "loss": 2.7949, + "step": 8211 + }, + { + "epoch": 0.243513329181864, + "grad_norm": 0.12354089319705963, + "learning_rate": 0.0008689427044402429, + "loss": 2.7357, + "step": 8212 + }, + { + "epoch": 0.2435429825341755, + "grad_norm": 0.11918431520462036, + "learning_rate": 0.0008689109468893622, + "loss": 2.7938, + "step": 8213 + }, + { + "epoch": 0.24357263588648698, + "grad_norm": 0.12036676704883575, + "learning_rate": 0.0008688791860717117, + "loss": 2.7953, + "step": 8214 + }, + { + "epoch": 0.24360228923879845, + "grad_norm": 0.1255940943956375, + "learning_rate": 0.0008688474219875726, + "loss": 2.8133, + "step": 8215 + }, + { + "epoch": 0.24363194259110993, + "grad_norm": 0.13045352697372437, + "learning_rate": 0.0008688156546372264, + "loss": 2.8407, + "step": 8216 + }, + { + "epoch": 0.2436615959434214, + "grad_norm": 0.13363024592399597, + "learning_rate": 0.0008687838840209541, + "loss": 2.8187, + "step": 8217 + }, + { + "epoch": 0.24369124929573288, + "grad_norm": 0.13054732978343964, + "learning_rate": 0.0008687521101390373, + "loss": 2.8156, + "step": 8218 + }, + { + "epoch": 0.24372090264804436, + "grad_norm": 0.13756775856018066, + "learning_rate": 0.0008687203329917572, + "loss": 2.7918, + "step": 8219 + }, + { + "epoch": 0.24375055600035583, + "grad_norm": 0.1529286652803421, + "learning_rate": 0.0008686885525793954, + "loss": 2.8129, + "step": 8220 + }, + { + "epoch": 0.2437802093526673, + "grad_norm": 0.15134261548519135, + "learning_rate": 0.0008686567689022331, + "loss": 2.766, + "step": 8221 + }, + { + "epoch": 0.24380986270497879, + "grad_norm": 0.13746806979179382, + "learning_rate": 0.0008686249819605518, + "loss": 2.8027, + "step": 8222 + }, + { + "epoch": 0.2438395160572903, + "grad_norm": 0.12685613334178925, + "learning_rate": 0.000868593191754633, + "loss": 2.7982, + "step": 8223 + }, + { + "epoch": 0.24386916940960177, + "grad_norm": 0.13248521089553833, + "learning_rate": 0.0008685613982847585, + "loss": 2.7751, + "step": 8224 + }, + { + "epoch": 0.24389882276191324, + "grad_norm": 0.1342095285654068, + "learning_rate": 0.0008685296015512092, + "loss": 2.8078, + "step": 8225 + }, + { + "epoch": 0.24392847611422472, + "grad_norm": 0.13638459146022797, + "learning_rate": 0.0008684978015542672, + "loss": 2.8069, + "step": 8226 + }, + { + "epoch": 0.2439581294665362, + "grad_norm": 0.1354011744260788, + "learning_rate": 0.0008684659982942138, + "loss": 2.802, + "step": 8227 + }, + { + "epoch": 0.24398778281884767, + "grad_norm": 0.13094773888587952, + "learning_rate": 0.0008684341917713308, + "loss": 2.7896, + "step": 8228 + }, + { + "epoch": 0.24401743617115915, + "grad_norm": 0.1463044285774231, + "learning_rate": 0.0008684023819858998, + "loss": 2.7792, + "step": 8229 + }, + { + "epoch": 0.24404708952347062, + "grad_norm": 0.16319642961025238, + "learning_rate": 0.0008683705689382025, + "loss": 2.7935, + "step": 8230 + }, + { + "epoch": 0.2440767428757821, + "grad_norm": 0.15060941874980927, + "learning_rate": 0.0008683387526285205, + "loss": 2.7836, + "step": 8231 + }, + { + "epoch": 0.24410639622809358, + "grad_norm": 0.15000665187835693, + "learning_rate": 0.0008683069330571357, + "loss": 2.8151, + "step": 8232 + }, + { + "epoch": 0.24413604958040505, + "grad_norm": 0.14635564386844635, + "learning_rate": 0.0008682751102243298, + "loss": 2.7979, + "step": 8233 + }, + { + "epoch": 0.24416570293271656, + "grad_norm": 0.15930037200450897, + "learning_rate": 0.0008682432841303845, + "loss": 2.8073, + "step": 8234 + }, + { + "epoch": 0.24419535628502803, + "grad_norm": 0.1629866063594818, + "learning_rate": 0.0008682114547755817, + "loss": 2.7926, + "step": 8235 + }, + { + "epoch": 0.2442250096373395, + "grad_norm": 0.14141197502613068, + "learning_rate": 0.0008681796221602034, + "loss": 2.8236, + "step": 8236 + }, + { + "epoch": 0.24425466298965098, + "grad_norm": 0.1463969349861145, + "learning_rate": 0.0008681477862845313, + "loss": 2.7909, + "step": 8237 + }, + { + "epoch": 0.24428431634196246, + "grad_norm": 0.14972245693206787, + "learning_rate": 0.0008681159471488472, + "loss": 2.8072, + "step": 8238 + }, + { + "epoch": 0.24431396969427394, + "grad_norm": 0.1540534496307373, + "learning_rate": 0.0008680841047534333, + "loss": 2.7754, + "step": 8239 + }, + { + "epoch": 0.2443436230465854, + "grad_norm": 0.16306781768798828, + "learning_rate": 0.0008680522590985715, + "loss": 2.8013, + "step": 8240 + }, + { + "epoch": 0.2443732763988969, + "grad_norm": 0.15392234921455383, + "learning_rate": 0.0008680204101845439, + "loss": 2.79, + "step": 8241 + }, + { + "epoch": 0.24440292975120836, + "grad_norm": 0.1444409340620041, + "learning_rate": 0.0008679885580116322, + "loss": 2.8262, + "step": 8242 + }, + { + "epoch": 0.24443258310351984, + "grad_norm": 0.13508471846580505, + "learning_rate": 0.0008679567025801187, + "loss": 2.7784, + "step": 8243 + }, + { + "epoch": 0.24446223645583134, + "grad_norm": 0.16250041127204895, + "learning_rate": 0.0008679248438902856, + "loss": 2.7851, + "step": 8244 + }, + { + "epoch": 0.24449188980814282, + "grad_norm": 0.17743606865406036, + "learning_rate": 0.0008678929819424146, + "loss": 2.7953, + "step": 8245 + }, + { + "epoch": 0.2445215431604543, + "grad_norm": 0.18086828291416168, + "learning_rate": 0.0008678611167367882, + "loss": 2.7545, + "step": 8246 + }, + { + "epoch": 0.24455119651276577, + "grad_norm": 0.14441275596618652, + "learning_rate": 0.0008678292482736885, + "loss": 2.8265, + "step": 8247 + }, + { + "epoch": 0.24458084986507725, + "grad_norm": 0.12256110459566116, + "learning_rate": 0.0008677973765533977, + "loss": 2.7877, + "step": 8248 + }, + { + "epoch": 0.24461050321738873, + "grad_norm": 0.12498769909143448, + "learning_rate": 0.0008677655015761979, + "loss": 2.7811, + "step": 8249 + }, + { + "epoch": 0.2446401565697002, + "grad_norm": 0.12017659842967987, + "learning_rate": 0.0008677336233423716, + "loss": 2.7877, + "step": 8250 + }, + { + "epoch": 0.24466980992201168, + "grad_norm": 0.12841777503490448, + "learning_rate": 0.0008677017418522009, + "loss": 2.8203, + "step": 8251 + }, + { + "epoch": 0.24469946327432315, + "grad_norm": 0.11589980870485306, + "learning_rate": 0.0008676698571059681, + "loss": 2.7957, + "step": 8252 + }, + { + "epoch": 0.24472911662663463, + "grad_norm": 0.12823207676410675, + "learning_rate": 0.0008676379691039555, + "loss": 2.7914, + "step": 8253 + }, + { + "epoch": 0.24475876997894613, + "grad_norm": 0.14013327658176422, + "learning_rate": 0.0008676060778464457, + "loss": 2.7755, + "step": 8254 + }, + { + "epoch": 0.2447884233312576, + "grad_norm": 0.15037797391414642, + "learning_rate": 0.000867574183333721, + "loss": 2.7622, + "step": 8255 + }, + { + "epoch": 0.24481807668356909, + "grad_norm": 0.1302134245634079, + "learning_rate": 0.0008675422855660638, + "loss": 2.7858, + "step": 8256 + }, + { + "epoch": 0.24484773003588056, + "grad_norm": 0.11888839304447174, + "learning_rate": 0.0008675103845437565, + "loss": 2.8118, + "step": 8257 + }, + { + "epoch": 0.24487738338819204, + "grad_norm": 0.12444274127483368, + "learning_rate": 0.0008674784802670817, + "loss": 2.7642, + "step": 8258 + }, + { + "epoch": 0.24490703674050351, + "grad_norm": 0.14593183994293213, + "learning_rate": 0.0008674465727363221, + "loss": 2.7756, + "step": 8259 + }, + { + "epoch": 0.244936690092815, + "grad_norm": 0.12473293393850327, + "learning_rate": 0.0008674146619517597, + "loss": 2.8057, + "step": 8260 + }, + { + "epoch": 0.24496634344512647, + "grad_norm": 0.12470465153455734, + "learning_rate": 0.0008673827479136776, + "loss": 2.7676, + "step": 8261 + }, + { + "epoch": 0.24499599679743794, + "grad_norm": 0.1384652704000473, + "learning_rate": 0.0008673508306223581, + "loss": 2.7705, + "step": 8262 + }, + { + "epoch": 0.24502565014974942, + "grad_norm": 0.13917869329452515, + "learning_rate": 0.000867318910078084, + "loss": 2.8115, + "step": 8263 + }, + { + "epoch": 0.2450553035020609, + "grad_norm": 0.13839305937290192, + "learning_rate": 0.0008672869862811379, + "loss": 2.7873, + "step": 8264 + }, + { + "epoch": 0.2450849568543724, + "grad_norm": 0.1336911916732788, + "learning_rate": 0.0008672550592318024, + "loss": 2.76, + "step": 8265 + }, + { + "epoch": 0.24511461020668388, + "grad_norm": 0.11361753195524216, + "learning_rate": 0.0008672231289303605, + "loss": 2.8047, + "step": 8266 + }, + { + "epoch": 0.24514426355899535, + "grad_norm": 0.11745880544185638, + "learning_rate": 0.0008671911953770946, + "loss": 2.7387, + "step": 8267 + }, + { + "epoch": 0.24517391691130683, + "grad_norm": 0.11509998887777328, + "learning_rate": 0.0008671592585722878, + "loss": 2.7641, + "step": 8268 + }, + { + "epoch": 0.2452035702636183, + "grad_norm": 0.12357252091169357, + "learning_rate": 0.0008671273185162225, + "loss": 2.7874, + "step": 8269 + }, + { + "epoch": 0.24523322361592978, + "grad_norm": 0.123631551861763, + "learning_rate": 0.0008670953752091819, + "loss": 2.7621, + "step": 8270 + }, + { + "epoch": 0.24526287696824126, + "grad_norm": 0.1336003839969635, + "learning_rate": 0.0008670634286514488, + "loss": 2.7818, + "step": 8271 + }, + { + "epoch": 0.24529253032055273, + "grad_norm": 0.1547340452671051, + "learning_rate": 0.000867031478843306, + "loss": 2.7513, + "step": 8272 + }, + { + "epoch": 0.2453221836728642, + "grad_norm": 0.1655394583940506, + "learning_rate": 0.0008669995257850365, + "loss": 2.7819, + "step": 8273 + }, + { + "epoch": 0.24535183702517568, + "grad_norm": 0.1567152738571167, + "learning_rate": 0.000866967569476923, + "loss": 2.7515, + "step": 8274 + }, + { + "epoch": 0.2453814903774872, + "grad_norm": 0.16070303320884705, + "learning_rate": 0.0008669356099192489, + "loss": 2.7987, + "step": 8275 + }, + { + "epoch": 0.24541114372979866, + "grad_norm": 0.1625920683145523, + "learning_rate": 0.0008669036471122969, + "loss": 2.7739, + "step": 8276 + }, + { + "epoch": 0.24544079708211014, + "grad_norm": 0.15535889565944672, + "learning_rate": 0.0008668716810563502, + "loss": 2.7771, + "step": 8277 + }, + { + "epoch": 0.24547045043442162, + "grad_norm": 0.1432812213897705, + "learning_rate": 0.0008668397117516918, + "loss": 2.8, + "step": 8278 + }, + { + "epoch": 0.2455001037867331, + "grad_norm": 0.14544901251792908, + "learning_rate": 0.0008668077391986047, + "loss": 2.78, + "step": 8279 + }, + { + "epoch": 0.24552975713904457, + "grad_norm": 0.1541498303413391, + "learning_rate": 0.0008667757633973721, + "loss": 2.7898, + "step": 8280 + }, + { + "epoch": 0.24555941049135604, + "grad_norm": 0.16968974471092224, + "learning_rate": 0.0008667437843482772, + "loss": 2.8372, + "step": 8281 + }, + { + "epoch": 0.24558906384366752, + "grad_norm": 0.15542763471603394, + "learning_rate": 0.0008667118020516031, + "loss": 2.8087, + "step": 8282 + }, + { + "epoch": 0.245618717195979, + "grad_norm": 0.1554822474718094, + "learning_rate": 0.0008666798165076331, + "loss": 2.8236, + "step": 8283 + }, + { + "epoch": 0.24564837054829047, + "grad_norm": 0.15683576464653015, + "learning_rate": 0.0008666478277166503, + "loss": 2.7772, + "step": 8284 + }, + { + "epoch": 0.24567802390060195, + "grad_norm": 0.1502191722393036, + "learning_rate": 0.0008666158356789382, + "loss": 2.8094, + "step": 8285 + }, + { + "epoch": 0.24570767725291345, + "grad_norm": 0.15973906219005585, + "learning_rate": 0.0008665838403947799, + "loss": 2.8032, + "step": 8286 + }, + { + "epoch": 0.24573733060522493, + "grad_norm": 0.14636775851249695, + "learning_rate": 0.0008665518418644587, + "loss": 2.7975, + "step": 8287 + }, + { + "epoch": 0.2457669839575364, + "grad_norm": 0.13559649884700775, + "learning_rate": 0.0008665198400882579, + "loss": 2.8046, + "step": 8288 + }, + { + "epoch": 0.24579663730984788, + "grad_norm": 0.13309112191200256, + "learning_rate": 0.0008664878350664614, + "loss": 2.7863, + "step": 8289 + }, + { + "epoch": 0.24582629066215936, + "grad_norm": 0.14557021856307983, + "learning_rate": 0.0008664558267993519, + "loss": 2.7804, + "step": 8290 + }, + { + "epoch": 0.24585594401447083, + "grad_norm": 0.14691810309886932, + "learning_rate": 0.0008664238152872131, + "loss": 2.8083, + "step": 8291 + }, + { + "epoch": 0.2458855973667823, + "grad_norm": 0.14204856753349304, + "learning_rate": 0.0008663918005303287, + "loss": 2.8047, + "step": 8292 + }, + { + "epoch": 0.2459152507190938, + "grad_norm": 0.14181271195411682, + "learning_rate": 0.000866359782528982, + "loss": 2.7949, + "step": 8293 + }, + { + "epoch": 0.24594490407140526, + "grad_norm": 0.13449084758758545, + "learning_rate": 0.0008663277612834564, + "loss": 2.8105, + "step": 8294 + }, + { + "epoch": 0.24597455742371674, + "grad_norm": 0.13791140913963318, + "learning_rate": 0.0008662957367940357, + "loss": 2.7921, + "step": 8295 + }, + { + "epoch": 0.24600421077602824, + "grad_norm": 0.13478592038154602, + "learning_rate": 0.0008662637090610034, + "loss": 2.8086, + "step": 8296 + }, + { + "epoch": 0.24603386412833972, + "grad_norm": 0.12751063704490662, + "learning_rate": 0.0008662316780846431, + "loss": 2.7892, + "step": 8297 + }, + { + "epoch": 0.2460635174806512, + "grad_norm": 0.14197289943695068, + "learning_rate": 0.0008661996438652384, + "loss": 2.8125, + "step": 8298 + }, + { + "epoch": 0.24609317083296267, + "grad_norm": 0.1359615921974182, + "learning_rate": 0.0008661676064030729, + "loss": 2.7755, + "step": 8299 + }, + { + "epoch": 0.24612282418527415, + "grad_norm": 0.13461489975452423, + "learning_rate": 0.0008661355656984305, + "loss": 2.7801, + "step": 8300 + }, + { + "epoch": 0.24615247753758562, + "grad_norm": 0.1313364952802658, + "learning_rate": 0.0008661035217515947, + "loss": 2.8191, + "step": 8301 + }, + { + "epoch": 0.2461821308898971, + "grad_norm": 0.12676183879375458, + "learning_rate": 0.0008660714745628495, + "loss": 2.7705, + "step": 8302 + }, + { + "epoch": 0.24621178424220858, + "grad_norm": 0.1358608603477478, + "learning_rate": 0.0008660394241324785, + "loss": 2.7999, + "step": 8303 + }, + { + "epoch": 0.24624143759452005, + "grad_norm": 0.15589015185832977, + "learning_rate": 0.0008660073704607656, + "loss": 2.8009, + "step": 8304 + }, + { + "epoch": 0.24627109094683153, + "grad_norm": 0.13677453994750977, + "learning_rate": 0.0008659753135479946, + "loss": 2.7796, + "step": 8305 + }, + { + "epoch": 0.24630074429914303, + "grad_norm": 0.12180113047361374, + "learning_rate": 0.0008659432533944495, + "loss": 2.7945, + "step": 8306 + }, + { + "epoch": 0.2463303976514545, + "grad_norm": 0.1339431256055832, + "learning_rate": 0.000865911190000414, + "loss": 2.7895, + "step": 8307 + }, + { + "epoch": 0.24636005100376598, + "grad_norm": 0.15198254585266113, + "learning_rate": 0.000865879123366172, + "loss": 2.7934, + "step": 8308 + }, + { + "epoch": 0.24638970435607746, + "grad_norm": 0.14472784101963043, + "learning_rate": 0.0008658470534920076, + "loss": 2.7387, + "step": 8309 + }, + { + "epoch": 0.24641935770838894, + "grad_norm": 0.15225863456726074, + "learning_rate": 0.0008658149803782047, + "loss": 2.7993, + "step": 8310 + }, + { + "epoch": 0.2464490110607004, + "grad_norm": 0.17636670172214508, + "learning_rate": 0.0008657829040250476, + "loss": 2.7683, + "step": 8311 + }, + { + "epoch": 0.2464786644130119, + "grad_norm": 0.14385566115379333, + "learning_rate": 0.0008657508244328198, + "loss": 2.7908, + "step": 8312 + }, + { + "epoch": 0.24650831776532336, + "grad_norm": 0.12895649671554565, + "learning_rate": 0.0008657187416018057, + "loss": 2.7984, + "step": 8313 + }, + { + "epoch": 0.24653797111763484, + "grad_norm": 0.13193440437316895, + "learning_rate": 0.0008656866555322895, + "loss": 2.8414, + "step": 8314 + }, + { + "epoch": 0.24656762446994632, + "grad_norm": 0.1422477811574936, + "learning_rate": 0.0008656545662245553, + "loss": 2.7743, + "step": 8315 + }, + { + "epoch": 0.2465972778222578, + "grad_norm": 0.15545280277729034, + "learning_rate": 0.0008656224736788869, + "loss": 2.7758, + "step": 8316 + }, + { + "epoch": 0.2466269311745693, + "grad_norm": 0.17429357767105103, + "learning_rate": 0.000865590377895569, + "loss": 2.7995, + "step": 8317 + }, + { + "epoch": 0.24665658452688077, + "grad_norm": 0.17097340524196625, + "learning_rate": 0.0008655582788748852, + "loss": 2.7808, + "step": 8318 + }, + { + "epoch": 0.24668623787919225, + "grad_norm": 0.18691560626029968, + "learning_rate": 0.0008655261766171204, + "loss": 2.8114, + "step": 8319 + }, + { + "epoch": 0.24671589123150373, + "grad_norm": 0.20663610100746155, + "learning_rate": 0.0008654940711225585, + "loss": 2.7803, + "step": 8320 + }, + { + "epoch": 0.2467455445838152, + "grad_norm": 0.18365086615085602, + "learning_rate": 0.0008654619623914838, + "loss": 2.794, + "step": 8321 + }, + { + "epoch": 0.24677519793612668, + "grad_norm": 0.14856217801570892, + "learning_rate": 0.0008654298504241806, + "loss": 2.7963, + "step": 8322 + }, + { + "epoch": 0.24680485128843815, + "grad_norm": 0.17462025582790375, + "learning_rate": 0.0008653977352209336, + "loss": 2.8202, + "step": 8323 + }, + { + "epoch": 0.24683450464074963, + "grad_norm": 0.1720193773508072, + "learning_rate": 0.0008653656167820267, + "loss": 2.7813, + "step": 8324 + }, + { + "epoch": 0.2468641579930611, + "grad_norm": 0.1514241099357605, + "learning_rate": 0.0008653334951077448, + "loss": 2.7832, + "step": 8325 + }, + { + "epoch": 0.24689381134537258, + "grad_norm": 0.1537393480539322, + "learning_rate": 0.0008653013701983718, + "loss": 2.8, + "step": 8326 + }, + { + "epoch": 0.2469234646976841, + "grad_norm": 0.14267520606517792, + "learning_rate": 0.0008652692420541928, + "loss": 2.7515, + "step": 8327 + }, + { + "epoch": 0.24695311804999556, + "grad_norm": 0.1338217407464981, + "learning_rate": 0.0008652371106754917, + "loss": 2.7622, + "step": 8328 + }, + { + "epoch": 0.24698277140230704, + "grad_norm": 0.11668730527162552, + "learning_rate": 0.0008652049760625533, + "loss": 2.7944, + "step": 8329 + }, + { + "epoch": 0.24701242475461851, + "grad_norm": 0.14639821648597717, + "learning_rate": 0.0008651728382156622, + "loss": 2.7453, + "step": 8330 + }, + { + "epoch": 0.24704207810693, + "grad_norm": 0.13388654589653015, + "learning_rate": 0.000865140697135103, + "loss": 2.7948, + "step": 8331 + }, + { + "epoch": 0.24707173145924147, + "grad_norm": 0.12558667361736298, + "learning_rate": 0.0008651085528211602, + "loss": 2.7744, + "step": 8332 + }, + { + "epoch": 0.24710138481155294, + "grad_norm": 0.12817756831645966, + "learning_rate": 0.0008650764052741185, + "loss": 2.7789, + "step": 8333 + }, + { + "epoch": 0.24713103816386442, + "grad_norm": 0.1187325045466423, + "learning_rate": 0.0008650442544942625, + "loss": 2.7558, + "step": 8334 + }, + { + "epoch": 0.2471606915161759, + "grad_norm": 0.12428798526525497, + "learning_rate": 0.000865012100481877, + "loss": 2.7613, + "step": 8335 + }, + { + "epoch": 0.24719034486848737, + "grad_norm": 0.11621434986591339, + "learning_rate": 0.0008649799432372468, + "loss": 2.7485, + "step": 8336 + }, + { + "epoch": 0.24721999822079885, + "grad_norm": 0.11566699296236038, + "learning_rate": 0.0008649477827606564, + "loss": 2.7657, + "step": 8337 + }, + { + "epoch": 0.24724965157311035, + "grad_norm": 0.1338142454624176, + "learning_rate": 0.0008649156190523909, + "loss": 2.8013, + "step": 8338 + }, + { + "epoch": 0.24727930492542183, + "grad_norm": 0.13169817626476288, + "learning_rate": 0.0008648834521127349, + "loss": 2.7943, + "step": 8339 + }, + { + "epoch": 0.2473089582777333, + "grad_norm": 0.1522853523492813, + "learning_rate": 0.0008648512819419733, + "loss": 2.7592, + "step": 8340 + }, + { + "epoch": 0.24733861163004478, + "grad_norm": 0.16574741899967194, + "learning_rate": 0.0008648191085403909, + "loss": 2.8182, + "step": 8341 + }, + { + "epoch": 0.24736826498235626, + "grad_norm": 0.16033689677715302, + "learning_rate": 0.0008647869319082728, + "loss": 2.7741, + "step": 8342 + }, + { + "epoch": 0.24739791833466773, + "grad_norm": 0.16223183274269104, + "learning_rate": 0.0008647547520459035, + "loss": 2.8104, + "step": 8343 + }, + { + "epoch": 0.2474275716869792, + "grad_norm": 0.15322904288768768, + "learning_rate": 0.0008647225689535687, + "loss": 2.8264, + "step": 8344 + }, + { + "epoch": 0.24745722503929068, + "grad_norm": 0.15129579603672028, + "learning_rate": 0.0008646903826315526, + "loss": 2.8001, + "step": 8345 + }, + { + "epoch": 0.24748687839160216, + "grad_norm": 0.16006429493427277, + "learning_rate": 0.0008646581930801408, + "loss": 2.7802, + "step": 8346 + }, + { + "epoch": 0.24751653174391364, + "grad_norm": 0.1712319701910019, + "learning_rate": 0.000864626000299618, + "loss": 2.7847, + "step": 8347 + }, + { + "epoch": 0.24754618509622514, + "grad_norm": 0.1748867630958557, + "learning_rate": 0.0008645938042902693, + "loss": 2.7684, + "step": 8348 + }, + { + "epoch": 0.24757583844853662, + "grad_norm": 0.13533039391040802, + "learning_rate": 0.0008645616050523802, + "loss": 2.7578, + "step": 8349 + }, + { + "epoch": 0.2476054918008481, + "grad_norm": 0.1733233481645584, + "learning_rate": 0.0008645294025862351, + "loss": 2.7649, + "step": 8350 + }, + { + "epoch": 0.24763514515315957, + "grad_norm": 0.1608061045408249, + "learning_rate": 0.0008644971968921198, + "loss": 2.7887, + "step": 8351 + }, + { + "epoch": 0.24766479850547105, + "grad_norm": 0.13920211791992188, + "learning_rate": 0.0008644649879703193, + "loss": 2.7951, + "step": 8352 + }, + { + "epoch": 0.24769445185778252, + "grad_norm": 0.1577703058719635, + "learning_rate": 0.0008644327758211186, + "loss": 2.8, + "step": 8353 + }, + { + "epoch": 0.247724105210094, + "grad_norm": 0.13577987253665924, + "learning_rate": 0.0008644005604448031, + "loss": 2.7873, + "step": 8354 + }, + { + "epoch": 0.24775375856240547, + "grad_norm": 0.14635108411312103, + "learning_rate": 0.0008643683418416583, + "loss": 2.7779, + "step": 8355 + }, + { + "epoch": 0.24778341191471695, + "grad_norm": 0.12934984266757965, + "learning_rate": 0.0008643361200119691, + "loss": 2.7806, + "step": 8356 + }, + { + "epoch": 0.24781306526702843, + "grad_norm": 0.1214018240571022, + "learning_rate": 0.0008643038949560212, + "loss": 2.773, + "step": 8357 + }, + { + "epoch": 0.24784271861933993, + "grad_norm": 0.12097355723381042, + "learning_rate": 0.0008642716666740995, + "loss": 2.7938, + "step": 8358 + }, + { + "epoch": 0.2478723719716514, + "grad_norm": 0.1258600950241089, + "learning_rate": 0.0008642394351664899, + "loss": 2.7905, + "step": 8359 + }, + { + "epoch": 0.24790202532396288, + "grad_norm": 0.13945572078227997, + "learning_rate": 0.0008642072004334775, + "loss": 2.7404, + "step": 8360 + }, + { + "epoch": 0.24793167867627436, + "grad_norm": 0.11364469677209854, + "learning_rate": 0.0008641749624753479, + "loss": 2.7844, + "step": 8361 + }, + { + "epoch": 0.24796133202858583, + "grad_norm": 0.1218424066901207, + "learning_rate": 0.0008641427212923863, + "loss": 2.7842, + "step": 8362 + }, + { + "epoch": 0.2479909853808973, + "grad_norm": 0.11183632910251617, + "learning_rate": 0.0008641104768848787, + "loss": 2.7948, + "step": 8363 + }, + { + "epoch": 0.2480206387332088, + "grad_norm": 0.10608268529176712, + "learning_rate": 0.0008640782292531101, + "loss": 2.8011, + "step": 8364 + }, + { + "epoch": 0.24805029208552026, + "grad_norm": 0.12218568474054337, + "learning_rate": 0.0008640459783973664, + "loss": 2.7903, + "step": 8365 + }, + { + "epoch": 0.24807994543783174, + "grad_norm": 0.13447579741477966, + "learning_rate": 0.0008640137243179331, + "loss": 2.7813, + "step": 8366 + }, + { + "epoch": 0.24810959879014322, + "grad_norm": 0.1594809740781784, + "learning_rate": 0.0008639814670150956, + "loss": 2.7826, + "step": 8367 + }, + { + "epoch": 0.2481392521424547, + "grad_norm": 0.16593098640441895, + "learning_rate": 0.0008639492064891398, + "loss": 2.7897, + "step": 8368 + }, + { + "epoch": 0.2481689054947662, + "grad_norm": 0.18743076920509338, + "learning_rate": 0.0008639169427403514, + "loss": 2.7787, + "step": 8369 + }, + { + "epoch": 0.24819855884707767, + "grad_norm": 0.20089289546012878, + "learning_rate": 0.0008638846757690159, + "loss": 2.8356, + "step": 8370 + }, + { + "epoch": 0.24822821219938915, + "grad_norm": 0.1825621873140335, + "learning_rate": 0.0008638524055754193, + "loss": 2.7884, + "step": 8371 + }, + { + "epoch": 0.24825786555170062, + "grad_norm": 0.16959118843078613, + "learning_rate": 0.0008638201321598471, + "loss": 2.8097, + "step": 8372 + }, + { + "epoch": 0.2482875189040121, + "grad_norm": 0.15850432217121124, + "learning_rate": 0.0008637878555225851, + "loss": 2.7925, + "step": 8373 + }, + { + "epoch": 0.24831717225632358, + "grad_norm": 0.16017407178878784, + "learning_rate": 0.0008637555756639192, + "loss": 2.7902, + "step": 8374 + }, + { + "epoch": 0.24834682560863505, + "grad_norm": 0.14084722101688385, + "learning_rate": 0.0008637232925841354, + "loss": 2.8127, + "step": 8375 + }, + { + "epoch": 0.24837647896094653, + "grad_norm": 0.15353339910507202, + "learning_rate": 0.0008636910062835193, + "loss": 2.8122, + "step": 8376 + }, + { + "epoch": 0.248406132313258, + "grad_norm": 0.15795987844467163, + "learning_rate": 0.0008636587167623568, + "loss": 2.7892, + "step": 8377 + }, + { + "epoch": 0.24843578566556948, + "grad_norm": 0.13539519906044006, + "learning_rate": 0.0008636264240209342, + "loss": 2.8162, + "step": 8378 + }, + { + "epoch": 0.24846543901788098, + "grad_norm": 0.15766829252243042, + "learning_rate": 0.0008635941280595372, + "loss": 2.8357, + "step": 8379 + }, + { + "epoch": 0.24849509237019246, + "grad_norm": 0.14971713721752167, + "learning_rate": 0.0008635618288784514, + "loss": 2.8085, + "step": 8380 + }, + { + "epoch": 0.24852474572250394, + "grad_norm": 0.11997509747743607, + "learning_rate": 0.0008635295264779636, + "loss": 2.8206, + "step": 8381 + }, + { + "epoch": 0.2485543990748154, + "grad_norm": 0.12401245534420013, + "learning_rate": 0.0008634972208583593, + "loss": 2.7742, + "step": 8382 + }, + { + "epoch": 0.2485840524271269, + "grad_norm": 0.1409117728471756, + "learning_rate": 0.0008634649120199247, + "loss": 2.7775, + "step": 8383 + }, + { + "epoch": 0.24861370577943837, + "grad_norm": 0.13072088360786438, + "learning_rate": 0.000863432599962946, + "loss": 2.7925, + "step": 8384 + }, + { + "epoch": 0.24864335913174984, + "grad_norm": 0.15683110058307648, + "learning_rate": 0.0008634002846877091, + "loss": 2.7882, + "step": 8385 + }, + { + "epoch": 0.24867301248406132, + "grad_norm": 0.18183715641498566, + "learning_rate": 0.0008633679661945005, + "loss": 2.7801, + "step": 8386 + }, + { + "epoch": 0.2487026658363728, + "grad_norm": 0.20135346055030823, + "learning_rate": 0.000863335644483606, + "loss": 2.7784, + "step": 8387 + }, + { + "epoch": 0.24873231918868427, + "grad_norm": 0.20231902599334717, + "learning_rate": 0.0008633033195553121, + "loss": 2.7453, + "step": 8388 + }, + { + "epoch": 0.24876197254099575, + "grad_norm": 0.1527831256389618, + "learning_rate": 0.0008632709914099049, + "loss": 2.7377, + "step": 8389 + }, + { + "epoch": 0.24879162589330725, + "grad_norm": 0.15466271340847015, + "learning_rate": 0.0008632386600476707, + "loss": 2.7859, + "step": 8390 + }, + { + "epoch": 0.24882127924561873, + "grad_norm": 0.15891456604003906, + "learning_rate": 0.0008632063254688959, + "loss": 2.7718, + "step": 8391 + }, + { + "epoch": 0.2488509325979302, + "grad_norm": 0.13724708557128906, + "learning_rate": 0.0008631739876738667, + "loss": 2.7763, + "step": 8392 + }, + { + "epoch": 0.24888058595024168, + "grad_norm": 0.1268923580646515, + "learning_rate": 0.0008631416466628694, + "loss": 2.7941, + "step": 8393 + }, + { + "epoch": 0.24891023930255315, + "grad_norm": 0.1615525782108307, + "learning_rate": 0.0008631093024361907, + "loss": 2.801, + "step": 8394 + }, + { + "epoch": 0.24893989265486463, + "grad_norm": 0.18731072545051575, + "learning_rate": 0.0008630769549941166, + "loss": 2.7742, + "step": 8395 + }, + { + "epoch": 0.2489695460071761, + "grad_norm": 0.1896175891160965, + "learning_rate": 0.0008630446043369338, + "loss": 2.797, + "step": 8396 + }, + { + "epoch": 0.24899919935948758, + "grad_norm": 0.16021104156970978, + "learning_rate": 0.0008630122504649287, + "loss": 2.8091, + "step": 8397 + }, + { + "epoch": 0.24902885271179906, + "grad_norm": 0.14315195381641388, + "learning_rate": 0.0008629798933783879, + "loss": 2.7787, + "step": 8398 + }, + { + "epoch": 0.24905850606411054, + "grad_norm": 0.1400795727968216, + "learning_rate": 0.0008629475330775978, + "loss": 2.7842, + "step": 8399 + }, + { + "epoch": 0.24908815941642204, + "grad_norm": 0.13385750353336334, + "learning_rate": 0.000862915169562845, + "loss": 2.7684, + "step": 8400 + }, + { + "epoch": 0.24911781276873352, + "grad_norm": 0.16099123656749725, + "learning_rate": 0.0008628828028344161, + "loss": 2.7862, + "step": 8401 + }, + { + "epoch": 0.249147466121045, + "grad_norm": 0.1447356939315796, + "learning_rate": 0.0008628504328925977, + "loss": 2.7831, + "step": 8402 + }, + { + "epoch": 0.24917711947335647, + "grad_norm": 0.11559248715639114, + "learning_rate": 0.0008628180597376764, + "loss": 2.791, + "step": 8403 + }, + { + "epoch": 0.24920677282566794, + "grad_norm": 0.144659623503685, + "learning_rate": 0.0008627856833699388, + "loss": 2.8204, + "step": 8404 + }, + { + "epoch": 0.24923642617797942, + "grad_norm": 0.14250120520591736, + "learning_rate": 0.0008627533037896718, + "loss": 2.7994, + "step": 8405 + }, + { + "epoch": 0.2492660795302909, + "grad_norm": 0.12513473629951477, + "learning_rate": 0.0008627209209971621, + "loss": 2.8153, + "step": 8406 + }, + { + "epoch": 0.24929573288260237, + "grad_norm": 0.1297774463891983, + "learning_rate": 0.0008626885349926963, + "loss": 2.7722, + "step": 8407 + }, + { + "epoch": 0.24932538623491385, + "grad_norm": 0.1268116980791092, + "learning_rate": 0.0008626561457765612, + "loss": 2.8057, + "step": 8408 + }, + { + "epoch": 0.24935503958722532, + "grad_norm": 0.13114649057388306, + "learning_rate": 0.0008626237533490437, + "loss": 2.8045, + "step": 8409 + }, + { + "epoch": 0.24938469293953683, + "grad_norm": 0.1287745088338852, + "learning_rate": 0.0008625913577104307, + "loss": 2.8054, + "step": 8410 + }, + { + "epoch": 0.2494143462918483, + "grad_norm": 0.12896646559238434, + "learning_rate": 0.000862558958861009, + "loss": 2.8115, + "step": 8411 + }, + { + "epoch": 0.24944399964415978, + "grad_norm": 0.13006098568439484, + "learning_rate": 0.0008625265568010655, + "loss": 2.7662, + "step": 8412 + }, + { + "epoch": 0.24947365299647126, + "grad_norm": 0.12002822011709213, + "learning_rate": 0.000862494151530887, + "loss": 2.7677, + "step": 8413 + }, + { + "epoch": 0.24950330634878273, + "grad_norm": 0.12338073551654816, + "learning_rate": 0.0008624617430507606, + "loss": 2.777, + "step": 8414 + }, + { + "epoch": 0.2495329597010942, + "grad_norm": 0.12017887830734253, + "learning_rate": 0.0008624293313609734, + "loss": 2.8012, + "step": 8415 + }, + { + "epoch": 0.24956261305340569, + "grad_norm": 0.12891030311584473, + "learning_rate": 0.0008623969164618122, + "loss": 2.7763, + "step": 8416 + }, + { + "epoch": 0.24959226640571716, + "grad_norm": 0.13425885140895844, + "learning_rate": 0.000862364498353564, + "loss": 2.808, + "step": 8417 + }, + { + "epoch": 0.24962191975802864, + "grad_norm": 0.15119123458862305, + "learning_rate": 0.000862332077036516, + "loss": 2.7505, + "step": 8418 + }, + { + "epoch": 0.2496515731103401, + "grad_norm": 0.12743063271045685, + "learning_rate": 0.0008622996525109552, + "loss": 2.7804, + "step": 8419 + }, + { + "epoch": 0.2496812264626516, + "grad_norm": 0.125810444355011, + "learning_rate": 0.000862267224777169, + "loss": 2.8314, + "step": 8420 + }, + { + "epoch": 0.2497108798149631, + "grad_norm": 0.13884392380714417, + "learning_rate": 0.0008622347938354442, + "loss": 2.8023, + "step": 8421 + }, + { + "epoch": 0.24974053316727457, + "grad_norm": 0.14644810557365417, + "learning_rate": 0.0008622023596860681, + "loss": 2.7851, + "step": 8422 + }, + { + "epoch": 0.24977018651958605, + "grad_norm": 0.15736132860183716, + "learning_rate": 0.000862169922329328, + "loss": 2.8058, + "step": 8423 + }, + { + "epoch": 0.24979983987189752, + "grad_norm": 0.13521859049797058, + "learning_rate": 0.000862137481765511, + "loss": 2.8177, + "step": 8424 + }, + { + "epoch": 0.249829493224209, + "grad_norm": 0.1427278071641922, + "learning_rate": 0.0008621050379949045, + "loss": 2.7745, + "step": 8425 + }, + { + "epoch": 0.24985914657652047, + "grad_norm": 0.14474909007549286, + "learning_rate": 0.0008620725910177957, + "loss": 2.7885, + "step": 8426 + }, + { + "epoch": 0.24988879992883195, + "grad_norm": 0.15986542403697968, + "learning_rate": 0.000862040140834472, + "loss": 2.7471, + "step": 8427 + }, + { + "epoch": 0.24991845328114343, + "grad_norm": 0.1601233333349228, + "learning_rate": 0.0008620076874452208, + "loss": 2.7947, + "step": 8428 + }, + { + "epoch": 0.2499481066334549, + "grad_norm": 0.14978177845478058, + "learning_rate": 0.000861975230850329, + "loss": 2.7638, + "step": 8429 + }, + { + "epoch": 0.24997775998576638, + "grad_norm": 0.17062680423259735, + "learning_rate": 0.0008619427710500848, + "loss": 2.7719, + "step": 8430 + }, + { + "epoch": 0.25000741333807786, + "grad_norm": 0.21014508605003357, + "learning_rate": 0.0008619103080447751, + "loss": 2.7791, + "step": 8431 + }, + { + "epoch": 0.25003706669038933, + "grad_norm": 0.1772191971540451, + "learning_rate": 0.0008618778418346875, + "loss": 2.8133, + "step": 8432 + }, + { + "epoch": 0.2500667200427008, + "grad_norm": 0.12956295907497406, + "learning_rate": 0.0008618453724201094, + "loss": 2.7849, + "step": 8433 + }, + { + "epoch": 0.2500963733950123, + "grad_norm": 0.13257944583892822, + "learning_rate": 0.0008618128998013286, + "loss": 2.8499, + "step": 8434 + }, + { + "epoch": 0.25012602674732376, + "grad_norm": 0.12235172837972641, + "learning_rate": 0.0008617804239786324, + "loss": 2.782, + "step": 8435 + }, + { + "epoch": 0.2501556800996353, + "grad_norm": 0.13276873528957367, + "learning_rate": 0.0008617479449523085, + "loss": 2.8086, + "step": 8436 + }, + { + "epoch": 0.25018533345194677, + "grad_norm": 0.14375832676887512, + "learning_rate": 0.0008617154627226444, + "loss": 2.7907, + "step": 8437 + }, + { + "epoch": 0.25021498680425824, + "grad_norm": 0.1293201446533203, + "learning_rate": 0.0008616829772899277, + "loss": 2.793, + "step": 8438 + }, + { + "epoch": 0.2502446401565697, + "grad_norm": 0.11415025591850281, + "learning_rate": 0.0008616504886544463, + "loss": 2.79, + "step": 8439 + }, + { + "epoch": 0.2502742935088812, + "grad_norm": 0.1344168335199356, + "learning_rate": 0.0008616179968164877, + "loss": 2.8037, + "step": 8440 + }, + { + "epoch": 0.25030394686119267, + "grad_norm": 0.1357915848493576, + "learning_rate": 0.0008615855017763396, + "loss": 2.772, + "step": 8441 + }, + { + "epoch": 0.25033360021350415, + "grad_norm": 0.11285184323787689, + "learning_rate": 0.0008615530035342898, + "loss": 2.8082, + "step": 8442 + }, + { + "epoch": 0.2503632535658156, + "grad_norm": 0.11679194122552872, + "learning_rate": 0.0008615205020906262, + "loss": 2.7738, + "step": 8443 + }, + { + "epoch": 0.2503929069181271, + "grad_norm": 0.12548266351222992, + "learning_rate": 0.0008614879974456365, + "loss": 2.8062, + "step": 8444 + }, + { + "epoch": 0.2504225602704386, + "grad_norm": 0.12083505839109421, + "learning_rate": 0.0008614554895996084, + "loss": 2.8013, + "step": 8445 + }, + { + "epoch": 0.25045221362275005, + "grad_norm": 0.1317746788263321, + "learning_rate": 0.0008614229785528301, + "loss": 2.8159, + "step": 8446 + }, + { + "epoch": 0.25048186697506153, + "grad_norm": 0.163314551115036, + "learning_rate": 0.0008613904643055891, + "loss": 2.8166, + "step": 8447 + }, + { + "epoch": 0.250511520327373, + "grad_norm": 0.18634571135044098, + "learning_rate": 0.0008613579468581736, + "loss": 2.7889, + "step": 8448 + }, + { + "epoch": 0.2505411736796845, + "grad_norm": 0.16331297159194946, + "learning_rate": 0.0008613254262108714, + "loss": 2.7835, + "step": 8449 + }, + { + "epoch": 0.25057082703199596, + "grad_norm": 0.14858444035053253, + "learning_rate": 0.0008612929023639706, + "loss": 2.7547, + "step": 8450 + }, + { + "epoch": 0.25060048038430743, + "grad_norm": 0.15071456134319305, + "learning_rate": 0.000861260375317759, + "loss": 2.8289, + "step": 8451 + }, + { + "epoch": 0.2506301337366189, + "grad_norm": 0.1538042426109314, + "learning_rate": 0.0008612278450725249, + "loss": 2.8011, + "step": 8452 + }, + { + "epoch": 0.2506597870889304, + "grad_norm": 0.14471203088760376, + "learning_rate": 0.0008611953116285562, + "loss": 2.7565, + "step": 8453 + }, + { + "epoch": 0.25068944044124186, + "grad_norm": 0.15706247091293335, + "learning_rate": 0.0008611627749861411, + "loss": 2.7722, + "step": 8454 + }, + { + "epoch": 0.25071909379355334, + "grad_norm": 0.14608082175254822, + "learning_rate": 0.0008611302351455674, + "loss": 2.7741, + "step": 8455 + }, + { + "epoch": 0.2507487471458648, + "grad_norm": 0.14775651693344116, + "learning_rate": 0.0008610976921071236, + "loss": 2.8129, + "step": 8456 + }, + { + "epoch": 0.25077840049817635, + "grad_norm": 0.1357220858335495, + "learning_rate": 0.0008610651458710978, + "loss": 2.7843, + "step": 8457 + }, + { + "epoch": 0.2508080538504878, + "grad_norm": 0.13838335871696472, + "learning_rate": 0.0008610325964377781, + "loss": 2.8168, + "step": 8458 + }, + { + "epoch": 0.2508377072027993, + "grad_norm": 0.13241952657699585, + "learning_rate": 0.0008610000438074529, + "loss": 2.7772, + "step": 8459 + }, + { + "epoch": 0.2508673605551108, + "grad_norm": 0.1438939869403839, + "learning_rate": 0.0008609674879804102, + "loss": 2.7649, + "step": 8460 + }, + { + "epoch": 0.25089701390742225, + "grad_norm": 0.1449441760778427, + "learning_rate": 0.0008609349289569385, + "loss": 2.7602, + "step": 8461 + }, + { + "epoch": 0.2509266672597337, + "grad_norm": 0.13438405096530914, + "learning_rate": 0.0008609023667373261, + "loss": 2.8187, + "step": 8462 + }, + { + "epoch": 0.2509563206120452, + "grad_norm": 0.13772720098495483, + "learning_rate": 0.0008608698013218612, + "loss": 2.7762, + "step": 8463 + }, + { + "epoch": 0.2509859739643567, + "grad_norm": 0.18323656916618347, + "learning_rate": 0.0008608372327108325, + "loss": 2.7734, + "step": 8464 + }, + { + "epoch": 0.25101562731666816, + "grad_norm": 0.19827871024608612, + "learning_rate": 0.0008608046609045279, + "loss": 2.7821, + "step": 8465 + }, + { + "epoch": 0.25104528066897963, + "grad_norm": 0.20510849356651306, + "learning_rate": 0.0008607720859032362, + "loss": 2.8165, + "step": 8466 + }, + { + "epoch": 0.2510749340212911, + "grad_norm": 0.18313796818256378, + "learning_rate": 0.0008607395077072457, + "loss": 2.7967, + "step": 8467 + }, + { + "epoch": 0.2511045873736026, + "grad_norm": 0.15251874923706055, + "learning_rate": 0.000860706926316845, + "loss": 2.7533, + "step": 8468 + }, + { + "epoch": 0.25113424072591406, + "grad_norm": 0.15894708037376404, + "learning_rate": 0.0008606743417323225, + "loss": 2.8069, + "step": 8469 + }, + { + "epoch": 0.25116389407822554, + "grad_norm": 0.1475122570991516, + "learning_rate": 0.0008606417539539668, + "loss": 2.7652, + "step": 8470 + }, + { + "epoch": 0.251193547430537, + "grad_norm": 0.1452035754919052, + "learning_rate": 0.0008606091629820665, + "loss": 2.8038, + "step": 8471 + }, + { + "epoch": 0.2512232007828485, + "grad_norm": 0.1696380078792572, + "learning_rate": 0.0008605765688169103, + "loss": 2.8171, + "step": 8472 + }, + { + "epoch": 0.25125285413515996, + "grad_norm": 0.1659744381904602, + "learning_rate": 0.0008605439714587864, + "loss": 2.7851, + "step": 8473 + }, + { + "epoch": 0.25128250748747144, + "grad_norm": 0.13743126392364502, + "learning_rate": 0.0008605113709079839, + "loss": 2.7482, + "step": 8474 + }, + { + "epoch": 0.2513121608397829, + "grad_norm": 0.133019357919693, + "learning_rate": 0.0008604787671647914, + "loss": 2.8012, + "step": 8475 + }, + { + "epoch": 0.2513418141920944, + "grad_norm": 0.13928385078907013, + "learning_rate": 0.0008604461602294974, + "loss": 2.8075, + "step": 8476 + }, + { + "epoch": 0.25137146754440587, + "grad_norm": 0.1421947330236435, + "learning_rate": 0.0008604135501023909, + "loss": 2.7635, + "step": 8477 + }, + { + "epoch": 0.2514011208967174, + "grad_norm": 0.14938436448574066, + "learning_rate": 0.0008603809367837605, + "loss": 2.8178, + "step": 8478 + }, + { + "epoch": 0.2514307742490289, + "grad_norm": 0.14362145960330963, + "learning_rate": 0.000860348320273895, + "loss": 2.774, + "step": 8479 + }, + { + "epoch": 0.25146042760134035, + "grad_norm": 0.13025683164596558, + "learning_rate": 0.0008603157005730833, + "loss": 2.7969, + "step": 8480 + }, + { + "epoch": 0.25149008095365183, + "grad_norm": 0.15076076984405518, + "learning_rate": 0.0008602830776816142, + "loss": 2.7799, + "step": 8481 + }, + { + "epoch": 0.2515197343059633, + "grad_norm": 0.137784942984581, + "learning_rate": 0.0008602504515997767, + "loss": 2.7873, + "step": 8482 + }, + { + "epoch": 0.2515493876582748, + "grad_norm": 0.1337922066450119, + "learning_rate": 0.0008602178223278595, + "loss": 2.7428, + "step": 8483 + }, + { + "epoch": 0.25157904101058626, + "grad_norm": 0.13451683521270752, + "learning_rate": 0.0008601851898661517, + "loss": 2.7302, + "step": 8484 + }, + { + "epoch": 0.25160869436289773, + "grad_norm": 0.1435292661190033, + "learning_rate": 0.0008601525542149422, + "loss": 2.812, + "step": 8485 + }, + { + "epoch": 0.2516383477152092, + "grad_norm": 0.13965323567390442, + "learning_rate": 0.00086011991537452, + "loss": 2.7315, + "step": 8486 + }, + { + "epoch": 0.2516680010675207, + "grad_norm": 0.13382399082183838, + "learning_rate": 0.0008600872733451742, + "loss": 2.7954, + "step": 8487 + }, + { + "epoch": 0.25169765441983216, + "grad_norm": 0.14316654205322266, + "learning_rate": 0.0008600546281271938, + "loss": 2.7951, + "step": 8488 + }, + { + "epoch": 0.25172730777214364, + "grad_norm": 0.1446070373058319, + "learning_rate": 0.0008600219797208678, + "loss": 2.7904, + "step": 8489 + }, + { + "epoch": 0.2517569611244551, + "grad_norm": 0.1392969936132431, + "learning_rate": 0.0008599893281264854, + "loss": 2.781, + "step": 8490 + }, + { + "epoch": 0.2517866144767666, + "grad_norm": 0.12629248201847076, + "learning_rate": 0.0008599566733443358, + "loss": 2.7859, + "step": 8491 + }, + { + "epoch": 0.25181626782907807, + "grad_norm": 0.11857721209526062, + "learning_rate": 0.0008599240153747079, + "loss": 2.8211, + "step": 8492 + }, + { + "epoch": 0.25184592118138954, + "grad_norm": 0.12541858851909637, + "learning_rate": 0.0008598913542178912, + "loss": 2.7914, + "step": 8493 + }, + { + "epoch": 0.251875574533701, + "grad_norm": 0.12426881492137909, + "learning_rate": 0.0008598586898741747, + "loss": 2.7689, + "step": 8494 + }, + { + "epoch": 0.2519052278860125, + "grad_norm": 0.1319418102502823, + "learning_rate": 0.0008598260223438476, + "loss": 2.7969, + "step": 8495 + }, + { + "epoch": 0.25193488123832397, + "grad_norm": 0.1430584192276001, + "learning_rate": 0.0008597933516271997, + "loss": 2.7966, + "step": 8496 + }, + { + "epoch": 0.25196453459063545, + "grad_norm": 0.1681525856256485, + "learning_rate": 0.0008597606777245195, + "loss": 2.7848, + "step": 8497 + }, + { + "epoch": 0.2519941879429469, + "grad_norm": 0.1829146146774292, + "learning_rate": 0.000859728000636097, + "loss": 2.7824, + "step": 8498 + }, + { + "epoch": 0.25202384129525846, + "grad_norm": 0.1329571157693863, + "learning_rate": 0.0008596953203622213, + "loss": 2.7602, + "step": 8499 + }, + { + "epoch": 0.25205349464756993, + "grad_norm": 0.149760439991951, + "learning_rate": 0.0008596626369031817, + "loss": 2.7905, + "step": 8500 + }, + { + "epoch": 0.2520831479998814, + "grad_norm": 0.15783874690532684, + "learning_rate": 0.0008596299502592677, + "loss": 2.787, + "step": 8501 + }, + { + "epoch": 0.2521128013521929, + "grad_norm": 0.12848274409770966, + "learning_rate": 0.0008595972604307689, + "loss": 2.772, + "step": 8502 + }, + { + "epoch": 0.25214245470450436, + "grad_norm": 0.12542720139026642, + "learning_rate": 0.0008595645674179744, + "loss": 2.814, + "step": 8503 + }, + { + "epoch": 0.25217210805681584, + "grad_norm": 0.11513957381248474, + "learning_rate": 0.0008595318712211742, + "loss": 2.7856, + "step": 8504 + }, + { + "epoch": 0.2522017614091273, + "grad_norm": 0.1153762936592102, + "learning_rate": 0.0008594991718406574, + "loss": 2.8023, + "step": 8505 + }, + { + "epoch": 0.2522314147614388, + "grad_norm": 0.11292006075382233, + "learning_rate": 0.0008594664692767138, + "loss": 2.7863, + "step": 8506 + }, + { + "epoch": 0.25226106811375026, + "grad_norm": 0.1364138275384903, + "learning_rate": 0.0008594337635296329, + "loss": 2.7445, + "step": 8507 + }, + { + "epoch": 0.25229072146606174, + "grad_norm": 0.13153372704982758, + "learning_rate": 0.0008594010545997042, + "loss": 2.7825, + "step": 8508 + }, + { + "epoch": 0.2523203748183732, + "grad_norm": 0.12758517265319824, + "learning_rate": 0.0008593683424872176, + "loss": 2.803, + "step": 8509 + }, + { + "epoch": 0.2523500281706847, + "grad_norm": 0.15593348443508148, + "learning_rate": 0.0008593356271924626, + "loss": 2.7827, + "step": 8510 + }, + { + "epoch": 0.25237968152299617, + "grad_norm": 0.17953388392925262, + "learning_rate": 0.000859302908715729, + "loss": 2.7936, + "step": 8511 + }, + { + "epoch": 0.25240933487530764, + "grad_norm": 0.16634969413280487, + "learning_rate": 0.0008592701870573066, + "loss": 2.8035, + "step": 8512 + }, + { + "epoch": 0.2524389882276191, + "grad_norm": 0.13200348615646362, + "learning_rate": 0.0008592374622174848, + "loss": 2.806, + "step": 8513 + }, + { + "epoch": 0.2524686415799306, + "grad_norm": 0.12936057150363922, + "learning_rate": 0.0008592047341965536, + "loss": 2.7537, + "step": 8514 + }, + { + "epoch": 0.2524982949322421, + "grad_norm": 0.13011987507343292, + "learning_rate": 0.0008591720029948029, + "loss": 2.7973, + "step": 8515 + }, + { + "epoch": 0.25252794828455355, + "grad_norm": 0.14273494482040405, + "learning_rate": 0.0008591392686125225, + "loss": 2.8135, + "step": 8516 + }, + { + "epoch": 0.252557601636865, + "grad_norm": 0.14319279789924622, + "learning_rate": 0.0008591065310500021, + "loss": 2.819, + "step": 8517 + }, + { + "epoch": 0.2525872549891765, + "grad_norm": 0.12651991844177246, + "learning_rate": 0.0008590737903075319, + "loss": 2.7693, + "step": 8518 + }, + { + "epoch": 0.252616908341488, + "grad_norm": 0.126276895403862, + "learning_rate": 0.0008590410463854014, + "loss": 2.7635, + "step": 8519 + }, + { + "epoch": 0.2526465616937995, + "grad_norm": 0.12144540995359421, + "learning_rate": 0.0008590082992839011, + "loss": 2.7634, + "step": 8520 + }, + { + "epoch": 0.252676215046111, + "grad_norm": 0.13055983185768127, + "learning_rate": 0.0008589755490033207, + "loss": 2.7767, + "step": 8521 + }, + { + "epoch": 0.25270586839842246, + "grad_norm": 0.12799032032489777, + "learning_rate": 0.00085894279554395, + "loss": 2.815, + "step": 8522 + }, + { + "epoch": 0.25273552175073394, + "grad_norm": 0.1236441507935524, + "learning_rate": 0.0008589100389060794, + "loss": 2.789, + "step": 8523 + }, + { + "epoch": 0.2527651751030454, + "grad_norm": 0.13058871030807495, + "learning_rate": 0.0008588772790899986, + "loss": 2.8179, + "step": 8524 + }, + { + "epoch": 0.2527948284553569, + "grad_norm": 0.1518031507730484, + "learning_rate": 0.0008588445160959979, + "loss": 2.8029, + "step": 8525 + }, + { + "epoch": 0.25282448180766837, + "grad_norm": 0.17436590790748596, + "learning_rate": 0.0008588117499243675, + "loss": 2.7966, + "step": 8526 + }, + { + "epoch": 0.25285413515997984, + "grad_norm": 0.1665755659341812, + "learning_rate": 0.0008587789805753975, + "loss": 2.7421, + "step": 8527 + }, + { + "epoch": 0.2528837885122913, + "grad_norm": 0.12991595268249512, + "learning_rate": 0.0008587462080493779, + "loss": 2.752, + "step": 8528 + }, + { + "epoch": 0.2529134418646028, + "grad_norm": 0.11899667978286743, + "learning_rate": 0.0008587134323465993, + "loss": 2.8175, + "step": 8529 + }, + { + "epoch": 0.25294309521691427, + "grad_norm": 0.12953966856002808, + "learning_rate": 0.0008586806534673514, + "loss": 2.7907, + "step": 8530 + }, + { + "epoch": 0.25297274856922575, + "grad_norm": 0.11615154892206192, + "learning_rate": 0.000858647871411925, + "loss": 2.7572, + "step": 8531 + }, + { + "epoch": 0.2530024019215372, + "grad_norm": 0.11750166118144989, + "learning_rate": 0.00085861508618061, + "loss": 2.7716, + "step": 8532 + }, + { + "epoch": 0.2530320552738487, + "grad_norm": 0.12760643661022186, + "learning_rate": 0.0008585822977736969, + "loss": 2.7859, + "step": 8533 + }, + { + "epoch": 0.2530617086261602, + "grad_norm": 0.14689844846725464, + "learning_rate": 0.000858549506191476, + "loss": 2.7432, + "step": 8534 + }, + { + "epoch": 0.25309136197847165, + "grad_norm": 0.17536751925945282, + "learning_rate": 0.0008585167114342376, + "loss": 2.7929, + "step": 8535 + }, + { + "epoch": 0.25312101533078313, + "grad_norm": 0.18986070156097412, + "learning_rate": 0.0008584839135022723, + "loss": 2.7931, + "step": 8536 + }, + { + "epoch": 0.2531506686830946, + "grad_norm": 0.16690437495708466, + "learning_rate": 0.0008584511123958704, + "loss": 2.7671, + "step": 8537 + }, + { + "epoch": 0.2531803220354061, + "grad_norm": 0.14189301431179047, + "learning_rate": 0.0008584183081153223, + "loss": 2.7739, + "step": 8538 + }, + { + "epoch": 0.25320997538771756, + "grad_norm": 0.16495272517204285, + "learning_rate": 0.0008583855006609186, + "loss": 2.7906, + "step": 8539 + }, + { + "epoch": 0.2532396287400291, + "grad_norm": 0.16013438999652863, + "learning_rate": 0.0008583526900329497, + "loss": 2.7508, + "step": 8540 + }, + { + "epoch": 0.25326928209234056, + "grad_norm": 0.12453947961330414, + "learning_rate": 0.0008583198762317064, + "loss": 2.7788, + "step": 8541 + }, + { + "epoch": 0.25329893544465204, + "grad_norm": 0.1556973159313202, + "learning_rate": 0.0008582870592574789, + "loss": 2.7874, + "step": 8542 + }, + { + "epoch": 0.2533285887969635, + "grad_norm": 0.15037299692630768, + "learning_rate": 0.0008582542391105581, + "loss": 2.7913, + "step": 8543 + }, + { + "epoch": 0.253358242149275, + "grad_norm": 0.15273715555667877, + "learning_rate": 0.0008582214157912345, + "loss": 2.8063, + "step": 8544 + }, + { + "epoch": 0.25338789550158647, + "grad_norm": 0.13261395692825317, + "learning_rate": 0.0008581885892997987, + "loss": 2.8201, + "step": 8545 + }, + { + "epoch": 0.25341754885389794, + "grad_norm": 0.14939278364181519, + "learning_rate": 0.0008581557596365416, + "loss": 2.7784, + "step": 8546 + }, + { + "epoch": 0.2534472022062094, + "grad_norm": 0.12249982357025146, + "learning_rate": 0.0008581229268017536, + "loss": 2.7625, + "step": 8547 + }, + { + "epoch": 0.2534768555585209, + "grad_norm": 0.1281004250049591, + "learning_rate": 0.0008580900907957258, + "loss": 2.7816, + "step": 8548 + }, + { + "epoch": 0.2535065089108324, + "grad_norm": 0.1207703948020935, + "learning_rate": 0.0008580572516187486, + "loss": 2.7517, + "step": 8549 + }, + { + "epoch": 0.25353616226314385, + "grad_norm": 0.11248201131820679, + "learning_rate": 0.0008580244092711132, + "loss": 2.7915, + "step": 8550 + }, + { + "epoch": 0.2535658156154553, + "grad_norm": 0.1231386661529541, + "learning_rate": 0.00085799156375311, + "loss": 2.7954, + "step": 8551 + }, + { + "epoch": 0.2535954689677668, + "grad_norm": 0.12950734794139862, + "learning_rate": 0.0008579587150650301, + "loss": 2.8044, + "step": 8552 + }, + { + "epoch": 0.2536251223200783, + "grad_norm": 0.15882182121276855, + "learning_rate": 0.0008579258632071643, + "loss": 2.7969, + "step": 8553 + }, + { + "epoch": 0.25365477567238975, + "grad_norm": 0.16929912567138672, + "learning_rate": 0.0008578930081798037, + "loss": 2.7859, + "step": 8554 + }, + { + "epoch": 0.25368442902470123, + "grad_norm": 0.17012585699558258, + "learning_rate": 0.0008578601499832389, + "loss": 2.7928, + "step": 8555 + }, + { + "epoch": 0.2537140823770127, + "grad_norm": 0.16555960476398468, + "learning_rate": 0.0008578272886177611, + "loss": 2.7837, + "step": 8556 + }, + { + "epoch": 0.2537437357293242, + "grad_norm": 0.15113338828086853, + "learning_rate": 0.0008577944240836614, + "loss": 2.7909, + "step": 8557 + }, + { + "epoch": 0.25377338908163566, + "grad_norm": 0.17102846503257751, + "learning_rate": 0.0008577615563812304, + "loss": 2.7851, + "step": 8558 + }, + { + "epoch": 0.25380304243394713, + "grad_norm": 0.15728814899921417, + "learning_rate": 0.0008577286855107596, + "loss": 2.7916, + "step": 8559 + }, + { + "epoch": 0.2538326957862586, + "grad_norm": 0.14633524417877197, + "learning_rate": 0.0008576958114725399, + "loss": 2.8509, + "step": 8560 + }, + { + "epoch": 0.25386234913857014, + "grad_norm": 0.1598093956708908, + "learning_rate": 0.0008576629342668623, + "loss": 2.8109, + "step": 8561 + }, + { + "epoch": 0.2538920024908816, + "grad_norm": 0.15916210412979126, + "learning_rate": 0.000857630053894018, + "loss": 2.797, + "step": 8562 + }, + { + "epoch": 0.2539216558431931, + "grad_norm": 0.14311885833740234, + "learning_rate": 0.0008575971703542981, + "loss": 2.7893, + "step": 8563 + }, + { + "epoch": 0.25395130919550457, + "grad_norm": 0.15495041012763977, + "learning_rate": 0.0008575642836479941, + "loss": 2.7874, + "step": 8564 + }, + { + "epoch": 0.25398096254781605, + "grad_norm": 0.13924168050289154, + "learning_rate": 0.0008575313937753969, + "loss": 2.7999, + "step": 8565 + }, + { + "epoch": 0.2540106159001275, + "grad_norm": 0.14315515756607056, + "learning_rate": 0.0008574985007367979, + "loss": 2.8005, + "step": 8566 + }, + { + "epoch": 0.254040269252439, + "grad_norm": 0.14684276282787323, + "learning_rate": 0.0008574656045324883, + "loss": 2.7732, + "step": 8567 + }, + { + "epoch": 0.2540699226047505, + "grad_norm": 0.13969098031520844, + "learning_rate": 0.0008574327051627593, + "loss": 2.7985, + "step": 8568 + }, + { + "epoch": 0.25409957595706195, + "grad_norm": 0.15480181574821472, + "learning_rate": 0.0008573998026279024, + "loss": 2.7888, + "step": 8569 + }, + { + "epoch": 0.25412922930937343, + "grad_norm": 0.14867079257965088, + "learning_rate": 0.0008573668969282089, + "loss": 2.7941, + "step": 8570 + }, + { + "epoch": 0.2541588826616849, + "grad_norm": 0.1325322687625885, + "learning_rate": 0.0008573339880639701, + "loss": 2.7936, + "step": 8571 + }, + { + "epoch": 0.2541885360139964, + "grad_norm": 0.12953604757785797, + "learning_rate": 0.0008573010760354776, + "loss": 2.8141, + "step": 8572 + }, + { + "epoch": 0.25421818936630786, + "grad_norm": 0.12884031236171722, + "learning_rate": 0.0008572681608430228, + "loss": 2.7879, + "step": 8573 + }, + { + "epoch": 0.25424784271861933, + "grad_norm": 0.138696551322937, + "learning_rate": 0.000857235242486897, + "loss": 2.7562, + "step": 8574 + }, + { + "epoch": 0.2542774960709308, + "grad_norm": 0.13464367389678955, + "learning_rate": 0.0008572023209673918, + "loss": 2.803, + "step": 8575 + }, + { + "epoch": 0.2543071494232423, + "grad_norm": 0.1450984925031662, + "learning_rate": 0.0008571693962847989, + "loss": 2.7488, + "step": 8576 + }, + { + "epoch": 0.25433680277555376, + "grad_norm": 0.13566502928733826, + "learning_rate": 0.0008571364684394095, + "loss": 2.8007, + "step": 8577 + }, + { + "epoch": 0.25436645612786524, + "grad_norm": 0.12783567607402802, + "learning_rate": 0.0008571035374315155, + "loss": 2.7724, + "step": 8578 + }, + { + "epoch": 0.2543961094801767, + "grad_norm": 0.13429702818393707, + "learning_rate": 0.0008570706032614083, + "loss": 2.7591, + "step": 8579 + }, + { + "epoch": 0.2544257628324882, + "grad_norm": 0.1378946453332901, + "learning_rate": 0.0008570376659293797, + "loss": 2.7729, + "step": 8580 + }, + { + "epoch": 0.25445541618479967, + "grad_norm": 0.14181400835514069, + "learning_rate": 0.0008570047254357211, + "loss": 2.8187, + "step": 8581 + }, + { + "epoch": 0.2544850695371112, + "grad_norm": 0.1541011929512024, + "learning_rate": 0.0008569717817807246, + "loss": 2.7785, + "step": 8582 + }, + { + "epoch": 0.2545147228894227, + "grad_norm": 0.15898047387599945, + "learning_rate": 0.0008569388349646816, + "loss": 2.7839, + "step": 8583 + }, + { + "epoch": 0.25454437624173415, + "grad_norm": 0.16495664417743683, + "learning_rate": 0.0008569058849878839, + "loss": 2.8279, + "step": 8584 + }, + { + "epoch": 0.2545740295940456, + "grad_norm": 0.15284420549869537, + "learning_rate": 0.0008568729318506234, + "loss": 2.7617, + "step": 8585 + }, + { + "epoch": 0.2546036829463571, + "grad_norm": 0.1536354422569275, + "learning_rate": 0.0008568399755531919, + "loss": 2.7623, + "step": 8586 + }, + { + "epoch": 0.2546333362986686, + "grad_norm": 0.14912615716457367, + "learning_rate": 0.0008568070160958809, + "loss": 2.8157, + "step": 8587 + }, + { + "epoch": 0.25466298965098005, + "grad_norm": 0.14573442935943604, + "learning_rate": 0.0008567740534789828, + "loss": 2.8112, + "step": 8588 + }, + { + "epoch": 0.25469264300329153, + "grad_norm": 0.15719860792160034, + "learning_rate": 0.0008567410877027891, + "loss": 2.7883, + "step": 8589 + }, + { + "epoch": 0.254722296355603, + "grad_norm": 0.14643539488315582, + "learning_rate": 0.0008567081187675918, + "loss": 2.7599, + "step": 8590 + }, + { + "epoch": 0.2547519497079145, + "grad_norm": 0.13746044039726257, + "learning_rate": 0.0008566751466736831, + "loss": 2.804, + "step": 8591 + }, + { + "epoch": 0.25478160306022596, + "grad_norm": 0.15375950932502747, + "learning_rate": 0.0008566421714213546, + "loss": 2.7388, + "step": 8592 + }, + { + "epoch": 0.25481125641253743, + "grad_norm": 0.12793578207492828, + "learning_rate": 0.0008566091930108983, + "loss": 2.7997, + "step": 8593 + }, + { + "epoch": 0.2548409097648489, + "grad_norm": 0.12280232459306717, + "learning_rate": 0.0008565762114426065, + "loss": 2.7841, + "step": 8594 + }, + { + "epoch": 0.2548705631171604, + "grad_norm": 0.112392857670784, + "learning_rate": 0.0008565432267167712, + "loss": 2.8102, + "step": 8595 + }, + { + "epoch": 0.25490021646947186, + "grad_norm": 0.13989262282848358, + "learning_rate": 0.0008565102388336845, + "loss": 2.765, + "step": 8596 + }, + { + "epoch": 0.25492986982178334, + "grad_norm": 0.1379697620868683, + "learning_rate": 0.0008564772477936383, + "loss": 2.8349, + "step": 8597 + }, + { + "epoch": 0.2549595231740948, + "grad_norm": 0.15369781851768494, + "learning_rate": 0.0008564442535969249, + "loss": 2.7828, + "step": 8598 + }, + { + "epoch": 0.2549891765264063, + "grad_norm": 0.1481625735759735, + "learning_rate": 0.0008564112562438364, + "loss": 2.8042, + "step": 8599 + }, + { + "epoch": 0.25501882987871777, + "grad_norm": 0.1604447215795517, + "learning_rate": 0.0008563782557346652, + "loss": 2.8071, + "step": 8600 + }, + { + "epoch": 0.25504848323102924, + "grad_norm": 0.16150586307048798, + "learning_rate": 0.0008563452520697032, + "loss": 2.7618, + "step": 8601 + }, + { + "epoch": 0.2550781365833407, + "grad_norm": 0.16373907029628754, + "learning_rate": 0.000856312245249243, + "loss": 2.812, + "step": 8602 + }, + { + "epoch": 0.25510778993565225, + "grad_norm": 0.16203998029232025, + "learning_rate": 0.0008562792352735766, + "loss": 2.8084, + "step": 8603 + }, + { + "epoch": 0.2551374432879637, + "grad_norm": 0.12904730439186096, + "learning_rate": 0.0008562462221429966, + "loss": 2.7923, + "step": 8604 + }, + { + "epoch": 0.2551670966402752, + "grad_norm": 0.1443416029214859, + "learning_rate": 0.000856213205857795, + "loss": 2.802, + "step": 8605 + }, + { + "epoch": 0.2551967499925867, + "grad_norm": 0.13798169791698456, + "learning_rate": 0.0008561801864182642, + "loss": 2.7785, + "step": 8606 + }, + { + "epoch": 0.25522640334489816, + "grad_norm": 0.12387076765298843, + "learning_rate": 0.0008561471638246968, + "loss": 2.7851, + "step": 8607 + }, + { + "epoch": 0.25525605669720963, + "grad_norm": 0.1370672732591629, + "learning_rate": 0.0008561141380773853, + "loss": 2.8003, + "step": 8608 + }, + { + "epoch": 0.2552857100495211, + "grad_norm": 0.14649197459220886, + "learning_rate": 0.0008560811091766218, + "loss": 2.7922, + "step": 8609 + }, + { + "epoch": 0.2553153634018326, + "grad_norm": 0.129961296916008, + "learning_rate": 0.000856048077122699, + "loss": 2.7805, + "step": 8610 + }, + { + "epoch": 0.25534501675414406, + "grad_norm": 0.11378464102745056, + "learning_rate": 0.0008560150419159094, + "loss": 2.753, + "step": 8611 + }, + { + "epoch": 0.25537467010645554, + "grad_norm": 0.1230548843741417, + "learning_rate": 0.0008559820035565454, + "loss": 2.781, + "step": 8612 + }, + { + "epoch": 0.255404323458767, + "grad_norm": 0.13369470834732056, + "learning_rate": 0.0008559489620448997, + "loss": 2.7857, + "step": 8613 + }, + { + "epoch": 0.2554339768110785, + "grad_norm": 0.11676029115915298, + "learning_rate": 0.000855915917381265, + "loss": 2.7859, + "step": 8614 + }, + { + "epoch": 0.25546363016338997, + "grad_norm": 0.12021756172180176, + "learning_rate": 0.0008558828695659336, + "loss": 2.7901, + "step": 8615 + }, + { + "epoch": 0.25549328351570144, + "grad_norm": 0.13149623572826385, + "learning_rate": 0.0008558498185991983, + "loss": 2.7968, + "step": 8616 + }, + { + "epoch": 0.2555229368680129, + "grad_norm": 0.15466921031475067, + "learning_rate": 0.0008558167644813517, + "loss": 2.7927, + "step": 8617 + }, + { + "epoch": 0.2555525902203244, + "grad_norm": 0.14696337282657623, + "learning_rate": 0.0008557837072126866, + "loss": 2.7597, + "step": 8618 + }, + { + "epoch": 0.25558224357263587, + "grad_norm": 0.12193167209625244, + "learning_rate": 0.0008557506467934959, + "loss": 2.7799, + "step": 8619 + }, + { + "epoch": 0.25561189692494735, + "grad_norm": 0.1386062502861023, + "learning_rate": 0.000855717583224072, + "loss": 2.7816, + "step": 8620 + }, + { + "epoch": 0.2556415502772588, + "grad_norm": 0.15278424322605133, + "learning_rate": 0.0008556845165047078, + "loss": 2.7445, + "step": 8621 + }, + { + "epoch": 0.2556712036295703, + "grad_norm": 0.15960773825645447, + "learning_rate": 0.0008556514466356963, + "loss": 2.7693, + "step": 8622 + }, + { + "epoch": 0.2557008569818818, + "grad_norm": 0.2061634510755539, + "learning_rate": 0.0008556183736173302, + "loss": 2.8153, + "step": 8623 + }, + { + "epoch": 0.2557305103341933, + "grad_norm": 0.21471531689167023, + "learning_rate": 0.0008555852974499023, + "loss": 2.7986, + "step": 8624 + }, + { + "epoch": 0.2557601636865048, + "grad_norm": 0.1896587461233139, + "learning_rate": 0.0008555522181337054, + "loss": 2.8038, + "step": 8625 + }, + { + "epoch": 0.25578981703881626, + "grad_norm": 0.15547651052474976, + "learning_rate": 0.0008555191356690329, + "loss": 2.8051, + "step": 8626 + }, + { + "epoch": 0.25581947039112773, + "grad_norm": 0.15051816403865814, + "learning_rate": 0.0008554860500561772, + "loss": 2.793, + "step": 8627 + }, + { + "epoch": 0.2558491237434392, + "grad_norm": 0.14819303154945374, + "learning_rate": 0.0008554529612954315, + "loss": 2.7878, + "step": 8628 + }, + { + "epoch": 0.2558787770957507, + "grad_norm": 0.13399310410022736, + "learning_rate": 0.0008554198693870889, + "loss": 2.818, + "step": 8629 + }, + { + "epoch": 0.25590843044806216, + "grad_norm": 0.12305665761232376, + "learning_rate": 0.0008553867743314423, + "loss": 2.7787, + "step": 8630 + }, + { + "epoch": 0.25593808380037364, + "grad_norm": 0.14356738328933716, + "learning_rate": 0.0008553536761287848, + "loss": 2.7993, + "step": 8631 + }, + { + "epoch": 0.2559677371526851, + "grad_norm": 0.14259015023708344, + "learning_rate": 0.0008553205747794095, + "loss": 2.7474, + "step": 8632 + }, + { + "epoch": 0.2559973905049966, + "grad_norm": 0.14897486567497253, + "learning_rate": 0.0008552874702836096, + "loss": 2.7796, + "step": 8633 + }, + { + "epoch": 0.25602704385730807, + "grad_norm": 0.13972778618335724, + "learning_rate": 0.0008552543626416783, + "loss": 2.7345, + "step": 8634 + }, + { + "epoch": 0.25605669720961954, + "grad_norm": 0.13548222184181213, + "learning_rate": 0.0008552212518539084, + "loss": 2.7936, + "step": 8635 + }, + { + "epoch": 0.256086350561931, + "grad_norm": 0.13103942573070526, + "learning_rate": 0.0008551881379205932, + "loss": 2.7795, + "step": 8636 + }, + { + "epoch": 0.2561160039142425, + "grad_norm": 0.11791559308767319, + "learning_rate": 0.0008551550208420265, + "loss": 2.7809, + "step": 8637 + }, + { + "epoch": 0.25614565726655397, + "grad_norm": 0.12970151007175446, + "learning_rate": 0.0008551219006185008, + "loss": 2.7971, + "step": 8638 + }, + { + "epoch": 0.25617531061886545, + "grad_norm": 0.13270241022109985, + "learning_rate": 0.0008550887772503097, + "loss": 2.7577, + "step": 8639 + }, + { + "epoch": 0.2562049639711769, + "grad_norm": 0.14638492465019226, + "learning_rate": 0.0008550556507377467, + "loss": 2.7968, + "step": 8640 + }, + { + "epoch": 0.2562346173234884, + "grad_norm": 0.12218699604272842, + "learning_rate": 0.000855022521081105, + "loss": 2.769, + "step": 8641 + }, + { + "epoch": 0.2562642706757999, + "grad_norm": 0.1295144259929657, + "learning_rate": 0.0008549893882806778, + "loss": 2.7672, + "step": 8642 + }, + { + "epoch": 0.25629392402811135, + "grad_norm": 0.13562418520450592, + "learning_rate": 0.0008549562523367586, + "loss": 2.7736, + "step": 8643 + }, + { + "epoch": 0.2563235773804229, + "grad_norm": 0.13872240483760834, + "learning_rate": 0.0008549231132496412, + "loss": 2.753, + "step": 8644 + }, + { + "epoch": 0.25635323073273436, + "grad_norm": 0.14597269892692566, + "learning_rate": 0.0008548899710196183, + "loss": 2.8005, + "step": 8645 + }, + { + "epoch": 0.25638288408504584, + "grad_norm": 0.14548395574092865, + "learning_rate": 0.000854856825646984, + "loss": 2.8166, + "step": 8646 + }, + { + "epoch": 0.2564125374373573, + "grad_norm": 0.12962740659713745, + "learning_rate": 0.0008548236771320314, + "loss": 2.7793, + "step": 8647 + }, + { + "epoch": 0.2564421907896688, + "grad_norm": 0.124795101583004, + "learning_rate": 0.0008547905254750545, + "loss": 2.8093, + "step": 8648 + }, + { + "epoch": 0.25647184414198027, + "grad_norm": 0.14469869434833527, + "learning_rate": 0.0008547573706763465, + "loss": 2.8003, + "step": 8649 + }, + { + "epoch": 0.25650149749429174, + "grad_norm": 0.1582089215517044, + "learning_rate": 0.000854724212736201, + "loss": 2.7282, + "step": 8650 + }, + { + "epoch": 0.2565311508466032, + "grad_norm": 0.14092375338077545, + "learning_rate": 0.0008546910516549118, + "loss": 2.7554, + "step": 8651 + }, + { + "epoch": 0.2565608041989147, + "grad_norm": 0.1407010704278946, + "learning_rate": 0.0008546578874327724, + "loss": 2.7999, + "step": 8652 + }, + { + "epoch": 0.25659045755122617, + "grad_norm": 0.14191482961177826, + "learning_rate": 0.0008546247200700765, + "loss": 2.7938, + "step": 8653 + }, + { + "epoch": 0.25662011090353765, + "grad_norm": 0.1447896957397461, + "learning_rate": 0.000854591549567118, + "loss": 2.7866, + "step": 8654 + }, + { + "epoch": 0.2566497642558491, + "grad_norm": 0.11977377533912659, + "learning_rate": 0.0008545583759241904, + "loss": 2.8027, + "step": 8655 + }, + { + "epoch": 0.2566794176081606, + "grad_norm": 0.13661420345306396, + "learning_rate": 0.0008545251991415874, + "loss": 2.7895, + "step": 8656 + }, + { + "epoch": 0.2567090709604721, + "grad_norm": 0.14085504412651062, + "learning_rate": 0.0008544920192196031, + "loss": 2.8039, + "step": 8657 + }, + { + "epoch": 0.25673872431278355, + "grad_norm": 0.14930161833763123, + "learning_rate": 0.0008544588361585309, + "loss": 2.7739, + "step": 8658 + }, + { + "epoch": 0.256768377665095, + "grad_norm": 0.15002664923667908, + "learning_rate": 0.0008544256499586649, + "loss": 2.7833, + "step": 8659 + }, + { + "epoch": 0.2567980310174065, + "grad_norm": 0.1406976729631424, + "learning_rate": 0.0008543924606202991, + "loss": 2.8113, + "step": 8660 + }, + { + "epoch": 0.256827684369718, + "grad_norm": 0.13524439930915833, + "learning_rate": 0.0008543592681437271, + "loss": 2.7827, + "step": 8661 + }, + { + "epoch": 0.25685733772202946, + "grad_norm": 0.13258983194828033, + "learning_rate": 0.000854326072529243, + "loss": 2.7898, + "step": 8662 + }, + { + "epoch": 0.25688699107434093, + "grad_norm": 0.15899567306041718, + "learning_rate": 0.0008542928737771407, + "loss": 2.8021, + "step": 8663 + }, + { + "epoch": 0.2569166444266524, + "grad_norm": 0.1647299975156784, + "learning_rate": 0.0008542596718877142, + "loss": 2.7973, + "step": 8664 + }, + { + "epoch": 0.25694629777896394, + "grad_norm": 0.15737813711166382, + "learning_rate": 0.0008542264668612575, + "loss": 2.7927, + "step": 8665 + }, + { + "epoch": 0.2569759511312754, + "grad_norm": 0.16002824902534485, + "learning_rate": 0.0008541932586980647, + "loss": 2.7782, + "step": 8666 + }, + { + "epoch": 0.2570056044835869, + "grad_norm": 0.13953882455825806, + "learning_rate": 0.0008541600473984297, + "loss": 2.7773, + "step": 8667 + }, + { + "epoch": 0.25703525783589837, + "grad_norm": 0.12847964465618134, + "learning_rate": 0.0008541268329626466, + "loss": 2.7864, + "step": 8668 + }, + { + "epoch": 0.25706491118820984, + "grad_norm": 0.11237989366054535, + "learning_rate": 0.0008540936153910097, + "loss": 2.7725, + "step": 8669 + }, + { + "epoch": 0.2570945645405213, + "grad_norm": 0.11955778300762177, + "learning_rate": 0.0008540603946838131, + "loss": 2.7638, + "step": 8670 + }, + { + "epoch": 0.2571242178928328, + "grad_norm": 0.13520725071430206, + "learning_rate": 0.000854027170841351, + "loss": 2.7668, + "step": 8671 + }, + { + "epoch": 0.25715387124514427, + "grad_norm": 0.14704564213752747, + "learning_rate": 0.0008539939438639174, + "loss": 2.7905, + "step": 8672 + }, + { + "epoch": 0.25718352459745575, + "grad_norm": 0.1262756586074829, + "learning_rate": 0.0008539607137518066, + "loss": 2.8178, + "step": 8673 + }, + { + "epoch": 0.2572131779497672, + "grad_norm": 0.1390070766210556, + "learning_rate": 0.0008539274805053131, + "loss": 2.7819, + "step": 8674 + }, + { + "epoch": 0.2572428313020787, + "grad_norm": 0.1543717384338379, + "learning_rate": 0.000853894244124731, + "loss": 2.7799, + "step": 8675 + }, + { + "epoch": 0.2572724846543902, + "grad_norm": 0.13546988368034363, + "learning_rate": 0.0008538610046103546, + "loss": 2.7931, + "step": 8676 + }, + { + "epoch": 0.25730213800670165, + "grad_norm": 0.14221970736980438, + "learning_rate": 0.0008538277619624782, + "loss": 2.7714, + "step": 8677 + }, + { + "epoch": 0.25733179135901313, + "grad_norm": 0.15188436210155487, + "learning_rate": 0.0008537945161813963, + "loss": 2.7608, + "step": 8678 + }, + { + "epoch": 0.2573614447113246, + "grad_norm": 0.14567930996418, + "learning_rate": 0.0008537612672674031, + "loss": 2.7789, + "step": 8679 + }, + { + "epoch": 0.2573910980636361, + "grad_norm": 0.13792632520198822, + "learning_rate": 0.0008537280152207933, + "loss": 2.8047, + "step": 8680 + }, + { + "epoch": 0.25742075141594756, + "grad_norm": 0.12869678437709808, + "learning_rate": 0.000853694760041861, + "loss": 2.8099, + "step": 8681 + }, + { + "epoch": 0.25745040476825903, + "grad_norm": 0.14922645688056946, + "learning_rate": 0.0008536615017309011, + "loss": 2.764, + "step": 8682 + }, + { + "epoch": 0.2574800581205705, + "grad_norm": 0.17384517192840576, + "learning_rate": 0.0008536282402882079, + "loss": 2.8178, + "step": 8683 + }, + { + "epoch": 0.257509711472882, + "grad_norm": 0.1632114201784134, + "learning_rate": 0.0008535949757140759, + "loss": 2.7935, + "step": 8684 + }, + { + "epoch": 0.25753936482519346, + "grad_norm": 0.1431022733449936, + "learning_rate": 0.0008535617080087997, + "loss": 2.779, + "step": 8685 + }, + { + "epoch": 0.257569018177505, + "grad_norm": 0.13939499855041504, + "learning_rate": 0.0008535284371726737, + "loss": 2.7692, + "step": 8686 + }, + { + "epoch": 0.25759867152981647, + "grad_norm": 0.1313496083021164, + "learning_rate": 0.000853495163205993, + "loss": 2.7639, + "step": 8687 + }, + { + "epoch": 0.25762832488212795, + "grad_norm": 0.14287105202674866, + "learning_rate": 0.0008534618861090517, + "loss": 2.7936, + "step": 8688 + }, + { + "epoch": 0.2576579782344394, + "grad_norm": 0.13775178790092468, + "learning_rate": 0.0008534286058821448, + "loss": 2.7512, + "step": 8689 + }, + { + "epoch": 0.2576876315867509, + "grad_norm": 0.1471312791109085, + "learning_rate": 0.0008533953225255671, + "loss": 2.7653, + "step": 8690 + }, + { + "epoch": 0.2577172849390624, + "grad_norm": 0.14007194340229034, + "learning_rate": 0.000853362036039613, + "loss": 2.7916, + "step": 8691 + }, + { + "epoch": 0.25774693829137385, + "grad_norm": 0.13144880533218384, + "learning_rate": 0.0008533287464245774, + "loss": 2.7897, + "step": 8692 + }, + { + "epoch": 0.2577765916436853, + "grad_norm": 0.13670045137405396, + "learning_rate": 0.0008532954536807552, + "loss": 2.7849, + "step": 8693 + }, + { + "epoch": 0.2578062449959968, + "grad_norm": 0.12806852161884308, + "learning_rate": 0.000853262157808441, + "loss": 2.794, + "step": 8694 + }, + { + "epoch": 0.2578358983483083, + "grad_norm": 0.12194091826677322, + "learning_rate": 0.0008532288588079299, + "loss": 2.7944, + "step": 8695 + }, + { + "epoch": 0.25786555170061976, + "grad_norm": 0.13042470812797546, + "learning_rate": 0.0008531955566795166, + "loss": 2.7518, + "step": 8696 + }, + { + "epoch": 0.25789520505293123, + "grad_norm": 0.14223122596740723, + "learning_rate": 0.0008531622514234959, + "loss": 2.7633, + "step": 8697 + }, + { + "epoch": 0.2579248584052427, + "grad_norm": 0.13980348408222198, + "learning_rate": 0.000853128943040163, + "loss": 2.7817, + "step": 8698 + }, + { + "epoch": 0.2579545117575542, + "grad_norm": 0.1332385390996933, + "learning_rate": 0.0008530956315298125, + "loss": 2.7811, + "step": 8699 + }, + { + "epoch": 0.25798416510986566, + "grad_norm": 0.14177601039409637, + "learning_rate": 0.0008530623168927397, + "loss": 2.7774, + "step": 8700 + }, + { + "epoch": 0.25801381846217714, + "grad_norm": 0.14145494997501373, + "learning_rate": 0.0008530289991292394, + "loss": 2.7939, + "step": 8701 + }, + { + "epoch": 0.2580434718144886, + "grad_norm": 0.14989936351776123, + "learning_rate": 0.0008529956782396069, + "loss": 2.826, + "step": 8702 + }, + { + "epoch": 0.2580731251668001, + "grad_norm": 0.13238917291164398, + "learning_rate": 0.000852962354224137, + "loss": 2.7843, + "step": 8703 + }, + { + "epoch": 0.25810277851911156, + "grad_norm": 0.1268533617258072, + "learning_rate": 0.0008529290270831247, + "loss": 2.7514, + "step": 8704 + }, + { + "epoch": 0.25813243187142304, + "grad_norm": 0.12530049681663513, + "learning_rate": 0.0008528956968168655, + "loss": 2.7639, + "step": 8705 + }, + { + "epoch": 0.2581620852237345, + "grad_norm": 0.14050395786762238, + "learning_rate": 0.0008528623634256543, + "loss": 2.7676, + "step": 8706 + }, + { + "epoch": 0.25819173857604605, + "grad_norm": 0.14114682376384735, + "learning_rate": 0.0008528290269097863, + "loss": 2.7473, + "step": 8707 + }, + { + "epoch": 0.2582213919283575, + "grad_norm": 0.1398865431547165, + "learning_rate": 0.0008527956872695565, + "loss": 2.7546, + "step": 8708 + }, + { + "epoch": 0.258251045280669, + "grad_norm": 0.14397667348384857, + "learning_rate": 0.0008527623445052604, + "loss": 2.8051, + "step": 8709 + }, + { + "epoch": 0.2582806986329805, + "grad_norm": 0.17371754348278046, + "learning_rate": 0.0008527289986171934, + "loss": 2.8215, + "step": 8710 + }, + { + "epoch": 0.25831035198529195, + "grad_norm": 0.154056116938591, + "learning_rate": 0.0008526956496056504, + "loss": 2.8015, + "step": 8711 + }, + { + "epoch": 0.25834000533760343, + "grad_norm": 0.13370957970619202, + "learning_rate": 0.0008526622974709269, + "loss": 2.7811, + "step": 8712 + }, + { + "epoch": 0.2583696586899149, + "grad_norm": 0.1425231695175171, + "learning_rate": 0.0008526289422133182, + "loss": 2.7921, + "step": 8713 + }, + { + "epoch": 0.2583993120422264, + "grad_norm": 0.14774224162101746, + "learning_rate": 0.0008525955838331198, + "loss": 2.7956, + "step": 8714 + }, + { + "epoch": 0.25842896539453786, + "grad_norm": 0.17224889993667603, + "learning_rate": 0.0008525622223306269, + "loss": 2.7702, + "step": 8715 + }, + { + "epoch": 0.25845861874684933, + "grad_norm": 0.13807255029678345, + "learning_rate": 0.0008525288577061349, + "loss": 2.8025, + "step": 8716 + }, + { + "epoch": 0.2584882720991608, + "grad_norm": 0.12805737555027008, + "learning_rate": 0.0008524954899599397, + "loss": 2.7949, + "step": 8717 + }, + { + "epoch": 0.2585179254514723, + "grad_norm": 0.13012602925300598, + "learning_rate": 0.0008524621190923362, + "loss": 2.7751, + "step": 8718 + }, + { + "epoch": 0.25854757880378376, + "grad_norm": 0.128680020570755, + "learning_rate": 0.0008524287451036201, + "loss": 2.8126, + "step": 8719 + }, + { + "epoch": 0.25857723215609524, + "grad_norm": 0.13167241215705872, + "learning_rate": 0.000852395367994087, + "loss": 2.7576, + "step": 8720 + }, + { + "epoch": 0.2586068855084067, + "grad_norm": 0.13772252202033997, + "learning_rate": 0.0008523619877640325, + "loss": 2.729, + "step": 8721 + }, + { + "epoch": 0.2586365388607182, + "grad_norm": 0.13084253668785095, + "learning_rate": 0.0008523286044137521, + "loss": 2.805, + "step": 8722 + }, + { + "epoch": 0.25866619221302967, + "grad_norm": 0.12930621206760406, + "learning_rate": 0.0008522952179435412, + "loss": 2.77, + "step": 8723 + }, + { + "epoch": 0.25869584556534114, + "grad_norm": 0.13670513033866882, + "learning_rate": 0.0008522618283536961, + "loss": 2.7542, + "step": 8724 + }, + { + "epoch": 0.2587254989176526, + "grad_norm": 0.14381609857082367, + "learning_rate": 0.0008522284356445118, + "loss": 2.7873, + "step": 8725 + }, + { + "epoch": 0.2587551522699641, + "grad_norm": 0.15103158354759216, + "learning_rate": 0.0008521950398162842, + "loss": 2.7253, + "step": 8726 + }, + { + "epoch": 0.25878480562227557, + "grad_norm": 0.1470465064048767, + "learning_rate": 0.0008521616408693092, + "loss": 2.7984, + "step": 8727 + }, + { + "epoch": 0.2588144589745871, + "grad_norm": 0.14673668146133423, + "learning_rate": 0.0008521282388038822, + "loss": 2.7676, + "step": 8728 + }, + { + "epoch": 0.2588441123268986, + "grad_norm": 0.15344028174877167, + "learning_rate": 0.0008520948336202994, + "loss": 2.783, + "step": 8729 + }, + { + "epoch": 0.25887376567921005, + "grad_norm": 0.13498739898204803, + "learning_rate": 0.0008520614253188563, + "loss": 2.7812, + "step": 8730 + }, + { + "epoch": 0.25890341903152153, + "grad_norm": 0.13626728951931, + "learning_rate": 0.0008520280138998489, + "loss": 2.7687, + "step": 8731 + }, + { + "epoch": 0.258933072383833, + "grad_norm": 0.15512984991073608, + "learning_rate": 0.0008519945993635731, + "loss": 2.8142, + "step": 8732 + }, + { + "epoch": 0.2589627257361445, + "grad_norm": 0.1318402886390686, + "learning_rate": 0.0008519611817103246, + "loss": 2.7588, + "step": 8733 + }, + { + "epoch": 0.25899237908845596, + "grad_norm": 0.1370951384305954, + "learning_rate": 0.0008519277609403995, + "loss": 2.7925, + "step": 8734 + }, + { + "epoch": 0.25902203244076744, + "grad_norm": 0.1432701051235199, + "learning_rate": 0.0008518943370540935, + "loss": 2.7931, + "step": 8735 + }, + { + "epoch": 0.2590516857930789, + "grad_norm": 0.12498553842306137, + "learning_rate": 0.000851860910051703, + "loss": 2.814, + "step": 8736 + }, + { + "epoch": 0.2590813391453904, + "grad_norm": 0.13423089683055878, + "learning_rate": 0.0008518274799335235, + "loss": 2.7483, + "step": 8737 + }, + { + "epoch": 0.25911099249770186, + "grad_norm": 0.14560386538505554, + "learning_rate": 0.0008517940466998515, + "loss": 2.7771, + "step": 8738 + }, + { + "epoch": 0.25914064585001334, + "grad_norm": 0.1632617861032486, + "learning_rate": 0.0008517606103509828, + "loss": 2.803, + "step": 8739 + }, + { + "epoch": 0.2591702992023248, + "grad_norm": 0.16352200508117676, + "learning_rate": 0.0008517271708872133, + "loss": 2.7848, + "step": 8740 + }, + { + "epoch": 0.2591999525546363, + "grad_norm": 0.15569241344928741, + "learning_rate": 0.0008516937283088394, + "loss": 2.8018, + "step": 8741 + }, + { + "epoch": 0.25922960590694777, + "grad_norm": 0.12385226041078568, + "learning_rate": 0.0008516602826161572, + "loss": 2.7846, + "step": 8742 + }, + { + "epoch": 0.25925925925925924, + "grad_norm": 0.13691431283950806, + "learning_rate": 0.000851626833809463, + "loss": 2.756, + "step": 8743 + }, + { + "epoch": 0.2592889126115707, + "grad_norm": 0.1378019005060196, + "learning_rate": 0.0008515933818890527, + "loss": 2.7755, + "step": 8744 + }, + { + "epoch": 0.2593185659638822, + "grad_norm": 0.12806113064289093, + "learning_rate": 0.0008515599268552226, + "loss": 2.7801, + "step": 8745 + }, + { + "epoch": 0.2593482193161937, + "grad_norm": 0.13215990364551544, + "learning_rate": 0.0008515264687082692, + "loss": 2.8082, + "step": 8746 + }, + { + "epoch": 0.25937787266850515, + "grad_norm": 0.1544182151556015, + "learning_rate": 0.0008514930074484883, + "loss": 2.7837, + "step": 8747 + }, + { + "epoch": 0.2594075260208166, + "grad_norm": 0.15969260036945343, + "learning_rate": 0.0008514595430761764, + "loss": 2.8269, + "step": 8748 + }, + { + "epoch": 0.25943717937312816, + "grad_norm": 0.14543063938617706, + "learning_rate": 0.0008514260755916304, + "loss": 2.7929, + "step": 8749 + }, + { + "epoch": 0.25946683272543963, + "grad_norm": 0.12347400933504105, + "learning_rate": 0.0008513926049951459, + "loss": 2.7676, + "step": 8750 + }, + { + "epoch": 0.2594964860777511, + "grad_norm": 0.15750806033611298, + "learning_rate": 0.0008513591312870194, + "loss": 2.7998, + "step": 8751 + }, + { + "epoch": 0.2595261394300626, + "grad_norm": 0.17793512344360352, + "learning_rate": 0.0008513256544675479, + "loss": 2.8027, + "step": 8752 + }, + { + "epoch": 0.25955579278237406, + "grad_norm": 0.16217133402824402, + "learning_rate": 0.000851292174537027, + "loss": 2.8125, + "step": 8753 + }, + { + "epoch": 0.25958544613468554, + "grad_norm": 0.12354885786771774, + "learning_rate": 0.0008512586914957538, + "loss": 2.8066, + "step": 8754 + }, + { + "epoch": 0.259615099486997, + "grad_norm": 0.13156484067440033, + "learning_rate": 0.0008512252053440246, + "loss": 2.7744, + "step": 8755 + }, + { + "epoch": 0.2596447528393085, + "grad_norm": 0.1374097615480423, + "learning_rate": 0.0008511917160821358, + "loss": 2.8044, + "step": 8756 + }, + { + "epoch": 0.25967440619161997, + "grad_norm": 0.11487317085266113, + "learning_rate": 0.0008511582237103843, + "loss": 2.7916, + "step": 8757 + }, + { + "epoch": 0.25970405954393144, + "grad_norm": 0.13301026821136475, + "learning_rate": 0.0008511247282290664, + "loss": 2.8065, + "step": 8758 + }, + { + "epoch": 0.2597337128962429, + "grad_norm": 0.14213348925113678, + "learning_rate": 0.0008510912296384786, + "loss": 2.8117, + "step": 8759 + }, + { + "epoch": 0.2597633662485544, + "grad_norm": 0.10907461494207382, + "learning_rate": 0.0008510577279389178, + "loss": 2.7903, + "step": 8760 + }, + { + "epoch": 0.25979301960086587, + "grad_norm": 0.11444790661334991, + "learning_rate": 0.0008510242231306805, + "loss": 2.7716, + "step": 8761 + }, + { + "epoch": 0.25982267295317735, + "grad_norm": 0.11125043034553528, + "learning_rate": 0.0008509907152140635, + "loss": 2.7646, + "step": 8762 + }, + { + "epoch": 0.2598523263054888, + "grad_norm": 0.11433060467243195, + "learning_rate": 0.0008509572041893635, + "loss": 2.799, + "step": 8763 + }, + { + "epoch": 0.2598819796578003, + "grad_norm": 0.12543946504592896, + "learning_rate": 0.0008509236900568772, + "loss": 2.7615, + "step": 8764 + }, + { + "epoch": 0.2599116330101118, + "grad_norm": 0.13639618456363678, + "learning_rate": 0.0008508901728169014, + "loss": 2.762, + "step": 8765 + }, + { + "epoch": 0.25994128636242325, + "grad_norm": 0.12118500471115112, + "learning_rate": 0.0008508566524697327, + "loss": 2.7275, + "step": 8766 + }, + { + "epoch": 0.25997093971473473, + "grad_norm": 0.11716512590646744, + "learning_rate": 0.0008508231290156684, + "loss": 2.8111, + "step": 8767 + }, + { + "epoch": 0.2600005930670462, + "grad_norm": 0.13769222795963287, + "learning_rate": 0.000850789602455005, + "loss": 2.7729, + "step": 8768 + }, + { + "epoch": 0.26003024641935774, + "grad_norm": 0.13275425136089325, + "learning_rate": 0.0008507560727880393, + "loss": 2.8012, + "step": 8769 + }, + { + "epoch": 0.2600598997716692, + "grad_norm": 0.13769900798797607, + "learning_rate": 0.0008507225400150685, + "loss": 2.7685, + "step": 8770 + }, + { + "epoch": 0.2600895531239807, + "grad_norm": 0.15650750696659088, + "learning_rate": 0.0008506890041363895, + "loss": 2.7566, + "step": 8771 + }, + { + "epoch": 0.26011920647629216, + "grad_norm": 0.18212027847766876, + "learning_rate": 0.0008506554651522991, + "loss": 2.7818, + "step": 8772 + }, + { + "epoch": 0.26014885982860364, + "grad_norm": 0.20391811430454254, + "learning_rate": 0.0008506219230630941, + "loss": 2.7838, + "step": 8773 + }, + { + "epoch": 0.2601785131809151, + "grad_norm": 0.19480523467063904, + "learning_rate": 0.0008505883778690722, + "loss": 2.7921, + "step": 8774 + }, + { + "epoch": 0.2602081665332266, + "grad_norm": 0.20723280310630798, + "learning_rate": 0.0008505548295705298, + "loss": 2.7989, + "step": 8775 + }, + { + "epoch": 0.26023781988553807, + "grad_norm": 0.23071785271167755, + "learning_rate": 0.0008505212781677642, + "loss": 2.7969, + "step": 8776 + }, + { + "epoch": 0.26026747323784954, + "grad_norm": 0.23268942534923553, + "learning_rate": 0.0008504877236610726, + "loss": 2.76, + "step": 8777 + }, + { + "epoch": 0.260297126590161, + "grad_norm": 0.19378286600112915, + "learning_rate": 0.0008504541660507521, + "loss": 2.7687, + "step": 8778 + }, + { + "epoch": 0.2603267799424725, + "grad_norm": 0.16068612039089203, + "learning_rate": 0.0008504206053370997, + "loss": 2.7818, + "step": 8779 + }, + { + "epoch": 0.260356433294784, + "grad_norm": 0.18968132138252258, + "learning_rate": 0.0008503870415204127, + "loss": 2.7886, + "step": 8780 + }, + { + "epoch": 0.26038608664709545, + "grad_norm": 0.15634594857692719, + "learning_rate": 0.0008503534746009884, + "loss": 2.7769, + "step": 8781 + }, + { + "epoch": 0.2604157399994069, + "grad_norm": 0.13360220193862915, + "learning_rate": 0.0008503199045791239, + "loss": 2.7343, + "step": 8782 + }, + { + "epoch": 0.2604453933517184, + "grad_norm": 0.1504296213388443, + "learning_rate": 0.0008502863314551164, + "loss": 2.768, + "step": 8783 + }, + { + "epoch": 0.2604750467040299, + "grad_norm": 0.1296158730983734, + "learning_rate": 0.0008502527552292634, + "loss": 2.7781, + "step": 8784 + }, + { + "epoch": 0.26050470005634135, + "grad_norm": 0.1344740092754364, + "learning_rate": 0.0008502191759018621, + "loss": 2.7808, + "step": 8785 + }, + { + "epoch": 0.26053435340865283, + "grad_norm": 0.1193300187587738, + "learning_rate": 0.0008501855934732099, + "loss": 2.7988, + "step": 8786 + }, + { + "epoch": 0.2605640067609643, + "grad_norm": 0.1109638586640358, + "learning_rate": 0.0008501520079436043, + "loss": 2.794, + "step": 8787 + }, + { + "epoch": 0.2605936601132758, + "grad_norm": 0.1159617081284523, + "learning_rate": 0.0008501184193133425, + "loss": 2.7169, + "step": 8788 + }, + { + "epoch": 0.26062331346558726, + "grad_norm": 0.12492773681879044, + "learning_rate": 0.0008500848275827217, + "loss": 2.8115, + "step": 8789 + }, + { + "epoch": 0.2606529668178988, + "grad_norm": 0.11913501471281052, + "learning_rate": 0.00085005123275204, + "loss": 2.8042, + "step": 8790 + }, + { + "epoch": 0.26068262017021027, + "grad_norm": 0.12696649134159088, + "learning_rate": 0.0008500176348215945, + "loss": 2.815, + "step": 8791 + }, + { + "epoch": 0.26071227352252174, + "grad_norm": 0.1355697363615036, + "learning_rate": 0.0008499840337916827, + "loss": 2.7909, + "step": 8792 + }, + { + "epoch": 0.2607419268748332, + "grad_norm": 0.1265234351158142, + "learning_rate": 0.0008499504296626022, + "loss": 2.7817, + "step": 8793 + }, + { + "epoch": 0.2607715802271447, + "grad_norm": 0.12858207523822784, + "learning_rate": 0.0008499168224346505, + "loss": 2.7746, + "step": 8794 + }, + { + "epoch": 0.26080123357945617, + "grad_norm": 0.12596328556537628, + "learning_rate": 0.0008498832121081255, + "loss": 2.8114, + "step": 8795 + }, + { + "epoch": 0.26083088693176765, + "grad_norm": 0.142664834856987, + "learning_rate": 0.0008498495986833244, + "loss": 2.7676, + "step": 8796 + }, + { + "epoch": 0.2608605402840791, + "grad_norm": 0.12653984129428864, + "learning_rate": 0.0008498159821605451, + "loss": 2.7818, + "step": 8797 + }, + { + "epoch": 0.2608901936363906, + "grad_norm": 0.11409281939268112, + "learning_rate": 0.0008497823625400853, + "loss": 2.7708, + "step": 8798 + }, + { + "epoch": 0.2609198469887021, + "grad_norm": 0.11981035768985748, + "learning_rate": 0.0008497487398222425, + "loss": 2.8213, + "step": 8799 + }, + { + "epoch": 0.26094950034101355, + "grad_norm": 0.13217061758041382, + "learning_rate": 0.0008497151140073147, + "loss": 2.7941, + "step": 8800 + }, + { + "epoch": 0.26097915369332503, + "grad_norm": 0.14312821626663208, + "learning_rate": 0.0008496814850955996, + "loss": 2.7957, + "step": 8801 + }, + { + "epoch": 0.2610088070456365, + "grad_norm": 0.13218393921852112, + "learning_rate": 0.0008496478530873948, + "loss": 2.7447, + "step": 8802 + }, + { + "epoch": 0.261038460397948, + "grad_norm": 0.14886638522148132, + "learning_rate": 0.0008496142179829984, + "loss": 2.7914, + "step": 8803 + }, + { + "epoch": 0.26106811375025946, + "grad_norm": 0.1459992527961731, + "learning_rate": 0.0008495805797827079, + "loss": 2.7424, + "step": 8804 + }, + { + "epoch": 0.26109776710257093, + "grad_norm": 0.195912167429924, + "learning_rate": 0.0008495469384868214, + "loss": 2.8104, + "step": 8805 + }, + { + "epoch": 0.2611274204548824, + "grad_norm": 0.17645291984081268, + "learning_rate": 0.0008495132940956367, + "loss": 2.8507, + "step": 8806 + }, + { + "epoch": 0.2611570738071939, + "grad_norm": 0.1649557650089264, + "learning_rate": 0.000849479646609452, + "loss": 2.7871, + "step": 8807 + }, + { + "epoch": 0.26118672715950536, + "grad_norm": 0.16282692551612854, + "learning_rate": 0.000849445996028565, + "loss": 2.769, + "step": 8808 + }, + { + "epoch": 0.26121638051181684, + "grad_norm": 0.16717487573623657, + "learning_rate": 0.0008494123423532736, + "loss": 2.7968, + "step": 8809 + }, + { + "epoch": 0.2612460338641283, + "grad_norm": 0.1740255206823349, + "learning_rate": 0.0008493786855838759, + "loss": 2.7701, + "step": 8810 + }, + { + "epoch": 0.26127568721643984, + "grad_norm": 0.15812283754348755, + "learning_rate": 0.0008493450257206701, + "loss": 2.8091, + "step": 8811 + }, + { + "epoch": 0.2613053405687513, + "grad_norm": 0.14760860800743103, + "learning_rate": 0.000849311362763954, + "loss": 2.8192, + "step": 8812 + }, + { + "epoch": 0.2613349939210628, + "grad_norm": 0.16258513927459717, + "learning_rate": 0.0008492776967140259, + "loss": 2.7798, + "step": 8813 + }, + { + "epoch": 0.2613646472733743, + "grad_norm": 0.11730722337961197, + "learning_rate": 0.0008492440275711839, + "loss": 2.7978, + "step": 8814 + }, + { + "epoch": 0.26139430062568575, + "grad_norm": 0.12746776640415192, + "learning_rate": 0.0008492103553357261, + "loss": 2.8021, + "step": 8815 + }, + { + "epoch": 0.2614239539779972, + "grad_norm": 0.13817796111106873, + "learning_rate": 0.0008491766800079505, + "loss": 2.7621, + "step": 8816 + }, + { + "epoch": 0.2614536073303087, + "grad_norm": 0.12902237474918365, + "learning_rate": 0.0008491430015881556, + "loss": 2.8183, + "step": 8817 + }, + { + "epoch": 0.2614832606826202, + "grad_norm": 0.1417708843946457, + "learning_rate": 0.0008491093200766395, + "loss": 2.8099, + "step": 8818 + }, + { + "epoch": 0.26151291403493165, + "grad_norm": 0.14025835692882538, + "learning_rate": 0.0008490756354737004, + "loss": 2.7601, + "step": 8819 + }, + { + "epoch": 0.26154256738724313, + "grad_norm": 0.11408248543739319, + "learning_rate": 0.0008490419477796366, + "loss": 2.771, + "step": 8820 + }, + { + "epoch": 0.2615722207395546, + "grad_norm": 0.1259460151195526, + "learning_rate": 0.0008490082569947465, + "loss": 2.799, + "step": 8821 + }, + { + "epoch": 0.2616018740918661, + "grad_norm": 0.12915928661823273, + "learning_rate": 0.0008489745631193285, + "loss": 2.8117, + "step": 8822 + }, + { + "epoch": 0.26163152744417756, + "grad_norm": 0.11984114348888397, + "learning_rate": 0.0008489408661536806, + "loss": 2.8106, + "step": 8823 + }, + { + "epoch": 0.26166118079648903, + "grad_norm": 0.11452449858188629, + "learning_rate": 0.0008489071660981015, + "loss": 2.7756, + "step": 8824 + }, + { + "epoch": 0.2616908341488005, + "grad_norm": 0.10833875089883804, + "learning_rate": 0.0008488734629528894, + "loss": 2.7748, + "step": 8825 + }, + { + "epoch": 0.261720487501112, + "grad_norm": 0.11852331459522247, + "learning_rate": 0.0008488397567183433, + "loss": 2.7847, + "step": 8826 + }, + { + "epoch": 0.26175014085342346, + "grad_norm": 0.12685300409793854, + "learning_rate": 0.000848806047394761, + "loss": 2.7499, + "step": 8827 + }, + { + "epoch": 0.26177979420573494, + "grad_norm": 0.1197337880730629, + "learning_rate": 0.0008487723349824413, + "loss": 2.7694, + "step": 8828 + }, + { + "epoch": 0.2618094475580464, + "grad_norm": 0.1218583807349205, + "learning_rate": 0.0008487386194816829, + "loss": 2.7566, + "step": 8829 + }, + { + "epoch": 0.2618391009103579, + "grad_norm": 0.13412950932979584, + "learning_rate": 0.0008487049008927838, + "loss": 2.7745, + "step": 8830 + }, + { + "epoch": 0.26186875426266937, + "grad_norm": 0.1598295420408249, + "learning_rate": 0.0008486711792160432, + "loss": 2.7485, + "step": 8831 + }, + { + "epoch": 0.2618984076149809, + "grad_norm": 0.177462637424469, + "learning_rate": 0.0008486374544517594, + "loss": 2.7943, + "step": 8832 + }, + { + "epoch": 0.2619280609672924, + "grad_norm": 0.1652199625968933, + "learning_rate": 0.0008486037266002311, + "loss": 2.7829, + "step": 8833 + }, + { + "epoch": 0.26195771431960385, + "grad_norm": 0.17454977333545685, + "learning_rate": 0.0008485699956617571, + "loss": 2.7984, + "step": 8834 + }, + { + "epoch": 0.2619873676719153, + "grad_norm": 0.1725534051656723, + "learning_rate": 0.0008485362616366359, + "loss": 2.7451, + "step": 8835 + }, + { + "epoch": 0.2620170210242268, + "grad_norm": 0.14805710315704346, + "learning_rate": 0.000848502524525166, + "loss": 2.7647, + "step": 8836 + }, + { + "epoch": 0.2620466743765383, + "grad_norm": 0.13131746649742126, + "learning_rate": 0.0008484687843276469, + "loss": 2.7783, + "step": 8837 + }, + { + "epoch": 0.26207632772884976, + "grad_norm": 0.15299727022647858, + "learning_rate": 0.0008484350410443764, + "loss": 2.7989, + "step": 8838 + }, + { + "epoch": 0.26210598108116123, + "grad_norm": 0.15761211514472961, + "learning_rate": 0.000848401294675654, + "loss": 2.7546, + "step": 8839 + }, + { + "epoch": 0.2621356344334727, + "grad_norm": 0.1392894983291626, + "learning_rate": 0.0008483675452217785, + "loss": 2.7735, + "step": 8840 + }, + { + "epoch": 0.2621652877857842, + "grad_norm": 0.1640154868364334, + "learning_rate": 0.0008483337926830486, + "loss": 2.725, + "step": 8841 + }, + { + "epoch": 0.26219494113809566, + "grad_norm": 0.174744114279747, + "learning_rate": 0.0008483000370597629, + "loss": 2.7704, + "step": 8842 + }, + { + "epoch": 0.26222459449040714, + "grad_norm": 0.15357817709445953, + "learning_rate": 0.0008482662783522208, + "loss": 2.7518, + "step": 8843 + }, + { + "epoch": 0.2622542478427186, + "grad_norm": 0.1340361386537552, + "learning_rate": 0.0008482325165607208, + "loss": 2.7964, + "step": 8844 + }, + { + "epoch": 0.2622839011950301, + "grad_norm": 0.13762232661247253, + "learning_rate": 0.0008481987516855624, + "loss": 2.7731, + "step": 8845 + }, + { + "epoch": 0.26231355454734157, + "grad_norm": 0.13333363831043243, + "learning_rate": 0.000848164983727044, + "loss": 2.7705, + "step": 8846 + }, + { + "epoch": 0.26234320789965304, + "grad_norm": 0.12514320015907288, + "learning_rate": 0.0008481312126854652, + "loss": 2.7489, + "step": 8847 + }, + { + "epoch": 0.2623728612519645, + "grad_norm": 0.13618186116218567, + "learning_rate": 0.0008480974385611246, + "loss": 2.7579, + "step": 8848 + }, + { + "epoch": 0.262402514604276, + "grad_norm": 0.14104460179805756, + "learning_rate": 0.0008480636613543214, + "loss": 2.7804, + "step": 8849 + }, + { + "epoch": 0.26243216795658747, + "grad_norm": 0.12762001156806946, + "learning_rate": 0.0008480298810653548, + "loss": 2.7873, + "step": 8850 + }, + { + "epoch": 0.26246182130889895, + "grad_norm": 0.12161774933338165, + "learning_rate": 0.0008479960976945238, + "loss": 2.7811, + "step": 8851 + }, + { + "epoch": 0.2624914746612104, + "grad_norm": 0.1330765187740326, + "learning_rate": 0.0008479623112421276, + "loss": 2.7618, + "step": 8852 + }, + { + "epoch": 0.26252112801352195, + "grad_norm": 0.11869457364082336, + "learning_rate": 0.0008479285217084656, + "loss": 2.7917, + "step": 8853 + }, + { + "epoch": 0.26255078136583343, + "grad_norm": 0.13148579001426697, + "learning_rate": 0.0008478947290938366, + "loss": 2.7743, + "step": 8854 + }, + { + "epoch": 0.2625804347181449, + "grad_norm": 0.14245493710041046, + "learning_rate": 0.0008478609333985401, + "loss": 2.7727, + "step": 8855 + }, + { + "epoch": 0.2626100880704564, + "grad_norm": 0.12065812945365906, + "learning_rate": 0.0008478271346228755, + "loss": 2.7596, + "step": 8856 + }, + { + "epoch": 0.26263974142276786, + "grad_norm": 0.11843719333410263, + "learning_rate": 0.0008477933327671416, + "loss": 2.7716, + "step": 8857 + }, + { + "epoch": 0.26266939477507933, + "grad_norm": 0.1439712941646576, + "learning_rate": 0.0008477595278316382, + "loss": 2.7553, + "step": 8858 + }, + { + "epoch": 0.2626990481273908, + "grad_norm": 0.14848197996616364, + "learning_rate": 0.0008477257198166646, + "loss": 2.7741, + "step": 8859 + }, + { + "epoch": 0.2627287014797023, + "grad_norm": 0.16275857388973236, + "learning_rate": 0.0008476919087225199, + "loss": 2.8105, + "step": 8860 + }, + { + "epoch": 0.26275835483201376, + "grad_norm": 0.16724197566509247, + "learning_rate": 0.0008476580945495038, + "loss": 2.8183, + "step": 8861 + }, + { + "epoch": 0.26278800818432524, + "grad_norm": 0.1368768960237503, + "learning_rate": 0.0008476242772979156, + "loss": 2.7973, + "step": 8862 + }, + { + "epoch": 0.2628176615366367, + "grad_norm": 0.13849927484989166, + "learning_rate": 0.0008475904569680547, + "loss": 2.7926, + "step": 8863 + }, + { + "epoch": 0.2628473148889482, + "grad_norm": 0.14633731544017792, + "learning_rate": 0.0008475566335602205, + "loss": 2.7845, + "step": 8864 + }, + { + "epoch": 0.26287696824125967, + "grad_norm": 0.15131579339504242, + "learning_rate": 0.0008475228070747128, + "loss": 2.7467, + "step": 8865 + }, + { + "epoch": 0.26290662159357114, + "grad_norm": 0.16374480724334717, + "learning_rate": 0.0008474889775118311, + "loss": 2.8066, + "step": 8866 + }, + { + "epoch": 0.2629362749458826, + "grad_norm": 0.15086354315280914, + "learning_rate": 0.0008474551448718747, + "loss": 2.7752, + "step": 8867 + }, + { + "epoch": 0.2629659282981941, + "grad_norm": 0.13733136653900146, + "learning_rate": 0.0008474213091551434, + "loss": 2.787, + "step": 8868 + }, + { + "epoch": 0.26299558165050557, + "grad_norm": 0.1326538771390915, + "learning_rate": 0.0008473874703619368, + "loss": 2.7838, + "step": 8869 + }, + { + "epoch": 0.26302523500281705, + "grad_norm": 0.14083297550678253, + "learning_rate": 0.0008473536284925545, + "loss": 2.7633, + "step": 8870 + }, + { + "epoch": 0.2630548883551285, + "grad_norm": 0.12430891394615173, + "learning_rate": 0.0008473197835472961, + "loss": 2.7662, + "step": 8871 + }, + { + "epoch": 0.26308454170744, + "grad_norm": 0.12577824294567108, + "learning_rate": 0.0008472859355264615, + "loss": 2.7661, + "step": 8872 + }, + { + "epoch": 0.26311419505975153, + "grad_norm": 0.14525535702705383, + "learning_rate": 0.0008472520844303504, + "loss": 2.7859, + "step": 8873 + }, + { + "epoch": 0.263143848412063, + "grad_norm": 0.11936133354902267, + "learning_rate": 0.0008472182302592623, + "loss": 2.7902, + "step": 8874 + }, + { + "epoch": 0.2631735017643745, + "grad_norm": 0.13509897887706757, + "learning_rate": 0.0008471843730134973, + "loss": 2.8118, + "step": 8875 + }, + { + "epoch": 0.26320315511668596, + "grad_norm": 0.12635834515094757, + "learning_rate": 0.000847150512693355, + "loss": 2.78, + "step": 8876 + }, + { + "epoch": 0.26323280846899744, + "grad_norm": 0.12656626105308533, + "learning_rate": 0.0008471166492991354, + "loss": 2.7593, + "step": 8877 + }, + { + "epoch": 0.2632624618213089, + "grad_norm": 0.1391913741827011, + "learning_rate": 0.0008470827828311382, + "loss": 2.7889, + "step": 8878 + }, + { + "epoch": 0.2632921151736204, + "grad_norm": 0.13875167071819305, + "learning_rate": 0.0008470489132896635, + "loss": 2.7496, + "step": 8879 + }, + { + "epoch": 0.26332176852593187, + "grad_norm": 0.12768390774726868, + "learning_rate": 0.0008470150406750111, + "loss": 2.7772, + "step": 8880 + }, + { + "epoch": 0.26335142187824334, + "grad_norm": 0.14529737830162048, + "learning_rate": 0.000846981164987481, + "loss": 2.783, + "step": 8881 + }, + { + "epoch": 0.2633810752305548, + "grad_norm": 0.1601833999156952, + "learning_rate": 0.000846947286227373, + "loss": 2.7342, + "step": 8882 + }, + { + "epoch": 0.2634107285828663, + "grad_norm": 0.15306545794010162, + "learning_rate": 0.0008469134043949871, + "loss": 2.7755, + "step": 8883 + }, + { + "epoch": 0.26344038193517777, + "grad_norm": 0.13326354324817657, + "learning_rate": 0.0008468795194906237, + "loss": 2.7419, + "step": 8884 + }, + { + "epoch": 0.26347003528748925, + "grad_norm": 0.15819355845451355, + "learning_rate": 0.0008468456315145825, + "loss": 2.7872, + "step": 8885 + }, + { + "epoch": 0.2634996886398007, + "grad_norm": 0.16865943372249603, + "learning_rate": 0.0008468117404671638, + "loss": 2.7559, + "step": 8886 + }, + { + "epoch": 0.2635293419921122, + "grad_norm": 0.13783368468284607, + "learning_rate": 0.0008467778463486675, + "loss": 2.8065, + "step": 8887 + }, + { + "epoch": 0.2635589953444237, + "grad_norm": 0.13187377154827118, + "learning_rate": 0.0008467439491593939, + "loss": 2.7642, + "step": 8888 + }, + { + "epoch": 0.26358864869673515, + "grad_norm": 0.12529776990413666, + "learning_rate": 0.0008467100488996431, + "loss": 2.7668, + "step": 8889 + }, + { + "epoch": 0.2636183020490466, + "grad_norm": 0.1353110373020172, + "learning_rate": 0.0008466761455697151, + "loss": 2.7495, + "step": 8890 + }, + { + "epoch": 0.2636479554013581, + "grad_norm": 0.13687780499458313, + "learning_rate": 0.0008466422391699106, + "loss": 2.7685, + "step": 8891 + }, + { + "epoch": 0.2636776087536696, + "grad_norm": 0.1449865698814392, + "learning_rate": 0.0008466083297005296, + "loss": 2.8248, + "step": 8892 + }, + { + "epoch": 0.26370726210598106, + "grad_norm": 0.1507202833890915, + "learning_rate": 0.0008465744171618722, + "loss": 2.7902, + "step": 8893 + }, + { + "epoch": 0.2637369154582926, + "grad_norm": 0.15003834664821625, + "learning_rate": 0.0008465405015542389, + "loss": 2.7526, + "step": 8894 + }, + { + "epoch": 0.26376656881060406, + "grad_norm": 0.12682893872261047, + "learning_rate": 0.0008465065828779301, + "loss": 2.7717, + "step": 8895 + }, + { + "epoch": 0.26379622216291554, + "grad_norm": 0.15446342527866364, + "learning_rate": 0.0008464726611332457, + "loss": 2.7575, + "step": 8896 + }, + { + "epoch": 0.263825875515227, + "grad_norm": 0.15622934699058533, + "learning_rate": 0.0008464387363204866, + "loss": 2.8053, + "step": 8897 + }, + { + "epoch": 0.2638555288675385, + "grad_norm": 0.1503647118806839, + "learning_rate": 0.0008464048084399531, + "loss": 2.7643, + "step": 8898 + }, + { + "epoch": 0.26388518221984997, + "grad_norm": 0.15024082362651825, + "learning_rate": 0.0008463708774919456, + "loss": 2.7052, + "step": 8899 + }, + { + "epoch": 0.26391483557216144, + "grad_norm": 0.1662369817495346, + "learning_rate": 0.0008463369434767644, + "loss": 2.7692, + "step": 8900 + }, + { + "epoch": 0.2639444889244729, + "grad_norm": 0.1787038892507553, + "learning_rate": 0.0008463030063947101, + "loss": 2.7971, + "step": 8901 + }, + { + "epoch": 0.2639741422767844, + "grad_norm": 0.14226725697517395, + "learning_rate": 0.0008462690662460832, + "loss": 2.7775, + "step": 8902 + }, + { + "epoch": 0.26400379562909587, + "grad_norm": 0.13014201819896698, + "learning_rate": 0.0008462351230311844, + "loss": 2.7755, + "step": 8903 + }, + { + "epoch": 0.26403344898140735, + "grad_norm": 0.15039968490600586, + "learning_rate": 0.0008462011767503141, + "loss": 2.7764, + "step": 8904 + }, + { + "epoch": 0.2640631023337188, + "grad_norm": 0.1481785923242569, + "learning_rate": 0.0008461672274037731, + "loss": 2.7894, + "step": 8905 + }, + { + "epoch": 0.2640927556860303, + "grad_norm": 0.15734711289405823, + "learning_rate": 0.0008461332749918616, + "loss": 2.803, + "step": 8906 + }, + { + "epoch": 0.2641224090383418, + "grad_norm": 0.16150322556495667, + "learning_rate": 0.0008460993195148807, + "loss": 2.7876, + "step": 8907 + }, + { + "epoch": 0.26415206239065325, + "grad_norm": 0.14710435271263123, + "learning_rate": 0.0008460653609731311, + "loss": 2.7791, + "step": 8908 + }, + { + "epoch": 0.26418171574296473, + "grad_norm": 0.14793363213539124, + "learning_rate": 0.0008460313993669128, + "loss": 2.7872, + "step": 8909 + }, + { + "epoch": 0.2642113690952762, + "grad_norm": 0.12961864471435547, + "learning_rate": 0.0008459974346965276, + "loss": 2.748, + "step": 8910 + }, + { + "epoch": 0.2642410224475877, + "grad_norm": 0.1237587258219719, + "learning_rate": 0.0008459634669622755, + "loss": 2.7883, + "step": 8911 + }, + { + "epoch": 0.26427067579989916, + "grad_norm": 0.14182502031326294, + "learning_rate": 0.0008459294961644574, + "loss": 2.8012, + "step": 8912 + }, + { + "epoch": 0.26430032915221063, + "grad_norm": 0.16457192599773407, + "learning_rate": 0.0008458955223033744, + "loss": 2.8096, + "step": 8913 + }, + { + "epoch": 0.2643299825045221, + "grad_norm": 0.1595306247472763, + "learning_rate": 0.0008458615453793273, + "loss": 2.812, + "step": 8914 + }, + { + "epoch": 0.26435963585683364, + "grad_norm": 0.1674431562423706, + "learning_rate": 0.0008458275653926166, + "loss": 2.7997, + "step": 8915 + }, + { + "epoch": 0.2643892892091451, + "grad_norm": 0.15707223117351532, + "learning_rate": 0.0008457935823435437, + "loss": 2.8072, + "step": 8916 + }, + { + "epoch": 0.2644189425614566, + "grad_norm": 0.14321517944335938, + "learning_rate": 0.0008457595962324089, + "loss": 2.7925, + "step": 8917 + }, + { + "epoch": 0.26444859591376807, + "grad_norm": 0.13535667955875397, + "learning_rate": 0.0008457256070595138, + "loss": 2.7602, + "step": 8918 + }, + { + "epoch": 0.26447824926607955, + "grad_norm": 0.13289609551429749, + "learning_rate": 0.0008456916148251592, + "loss": 2.7719, + "step": 8919 + }, + { + "epoch": 0.264507902618391, + "grad_norm": 0.12979920208454132, + "learning_rate": 0.0008456576195296458, + "loss": 2.7971, + "step": 8920 + }, + { + "epoch": 0.2645375559707025, + "grad_norm": 0.14671140909194946, + "learning_rate": 0.0008456236211732748, + "loss": 2.7951, + "step": 8921 + }, + { + "epoch": 0.264567209323014, + "grad_norm": 0.13625596463680267, + "learning_rate": 0.0008455896197563475, + "loss": 2.7898, + "step": 8922 + }, + { + "epoch": 0.26459686267532545, + "grad_norm": 0.15457257628440857, + "learning_rate": 0.0008455556152791646, + "loss": 2.7495, + "step": 8923 + }, + { + "epoch": 0.2646265160276369, + "grad_norm": 0.13676366209983826, + "learning_rate": 0.0008455216077420277, + "loss": 2.7697, + "step": 8924 + }, + { + "epoch": 0.2646561693799484, + "grad_norm": 0.1418592929840088, + "learning_rate": 0.0008454875971452375, + "loss": 2.7559, + "step": 8925 + }, + { + "epoch": 0.2646858227322599, + "grad_norm": 0.1360597312450409, + "learning_rate": 0.0008454535834890953, + "loss": 2.7858, + "step": 8926 + }, + { + "epoch": 0.26471547608457136, + "grad_norm": 0.1455513834953308, + "learning_rate": 0.0008454195667739024, + "loss": 2.7308, + "step": 8927 + }, + { + "epoch": 0.26474512943688283, + "grad_norm": 0.15232150256633759, + "learning_rate": 0.0008453855469999597, + "loss": 2.7622, + "step": 8928 + }, + { + "epoch": 0.2647747827891943, + "grad_norm": 0.1530189961194992, + "learning_rate": 0.000845351524167569, + "loss": 2.8213, + "step": 8929 + }, + { + "epoch": 0.2648044361415058, + "grad_norm": 0.1560228317975998, + "learning_rate": 0.0008453174982770311, + "loss": 2.7762, + "step": 8930 + }, + { + "epoch": 0.26483408949381726, + "grad_norm": 0.14574530720710754, + "learning_rate": 0.0008452834693286475, + "loss": 2.7779, + "step": 8931 + }, + { + "epoch": 0.26486374284612874, + "grad_norm": 0.15380382537841797, + "learning_rate": 0.0008452494373227196, + "loss": 2.7671, + "step": 8932 + }, + { + "epoch": 0.2648933961984402, + "grad_norm": 0.14500252902507782, + "learning_rate": 0.0008452154022595487, + "loss": 2.7477, + "step": 8933 + }, + { + "epoch": 0.2649230495507517, + "grad_norm": 0.14826856553554535, + "learning_rate": 0.000845181364139436, + "loss": 2.7722, + "step": 8934 + }, + { + "epoch": 0.26495270290306316, + "grad_norm": 0.1714881807565689, + "learning_rate": 0.0008451473229626832, + "loss": 2.7634, + "step": 8935 + }, + { + "epoch": 0.2649823562553747, + "grad_norm": 0.1707826852798462, + "learning_rate": 0.0008451132787295915, + "loss": 2.7575, + "step": 8936 + }, + { + "epoch": 0.26501200960768617, + "grad_norm": 0.1645769625902176, + "learning_rate": 0.0008450792314404625, + "loss": 2.7674, + "step": 8937 + }, + { + "epoch": 0.26504166295999765, + "grad_norm": 0.1706206351518631, + "learning_rate": 0.0008450451810955977, + "loss": 2.7923, + "step": 8938 + }, + { + "epoch": 0.2650713163123091, + "grad_norm": 0.1587025374174118, + "learning_rate": 0.0008450111276952987, + "loss": 2.7566, + "step": 8939 + }, + { + "epoch": 0.2651009696646206, + "grad_norm": 0.14364206790924072, + "learning_rate": 0.0008449770712398668, + "loss": 2.7636, + "step": 8940 + }, + { + "epoch": 0.2651306230169321, + "grad_norm": 0.137899249792099, + "learning_rate": 0.0008449430117296037, + "loss": 2.7812, + "step": 8941 + }, + { + "epoch": 0.26516027636924355, + "grad_norm": 0.1377434879541397, + "learning_rate": 0.0008449089491648112, + "loss": 2.7595, + "step": 8942 + }, + { + "epoch": 0.26518992972155503, + "grad_norm": 0.13632197678089142, + "learning_rate": 0.0008448748835457907, + "loss": 2.7781, + "step": 8943 + }, + { + "epoch": 0.2652195830738665, + "grad_norm": 0.1198844462633133, + "learning_rate": 0.0008448408148728437, + "loss": 2.7648, + "step": 8944 + }, + { + "epoch": 0.265249236426178, + "grad_norm": 0.12646034359931946, + "learning_rate": 0.0008448067431462723, + "loss": 2.7876, + "step": 8945 + }, + { + "epoch": 0.26527888977848946, + "grad_norm": 0.11812802404165268, + "learning_rate": 0.000844772668366378, + "loss": 2.7747, + "step": 8946 + }, + { + "epoch": 0.26530854313080093, + "grad_norm": 0.14098022878170013, + "learning_rate": 0.0008447385905334625, + "loss": 2.7852, + "step": 8947 + }, + { + "epoch": 0.2653381964831124, + "grad_norm": 0.13691452145576477, + "learning_rate": 0.0008447045096478276, + "loss": 2.7861, + "step": 8948 + }, + { + "epoch": 0.2653678498354239, + "grad_norm": 0.1244095116853714, + "learning_rate": 0.000844670425709775, + "loss": 2.7819, + "step": 8949 + }, + { + "epoch": 0.26539750318773536, + "grad_norm": 0.11774428188800812, + "learning_rate": 0.0008446363387196068, + "loss": 2.7908, + "step": 8950 + }, + { + "epoch": 0.26542715654004684, + "grad_norm": 0.11883487552404404, + "learning_rate": 0.0008446022486776246, + "loss": 2.7474, + "step": 8951 + }, + { + "epoch": 0.2654568098923583, + "grad_norm": 0.12469670921564102, + "learning_rate": 0.0008445681555841303, + "loss": 2.7294, + "step": 8952 + }, + { + "epoch": 0.2654864632446698, + "grad_norm": 0.1528322398662567, + "learning_rate": 0.0008445340594394259, + "loss": 2.7752, + "step": 8953 + }, + { + "epoch": 0.26551611659698127, + "grad_norm": 0.17436270415782928, + "learning_rate": 0.000844499960243813, + "loss": 2.7527, + "step": 8954 + }, + { + "epoch": 0.26554576994929274, + "grad_norm": 0.17270049452781677, + "learning_rate": 0.0008444658579975942, + "loss": 2.7795, + "step": 8955 + }, + { + "epoch": 0.2655754233016042, + "grad_norm": 0.1679147332906723, + "learning_rate": 0.0008444317527010708, + "loss": 2.7623, + "step": 8956 + }, + { + "epoch": 0.26560507665391575, + "grad_norm": 0.13647322356700897, + "learning_rate": 0.0008443976443545454, + "loss": 2.7535, + "step": 8957 + }, + { + "epoch": 0.2656347300062272, + "grad_norm": 0.12150602042675018, + "learning_rate": 0.0008443635329583196, + "loss": 2.7768, + "step": 8958 + }, + { + "epoch": 0.2656643833585387, + "grad_norm": 0.13218972086906433, + "learning_rate": 0.0008443294185126955, + "loss": 2.8019, + "step": 8959 + }, + { + "epoch": 0.2656940367108502, + "grad_norm": 0.1279900223016739, + "learning_rate": 0.0008442953010179754, + "loss": 2.8146, + "step": 8960 + }, + { + "epoch": 0.26572369006316165, + "grad_norm": 0.14518068730831146, + "learning_rate": 0.0008442611804744613, + "loss": 2.7861, + "step": 8961 + }, + { + "epoch": 0.26575334341547313, + "grad_norm": 0.1346747726202011, + "learning_rate": 0.0008442270568824555, + "loss": 2.7759, + "step": 8962 + }, + { + "epoch": 0.2657829967677846, + "grad_norm": 0.13611765205860138, + "learning_rate": 0.0008441929302422598, + "loss": 2.7728, + "step": 8963 + }, + { + "epoch": 0.2658126501200961, + "grad_norm": 0.15268170833587646, + "learning_rate": 0.0008441588005541767, + "loss": 2.8025, + "step": 8964 + }, + { + "epoch": 0.26584230347240756, + "grad_norm": 0.16452905535697937, + "learning_rate": 0.0008441246678185084, + "loss": 2.7492, + "step": 8965 + }, + { + "epoch": 0.26587195682471904, + "grad_norm": 0.14905838668346405, + "learning_rate": 0.000844090532035557, + "loss": 2.7589, + "step": 8966 + }, + { + "epoch": 0.2659016101770305, + "grad_norm": 0.1269199103116989, + "learning_rate": 0.0008440563932056249, + "loss": 2.7867, + "step": 8967 + }, + { + "epoch": 0.265931263529342, + "grad_norm": 0.11158603429794312, + "learning_rate": 0.0008440222513290145, + "loss": 2.7794, + "step": 8968 + }, + { + "epoch": 0.26596091688165346, + "grad_norm": 0.11844808608293533, + "learning_rate": 0.0008439881064060279, + "loss": 2.7771, + "step": 8969 + }, + { + "epoch": 0.26599057023396494, + "grad_norm": 0.11735185980796814, + "learning_rate": 0.0008439539584369675, + "loss": 2.7932, + "step": 8970 + }, + { + "epoch": 0.2660202235862764, + "grad_norm": 0.13578583300113678, + "learning_rate": 0.0008439198074221359, + "loss": 2.7798, + "step": 8971 + }, + { + "epoch": 0.2660498769385879, + "grad_norm": 0.14769242703914642, + "learning_rate": 0.0008438856533618354, + "loss": 2.7673, + "step": 8972 + }, + { + "epoch": 0.26607953029089937, + "grad_norm": 0.14868015050888062, + "learning_rate": 0.0008438514962563684, + "loss": 2.7471, + "step": 8973 + }, + { + "epoch": 0.26610918364321084, + "grad_norm": 0.16319645941257477, + "learning_rate": 0.0008438173361060373, + "loss": 2.7965, + "step": 8974 + }, + { + "epoch": 0.2661388369955223, + "grad_norm": 0.15002015233039856, + "learning_rate": 0.0008437831729111447, + "loss": 2.7807, + "step": 8975 + }, + { + "epoch": 0.2661684903478338, + "grad_norm": 0.14598587155342102, + "learning_rate": 0.000843749006671993, + "loss": 2.7671, + "step": 8976 + }, + { + "epoch": 0.26619814370014533, + "grad_norm": 0.14718839526176453, + "learning_rate": 0.000843714837388885, + "loss": 2.7683, + "step": 8977 + }, + { + "epoch": 0.2662277970524568, + "grad_norm": 0.13877858221530914, + "learning_rate": 0.0008436806650621231, + "loss": 2.7756, + "step": 8978 + }, + { + "epoch": 0.2662574504047683, + "grad_norm": 0.16080114245414734, + "learning_rate": 0.0008436464896920099, + "loss": 2.7884, + "step": 8979 + }, + { + "epoch": 0.26628710375707976, + "grad_norm": 0.15323401987552643, + "learning_rate": 0.0008436123112788478, + "loss": 2.8208, + "step": 8980 + }, + { + "epoch": 0.26631675710939123, + "grad_norm": 0.15867912769317627, + "learning_rate": 0.0008435781298229402, + "loss": 2.7847, + "step": 8981 + }, + { + "epoch": 0.2663464104617027, + "grad_norm": 0.16912433505058289, + "learning_rate": 0.0008435439453245889, + "loss": 2.8151, + "step": 8982 + }, + { + "epoch": 0.2663760638140142, + "grad_norm": 0.1280265897512436, + "learning_rate": 0.0008435097577840971, + "loss": 2.7939, + "step": 8983 + }, + { + "epoch": 0.26640571716632566, + "grad_norm": 0.13301241397857666, + "learning_rate": 0.0008434755672017674, + "loss": 2.7755, + "step": 8984 + }, + { + "epoch": 0.26643537051863714, + "grad_norm": 0.15096545219421387, + "learning_rate": 0.0008434413735779028, + "loss": 2.8196, + "step": 8985 + }, + { + "epoch": 0.2664650238709486, + "grad_norm": 0.15393689274787903, + "learning_rate": 0.0008434071769128056, + "loss": 2.7531, + "step": 8986 + }, + { + "epoch": 0.2664946772232601, + "grad_norm": 0.14821553230285645, + "learning_rate": 0.0008433729772067789, + "loss": 2.7447, + "step": 8987 + }, + { + "epoch": 0.26652433057557157, + "grad_norm": 0.14487452805042267, + "learning_rate": 0.0008433387744601257, + "loss": 2.798, + "step": 8988 + }, + { + "epoch": 0.26655398392788304, + "grad_norm": 0.15208077430725098, + "learning_rate": 0.0008433045686731486, + "loss": 2.7679, + "step": 8989 + }, + { + "epoch": 0.2665836372801945, + "grad_norm": 0.15586735308170319, + "learning_rate": 0.0008432703598461508, + "loss": 2.7898, + "step": 8990 + }, + { + "epoch": 0.266613290632506, + "grad_norm": 0.12609151005744934, + "learning_rate": 0.000843236147979435, + "loss": 2.7696, + "step": 8991 + }, + { + "epoch": 0.26664294398481747, + "grad_norm": 0.14061671495437622, + "learning_rate": 0.0008432019330733041, + "loss": 2.7917, + "step": 8992 + }, + { + "epoch": 0.26667259733712895, + "grad_norm": 0.14726583659648895, + "learning_rate": 0.0008431677151280612, + "loss": 2.7534, + "step": 8993 + }, + { + "epoch": 0.2667022506894404, + "grad_norm": 0.14303311705589294, + "learning_rate": 0.0008431334941440093, + "loss": 2.7446, + "step": 8994 + }, + { + "epoch": 0.2667319040417519, + "grad_norm": 0.13876314461231232, + "learning_rate": 0.0008430992701214515, + "loss": 2.7482, + "step": 8995 + }, + { + "epoch": 0.2667615573940634, + "grad_norm": 0.1470765322446823, + "learning_rate": 0.0008430650430606906, + "loss": 2.7537, + "step": 8996 + }, + { + "epoch": 0.26679121074637485, + "grad_norm": 0.1668805181980133, + "learning_rate": 0.0008430308129620299, + "loss": 2.7621, + "step": 8997 + }, + { + "epoch": 0.2668208640986864, + "grad_norm": 0.1503361463546753, + "learning_rate": 0.0008429965798257726, + "loss": 2.7664, + "step": 8998 + }, + { + "epoch": 0.26685051745099786, + "grad_norm": 0.14168988168239594, + "learning_rate": 0.0008429623436522215, + "loss": 2.7712, + "step": 8999 + }, + { + "epoch": 0.26688017080330934, + "grad_norm": 0.1434018909931183, + "learning_rate": 0.0008429281044416801, + "loss": 2.7904, + "step": 9000 + }, + { + "epoch": 0.2669098241556208, + "grad_norm": 0.14009089767932892, + "learning_rate": 0.0008428938621944515, + "loss": 2.7752, + "step": 9001 + }, + { + "epoch": 0.2669394775079323, + "grad_norm": 0.1847701072692871, + "learning_rate": 0.0008428596169108389, + "loss": 2.7568, + "step": 9002 + }, + { + "epoch": 0.26696913086024376, + "grad_norm": 0.18197894096374512, + "learning_rate": 0.0008428253685911455, + "loss": 2.7539, + "step": 9003 + }, + { + "epoch": 0.26699878421255524, + "grad_norm": 0.1499144732952118, + "learning_rate": 0.0008427911172356746, + "loss": 2.8126, + "step": 9004 + }, + { + "epoch": 0.2670284375648667, + "grad_norm": 0.1415359526872635, + "learning_rate": 0.0008427568628447295, + "loss": 2.7474, + "step": 9005 + }, + { + "epoch": 0.2670580909171782, + "grad_norm": 0.12501150369644165, + "learning_rate": 0.0008427226054186135, + "loss": 2.749, + "step": 9006 + }, + { + "epoch": 0.26708774426948967, + "grad_norm": 0.13520817458629608, + "learning_rate": 0.0008426883449576301, + "loss": 2.7537, + "step": 9007 + }, + { + "epoch": 0.26711739762180114, + "grad_norm": 0.11367355287075043, + "learning_rate": 0.0008426540814620827, + "loss": 2.7669, + "step": 9008 + }, + { + "epoch": 0.2671470509741126, + "grad_norm": 0.13126695156097412, + "learning_rate": 0.0008426198149322744, + "loss": 2.8162, + "step": 9009 + }, + { + "epoch": 0.2671767043264241, + "grad_norm": 0.12071048468351364, + "learning_rate": 0.0008425855453685089, + "loss": 2.7674, + "step": 9010 + }, + { + "epoch": 0.2672063576787356, + "grad_norm": 0.12588930130004883, + "learning_rate": 0.0008425512727710895, + "loss": 2.7451, + "step": 9011 + }, + { + "epoch": 0.26723601103104705, + "grad_norm": 0.11922314763069153, + "learning_rate": 0.0008425169971403199, + "loss": 2.7991, + "step": 9012 + }, + { + "epoch": 0.2672656643833585, + "grad_norm": 0.12772363424301147, + "learning_rate": 0.0008424827184765034, + "loss": 2.7791, + "step": 9013 + }, + { + "epoch": 0.26729531773567, + "grad_norm": 0.12722381949424744, + "learning_rate": 0.0008424484367799438, + "loss": 2.7763, + "step": 9014 + }, + { + "epoch": 0.2673249710879815, + "grad_norm": 0.1250678151845932, + "learning_rate": 0.0008424141520509443, + "loss": 2.7849, + "step": 9015 + }, + { + "epoch": 0.26735462444029295, + "grad_norm": 0.13906995952129364, + "learning_rate": 0.0008423798642898089, + "loss": 2.8015, + "step": 9016 + }, + { + "epoch": 0.26738427779260443, + "grad_norm": 0.12497256696224213, + "learning_rate": 0.0008423455734968409, + "loss": 2.7673, + "step": 9017 + }, + { + "epoch": 0.2674139311449159, + "grad_norm": 0.13414984941482544, + "learning_rate": 0.0008423112796723442, + "loss": 2.8182, + "step": 9018 + }, + { + "epoch": 0.26744358449722744, + "grad_norm": 0.14909633994102478, + "learning_rate": 0.0008422769828166222, + "loss": 2.7884, + "step": 9019 + }, + { + "epoch": 0.2674732378495389, + "grad_norm": 0.14267568290233612, + "learning_rate": 0.0008422426829299789, + "loss": 2.8013, + "step": 9020 + }, + { + "epoch": 0.2675028912018504, + "grad_norm": 0.13542540371418, + "learning_rate": 0.0008422083800127178, + "loss": 2.8034, + "step": 9021 + }, + { + "epoch": 0.26753254455416187, + "grad_norm": 0.14829105138778687, + "learning_rate": 0.0008421740740651429, + "loss": 2.7671, + "step": 9022 + }, + { + "epoch": 0.26756219790647334, + "grad_norm": 0.1306878626346588, + "learning_rate": 0.0008421397650875578, + "loss": 2.733, + "step": 9023 + }, + { + "epoch": 0.2675918512587848, + "grad_norm": 0.1234232559800148, + "learning_rate": 0.0008421054530802663, + "loss": 2.7896, + "step": 9024 + }, + { + "epoch": 0.2676215046110963, + "grad_norm": 0.1239718422293663, + "learning_rate": 0.0008420711380435721, + "loss": 2.7908, + "step": 9025 + }, + { + "epoch": 0.26765115796340777, + "grad_norm": 0.13307525217533112, + "learning_rate": 0.0008420368199777796, + "loss": 2.7826, + "step": 9026 + }, + { + "epoch": 0.26768081131571925, + "grad_norm": 0.1436236947774887, + "learning_rate": 0.0008420024988831923, + "loss": 2.761, + "step": 9027 + }, + { + "epoch": 0.2677104646680307, + "grad_norm": 0.14027021825313568, + "learning_rate": 0.000841968174760114, + "loss": 2.7682, + "step": 9028 + }, + { + "epoch": 0.2677401180203422, + "grad_norm": 0.12946194410324097, + "learning_rate": 0.000841933847608849, + "loss": 2.8105, + "step": 9029 + }, + { + "epoch": 0.2677697713726537, + "grad_norm": 0.13618913292884827, + "learning_rate": 0.0008418995174297009, + "loss": 2.7972, + "step": 9030 + }, + { + "epoch": 0.26779942472496515, + "grad_norm": 0.14570985734462738, + "learning_rate": 0.000841865184222974, + "loss": 2.7707, + "step": 9031 + }, + { + "epoch": 0.2678290780772766, + "grad_norm": 0.13839305937290192, + "learning_rate": 0.000841830847988972, + "loss": 2.7603, + "step": 9032 + }, + { + "epoch": 0.2678587314295881, + "grad_norm": 0.13307468593120575, + "learning_rate": 0.0008417965087279994, + "loss": 2.7669, + "step": 9033 + }, + { + "epoch": 0.2678883847818996, + "grad_norm": 0.1456473469734192, + "learning_rate": 0.0008417621664403601, + "loss": 2.7743, + "step": 9034 + }, + { + "epoch": 0.26791803813421106, + "grad_norm": 0.1457943618297577, + "learning_rate": 0.0008417278211263579, + "loss": 2.811, + "step": 9035 + }, + { + "epoch": 0.26794769148652253, + "grad_norm": 0.1230805516242981, + "learning_rate": 0.0008416934727862974, + "loss": 2.7628, + "step": 9036 + }, + { + "epoch": 0.267977344838834, + "grad_norm": 0.15547356009483337, + "learning_rate": 0.0008416591214204825, + "loss": 2.7702, + "step": 9037 + }, + { + "epoch": 0.2680069981911455, + "grad_norm": 0.1604980230331421, + "learning_rate": 0.0008416247670292174, + "loss": 2.7407, + "step": 9038 + }, + { + "epoch": 0.26803665154345696, + "grad_norm": 0.17681340873241425, + "learning_rate": 0.0008415904096128063, + "loss": 2.7832, + "step": 9039 + }, + { + "epoch": 0.2680663048957685, + "grad_norm": 0.16372740268707275, + "learning_rate": 0.0008415560491715536, + "loss": 2.8115, + "step": 9040 + }, + { + "epoch": 0.26809595824807997, + "grad_norm": 0.12098126113414764, + "learning_rate": 0.0008415216857057635, + "loss": 2.7885, + "step": 9041 + }, + { + "epoch": 0.26812561160039144, + "grad_norm": 0.13514438271522522, + "learning_rate": 0.00084148731921574, + "loss": 2.7683, + "step": 9042 + }, + { + "epoch": 0.2681552649527029, + "grad_norm": 0.13926513493061066, + "learning_rate": 0.0008414529497017879, + "loss": 2.7655, + "step": 9043 + }, + { + "epoch": 0.2681849183050144, + "grad_norm": 0.12777096033096313, + "learning_rate": 0.0008414185771642113, + "loss": 2.7605, + "step": 9044 + }, + { + "epoch": 0.2682145716573259, + "grad_norm": 0.11646442115306854, + "learning_rate": 0.0008413842016033144, + "loss": 2.7914, + "step": 9045 + }, + { + "epoch": 0.26824422500963735, + "grad_norm": 0.11739915609359741, + "learning_rate": 0.000841349823019402, + "loss": 2.7769, + "step": 9046 + }, + { + "epoch": 0.2682738783619488, + "grad_norm": 0.12702500820159912, + "learning_rate": 0.0008413154414127784, + "loss": 2.7572, + "step": 9047 + }, + { + "epoch": 0.2683035317142603, + "grad_norm": 0.12812897562980652, + "learning_rate": 0.0008412810567837478, + "loss": 2.7824, + "step": 9048 + }, + { + "epoch": 0.2683331850665718, + "grad_norm": 0.14144517481327057, + "learning_rate": 0.0008412466691326148, + "loss": 2.7753, + "step": 9049 + }, + { + "epoch": 0.26836283841888325, + "grad_norm": 0.14718584716320038, + "learning_rate": 0.0008412122784596842, + "loss": 2.7551, + "step": 9050 + }, + { + "epoch": 0.26839249177119473, + "grad_norm": 0.16646717488765717, + "learning_rate": 0.00084117788476526, + "loss": 2.7991, + "step": 9051 + }, + { + "epoch": 0.2684221451235062, + "grad_norm": 0.1941792517900467, + "learning_rate": 0.0008411434880496474, + "loss": 2.7829, + "step": 9052 + }, + { + "epoch": 0.2684517984758177, + "grad_norm": 0.19071155786514282, + "learning_rate": 0.0008411090883131505, + "loss": 2.7679, + "step": 9053 + }, + { + "epoch": 0.26848145182812916, + "grad_norm": 0.16251836717128754, + "learning_rate": 0.0008410746855560741, + "loss": 2.7963, + "step": 9054 + }, + { + "epoch": 0.26851110518044063, + "grad_norm": 0.15687379240989685, + "learning_rate": 0.0008410402797787229, + "loss": 2.7996, + "step": 9055 + }, + { + "epoch": 0.2685407585327521, + "grad_norm": 0.15040771663188934, + "learning_rate": 0.0008410058709814013, + "loss": 2.783, + "step": 9056 + }, + { + "epoch": 0.2685704118850636, + "grad_norm": 0.14585483074188232, + "learning_rate": 0.0008409714591644142, + "loss": 2.7719, + "step": 9057 + }, + { + "epoch": 0.26860006523737506, + "grad_norm": 0.14251577854156494, + "learning_rate": 0.0008409370443280664, + "loss": 2.7556, + "step": 9058 + }, + { + "epoch": 0.26862971858968654, + "grad_norm": 0.1448523849248886, + "learning_rate": 0.0008409026264726625, + "loss": 2.8177, + "step": 9059 + }, + { + "epoch": 0.268659371941998, + "grad_norm": 0.16181603074073792, + "learning_rate": 0.0008408682055985073, + "loss": 2.7863, + "step": 9060 + }, + { + "epoch": 0.26868902529430955, + "grad_norm": 0.1326775699853897, + "learning_rate": 0.0008408337817059058, + "loss": 2.736, + "step": 9061 + }, + { + "epoch": 0.268718678646621, + "grad_norm": 0.12718895077705383, + "learning_rate": 0.0008407993547951625, + "loss": 2.756, + "step": 9062 + }, + { + "epoch": 0.2687483319989325, + "grad_norm": 0.11492367833852768, + "learning_rate": 0.0008407649248665825, + "loss": 2.7608, + "step": 9063 + }, + { + "epoch": 0.268777985351244, + "grad_norm": 0.13456164300441742, + "learning_rate": 0.0008407304919204704, + "loss": 2.7819, + "step": 9064 + }, + { + "epoch": 0.26880763870355545, + "grad_norm": 0.12786827981472015, + "learning_rate": 0.0008406960559571315, + "loss": 2.7583, + "step": 9065 + }, + { + "epoch": 0.2688372920558669, + "grad_norm": 0.14849935472011566, + "learning_rate": 0.0008406616169768706, + "loss": 2.7915, + "step": 9066 + }, + { + "epoch": 0.2688669454081784, + "grad_norm": 0.1646202802658081, + "learning_rate": 0.0008406271749799926, + "loss": 2.7639, + "step": 9067 + }, + { + "epoch": 0.2688965987604899, + "grad_norm": 0.16041430830955505, + "learning_rate": 0.0008405927299668022, + "loss": 2.766, + "step": 9068 + }, + { + "epoch": 0.26892625211280136, + "grad_norm": 0.13038349151611328, + "learning_rate": 0.0008405582819376049, + "loss": 2.7815, + "step": 9069 + }, + { + "epoch": 0.26895590546511283, + "grad_norm": 0.14697180688381195, + "learning_rate": 0.0008405238308927057, + "loss": 2.7686, + "step": 9070 + }, + { + "epoch": 0.2689855588174243, + "grad_norm": 0.1648445874452591, + "learning_rate": 0.0008404893768324094, + "loss": 2.7772, + "step": 9071 + }, + { + "epoch": 0.2690152121697358, + "grad_norm": 0.14510701596736908, + "learning_rate": 0.0008404549197570212, + "loss": 2.7964, + "step": 9072 + }, + { + "epoch": 0.26904486552204726, + "grad_norm": 0.14025963842868805, + "learning_rate": 0.0008404204596668463, + "loss": 2.7669, + "step": 9073 + }, + { + "epoch": 0.26907451887435874, + "grad_norm": 0.15937970578670502, + "learning_rate": 0.0008403859965621898, + "loss": 2.7762, + "step": 9074 + }, + { + "epoch": 0.2691041722266702, + "grad_norm": 0.13772894442081451, + "learning_rate": 0.0008403515304433569, + "loss": 2.8063, + "step": 9075 + }, + { + "epoch": 0.2691338255789817, + "grad_norm": 0.12017370015382767, + "learning_rate": 0.0008403170613106527, + "loss": 2.7878, + "step": 9076 + }, + { + "epoch": 0.26916347893129317, + "grad_norm": 0.1520499289035797, + "learning_rate": 0.0008402825891643823, + "loss": 2.7391, + "step": 9077 + }, + { + "epoch": 0.26919313228360464, + "grad_norm": 0.1428544521331787, + "learning_rate": 0.0008402481140048516, + "loss": 2.7642, + "step": 9078 + }, + { + "epoch": 0.2692227856359161, + "grad_norm": 0.14927121996879578, + "learning_rate": 0.0008402136358323652, + "loss": 2.7815, + "step": 9079 + }, + { + "epoch": 0.2692524389882276, + "grad_norm": 0.15465646982192993, + "learning_rate": 0.0008401791546472288, + "loss": 2.7256, + "step": 9080 + }, + { + "epoch": 0.2692820923405391, + "grad_norm": 0.13994532823562622, + "learning_rate": 0.0008401446704497475, + "loss": 2.7764, + "step": 9081 + }, + { + "epoch": 0.2693117456928506, + "grad_norm": 0.14648978412151337, + "learning_rate": 0.0008401101832402267, + "loss": 2.7831, + "step": 9082 + }, + { + "epoch": 0.2693413990451621, + "grad_norm": 0.16499124467372894, + "learning_rate": 0.0008400756930189719, + "loss": 2.7894, + "step": 9083 + }, + { + "epoch": 0.26937105239747355, + "grad_norm": 0.15096457302570343, + "learning_rate": 0.0008400411997862885, + "loss": 2.7642, + "step": 9084 + }, + { + "epoch": 0.26940070574978503, + "grad_norm": 0.12300770729780197, + "learning_rate": 0.0008400067035424819, + "loss": 2.7687, + "step": 9085 + }, + { + "epoch": 0.2694303591020965, + "grad_norm": 0.11032005399465561, + "learning_rate": 0.0008399722042878575, + "loss": 2.7513, + "step": 9086 + }, + { + "epoch": 0.269460012454408, + "grad_norm": 0.12259726971387863, + "learning_rate": 0.0008399377020227209, + "loss": 2.775, + "step": 9087 + }, + { + "epoch": 0.26948966580671946, + "grad_norm": 0.13203388452529907, + "learning_rate": 0.0008399031967473777, + "loss": 2.7775, + "step": 9088 + }, + { + "epoch": 0.26951931915903093, + "grad_norm": 0.1379891335964203, + "learning_rate": 0.0008398686884621332, + "loss": 2.7795, + "step": 9089 + }, + { + "epoch": 0.2695489725113424, + "grad_norm": 0.13915923237800598, + "learning_rate": 0.0008398341771672932, + "loss": 2.7843, + "step": 9090 + }, + { + "epoch": 0.2695786258636539, + "grad_norm": 0.13292807340621948, + "learning_rate": 0.0008397996628631632, + "loss": 2.8092, + "step": 9091 + }, + { + "epoch": 0.26960827921596536, + "grad_norm": 0.12598124146461487, + "learning_rate": 0.000839765145550049, + "loss": 2.7631, + "step": 9092 + }, + { + "epoch": 0.26963793256827684, + "grad_norm": 0.13034100830554962, + "learning_rate": 0.0008397306252282559, + "loss": 2.7864, + "step": 9093 + }, + { + "epoch": 0.2696675859205883, + "grad_norm": 0.12934903800487518, + "learning_rate": 0.00083969610189809, + "loss": 2.7613, + "step": 9094 + }, + { + "epoch": 0.2696972392728998, + "grad_norm": 0.10973375290632248, + "learning_rate": 0.0008396615755598566, + "loss": 2.8016, + "step": 9095 + }, + { + "epoch": 0.26972689262521127, + "grad_norm": 0.12603512406349182, + "learning_rate": 0.0008396270462138619, + "loss": 2.7755, + "step": 9096 + }, + { + "epoch": 0.26975654597752274, + "grad_norm": 0.12918682396411896, + "learning_rate": 0.0008395925138604113, + "loss": 2.7649, + "step": 9097 + }, + { + "epoch": 0.2697861993298342, + "grad_norm": 0.14650772511959076, + "learning_rate": 0.0008395579784998107, + "loss": 2.7676, + "step": 9098 + }, + { + "epoch": 0.2698158526821457, + "grad_norm": 0.161643385887146, + "learning_rate": 0.0008395234401323659, + "loss": 2.7648, + "step": 9099 + }, + { + "epoch": 0.26984550603445717, + "grad_norm": 0.15149618685245514, + "learning_rate": 0.0008394888987583826, + "loss": 2.7663, + "step": 9100 + }, + { + "epoch": 0.26987515938676865, + "grad_norm": 0.1704406589269638, + "learning_rate": 0.0008394543543781671, + "loss": 2.7721, + "step": 9101 + }, + { + "epoch": 0.2699048127390802, + "grad_norm": 0.1812477856874466, + "learning_rate": 0.000839419806992025, + "loss": 2.7497, + "step": 9102 + }, + { + "epoch": 0.26993446609139166, + "grad_norm": 0.1415240466594696, + "learning_rate": 0.000839385256600262, + "loss": 2.732, + "step": 9103 + }, + { + "epoch": 0.26996411944370313, + "grad_norm": 0.1351587325334549, + "learning_rate": 0.0008393507032031844, + "loss": 2.7533, + "step": 9104 + }, + { + "epoch": 0.2699937727960146, + "grad_norm": 0.1418009102344513, + "learning_rate": 0.0008393161468010982, + "loss": 2.8137, + "step": 9105 + }, + { + "epoch": 0.2700234261483261, + "grad_norm": 0.16117791831493378, + "learning_rate": 0.0008392815873943092, + "loss": 2.7861, + "step": 9106 + }, + { + "epoch": 0.27005307950063756, + "grad_norm": 0.16129444539546967, + "learning_rate": 0.0008392470249831235, + "loss": 2.7562, + "step": 9107 + }, + { + "epoch": 0.27008273285294904, + "grad_norm": 0.15813055634498596, + "learning_rate": 0.0008392124595678472, + "loss": 2.7735, + "step": 9108 + }, + { + "epoch": 0.2701123862052605, + "grad_norm": 0.15991921722888947, + "learning_rate": 0.0008391778911487863, + "loss": 2.7149, + "step": 9109 + }, + { + "epoch": 0.270142039557572, + "grad_norm": 0.1437235325574875, + "learning_rate": 0.000839143319726247, + "loss": 2.7779, + "step": 9110 + }, + { + "epoch": 0.27017169290988347, + "grad_norm": 0.13980507850646973, + "learning_rate": 0.0008391087453005352, + "loss": 2.7337, + "step": 9111 + }, + { + "epoch": 0.27020134626219494, + "grad_norm": 0.16320551931858063, + "learning_rate": 0.0008390741678719575, + "loss": 2.782, + "step": 9112 + }, + { + "epoch": 0.2702309996145064, + "grad_norm": 0.17090703547000885, + "learning_rate": 0.0008390395874408199, + "loss": 2.7383, + "step": 9113 + }, + { + "epoch": 0.2702606529668179, + "grad_norm": 0.15570685267448425, + "learning_rate": 0.0008390050040074284, + "loss": 2.7626, + "step": 9114 + }, + { + "epoch": 0.27029030631912937, + "grad_norm": 0.13940446078777313, + "learning_rate": 0.0008389704175720894, + "loss": 2.7642, + "step": 9115 + }, + { + "epoch": 0.27031995967144085, + "grad_norm": 0.12876537442207336, + "learning_rate": 0.0008389358281351092, + "loss": 2.7374, + "step": 9116 + }, + { + "epoch": 0.2703496130237523, + "grad_norm": 0.12709933519363403, + "learning_rate": 0.000838901235696794, + "loss": 2.7698, + "step": 9117 + }, + { + "epoch": 0.2703792663760638, + "grad_norm": 0.13469550013542175, + "learning_rate": 0.0008388666402574503, + "loss": 2.7428, + "step": 9118 + }, + { + "epoch": 0.2704089197283753, + "grad_norm": 0.14522017538547516, + "learning_rate": 0.0008388320418173843, + "loss": 2.7943, + "step": 9119 + }, + { + "epoch": 0.27043857308068675, + "grad_norm": 0.1593433916568756, + "learning_rate": 0.0008387974403769023, + "loss": 2.7565, + "step": 9120 + }, + { + "epoch": 0.2704682264329982, + "grad_norm": 0.16060468554496765, + "learning_rate": 0.0008387628359363109, + "loss": 2.7782, + "step": 9121 + }, + { + "epoch": 0.2704978797853097, + "grad_norm": 0.15952810645103455, + "learning_rate": 0.0008387282284959164, + "loss": 2.8122, + "step": 9122 + }, + { + "epoch": 0.27052753313762123, + "grad_norm": 0.14165915548801422, + "learning_rate": 0.0008386936180560254, + "loss": 2.7877, + "step": 9123 + }, + { + "epoch": 0.2705571864899327, + "grad_norm": 0.1356590986251831, + "learning_rate": 0.0008386590046169443, + "loss": 2.7953, + "step": 9124 + }, + { + "epoch": 0.2705868398422442, + "grad_norm": 0.12813277542591095, + "learning_rate": 0.0008386243881789794, + "loss": 2.7351, + "step": 9125 + }, + { + "epoch": 0.27061649319455566, + "grad_norm": 0.1220630630850792, + "learning_rate": 0.0008385897687424375, + "loss": 2.7738, + "step": 9126 + }, + { + "epoch": 0.27064614654686714, + "grad_norm": 0.11958354711532593, + "learning_rate": 0.000838555146307625, + "loss": 2.7737, + "step": 9127 + }, + { + "epoch": 0.2706757998991786, + "grad_norm": 0.12631754577159882, + "learning_rate": 0.0008385205208748487, + "loss": 2.7901, + "step": 9128 + }, + { + "epoch": 0.2707054532514901, + "grad_norm": 0.11718044430017471, + "learning_rate": 0.000838485892444415, + "loss": 2.758, + "step": 9129 + }, + { + "epoch": 0.27073510660380157, + "grad_norm": 0.13071493804454803, + "learning_rate": 0.0008384512610166307, + "loss": 2.7627, + "step": 9130 + }, + { + "epoch": 0.27076475995611304, + "grad_norm": 0.14679744839668274, + "learning_rate": 0.0008384166265918022, + "loss": 2.7348, + "step": 9131 + }, + { + "epoch": 0.2707944133084245, + "grad_norm": 0.14497309923171997, + "learning_rate": 0.0008383819891702366, + "loss": 2.7458, + "step": 9132 + }, + { + "epoch": 0.270824066660736, + "grad_norm": 0.14035135507583618, + "learning_rate": 0.0008383473487522404, + "loss": 2.7965, + "step": 9133 + }, + { + "epoch": 0.27085372001304747, + "grad_norm": 0.12543541193008423, + "learning_rate": 0.0008383127053381203, + "loss": 2.7664, + "step": 9134 + }, + { + "epoch": 0.27088337336535895, + "grad_norm": 0.12872593104839325, + "learning_rate": 0.0008382780589281831, + "loss": 2.7755, + "step": 9135 + }, + { + "epoch": 0.2709130267176704, + "grad_norm": 0.13637395203113556, + "learning_rate": 0.0008382434095227356, + "loss": 2.778, + "step": 9136 + }, + { + "epoch": 0.2709426800699819, + "grad_norm": 0.15074953436851501, + "learning_rate": 0.0008382087571220847, + "loss": 2.7891, + "step": 9137 + }, + { + "epoch": 0.2709723334222934, + "grad_norm": 0.16904489696025848, + "learning_rate": 0.0008381741017265371, + "loss": 2.7564, + "step": 9138 + }, + { + "epoch": 0.27100198677460485, + "grad_norm": 0.17212888598442078, + "learning_rate": 0.0008381394433364, + "loss": 2.7852, + "step": 9139 + }, + { + "epoch": 0.27103164012691633, + "grad_norm": 0.13280801475048065, + "learning_rate": 0.0008381047819519799, + "loss": 2.7919, + "step": 9140 + }, + { + "epoch": 0.2710612934792278, + "grad_norm": 0.13698570430278778, + "learning_rate": 0.0008380701175735841, + "loss": 2.7952, + "step": 9141 + }, + { + "epoch": 0.2710909468315393, + "grad_norm": 0.1493314951658249, + "learning_rate": 0.0008380354502015191, + "loss": 2.7833, + "step": 9142 + }, + { + "epoch": 0.27112060018385076, + "grad_norm": 0.15870440006256104, + "learning_rate": 0.0008380007798360924, + "loss": 2.8059, + "step": 9143 + }, + { + "epoch": 0.2711502535361623, + "grad_norm": 0.16097061336040497, + "learning_rate": 0.0008379661064776106, + "loss": 2.7833, + "step": 9144 + }, + { + "epoch": 0.27117990688847377, + "grad_norm": 0.16310004889965057, + "learning_rate": 0.0008379314301263811, + "loss": 2.7633, + "step": 9145 + }, + { + "epoch": 0.27120956024078524, + "grad_norm": 0.16497020423412323, + "learning_rate": 0.0008378967507827106, + "loss": 2.7459, + "step": 9146 + }, + { + "epoch": 0.2712392135930967, + "grad_norm": 0.14218449592590332, + "learning_rate": 0.0008378620684469064, + "loss": 2.7626, + "step": 9147 + }, + { + "epoch": 0.2712688669454082, + "grad_norm": 0.183176189661026, + "learning_rate": 0.0008378273831192758, + "loss": 2.7709, + "step": 9148 + }, + { + "epoch": 0.27129852029771967, + "grad_norm": 0.17010800540447235, + "learning_rate": 0.0008377926948001255, + "loss": 2.7908, + "step": 9149 + }, + { + "epoch": 0.27132817365003115, + "grad_norm": 0.13227912783622742, + "learning_rate": 0.0008377580034897631, + "loss": 2.7593, + "step": 9150 + }, + { + "epoch": 0.2713578270023426, + "grad_norm": 0.15227068960666656, + "learning_rate": 0.0008377233091884955, + "loss": 2.8048, + "step": 9151 + }, + { + "epoch": 0.2713874803546541, + "grad_norm": 0.11643572896718979, + "learning_rate": 0.00083768861189663, + "loss": 2.8016, + "step": 9152 + }, + { + "epoch": 0.2714171337069656, + "grad_norm": 0.13259842991828918, + "learning_rate": 0.000837653911614474, + "loss": 2.768, + "step": 9153 + }, + { + "epoch": 0.27144678705927705, + "grad_norm": 0.12904034554958344, + "learning_rate": 0.0008376192083423344, + "loss": 2.8035, + "step": 9154 + }, + { + "epoch": 0.2714764404115885, + "grad_norm": 0.1238260492682457, + "learning_rate": 0.0008375845020805189, + "loss": 2.7831, + "step": 9155 + }, + { + "epoch": 0.2715060937639, + "grad_norm": 0.13278000056743622, + "learning_rate": 0.0008375497928293348, + "loss": 2.7483, + "step": 9156 + }, + { + "epoch": 0.2715357471162115, + "grad_norm": 0.12026214599609375, + "learning_rate": 0.0008375150805890892, + "loss": 2.7795, + "step": 9157 + }, + { + "epoch": 0.27156540046852296, + "grad_norm": 0.12974582612514496, + "learning_rate": 0.0008374803653600898, + "loss": 2.7918, + "step": 9158 + }, + { + "epoch": 0.27159505382083443, + "grad_norm": 0.12435626983642578, + "learning_rate": 0.0008374456471426438, + "loss": 2.7564, + "step": 9159 + }, + { + "epoch": 0.2716247071731459, + "grad_norm": 0.13235408067703247, + "learning_rate": 0.0008374109259370586, + "loss": 2.8032, + "step": 9160 + }, + { + "epoch": 0.2716543605254574, + "grad_norm": 0.1387348175048828, + "learning_rate": 0.0008373762017436416, + "loss": 2.7591, + "step": 9161 + }, + { + "epoch": 0.27168401387776886, + "grad_norm": 0.15154346823692322, + "learning_rate": 0.0008373414745627006, + "loss": 2.7241, + "step": 9162 + }, + { + "epoch": 0.27171366723008034, + "grad_norm": 0.1282797008752823, + "learning_rate": 0.0008373067443945428, + "loss": 2.7687, + "step": 9163 + }, + { + "epoch": 0.2717433205823918, + "grad_norm": 0.11936761438846588, + "learning_rate": 0.0008372720112394761, + "loss": 2.7875, + "step": 9164 + }, + { + "epoch": 0.27177297393470334, + "grad_norm": 0.14478960633277893, + "learning_rate": 0.0008372372750978077, + "loss": 2.789, + "step": 9165 + }, + { + "epoch": 0.2718026272870148, + "grad_norm": 0.1301654577255249, + "learning_rate": 0.0008372025359698453, + "loss": 2.77, + "step": 9166 + }, + { + "epoch": 0.2718322806393263, + "grad_norm": 0.11454152315855026, + "learning_rate": 0.0008371677938558966, + "loss": 2.7737, + "step": 9167 + }, + { + "epoch": 0.27186193399163777, + "grad_norm": 0.12229862064123154, + "learning_rate": 0.0008371330487562692, + "loss": 2.7772, + "step": 9168 + }, + { + "epoch": 0.27189158734394925, + "grad_norm": 0.14770394563674927, + "learning_rate": 0.0008370983006712709, + "loss": 2.7669, + "step": 9169 + }, + { + "epoch": 0.2719212406962607, + "grad_norm": 0.1717119663953781, + "learning_rate": 0.0008370635496012092, + "loss": 2.7932, + "step": 9170 + }, + { + "epoch": 0.2719508940485722, + "grad_norm": 0.15617725253105164, + "learning_rate": 0.000837028795546392, + "loss": 2.7847, + "step": 9171 + }, + { + "epoch": 0.2719805474008837, + "grad_norm": 0.14206041395664215, + "learning_rate": 0.0008369940385071268, + "loss": 2.7784, + "step": 9172 + }, + { + "epoch": 0.27201020075319515, + "grad_norm": 0.13279768824577332, + "learning_rate": 0.0008369592784837216, + "loss": 2.7831, + "step": 9173 + }, + { + "epoch": 0.27203985410550663, + "grad_norm": 0.14869320392608643, + "learning_rate": 0.0008369245154764842, + "loss": 2.7872, + "step": 9174 + }, + { + "epoch": 0.2720695074578181, + "grad_norm": 0.13320115208625793, + "learning_rate": 0.0008368897494857223, + "loss": 2.7822, + "step": 9175 + }, + { + "epoch": 0.2720991608101296, + "grad_norm": 0.11026985943317413, + "learning_rate": 0.000836854980511744, + "loss": 2.7635, + "step": 9176 + }, + { + "epoch": 0.27212881416244106, + "grad_norm": 0.12571121752262115, + "learning_rate": 0.0008368202085548568, + "loss": 2.771, + "step": 9177 + }, + { + "epoch": 0.27215846751475253, + "grad_norm": 0.12914659082889557, + "learning_rate": 0.000836785433615369, + "loss": 2.7243, + "step": 9178 + }, + { + "epoch": 0.272188120867064, + "grad_norm": 0.11796267330646515, + "learning_rate": 0.0008367506556935884, + "loss": 2.8155, + "step": 9179 + }, + { + "epoch": 0.2722177742193755, + "grad_norm": 0.12858299911022186, + "learning_rate": 0.000836715874789823, + "loss": 2.7407, + "step": 9180 + }, + { + "epoch": 0.27224742757168696, + "grad_norm": 0.14279994368553162, + "learning_rate": 0.0008366810909043805, + "loss": 2.789, + "step": 9181 + }, + { + "epoch": 0.27227708092399844, + "grad_norm": 0.14715264737606049, + "learning_rate": 0.0008366463040375693, + "loss": 2.7917, + "step": 9182 + }, + { + "epoch": 0.2723067342763099, + "grad_norm": 0.15950730443000793, + "learning_rate": 0.0008366115141896972, + "loss": 2.8024, + "step": 9183 + }, + { + "epoch": 0.2723363876286214, + "grad_norm": 0.15171551704406738, + "learning_rate": 0.0008365767213610726, + "loss": 2.8018, + "step": 9184 + }, + { + "epoch": 0.2723660409809329, + "grad_norm": 0.14793725311756134, + "learning_rate": 0.0008365419255520031, + "loss": 2.77, + "step": 9185 + }, + { + "epoch": 0.2723956943332444, + "grad_norm": 0.15458454191684723, + "learning_rate": 0.0008365071267627973, + "loss": 2.761, + "step": 9186 + }, + { + "epoch": 0.2724253476855559, + "grad_norm": 0.1430247277021408, + "learning_rate": 0.0008364723249937629, + "loss": 2.7746, + "step": 9187 + }, + { + "epoch": 0.27245500103786735, + "grad_norm": 0.14554983377456665, + "learning_rate": 0.0008364375202452083, + "loss": 2.7576, + "step": 9188 + }, + { + "epoch": 0.2724846543901788, + "grad_norm": 0.15759001672267914, + "learning_rate": 0.0008364027125174419, + "loss": 2.7778, + "step": 9189 + }, + { + "epoch": 0.2725143077424903, + "grad_norm": 0.1634770780801773, + "learning_rate": 0.0008363679018107718, + "loss": 2.818, + "step": 9190 + }, + { + "epoch": 0.2725439610948018, + "grad_norm": 0.14816710352897644, + "learning_rate": 0.0008363330881255059, + "loss": 2.7601, + "step": 9191 + }, + { + "epoch": 0.27257361444711325, + "grad_norm": 0.12238233536481857, + "learning_rate": 0.0008362982714619529, + "loss": 2.7912, + "step": 9192 + }, + { + "epoch": 0.27260326779942473, + "grad_norm": 0.12613558769226074, + "learning_rate": 0.0008362634518204211, + "loss": 2.7891, + "step": 9193 + }, + { + "epoch": 0.2726329211517362, + "grad_norm": 0.1459135115146637, + "learning_rate": 0.0008362286292012185, + "loss": 2.7649, + "step": 9194 + }, + { + "epoch": 0.2726625745040477, + "grad_norm": 0.15048079192638397, + "learning_rate": 0.0008361938036046539, + "loss": 2.7847, + "step": 9195 + }, + { + "epoch": 0.27269222785635916, + "grad_norm": 0.15439705550670624, + "learning_rate": 0.0008361589750310353, + "loss": 2.7608, + "step": 9196 + }, + { + "epoch": 0.27272188120867064, + "grad_norm": 0.14484506845474243, + "learning_rate": 0.0008361241434806714, + "loss": 2.8144, + "step": 9197 + }, + { + "epoch": 0.2727515345609821, + "grad_norm": 0.1459444910287857, + "learning_rate": 0.0008360893089538703, + "loss": 2.7816, + "step": 9198 + }, + { + "epoch": 0.2727811879132936, + "grad_norm": 0.14541810750961304, + "learning_rate": 0.0008360544714509409, + "loss": 2.7637, + "step": 9199 + }, + { + "epoch": 0.27281084126560506, + "grad_norm": 0.14379724860191345, + "learning_rate": 0.0008360196309721915, + "loss": 2.821, + "step": 9200 + }, + { + "epoch": 0.27284049461791654, + "grad_norm": 0.15334154665470123, + "learning_rate": 0.0008359847875179304, + "loss": 2.7757, + "step": 9201 + }, + { + "epoch": 0.272870147970228, + "grad_norm": 0.13686588406562805, + "learning_rate": 0.0008359499410884665, + "loss": 2.7314, + "step": 9202 + }, + { + "epoch": 0.2728998013225395, + "grad_norm": 0.15264639258384705, + "learning_rate": 0.000835915091684108, + "loss": 2.7952, + "step": 9203 + }, + { + "epoch": 0.27292945467485097, + "grad_norm": 0.1421305239200592, + "learning_rate": 0.0008358802393051639, + "loss": 2.7319, + "step": 9204 + }, + { + "epoch": 0.27295910802716244, + "grad_norm": 0.14022818207740784, + "learning_rate": 0.0008358453839519426, + "loss": 2.768, + "step": 9205 + }, + { + "epoch": 0.272988761379474, + "grad_norm": 0.15444731712341309, + "learning_rate": 0.0008358105256247527, + "loss": 2.758, + "step": 9206 + }, + { + "epoch": 0.27301841473178545, + "grad_norm": 0.16653570532798767, + "learning_rate": 0.0008357756643239029, + "loss": 2.7487, + "step": 9207 + }, + { + "epoch": 0.27304806808409693, + "grad_norm": 0.15752451121807098, + "learning_rate": 0.0008357408000497022, + "loss": 2.7751, + "step": 9208 + }, + { + "epoch": 0.2730777214364084, + "grad_norm": 0.1425452083349228, + "learning_rate": 0.000835705932802459, + "loss": 2.8185, + "step": 9209 + }, + { + "epoch": 0.2731073747887199, + "grad_norm": 0.14813996851444244, + "learning_rate": 0.0008356710625824819, + "loss": 2.7636, + "step": 9210 + }, + { + "epoch": 0.27313702814103136, + "grad_norm": 0.16267947852611542, + "learning_rate": 0.0008356361893900803, + "loss": 2.7715, + "step": 9211 + }, + { + "epoch": 0.27316668149334283, + "grad_norm": 0.120135098695755, + "learning_rate": 0.0008356013132255624, + "loss": 2.7649, + "step": 9212 + }, + { + "epoch": 0.2731963348456543, + "grad_norm": 0.13787567615509033, + "learning_rate": 0.0008355664340892373, + "loss": 2.781, + "step": 9213 + }, + { + "epoch": 0.2732259881979658, + "grad_norm": 0.14923985302448273, + "learning_rate": 0.0008355315519814137, + "loss": 2.7302, + "step": 9214 + }, + { + "epoch": 0.27325564155027726, + "grad_norm": 0.13170498609542847, + "learning_rate": 0.0008354966669024008, + "loss": 2.8058, + "step": 9215 + }, + { + "epoch": 0.27328529490258874, + "grad_norm": 0.15733855962753296, + "learning_rate": 0.0008354617788525074, + "loss": 2.7908, + "step": 9216 + }, + { + "epoch": 0.2733149482549002, + "grad_norm": 0.1435629427433014, + "learning_rate": 0.0008354268878320422, + "loss": 2.8163, + "step": 9217 + }, + { + "epoch": 0.2733446016072117, + "grad_norm": 0.16505584120750427, + "learning_rate": 0.0008353919938413144, + "loss": 2.7932, + "step": 9218 + }, + { + "epoch": 0.27337425495952317, + "grad_norm": 0.15756675601005554, + "learning_rate": 0.0008353570968806328, + "loss": 2.7661, + "step": 9219 + }, + { + "epoch": 0.27340390831183464, + "grad_norm": 0.1520235389471054, + "learning_rate": 0.0008353221969503066, + "loss": 2.7581, + "step": 9220 + }, + { + "epoch": 0.2734335616641461, + "grad_norm": 0.15246249735355377, + "learning_rate": 0.0008352872940506448, + "loss": 2.7807, + "step": 9221 + }, + { + "epoch": 0.2734632150164576, + "grad_norm": 0.14010941982269287, + "learning_rate": 0.0008352523881819566, + "loss": 2.7749, + "step": 9222 + }, + { + "epoch": 0.27349286836876907, + "grad_norm": 0.14785532653331757, + "learning_rate": 0.0008352174793445508, + "loss": 2.8002, + "step": 9223 + }, + { + "epoch": 0.27352252172108055, + "grad_norm": 0.12808731198310852, + "learning_rate": 0.0008351825675387368, + "loss": 2.7592, + "step": 9224 + }, + { + "epoch": 0.273552175073392, + "grad_norm": 0.14523759484291077, + "learning_rate": 0.0008351476527648236, + "loss": 2.7542, + "step": 9225 + }, + { + "epoch": 0.2735818284257035, + "grad_norm": 0.15887705981731415, + "learning_rate": 0.0008351127350231202, + "loss": 2.7788, + "step": 9226 + }, + { + "epoch": 0.27361148177801503, + "grad_norm": 0.15427567064762115, + "learning_rate": 0.0008350778143139363, + "loss": 2.7551, + "step": 9227 + }, + { + "epoch": 0.2736411351303265, + "grad_norm": 0.1438121646642685, + "learning_rate": 0.0008350428906375806, + "loss": 2.7386, + "step": 9228 + }, + { + "epoch": 0.273670788482638, + "grad_norm": 0.13468968868255615, + "learning_rate": 0.0008350079639943629, + "loss": 2.7678, + "step": 9229 + }, + { + "epoch": 0.27370044183494946, + "grad_norm": 0.11581642180681229, + "learning_rate": 0.0008349730343845919, + "loss": 2.7549, + "step": 9230 + }, + { + "epoch": 0.27373009518726094, + "grad_norm": 0.13469421863555908, + "learning_rate": 0.0008349381018085773, + "loss": 2.7894, + "step": 9231 + }, + { + "epoch": 0.2737597485395724, + "grad_norm": 0.14466717839241028, + "learning_rate": 0.0008349031662666282, + "loss": 2.7934, + "step": 9232 + }, + { + "epoch": 0.2737894018918839, + "grad_norm": 0.13663846254348755, + "learning_rate": 0.0008348682277590542, + "loss": 2.7393, + "step": 9233 + }, + { + "epoch": 0.27381905524419536, + "grad_norm": 0.13323119282722473, + "learning_rate": 0.0008348332862861645, + "loss": 2.759, + "step": 9234 + }, + { + "epoch": 0.27384870859650684, + "grad_norm": 0.12427689880132675, + "learning_rate": 0.0008347983418482686, + "loss": 2.7744, + "step": 9235 + }, + { + "epoch": 0.2738783619488183, + "grad_norm": 0.1418977826833725, + "learning_rate": 0.000834763394445676, + "loss": 2.7959, + "step": 9236 + }, + { + "epoch": 0.2739080153011298, + "grad_norm": 0.15285025537014008, + "learning_rate": 0.000834728444078696, + "loss": 2.7583, + "step": 9237 + }, + { + "epoch": 0.27393766865344127, + "grad_norm": 0.15149621665477753, + "learning_rate": 0.0008346934907476382, + "loss": 2.7488, + "step": 9238 + }, + { + "epoch": 0.27396732200575274, + "grad_norm": 0.13408876955509186, + "learning_rate": 0.0008346585344528119, + "loss": 2.8049, + "step": 9239 + }, + { + "epoch": 0.2739969753580642, + "grad_norm": 0.13063238561153412, + "learning_rate": 0.000834623575194527, + "loss": 2.7657, + "step": 9240 + }, + { + "epoch": 0.2740266287103757, + "grad_norm": 0.12501530349254608, + "learning_rate": 0.000834588612973093, + "loss": 2.7827, + "step": 9241 + }, + { + "epoch": 0.2740562820626872, + "grad_norm": 0.13412241637706757, + "learning_rate": 0.0008345536477888193, + "loss": 2.7935, + "step": 9242 + }, + { + "epoch": 0.27408593541499865, + "grad_norm": 0.12974217534065247, + "learning_rate": 0.0008345186796420156, + "loss": 2.7938, + "step": 9243 + }, + { + "epoch": 0.2741155887673101, + "grad_norm": 0.13564717769622803, + "learning_rate": 0.0008344837085329917, + "loss": 2.7688, + "step": 9244 + }, + { + "epoch": 0.2741452421196216, + "grad_norm": 0.14080196619033813, + "learning_rate": 0.0008344487344620569, + "loss": 2.7885, + "step": 9245 + }, + { + "epoch": 0.2741748954719331, + "grad_norm": 0.12578940391540527, + "learning_rate": 0.0008344137574295214, + "loss": 2.7607, + "step": 9246 + }, + { + "epoch": 0.27420454882424455, + "grad_norm": 0.11232810467481613, + "learning_rate": 0.0008343787774356946, + "loss": 2.7915, + "step": 9247 + }, + { + "epoch": 0.2742342021765561, + "grad_norm": 0.13896812498569489, + "learning_rate": 0.0008343437944808862, + "loss": 2.7646, + "step": 9248 + }, + { + "epoch": 0.27426385552886756, + "grad_norm": 0.16428092122077942, + "learning_rate": 0.0008343088085654062, + "loss": 2.7835, + "step": 9249 + }, + { + "epoch": 0.27429350888117904, + "grad_norm": 0.15952692925930023, + "learning_rate": 0.0008342738196895644, + "loss": 2.8189, + "step": 9250 + }, + { + "epoch": 0.2743231622334905, + "grad_norm": 0.1445719450712204, + "learning_rate": 0.0008342388278536703, + "loss": 2.7804, + "step": 9251 + }, + { + "epoch": 0.274352815585802, + "grad_norm": 0.17355482280254364, + "learning_rate": 0.0008342038330580343, + "loss": 2.7942, + "step": 9252 + }, + { + "epoch": 0.27438246893811347, + "grad_norm": 0.1792811155319214, + "learning_rate": 0.0008341688353029659, + "loss": 2.8166, + "step": 9253 + }, + { + "epoch": 0.27441212229042494, + "grad_norm": 0.1629849225282669, + "learning_rate": 0.000834133834588775, + "loss": 2.7534, + "step": 9254 + }, + { + "epoch": 0.2744417756427364, + "grad_norm": 0.17353756725788116, + "learning_rate": 0.0008340988309157718, + "loss": 2.7681, + "step": 9255 + }, + { + "epoch": 0.2744714289950479, + "grad_norm": 0.18869014084339142, + "learning_rate": 0.000834063824284266, + "loss": 2.8026, + "step": 9256 + }, + { + "epoch": 0.27450108234735937, + "grad_norm": 0.17345167696475983, + "learning_rate": 0.0008340288146945678, + "loss": 2.7816, + "step": 9257 + }, + { + "epoch": 0.27453073569967085, + "grad_norm": 0.17061565816402435, + "learning_rate": 0.000833993802146987, + "loss": 2.7891, + "step": 9258 + }, + { + "epoch": 0.2745603890519823, + "grad_norm": 0.16556884348392487, + "learning_rate": 0.0008339587866418338, + "loss": 2.7771, + "step": 9259 + }, + { + "epoch": 0.2745900424042938, + "grad_norm": 0.15087661147117615, + "learning_rate": 0.0008339237681794182, + "loss": 2.8143, + "step": 9260 + }, + { + "epoch": 0.2746196957566053, + "grad_norm": 0.1559971570968628, + "learning_rate": 0.0008338887467600502, + "loss": 2.7699, + "step": 9261 + }, + { + "epoch": 0.27464934910891675, + "grad_norm": 0.14584152400493622, + "learning_rate": 0.0008338537223840403, + "loss": 2.8111, + "step": 9262 + }, + { + "epoch": 0.2746790024612282, + "grad_norm": 0.12805475294589996, + "learning_rate": 0.0008338186950516981, + "loss": 2.7829, + "step": 9263 + }, + { + "epoch": 0.2747086558135397, + "grad_norm": 0.14112994074821472, + "learning_rate": 0.0008337836647633344, + "loss": 2.7826, + "step": 9264 + }, + { + "epoch": 0.2747383091658512, + "grad_norm": 0.12709097564220428, + "learning_rate": 0.0008337486315192587, + "loss": 2.724, + "step": 9265 + }, + { + "epoch": 0.27476796251816266, + "grad_norm": 0.13888458907604218, + "learning_rate": 0.0008337135953197819, + "loss": 2.7707, + "step": 9266 + }, + { + "epoch": 0.27479761587047413, + "grad_norm": 0.13128143548965454, + "learning_rate": 0.0008336785561652136, + "loss": 2.7607, + "step": 9267 + }, + { + "epoch": 0.2748272692227856, + "grad_norm": 0.1342756748199463, + "learning_rate": 0.0008336435140558647, + "loss": 2.8411, + "step": 9268 + }, + { + "epoch": 0.27485692257509714, + "grad_norm": 0.1287342607975006, + "learning_rate": 0.0008336084689920451, + "loss": 2.7904, + "step": 9269 + }, + { + "epoch": 0.2748865759274086, + "grad_norm": 0.1261046975851059, + "learning_rate": 0.0008335734209740652, + "loss": 2.7563, + "step": 9270 + }, + { + "epoch": 0.2749162292797201, + "grad_norm": 0.11709538102149963, + "learning_rate": 0.0008335383700022354, + "loss": 2.79, + "step": 9271 + }, + { + "epoch": 0.27494588263203157, + "grad_norm": 0.11928924918174744, + "learning_rate": 0.0008335033160768662, + "loss": 2.7409, + "step": 9272 + }, + { + "epoch": 0.27497553598434304, + "grad_norm": 0.1423492580652237, + "learning_rate": 0.0008334682591982677, + "loss": 2.7258, + "step": 9273 + }, + { + "epoch": 0.2750051893366545, + "grad_norm": 0.1439565122127533, + "learning_rate": 0.0008334331993667506, + "loss": 2.7636, + "step": 9274 + }, + { + "epoch": 0.275034842688966, + "grad_norm": 0.12693294882774353, + "learning_rate": 0.0008333981365826253, + "loss": 2.7838, + "step": 9275 + }, + { + "epoch": 0.2750644960412775, + "grad_norm": 0.1089983806014061, + "learning_rate": 0.0008333630708462024, + "loss": 2.7421, + "step": 9276 + }, + { + "epoch": 0.27509414939358895, + "grad_norm": 0.10792676359415054, + "learning_rate": 0.000833328002157792, + "loss": 2.7639, + "step": 9277 + }, + { + "epoch": 0.2751238027459004, + "grad_norm": 0.11615826934576035, + "learning_rate": 0.0008332929305177052, + "loss": 2.7733, + "step": 9278 + }, + { + "epoch": 0.2751534560982119, + "grad_norm": 0.09729462116956711, + "learning_rate": 0.0008332578559262523, + "loss": 2.7593, + "step": 9279 + }, + { + "epoch": 0.2751831094505234, + "grad_norm": 0.11754119396209717, + "learning_rate": 0.0008332227783837437, + "loss": 2.7542, + "step": 9280 + }, + { + "epoch": 0.27521276280283485, + "grad_norm": 0.13094370067119598, + "learning_rate": 0.0008331876978904903, + "loss": 2.7908, + "step": 9281 + }, + { + "epoch": 0.27524241615514633, + "grad_norm": 0.1578371226787567, + "learning_rate": 0.0008331526144468027, + "loss": 2.7749, + "step": 9282 + }, + { + "epoch": 0.2752720695074578, + "grad_norm": 0.17909739911556244, + "learning_rate": 0.0008331175280529915, + "loss": 2.7772, + "step": 9283 + }, + { + "epoch": 0.2753017228597693, + "grad_norm": 0.1929299533367157, + "learning_rate": 0.0008330824387093672, + "loss": 2.7955, + "step": 9284 + }, + { + "epoch": 0.27533137621208076, + "grad_norm": 0.1596265733242035, + "learning_rate": 0.0008330473464162409, + "loss": 2.7664, + "step": 9285 + }, + { + "epoch": 0.27536102956439223, + "grad_norm": 0.1537838578224182, + "learning_rate": 0.0008330122511739231, + "loss": 2.7816, + "step": 9286 + }, + { + "epoch": 0.2753906829167037, + "grad_norm": 0.17545568943023682, + "learning_rate": 0.0008329771529827248, + "loss": 2.7898, + "step": 9287 + }, + { + "epoch": 0.2754203362690152, + "grad_norm": 0.18512730300426483, + "learning_rate": 0.0008329420518429566, + "loss": 2.7801, + "step": 9288 + }, + { + "epoch": 0.2754499896213267, + "grad_norm": 0.1710640788078308, + "learning_rate": 0.0008329069477549293, + "loss": 2.8012, + "step": 9289 + }, + { + "epoch": 0.2754796429736382, + "grad_norm": 0.1692173182964325, + "learning_rate": 0.000832871840718954, + "loss": 2.7942, + "step": 9290 + }, + { + "epoch": 0.27550929632594967, + "grad_norm": 0.15546011924743652, + "learning_rate": 0.0008328367307353412, + "loss": 2.8034, + "step": 9291 + }, + { + "epoch": 0.27553894967826115, + "grad_norm": 0.13472111523151398, + "learning_rate": 0.0008328016178044022, + "loss": 2.766, + "step": 9292 + }, + { + "epoch": 0.2755686030305726, + "grad_norm": 0.13919579982757568, + "learning_rate": 0.0008327665019264476, + "loss": 2.7872, + "step": 9293 + }, + { + "epoch": 0.2755982563828841, + "grad_norm": 0.1226266548037529, + "learning_rate": 0.0008327313831017886, + "loss": 2.7621, + "step": 9294 + }, + { + "epoch": 0.2756279097351956, + "grad_norm": 0.12657217681407928, + "learning_rate": 0.0008326962613307361, + "loss": 2.7769, + "step": 9295 + }, + { + "epoch": 0.27565756308750705, + "grad_norm": 0.12914225459098816, + "learning_rate": 0.000832661136613601, + "loss": 2.735, + "step": 9296 + }, + { + "epoch": 0.2756872164398185, + "grad_norm": 0.13661576807498932, + "learning_rate": 0.0008326260089506945, + "loss": 2.7509, + "step": 9297 + }, + { + "epoch": 0.27571686979213, + "grad_norm": 0.1331157386302948, + "learning_rate": 0.0008325908783423276, + "loss": 2.7581, + "step": 9298 + }, + { + "epoch": 0.2757465231444415, + "grad_norm": 0.13291718065738678, + "learning_rate": 0.0008325557447888115, + "loss": 2.7476, + "step": 9299 + }, + { + "epoch": 0.27577617649675296, + "grad_norm": 0.12854501605033875, + "learning_rate": 0.0008325206082904571, + "loss": 2.7587, + "step": 9300 + }, + { + "epoch": 0.27580582984906443, + "grad_norm": 0.13131098449230194, + "learning_rate": 0.0008324854688475756, + "loss": 2.7662, + "step": 9301 + }, + { + "epoch": 0.2758354832013759, + "grad_norm": 0.1354662925004959, + "learning_rate": 0.0008324503264604781, + "loss": 2.7689, + "step": 9302 + }, + { + "epoch": 0.2758651365536874, + "grad_norm": 0.14737968146800995, + "learning_rate": 0.000832415181129476, + "loss": 2.7738, + "step": 9303 + }, + { + "epoch": 0.27589478990599886, + "grad_norm": 0.15873503684997559, + "learning_rate": 0.0008323800328548805, + "loss": 2.7503, + "step": 9304 + }, + { + "epoch": 0.27592444325831034, + "grad_norm": 0.1784439980983734, + "learning_rate": 0.0008323448816370027, + "loss": 2.7813, + "step": 9305 + }, + { + "epoch": 0.2759540966106218, + "grad_norm": 0.16030706465244293, + "learning_rate": 0.000832309727476154, + "loss": 2.7494, + "step": 9306 + }, + { + "epoch": 0.2759837499629333, + "grad_norm": 0.14271564781665802, + "learning_rate": 0.0008322745703726454, + "loss": 2.7535, + "step": 9307 + }, + { + "epoch": 0.27601340331524477, + "grad_norm": 0.1510322540998459, + "learning_rate": 0.0008322394103267886, + "loss": 2.8203, + "step": 9308 + }, + { + "epoch": 0.27604305666755624, + "grad_norm": 0.15528403222560883, + "learning_rate": 0.0008322042473388949, + "loss": 2.7393, + "step": 9309 + }, + { + "epoch": 0.2760727100198678, + "grad_norm": 0.14856307208538055, + "learning_rate": 0.0008321690814092753, + "loss": 2.7712, + "step": 9310 + }, + { + "epoch": 0.27610236337217925, + "grad_norm": 0.14101047813892365, + "learning_rate": 0.0008321339125382417, + "loss": 2.7448, + "step": 9311 + }, + { + "epoch": 0.2761320167244907, + "grad_norm": 0.16774001717567444, + "learning_rate": 0.0008320987407261051, + "loss": 2.7688, + "step": 9312 + }, + { + "epoch": 0.2761616700768022, + "grad_norm": 0.15316903591156006, + "learning_rate": 0.0008320635659731773, + "loss": 2.7515, + "step": 9313 + }, + { + "epoch": 0.2761913234291137, + "grad_norm": 0.1249755248427391, + "learning_rate": 0.0008320283882797695, + "loss": 2.7419, + "step": 9314 + }, + { + "epoch": 0.27622097678142515, + "grad_norm": 0.13539119064807892, + "learning_rate": 0.0008319932076461936, + "loss": 2.7646, + "step": 9315 + }, + { + "epoch": 0.27625063013373663, + "grad_norm": 0.13577900826931, + "learning_rate": 0.0008319580240727604, + "loss": 2.7792, + "step": 9316 + }, + { + "epoch": 0.2762802834860481, + "grad_norm": 0.1671394258737564, + "learning_rate": 0.0008319228375597823, + "loss": 2.7762, + "step": 9317 + }, + { + "epoch": 0.2763099368383596, + "grad_norm": 0.17321528494358063, + "learning_rate": 0.0008318876481075703, + "loss": 2.7653, + "step": 9318 + }, + { + "epoch": 0.27633959019067106, + "grad_norm": 0.14341284334659576, + "learning_rate": 0.0008318524557164364, + "loss": 2.7833, + "step": 9319 + }, + { + "epoch": 0.27636924354298253, + "grad_norm": 0.10927271097898483, + "learning_rate": 0.0008318172603866919, + "loss": 2.7761, + "step": 9320 + }, + { + "epoch": 0.276398896895294, + "grad_norm": 0.10944513976573944, + "learning_rate": 0.0008317820621186488, + "loss": 2.7883, + "step": 9321 + }, + { + "epoch": 0.2764285502476055, + "grad_norm": 0.12162879854440689, + "learning_rate": 0.0008317468609126183, + "loss": 2.816, + "step": 9322 + }, + { + "epoch": 0.27645820359991696, + "grad_norm": 0.1320219486951828, + "learning_rate": 0.0008317116567689124, + "loss": 2.7677, + "step": 9323 + }, + { + "epoch": 0.27648785695222844, + "grad_norm": 0.13169531524181366, + "learning_rate": 0.0008316764496878431, + "loss": 2.786, + "step": 9324 + }, + { + "epoch": 0.2765175103045399, + "grad_norm": 0.14909693598747253, + "learning_rate": 0.0008316412396697217, + "loss": 2.7873, + "step": 9325 + }, + { + "epoch": 0.2765471636568514, + "grad_norm": 0.13888631761074066, + "learning_rate": 0.0008316060267148604, + "loss": 2.7518, + "step": 9326 + }, + { + "epoch": 0.27657681700916287, + "grad_norm": 0.15016427636146545, + "learning_rate": 0.0008315708108235706, + "loss": 2.7698, + "step": 9327 + }, + { + "epoch": 0.27660647036147434, + "grad_norm": 0.14705514907836914, + "learning_rate": 0.0008315355919961644, + "loss": 2.7977, + "step": 9328 + }, + { + "epoch": 0.2766361237137858, + "grad_norm": 0.1474228799343109, + "learning_rate": 0.0008315003702329538, + "loss": 2.8048, + "step": 9329 + }, + { + "epoch": 0.2766657770660973, + "grad_norm": 0.15880131721496582, + "learning_rate": 0.0008314651455342503, + "loss": 2.7992, + "step": 9330 + }, + { + "epoch": 0.2766954304184088, + "grad_norm": 0.1509322077035904, + "learning_rate": 0.0008314299179003661, + "loss": 2.7747, + "step": 9331 + }, + { + "epoch": 0.2767250837707203, + "grad_norm": 0.13876764476299286, + "learning_rate": 0.0008313946873316131, + "loss": 2.7695, + "step": 9332 + }, + { + "epoch": 0.2767547371230318, + "grad_norm": 0.12884412705898285, + "learning_rate": 0.0008313594538283033, + "loss": 2.7905, + "step": 9333 + }, + { + "epoch": 0.27678439047534326, + "grad_norm": 0.13055238127708435, + "learning_rate": 0.0008313242173907487, + "loss": 2.7722, + "step": 9334 + }, + { + "epoch": 0.27681404382765473, + "grad_norm": 0.1301439106464386, + "learning_rate": 0.0008312889780192612, + "loss": 2.7111, + "step": 9335 + }, + { + "epoch": 0.2768436971799662, + "grad_norm": 0.13142088055610657, + "learning_rate": 0.000831253735714153, + "loss": 2.7519, + "step": 9336 + }, + { + "epoch": 0.2768733505322777, + "grad_norm": 0.11563281714916229, + "learning_rate": 0.0008312184904757361, + "loss": 2.7829, + "step": 9337 + }, + { + "epoch": 0.27690300388458916, + "grad_norm": 0.10934474319219589, + "learning_rate": 0.0008311832423043226, + "loss": 2.7369, + "step": 9338 + }, + { + "epoch": 0.27693265723690064, + "grad_norm": 0.11103812605142593, + "learning_rate": 0.0008311479912002246, + "loss": 2.7821, + "step": 9339 + }, + { + "epoch": 0.2769623105892121, + "grad_norm": 0.1352771371603012, + "learning_rate": 0.0008311127371637544, + "loss": 2.7574, + "step": 9340 + }, + { + "epoch": 0.2769919639415236, + "grad_norm": 0.14014969766139984, + "learning_rate": 0.0008310774801952239, + "loss": 2.7926, + "step": 9341 + }, + { + "epoch": 0.27702161729383507, + "grad_norm": 0.14253270626068115, + "learning_rate": 0.0008310422202949456, + "loss": 2.7949, + "step": 9342 + }, + { + "epoch": 0.27705127064614654, + "grad_norm": 0.1325530707836151, + "learning_rate": 0.0008310069574632315, + "loss": 2.7441, + "step": 9343 + }, + { + "epoch": 0.277080923998458, + "grad_norm": 0.1243763267993927, + "learning_rate": 0.0008309716917003942, + "loss": 2.7688, + "step": 9344 + }, + { + "epoch": 0.2771105773507695, + "grad_norm": 0.10417038202285767, + "learning_rate": 0.0008309364230067456, + "loss": 2.7604, + "step": 9345 + }, + { + "epoch": 0.27714023070308097, + "grad_norm": 0.12004322558641434, + "learning_rate": 0.0008309011513825983, + "loss": 2.7573, + "step": 9346 + }, + { + "epoch": 0.27716988405539245, + "grad_norm": 0.12903393805027008, + "learning_rate": 0.0008308658768282644, + "loss": 2.7754, + "step": 9347 + }, + { + "epoch": 0.2771995374077039, + "grad_norm": 0.13034740090370178, + "learning_rate": 0.0008308305993440563, + "loss": 2.7725, + "step": 9348 + }, + { + "epoch": 0.2772291907600154, + "grad_norm": 0.13632944226264954, + "learning_rate": 0.0008307953189302866, + "loss": 2.781, + "step": 9349 + }, + { + "epoch": 0.2772588441123269, + "grad_norm": 0.1467476338148117, + "learning_rate": 0.0008307600355872676, + "loss": 2.7833, + "step": 9350 + }, + { + "epoch": 0.27728849746463835, + "grad_norm": 0.15224243700504303, + "learning_rate": 0.0008307247493153115, + "loss": 2.7825, + "step": 9351 + }, + { + "epoch": 0.2773181508169499, + "grad_norm": 0.14507973194122314, + "learning_rate": 0.0008306894601147312, + "loss": 2.7883, + "step": 9352 + }, + { + "epoch": 0.27734780416926136, + "grad_norm": 0.12934216856956482, + "learning_rate": 0.0008306541679858388, + "loss": 2.7812, + "step": 9353 + }, + { + "epoch": 0.27737745752157283, + "grad_norm": 0.11876314133405685, + "learning_rate": 0.000830618872928947, + "loss": 2.7936, + "step": 9354 + }, + { + "epoch": 0.2774071108738843, + "grad_norm": 0.15347516536712646, + "learning_rate": 0.0008305835749443685, + "loss": 2.7873, + "step": 9355 + }, + { + "epoch": 0.2774367642261958, + "grad_norm": 0.16453279554843903, + "learning_rate": 0.0008305482740324155, + "loss": 2.7761, + "step": 9356 + }, + { + "epoch": 0.27746641757850726, + "grad_norm": 0.15100765228271484, + "learning_rate": 0.0008305129701934009, + "loss": 2.7953, + "step": 9357 + }, + { + "epoch": 0.27749607093081874, + "grad_norm": 0.15002039074897766, + "learning_rate": 0.0008304776634276372, + "loss": 2.7752, + "step": 9358 + }, + { + "epoch": 0.2775257242831302, + "grad_norm": 0.14275600016117096, + "learning_rate": 0.0008304423537354371, + "loss": 2.7954, + "step": 9359 + }, + { + "epoch": 0.2775553776354417, + "grad_norm": 0.12492257356643677, + "learning_rate": 0.0008304070411171132, + "loss": 2.756, + "step": 9360 + }, + { + "epoch": 0.27758503098775317, + "grad_norm": 0.13391461968421936, + "learning_rate": 0.0008303717255729781, + "loss": 2.7846, + "step": 9361 + }, + { + "epoch": 0.27761468434006464, + "grad_norm": 0.13119447231292725, + "learning_rate": 0.0008303364071033448, + "loss": 2.798, + "step": 9362 + }, + { + "epoch": 0.2776443376923761, + "grad_norm": 0.11980824172496796, + "learning_rate": 0.000830301085708526, + "loss": 2.7515, + "step": 9363 + }, + { + "epoch": 0.2776739910446876, + "grad_norm": 0.11706209182739258, + "learning_rate": 0.0008302657613888342, + "loss": 2.7675, + "step": 9364 + }, + { + "epoch": 0.27770364439699907, + "grad_norm": 0.13024629652500153, + "learning_rate": 0.0008302304341445825, + "loss": 2.7546, + "step": 9365 + }, + { + "epoch": 0.27773329774931055, + "grad_norm": 0.13570839166641235, + "learning_rate": 0.0008301951039760838, + "loss": 2.777, + "step": 9366 + }, + { + "epoch": 0.277762951101622, + "grad_norm": 0.16482608020305634, + "learning_rate": 0.0008301597708836504, + "loss": 2.7671, + "step": 9367 + }, + { + "epoch": 0.2777926044539335, + "grad_norm": 0.1890353560447693, + "learning_rate": 0.0008301244348675958, + "loss": 2.7642, + "step": 9368 + }, + { + "epoch": 0.277822257806245, + "grad_norm": 0.20747241377830505, + "learning_rate": 0.0008300890959282326, + "loss": 2.7725, + "step": 9369 + }, + { + "epoch": 0.27785191115855645, + "grad_norm": 0.1895798146724701, + "learning_rate": 0.0008300537540658738, + "loss": 2.7386, + "step": 9370 + }, + { + "epoch": 0.27788156451086793, + "grad_norm": 0.15373283624649048, + "learning_rate": 0.0008300184092808325, + "loss": 2.7683, + "step": 9371 + }, + { + "epoch": 0.2779112178631794, + "grad_norm": 0.16378739476203918, + "learning_rate": 0.0008299830615734214, + "loss": 2.7719, + "step": 9372 + }, + { + "epoch": 0.27794087121549094, + "grad_norm": 0.16092976927757263, + "learning_rate": 0.0008299477109439536, + "loss": 2.768, + "step": 9373 + }, + { + "epoch": 0.2779705245678024, + "grad_norm": 0.14006437361240387, + "learning_rate": 0.0008299123573927422, + "loss": 2.7732, + "step": 9374 + }, + { + "epoch": 0.2780001779201139, + "grad_norm": 0.14264912903308868, + "learning_rate": 0.0008298770009201002, + "loss": 2.7707, + "step": 9375 + }, + { + "epoch": 0.27802983127242537, + "grad_norm": 0.14606069028377533, + "learning_rate": 0.000829841641526341, + "loss": 2.7253, + "step": 9376 + }, + { + "epoch": 0.27805948462473684, + "grad_norm": 0.1554901897907257, + "learning_rate": 0.0008298062792117772, + "loss": 2.7665, + "step": 9377 + }, + { + "epoch": 0.2780891379770483, + "grad_norm": 0.13871684670448303, + "learning_rate": 0.0008297709139767222, + "loss": 2.7786, + "step": 9378 + }, + { + "epoch": 0.2781187913293598, + "grad_norm": 0.12396441400051117, + "learning_rate": 0.0008297355458214892, + "loss": 2.7949, + "step": 9379 + }, + { + "epoch": 0.27814844468167127, + "grad_norm": 0.13923300802707672, + "learning_rate": 0.0008297001747463915, + "loss": 2.7578, + "step": 9380 + }, + { + "epoch": 0.27817809803398275, + "grad_norm": 0.12076635658740997, + "learning_rate": 0.0008296648007517417, + "loss": 2.7838, + "step": 9381 + }, + { + "epoch": 0.2782077513862942, + "grad_norm": 0.13434484601020813, + "learning_rate": 0.000829629423837854, + "loss": 2.7372, + "step": 9382 + }, + { + "epoch": 0.2782374047386057, + "grad_norm": 0.13668209314346313, + "learning_rate": 0.000829594044005041, + "loss": 2.7848, + "step": 9383 + }, + { + "epoch": 0.2782670580909172, + "grad_norm": 0.14870376884937286, + "learning_rate": 0.0008295586612536161, + "loss": 2.764, + "step": 9384 + }, + { + "epoch": 0.27829671144322865, + "grad_norm": 0.15044958889484406, + "learning_rate": 0.0008295232755838928, + "loss": 2.778, + "step": 9385 + }, + { + "epoch": 0.2783263647955401, + "grad_norm": 0.13394716382026672, + "learning_rate": 0.0008294878869961842, + "loss": 2.7685, + "step": 9386 + }, + { + "epoch": 0.2783560181478516, + "grad_norm": 0.14032705128192902, + "learning_rate": 0.0008294524954908039, + "loss": 2.7832, + "step": 9387 + }, + { + "epoch": 0.2783856715001631, + "grad_norm": 0.13334566354751587, + "learning_rate": 0.0008294171010680652, + "loss": 2.7544, + "step": 9388 + }, + { + "epoch": 0.27841532485247455, + "grad_norm": 0.12550105154514313, + "learning_rate": 0.0008293817037282815, + "loss": 2.7712, + "step": 9389 + }, + { + "epoch": 0.27844497820478603, + "grad_norm": 0.11852697283029556, + "learning_rate": 0.0008293463034717663, + "loss": 2.731, + "step": 9390 + }, + { + "epoch": 0.2784746315570975, + "grad_norm": 0.12894295156002045, + "learning_rate": 0.0008293109002988331, + "loss": 2.7377, + "step": 9391 + }, + { + "epoch": 0.278504284909409, + "grad_norm": 0.13117434084415436, + "learning_rate": 0.0008292754942097954, + "loss": 2.7706, + "step": 9392 + }, + { + "epoch": 0.2785339382617205, + "grad_norm": 0.1393730342388153, + "learning_rate": 0.0008292400852049664, + "loss": 2.7529, + "step": 9393 + }, + { + "epoch": 0.278563591614032, + "grad_norm": 0.13635455071926117, + "learning_rate": 0.0008292046732846601, + "loss": 2.7918, + "step": 9394 + }, + { + "epoch": 0.27859324496634347, + "grad_norm": 0.15003550052642822, + "learning_rate": 0.0008291692584491899, + "loss": 2.7816, + "step": 9395 + }, + { + "epoch": 0.27862289831865494, + "grad_norm": 0.14891989529132843, + "learning_rate": 0.0008291338406988695, + "loss": 2.7848, + "step": 9396 + }, + { + "epoch": 0.2786525516709664, + "grad_norm": 0.14456631243228912, + "learning_rate": 0.0008290984200340122, + "loss": 2.7848, + "step": 9397 + }, + { + "epoch": 0.2786822050232779, + "grad_norm": 0.14575466513633728, + "learning_rate": 0.0008290629964549321, + "loss": 2.7658, + "step": 9398 + }, + { + "epoch": 0.27871185837558937, + "grad_norm": 0.1508338749408722, + "learning_rate": 0.0008290275699619427, + "loss": 2.7841, + "step": 9399 + }, + { + "epoch": 0.27874151172790085, + "grad_norm": 0.14315231144428253, + "learning_rate": 0.0008289921405553576, + "loss": 2.7588, + "step": 9400 + }, + { + "epoch": 0.2787711650802123, + "grad_norm": 0.11880449205636978, + "learning_rate": 0.0008289567082354907, + "loss": 2.7691, + "step": 9401 + }, + { + "epoch": 0.2788008184325238, + "grad_norm": 0.11547009646892548, + "learning_rate": 0.0008289212730026557, + "loss": 2.7244, + "step": 9402 + }, + { + "epoch": 0.2788304717848353, + "grad_norm": 0.1188802719116211, + "learning_rate": 0.0008288858348571663, + "loss": 2.787, + "step": 9403 + }, + { + "epoch": 0.27886012513714675, + "grad_norm": 0.1212877705693245, + "learning_rate": 0.0008288503937993364, + "loss": 2.7659, + "step": 9404 + }, + { + "epoch": 0.27888977848945823, + "grad_norm": 0.10770712047815323, + "learning_rate": 0.0008288149498294799, + "loss": 2.7747, + "step": 9405 + }, + { + "epoch": 0.2789194318417697, + "grad_norm": 0.12746967375278473, + "learning_rate": 0.0008287795029479104, + "loss": 2.7834, + "step": 9406 + }, + { + "epoch": 0.2789490851940812, + "grad_norm": 0.15101870894432068, + "learning_rate": 0.0008287440531549421, + "loss": 2.779, + "step": 9407 + }, + { + "epoch": 0.27897873854639266, + "grad_norm": 0.16346555948257446, + "learning_rate": 0.0008287086004508887, + "loss": 2.7744, + "step": 9408 + }, + { + "epoch": 0.27900839189870413, + "grad_norm": 0.17937712371349335, + "learning_rate": 0.0008286731448360643, + "loss": 2.759, + "step": 9409 + }, + { + "epoch": 0.2790380452510156, + "grad_norm": 0.16988354921340942, + "learning_rate": 0.0008286376863107827, + "loss": 2.7803, + "step": 9410 + }, + { + "epoch": 0.2790676986033271, + "grad_norm": 0.17082232236862183, + "learning_rate": 0.000828602224875358, + "loss": 2.7755, + "step": 9411 + }, + { + "epoch": 0.27909735195563856, + "grad_norm": 0.16821172833442688, + "learning_rate": 0.0008285667605301044, + "loss": 2.7574, + "step": 9412 + }, + { + "epoch": 0.27912700530795004, + "grad_norm": 0.173268660902977, + "learning_rate": 0.0008285312932753355, + "loss": 2.7849, + "step": 9413 + }, + { + "epoch": 0.27915665866026157, + "grad_norm": 0.16473476588726044, + "learning_rate": 0.0008284958231113655, + "loss": 2.7639, + "step": 9414 + }, + { + "epoch": 0.27918631201257305, + "grad_norm": 0.1616613268852234, + "learning_rate": 0.0008284603500385089, + "loss": 2.7698, + "step": 9415 + }, + { + "epoch": 0.2792159653648845, + "grad_norm": 0.1449839472770691, + "learning_rate": 0.0008284248740570794, + "loss": 2.7562, + "step": 9416 + }, + { + "epoch": 0.279245618717196, + "grad_norm": 0.12528391182422638, + "learning_rate": 0.0008283893951673913, + "loss": 2.7595, + "step": 9417 + }, + { + "epoch": 0.2792752720695075, + "grad_norm": 0.15641431510448456, + "learning_rate": 0.0008283539133697586, + "loss": 2.798, + "step": 9418 + }, + { + "epoch": 0.27930492542181895, + "grad_norm": 0.14363403618335724, + "learning_rate": 0.0008283184286644958, + "loss": 2.7586, + "step": 9419 + }, + { + "epoch": 0.2793345787741304, + "grad_norm": 0.13913771510124207, + "learning_rate": 0.0008282829410519168, + "loss": 2.7683, + "step": 9420 + }, + { + "epoch": 0.2793642321264419, + "grad_norm": 0.14118267595767975, + "learning_rate": 0.0008282474505323361, + "loss": 2.7601, + "step": 9421 + }, + { + "epoch": 0.2793938854787534, + "grad_norm": 0.13476857542991638, + "learning_rate": 0.000828211957106068, + "loss": 2.7858, + "step": 9422 + }, + { + "epoch": 0.27942353883106485, + "grad_norm": 0.14428038895130157, + "learning_rate": 0.0008281764607734266, + "loss": 2.7543, + "step": 9423 + }, + { + "epoch": 0.27945319218337633, + "grad_norm": 0.13980650901794434, + "learning_rate": 0.0008281409615347262, + "loss": 2.8203, + "step": 9424 + }, + { + "epoch": 0.2794828455356878, + "grad_norm": 0.1166096031665802, + "learning_rate": 0.0008281054593902812, + "loss": 2.7515, + "step": 9425 + }, + { + "epoch": 0.2795124988879993, + "grad_norm": 0.1204146295785904, + "learning_rate": 0.0008280699543404063, + "loss": 2.7731, + "step": 9426 + }, + { + "epoch": 0.27954215224031076, + "grad_norm": 0.13060525059700012, + "learning_rate": 0.0008280344463854155, + "loss": 2.7535, + "step": 9427 + }, + { + "epoch": 0.27957180559262224, + "grad_norm": 0.18344007432460785, + "learning_rate": 0.0008279989355256235, + "loss": 2.7509, + "step": 9428 + }, + { + "epoch": 0.2796014589449337, + "grad_norm": 0.14334411919116974, + "learning_rate": 0.0008279634217613444, + "loss": 2.7897, + "step": 9429 + }, + { + "epoch": 0.2796311122972452, + "grad_norm": 0.11954814940690994, + "learning_rate": 0.000827927905092893, + "loss": 2.7805, + "step": 9430 + }, + { + "epoch": 0.27966076564955666, + "grad_norm": 0.13862410187721252, + "learning_rate": 0.0008278923855205838, + "loss": 2.7823, + "step": 9431 + }, + { + "epoch": 0.27969041900186814, + "grad_norm": 0.11476366966962814, + "learning_rate": 0.000827856863044731, + "loss": 2.7363, + "step": 9432 + }, + { + "epoch": 0.2797200723541796, + "grad_norm": 0.12253794819116592, + "learning_rate": 0.0008278213376656496, + "loss": 2.7801, + "step": 9433 + }, + { + "epoch": 0.2797497257064911, + "grad_norm": 0.11874625831842422, + "learning_rate": 0.0008277858093836541, + "loss": 2.8007, + "step": 9434 + }, + { + "epoch": 0.2797793790588026, + "grad_norm": 0.1343081146478653, + "learning_rate": 0.0008277502781990588, + "loss": 2.7715, + "step": 9435 + }, + { + "epoch": 0.2798090324111141, + "grad_norm": 0.1183243989944458, + "learning_rate": 0.0008277147441121786, + "loss": 2.7691, + "step": 9436 + }, + { + "epoch": 0.2798386857634256, + "grad_norm": 0.13847458362579346, + "learning_rate": 0.0008276792071233281, + "loss": 2.7926, + "step": 9437 + }, + { + "epoch": 0.27986833911573705, + "grad_norm": 0.16438470780849457, + "learning_rate": 0.000827643667232822, + "loss": 2.7736, + "step": 9438 + }, + { + "epoch": 0.27989799246804853, + "grad_norm": 0.16249942779541016, + "learning_rate": 0.0008276081244409747, + "loss": 2.7712, + "step": 9439 + }, + { + "epoch": 0.27992764582036, + "grad_norm": 0.14695066213607788, + "learning_rate": 0.0008275725787481017, + "loss": 2.7957, + "step": 9440 + }, + { + "epoch": 0.2799572991726715, + "grad_norm": 0.12414240837097168, + "learning_rate": 0.0008275370301545168, + "loss": 2.7789, + "step": 9441 + }, + { + "epoch": 0.27998695252498296, + "grad_norm": 0.13077042996883392, + "learning_rate": 0.0008275014786605357, + "loss": 2.7518, + "step": 9442 + }, + { + "epoch": 0.28001660587729443, + "grad_norm": 0.14031949639320374, + "learning_rate": 0.0008274659242664726, + "loss": 2.7761, + "step": 9443 + }, + { + "epoch": 0.2800462592296059, + "grad_norm": 0.12668199837207794, + "learning_rate": 0.0008274303669726426, + "loss": 2.7302, + "step": 9444 + }, + { + "epoch": 0.2800759125819174, + "grad_norm": 0.13638411462306976, + "learning_rate": 0.0008273948067793604, + "loss": 2.7704, + "step": 9445 + }, + { + "epoch": 0.28010556593422886, + "grad_norm": 0.12215325981378555, + "learning_rate": 0.000827359243686941, + "loss": 2.7562, + "step": 9446 + }, + { + "epoch": 0.28013521928654034, + "grad_norm": 0.14004798233509064, + "learning_rate": 0.0008273236776956994, + "loss": 2.75, + "step": 9447 + }, + { + "epoch": 0.2801648726388518, + "grad_norm": 0.13760773837566376, + "learning_rate": 0.0008272881088059504, + "loss": 2.712, + "step": 9448 + }, + { + "epoch": 0.2801945259911633, + "grad_norm": 0.12979352474212646, + "learning_rate": 0.0008272525370180091, + "loss": 2.7633, + "step": 9449 + }, + { + "epoch": 0.28022417934347477, + "grad_norm": 0.13302376866340637, + "learning_rate": 0.0008272169623321903, + "loss": 2.7813, + "step": 9450 + }, + { + "epoch": 0.28025383269578624, + "grad_norm": 0.1125781238079071, + "learning_rate": 0.0008271813847488091, + "loss": 2.7483, + "step": 9451 + }, + { + "epoch": 0.2802834860480977, + "grad_norm": 0.12616996467113495, + "learning_rate": 0.0008271458042681805, + "loss": 2.7556, + "step": 9452 + }, + { + "epoch": 0.2803131394004092, + "grad_norm": 0.13416527211666107, + "learning_rate": 0.0008271102208906199, + "loss": 2.7823, + "step": 9453 + }, + { + "epoch": 0.28034279275272067, + "grad_norm": 0.13903217017650604, + "learning_rate": 0.000827074634616442, + "loss": 2.7723, + "step": 9454 + }, + { + "epoch": 0.28037244610503215, + "grad_norm": 0.13635486364364624, + "learning_rate": 0.0008270390454459621, + "loss": 2.7917, + "step": 9455 + }, + { + "epoch": 0.2804020994573437, + "grad_norm": 0.13075852394104004, + "learning_rate": 0.0008270034533794955, + "loss": 2.7687, + "step": 9456 + }, + { + "epoch": 0.28043175280965515, + "grad_norm": 0.1432999074459076, + "learning_rate": 0.0008269678584173569, + "loss": 2.7672, + "step": 9457 + }, + { + "epoch": 0.28046140616196663, + "grad_norm": 0.13105860352516174, + "learning_rate": 0.0008269322605598618, + "loss": 2.7758, + "step": 9458 + }, + { + "epoch": 0.2804910595142781, + "grad_norm": 0.12909066677093506, + "learning_rate": 0.0008268966598073256, + "loss": 2.7484, + "step": 9459 + }, + { + "epoch": 0.2805207128665896, + "grad_norm": 0.156936913728714, + "learning_rate": 0.0008268610561600633, + "loss": 2.7731, + "step": 9460 + }, + { + "epoch": 0.28055036621890106, + "grad_norm": 0.13072460889816284, + "learning_rate": 0.0008268254496183903, + "loss": 2.7572, + "step": 9461 + }, + { + "epoch": 0.28058001957121254, + "grad_norm": 0.12344307452440262, + "learning_rate": 0.0008267898401826217, + "loss": 2.7529, + "step": 9462 + }, + { + "epoch": 0.280609672923524, + "grad_norm": 0.11681448668241501, + "learning_rate": 0.000826754227853073, + "loss": 2.769, + "step": 9463 + }, + { + "epoch": 0.2806393262758355, + "grad_norm": 0.12738896906375885, + "learning_rate": 0.0008267186126300597, + "loss": 2.7841, + "step": 9464 + }, + { + "epoch": 0.28066897962814696, + "grad_norm": 0.13431331515312195, + "learning_rate": 0.0008266829945138967, + "loss": 2.7758, + "step": 9465 + }, + { + "epoch": 0.28069863298045844, + "grad_norm": 0.15320265293121338, + "learning_rate": 0.0008266473735048999, + "loss": 2.7551, + "step": 9466 + }, + { + "epoch": 0.2807282863327699, + "grad_norm": 0.12519347667694092, + "learning_rate": 0.0008266117496033845, + "loss": 2.7843, + "step": 9467 + }, + { + "epoch": 0.2807579396850814, + "grad_norm": 0.15247268974781036, + "learning_rate": 0.000826576122809666, + "loss": 2.7921, + "step": 9468 + }, + { + "epoch": 0.28078759303739287, + "grad_norm": 0.17263545095920563, + "learning_rate": 0.0008265404931240599, + "loss": 2.7353, + "step": 9469 + }, + { + "epoch": 0.28081724638970434, + "grad_norm": 0.15273062884807587, + "learning_rate": 0.0008265048605468816, + "loss": 2.7351, + "step": 9470 + }, + { + "epoch": 0.2808468997420158, + "grad_norm": 0.14524848759174347, + "learning_rate": 0.0008264692250784468, + "loss": 2.7391, + "step": 9471 + }, + { + "epoch": 0.2808765530943273, + "grad_norm": 0.14704318344593048, + "learning_rate": 0.0008264335867190711, + "loss": 2.7762, + "step": 9472 + }, + { + "epoch": 0.2809062064466388, + "grad_norm": 0.1454956978559494, + "learning_rate": 0.0008263979454690698, + "loss": 2.7368, + "step": 9473 + }, + { + "epoch": 0.28093585979895025, + "grad_norm": 0.13095083832740784, + "learning_rate": 0.0008263623013287587, + "loss": 2.7812, + "step": 9474 + }, + { + "epoch": 0.2809655131512617, + "grad_norm": 0.1256333589553833, + "learning_rate": 0.0008263266542984534, + "loss": 2.7824, + "step": 9475 + }, + { + "epoch": 0.2809951665035732, + "grad_norm": 0.1397462636232376, + "learning_rate": 0.0008262910043784695, + "loss": 2.7843, + "step": 9476 + }, + { + "epoch": 0.28102481985588473, + "grad_norm": 0.13712289929389954, + "learning_rate": 0.0008262553515691228, + "loss": 2.7583, + "step": 9477 + }, + { + "epoch": 0.2810544732081962, + "grad_norm": 0.14176008105278015, + "learning_rate": 0.0008262196958707289, + "loss": 2.7578, + "step": 9478 + }, + { + "epoch": 0.2810841265605077, + "grad_norm": 0.15551002323627472, + "learning_rate": 0.0008261840372836037, + "loss": 2.7781, + "step": 9479 + }, + { + "epoch": 0.28111377991281916, + "grad_norm": 0.15784671902656555, + "learning_rate": 0.0008261483758080628, + "loss": 2.7731, + "step": 9480 + }, + { + "epoch": 0.28114343326513064, + "grad_norm": 0.16470880806446075, + "learning_rate": 0.0008261127114444221, + "loss": 2.7707, + "step": 9481 + }, + { + "epoch": 0.2811730866174421, + "grad_norm": 0.16050247848033905, + "learning_rate": 0.0008260770441929974, + "loss": 2.7464, + "step": 9482 + }, + { + "epoch": 0.2812027399697536, + "grad_norm": 0.15125775337219238, + "learning_rate": 0.0008260413740541044, + "loss": 2.7609, + "step": 9483 + }, + { + "epoch": 0.28123239332206507, + "grad_norm": 0.1294538378715515, + "learning_rate": 0.000826005701028059, + "loss": 2.7958, + "step": 9484 + }, + { + "epoch": 0.28126204667437654, + "grad_norm": 0.1426464319229126, + "learning_rate": 0.0008259700251151774, + "loss": 2.7633, + "step": 9485 + }, + { + "epoch": 0.281291700026688, + "grad_norm": 0.158647358417511, + "learning_rate": 0.0008259343463157752, + "loss": 2.7561, + "step": 9486 + }, + { + "epoch": 0.2813213533789995, + "grad_norm": 0.16285982728004456, + "learning_rate": 0.0008258986646301683, + "loss": 2.7693, + "step": 9487 + }, + { + "epoch": 0.28135100673131097, + "grad_norm": 0.1687815636396408, + "learning_rate": 0.0008258629800586728, + "loss": 2.7661, + "step": 9488 + }, + { + "epoch": 0.28138066008362245, + "grad_norm": 0.15285441279411316, + "learning_rate": 0.0008258272926016048, + "loss": 2.7651, + "step": 9489 + }, + { + "epoch": 0.2814103134359339, + "grad_norm": 0.13684797286987305, + "learning_rate": 0.0008257916022592801, + "loss": 2.7383, + "step": 9490 + }, + { + "epoch": 0.2814399667882454, + "grad_norm": 0.12743833661079407, + "learning_rate": 0.0008257559090320148, + "loss": 2.7241, + "step": 9491 + }, + { + "epoch": 0.2814696201405569, + "grad_norm": 0.1328544020652771, + "learning_rate": 0.0008257202129201252, + "loss": 2.7848, + "step": 9492 + }, + { + "epoch": 0.28149927349286835, + "grad_norm": 0.12963420152664185, + "learning_rate": 0.000825684513923927, + "loss": 2.7744, + "step": 9493 + }, + { + "epoch": 0.2815289268451798, + "grad_norm": 0.1172836497426033, + "learning_rate": 0.0008256488120437366, + "loss": 2.7845, + "step": 9494 + }, + { + "epoch": 0.2815585801974913, + "grad_norm": 0.11897015571594238, + "learning_rate": 0.0008256131072798701, + "loss": 2.7661, + "step": 9495 + }, + { + "epoch": 0.2815882335498028, + "grad_norm": 0.11711946129798889, + "learning_rate": 0.0008255773996326436, + "loss": 2.7738, + "step": 9496 + }, + { + "epoch": 0.2816178869021143, + "grad_norm": 0.12335605919361115, + "learning_rate": 0.0008255416891023733, + "loss": 2.7534, + "step": 9497 + }, + { + "epoch": 0.2816475402544258, + "grad_norm": 0.10777845233678818, + "learning_rate": 0.0008255059756893755, + "loss": 2.7837, + "step": 9498 + }, + { + "epoch": 0.28167719360673726, + "grad_norm": 0.11787823587656021, + "learning_rate": 0.0008254702593939665, + "loss": 2.7575, + "step": 9499 + }, + { + "epoch": 0.28170684695904874, + "grad_norm": 0.12539491057395935, + "learning_rate": 0.0008254345402164625, + "loss": 2.762, + "step": 9500 + }, + { + "epoch": 0.2817365003113602, + "grad_norm": 0.13018016517162323, + "learning_rate": 0.0008253988181571797, + "loss": 2.7504, + "step": 9501 + }, + { + "epoch": 0.2817661536636717, + "grad_norm": 0.14105473458766937, + "learning_rate": 0.0008253630932164344, + "loss": 2.7723, + "step": 9502 + }, + { + "epoch": 0.28179580701598317, + "grad_norm": 0.14908891916275024, + "learning_rate": 0.0008253273653945432, + "loss": 2.7671, + "step": 9503 + }, + { + "epoch": 0.28182546036829464, + "grad_norm": 0.13609816133975983, + "learning_rate": 0.0008252916346918223, + "loss": 2.7485, + "step": 9504 + }, + { + "epoch": 0.2818551137206061, + "grad_norm": 0.14456699788570404, + "learning_rate": 0.0008252559011085882, + "loss": 2.7828, + "step": 9505 + }, + { + "epoch": 0.2818847670729176, + "grad_norm": 0.1541229784488678, + "learning_rate": 0.0008252201646451573, + "loss": 2.7695, + "step": 9506 + }, + { + "epoch": 0.2819144204252291, + "grad_norm": 0.1625542938709259, + "learning_rate": 0.0008251844253018459, + "loss": 2.8015, + "step": 9507 + }, + { + "epoch": 0.28194407377754055, + "grad_norm": 0.1415029913187027, + "learning_rate": 0.0008251486830789707, + "loss": 2.7553, + "step": 9508 + }, + { + "epoch": 0.281973727129852, + "grad_norm": 0.14850392937660217, + "learning_rate": 0.000825112937976848, + "loss": 2.7965, + "step": 9509 + }, + { + "epoch": 0.2820033804821635, + "grad_norm": 0.17937994003295898, + "learning_rate": 0.0008250771899957945, + "loss": 2.7962, + "step": 9510 + }, + { + "epoch": 0.282033033834475, + "grad_norm": 0.16722317039966583, + "learning_rate": 0.0008250414391361265, + "loss": 2.7705, + "step": 9511 + }, + { + "epoch": 0.28206268718678645, + "grad_norm": 0.168606698513031, + "learning_rate": 0.000825005685398161, + "loss": 2.8169, + "step": 9512 + }, + { + "epoch": 0.28209234053909793, + "grad_norm": 0.1428631842136383, + "learning_rate": 0.0008249699287822144, + "loss": 2.7863, + "step": 9513 + }, + { + "epoch": 0.2821219938914094, + "grad_norm": 0.11787213385105133, + "learning_rate": 0.0008249341692886031, + "loss": 2.7575, + "step": 9514 + }, + { + "epoch": 0.2821516472437209, + "grad_norm": 0.13590575754642487, + "learning_rate": 0.000824898406917644, + "loss": 2.775, + "step": 9515 + }, + { + "epoch": 0.28218130059603236, + "grad_norm": 0.13787685334682465, + "learning_rate": 0.0008248626416696538, + "loss": 2.7669, + "step": 9516 + }, + { + "epoch": 0.28221095394834383, + "grad_norm": 0.15223059058189392, + "learning_rate": 0.0008248268735449491, + "loss": 2.756, + "step": 9517 + }, + { + "epoch": 0.28224060730065537, + "grad_norm": 0.14526870846748352, + "learning_rate": 0.0008247911025438469, + "loss": 2.7583, + "step": 9518 + }, + { + "epoch": 0.28227026065296684, + "grad_norm": 0.1362442523241043, + "learning_rate": 0.0008247553286666634, + "loss": 2.7716, + "step": 9519 + }, + { + "epoch": 0.2822999140052783, + "grad_norm": 0.13543520867824554, + "learning_rate": 0.0008247195519137158, + "loss": 2.7467, + "step": 9520 + }, + { + "epoch": 0.2823295673575898, + "grad_norm": 0.12782979011535645, + "learning_rate": 0.000824683772285321, + "loss": 2.746, + "step": 9521 + }, + { + "epoch": 0.28235922070990127, + "grad_norm": 0.11729152500629425, + "learning_rate": 0.0008246479897817956, + "loss": 2.7706, + "step": 9522 + }, + { + "epoch": 0.28238887406221275, + "grad_norm": 0.11741137504577637, + "learning_rate": 0.0008246122044034564, + "loss": 2.7352, + "step": 9523 + }, + { + "epoch": 0.2824185274145242, + "grad_norm": 0.1215057522058487, + "learning_rate": 0.0008245764161506205, + "loss": 2.7719, + "step": 9524 + }, + { + "epoch": 0.2824481807668357, + "grad_norm": 0.10416721552610397, + "learning_rate": 0.0008245406250236047, + "loss": 2.7753, + "step": 9525 + }, + { + "epoch": 0.2824778341191472, + "grad_norm": 0.10366376489400864, + "learning_rate": 0.0008245048310227261, + "loss": 2.7474, + "step": 9526 + }, + { + "epoch": 0.28250748747145865, + "grad_norm": 0.11820834130048752, + "learning_rate": 0.0008244690341483015, + "loss": 2.7949, + "step": 9527 + }, + { + "epoch": 0.2825371408237701, + "grad_norm": 0.12388622015714645, + "learning_rate": 0.0008244332344006476, + "loss": 2.7527, + "step": 9528 + }, + { + "epoch": 0.2825667941760816, + "grad_norm": 0.12322190403938293, + "learning_rate": 0.000824397431780082, + "loss": 2.7339, + "step": 9529 + }, + { + "epoch": 0.2825964475283931, + "grad_norm": 0.11513634026050568, + "learning_rate": 0.0008243616262869213, + "loss": 2.7898, + "step": 9530 + }, + { + "epoch": 0.28262610088070456, + "grad_norm": 0.11776549369096756, + "learning_rate": 0.0008243258179214828, + "loss": 2.7536, + "step": 9531 + }, + { + "epoch": 0.28265575423301603, + "grad_norm": 0.14434528350830078, + "learning_rate": 0.0008242900066840837, + "loss": 2.7926, + "step": 9532 + }, + { + "epoch": 0.2826854075853275, + "grad_norm": 0.16386286914348602, + "learning_rate": 0.0008242541925750406, + "loss": 2.7954, + "step": 9533 + }, + { + "epoch": 0.282715060937639, + "grad_norm": 0.16202346980571747, + "learning_rate": 0.0008242183755946713, + "loss": 2.7331, + "step": 9534 + }, + { + "epoch": 0.28274471428995046, + "grad_norm": 0.14556767046451569, + "learning_rate": 0.0008241825557432924, + "loss": 2.77, + "step": 9535 + }, + { + "epoch": 0.28277436764226194, + "grad_norm": 0.14836759865283966, + "learning_rate": 0.0008241467330212213, + "loss": 2.7598, + "step": 9536 + }, + { + "epoch": 0.2828040209945734, + "grad_norm": 0.15903118252754211, + "learning_rate": 0.0008241109074287753, + "loss": 2.773, + "step": 9537 + }, + { + "epoch": 0.2828336743468849, + "grad_norm": 0.14085142314434052, + "learning_rate": 0.0008240750789662716, + "loss": 2.7554, + "step": 9538 + }, + { + "epoch": 0.2828633276991964, + "grad_norm": 0.13518641889095306, + "learning_rate": 0.0008240392476340275, + "loss": 2.7732, + "step": 9539 + }, + { + "epoch": 0.2828929810515079, + "grad_norm": 0.15568068623542786, + "learning_rate": 0.0008240034134323602, + "loss": 2.7646, + "step": 9540 + }, + { + "epoch": 0.2829226344038194, + "grad_norm": 0.15104235708713531, + "learning_rate": 0.000823967576361587, + "loss": 2.751, + "step": 9541 + }, + { + "epoch": 0.28295228775613085, + "grad_norm": 0.12518541514873505, + "learning_rate": 0.0008239317364220253, + "loss": 2.7785, + "step": 9542 + }, + { + "epoch": 0.2829819411084423, + "grad_norm": 0.1422407031059265, + "learning_rate": 0.0008238958936139926, + "loss": 2.7365, + "step": 9543 + }, + { + "epoch": 0.2830115944607538, + "grad_norm": 0.1405271738767624, + "learning_rate": 0.0008238600479378061, + "loss": 2.7605, + "step": 9544 + }, + { + "epoch": 0.2830412478130653, + "grad_norm": 0.16321569681167603, + "learning_rate": 0.0008238241993937833, + "loss": 2.8248, + "step": 9545 + }, + { + "epoch": 0.28307090116537675, + "grad_norm": 0.20279183983802795, + "learning_rate": 0.0008237883479822416, + "loss": 2.785, + "step": 9546 + }, + { + "epoch": 0.28310055451768823, + "grad_norm": 0.17672081291675568, + "learning_rate": 0.0008237524937034986, + "loss": 2.7797, + "step": 9547 + }, + { + "epoch": 0.2831302078699997, + "grad_norm": 0.1341918408870697, + "learning_rate": 0.0008237166365578716, + "loss": 2.7491, + "step": 9548 + }, + { + "epoch": 0.2831598612223112, + "grad_norm": 0.14440271258354187, + "learning_rate": 0.000823680776545678, + "loss": 2.7299, + "step": 9549 + }, + { + "epoch": 0.28318951457462266, + "grad_norm": 0.16073505580425262, + "learning_rate": 0.000823644913667236, + "loss": 2.7568, + "step": 9550 + }, + { + "epoch": 0.28321916792693413, + "grad_norm": 0.12937524914741516, + "learning_rate": 0.0008236090479228624, + "loss": 2.7638, + "step": 9551 + }, + { + "epoch": 0.2832488212792456, + "grad_norm": 0.14643852412700653, + "learning_rate": 0.0008235731793128754, + "loss": 2.768, + "step": 9552 + }, + { + "epoch": 0.2832784746315571, + "grad_norm": 0.17092375457286835, + "learning_rate": 0.0008235373078375921, + "loss": 2.8037, + "step": 9553 + }, + { + "epoch": 0.28330812798386856, + "grad_norm": 0.16120751202106476, + "learning_rate": 0.0008235014334973305, + "loss": 2.7759, + "step": 9554 + }, + { + "epoch": 0.28333778133618004, + "grad_norm": 0.1745573878288269, + "learning_rate": 0.0008234655562924082, + "loss": 2.7299, + "step": 9555 + }, + { + "epoch": 0.2833674346884915, + "grad_norm": 0.17984704673290253, + "learning_rate": 0.0008234296762231429, + "loss": 2.7869, + "step": 9556 + }, + { + "epoch": 0.283397088040803, + "grad_norm": 0.1596478521823883, + "learning_rate": 0.0008233937932898523, + "loss": 2.7357, + "step": 9557 + }, + { + "epoch": 0.28342674139311447, + "grad_norm": 0.1704491674900055, + "learning_rate": 0.0008233579074928541, + "loss": 2.7663, + "step": 9558 + }, + { + "epoch": 0.28345639474542594, + "grad_norm": 0.13919366896152496, + "learning_rate": 0.0008233220188324661, + "loss": 2.7567, + "step": 9559 + }, + { + "epoch": 0.2834860480977375, + "grad_norm": 0.14595043659210205, + "learning_rate": 0.0008232861273090062, + "loss": 2.7574, + "step": 9560 + }, + { + "epoch": 0.28351570145004895, + "grad_norm": 0.16149993240833282, + "learning_rate": 0.0008232502329227923, + "loss": 2.7742, + "step": 9561 + }, + { + "epoch": 0.2835453548023604, + "grad_norm": 0.1347668170928955, + "learning_rate": 0.0008232143356741418, + "loss": 2.7776, + "step": 9562 + }, + { + "epoch": 0.2835750081546719, + "grad_norm": 0.14644469320774078, + "learning_rate": 0.000823178435563373, + "loss": 2.7433, + "step": 9563 + }, + { + "epoch": 0.2836046615069834, + "grad_norm": 0.13089218735694885, + "learning_rate": 0.0008231425325908037, + "loss": 2.7255, + "step": 9564 + }, + { + "epoch": 0.28363431485929486, + "grad_norm": 0.15245011448860168, + "learning_rate": 0.0008231066267567517, + "loss": 2.7647, + "step": 9565 + }, + { + "epoch": 0.28366396821160633, + "grad_norm": 0.14462031424045563, + "learning_rate": 0.0008230707180615353, + "loss": 2.7555, + "step": 9566 + }, + { + "epoch": 0.2836936215639178, + "grad_norm": 0.13534604012966156, + "learning_rate": 0.000823034806505472, + "loss": 2.7826, + "step": 9567 + }, + { + "epoch": 0.2837232749162293, + "grad_norm": 0.1369147002696991, + "learning_rate": 0.0008229988920888801, + "loss": 2.7562, + "step": 9568 + }, + { + "epoch": 0.28375292826854076, + "grad_norm": 0.12821955978870392, + "learning_rate": 0.0008229629748120777, + "loss": 2.7839, + "step": 9569 + }, + { + "epoch": 0.28378258162085224, + "grad_norm": 0.13263791799545288, + "learning_rate": 0.0008229270546753827, + "loss": 2.7498, + "step": 9570 + }, + { + "epoch": 0.2838122349731637, + "grad_norm": 0.11658617109060287, + "learning_rate": 0.000822891131679113, + "loss": 2.7799, + "step": 9571 + }, + { + "epoch": 0.2838418883254752, + "grad_norm": 0.12401565164327621, + "learning_rate": 0.000822855205823587, + "loss": 2.7939, + "step": 9572 + }, + { + "epoch": 0.28387154167778667, + "grad_norm": 0.13490897417068481, + "learning_rate": 0.0008228192771091229, + "loss": 2.7403, + "step": 9573 + }, + { + "epoch": 0.28390119503009814, + "grad_norm": 0.15657250583171844, + "learning_rate": 0.0008227833455360385, + "loss": 2.7542, + "step": 9574 + }, + { + "epoch": 0.2839308483824096, + "grad_norm": 0.1532307118177414, + "learning_rate": 0.0008227474111046522, + "loss": 2.7972, + "step": 9575 + }, + { + "epoch": 0.2839605017347211, + "grad_norm": 0.1529376208782196, + "learning_rate": 0.0008227114738152822, + "loss": 2.7552, + "step": 9576 + }, + { + "epoch": 0.28399015508703257, + "grad_norm": 0.13744930922985077, + "learning_rate": 0.0008226755336682468, + "loss": 2.7788, + "step": 9577 + }, + { + "epoch": 0.28401980843934405, + "grad_norm": 0.11201605945825577, + "learning_rate": 0.000822639590663864, + "loss": 2.7393, + "step": 9578 + }, + { + "epoch": 0.2840494617916555, + "grad_norm": 0.14482608437538147, + "learning_rate": 0.0008226036448024523, + "loss": 2.7568, + "step": 9579 + }, + { + "epoch": 0.284079115143967, + "grad_norm": 0.14433269202709198, + "learning_rate": 0.00082256769608433, + "loss": 2.7928, + "step": 9580 + }, + { + "epoch": 0.28410876849627853, + "grad_norm": 0.1470859795808792, + "learning_rate": 0.0008225317445098153, + "loss": 2.7745, + "step": 9581 + }, + { + "epoch": 0.28413842184859, + "grad_norm": 0.1344345659017563, + "learning_rate": 0.0008224957900792267, + "loss": 2.7414, + "step": 9582 + }, + { + "epoch": 0.2841680752009015, + "grad_norm": 0.1284054070711136, + "learning_rate": 0.0008224598327928825, + "loss": 2.7745, + "step": 9583 + }, + { + "epoch": 0.28419772855321296, + "grad_norm": 0.13356196880340576, + "learning_rate": 0.0008224238726511012, + "loss": 2.7678, + "step": 9584 + }, + { + "epoch": 0.28422738190552443, + "grad_norm": 0.13216708600521088, + "learning_rate": 0.0008223879096542011, + "loss": 2.7643, + "step": 9585 + }, + { + "epoch": 0.2842570352578359, + "grad_norm": 0.11846547573804855, + "learning_rate": 0.0008223519438025007, + "loss": 2.7523, + "step": 9586 + }, + { + "epoch": 0.2842866886101474, + "grad_norm": 0.12848608195781708, + "learning_rate": 0.0008223159750963186, + "loss": 2.7558, + "step": 9587 + }, + { + "epoch": 0.28431634196245886, + "grad_norm": 0.15192802250385284, + "learning_rate": 0.0008222800035359729, + "loss": 2.8011, + "step": 9588 + }, + { + "epoch": 0.28434599531477034, + "grad_norm": 0.1391276717185974, + "learning_rate": 0.000822244029121783, + "loss": 2.7828, + "step": 9589 + }, + { + "epoch": 0.2843756486670818, + "grad_norm": 0.15791605412960052, + "learning_rate": 0.0008222080518540665, + "loss": 2.7745, + "step": 9590 + }, + { + "epoch": 0.2844053020193933, + "grad_norm": 0.16442129015922546, + "learning_rate": 0.0008221720717331425, + "loss": 2.7544, + "step": 9591 + }, + { + "epoch": 0.28443495537170477, + "grad_norm": 0.14835225045681, + "learning_rate": 0.0008221360887593296, + "loss": 2.7824, + "step": 9592 + }, + { + "epoch": 0.28446460872401624, + "grad_norm": 0.1345965713262558, + "learning_rate": 0.0008221001029329462, + "loss": 2.7635, + "step": 9593 + }, + { + "epoch": 0.2844942620763277, + "grad_norm": 0.16111616790294647, + "learning_rate": 0.0008220641142543112, + "loss": 2.7701, + "step": 9594 + }, + { + "epoch": 0.2845239154286392, + "grad_norm": 0.17801505327224731, + "learning_rate": 0.0008220281227237431, + "loss": 2.7644, + "step": 9595 + }, + { + "epoch": 0.28455356878095067, + "grad_norm": 0.162623330950737, + "learning_rate": 0.0008219921283415608, + "loss": 2.7383, + "step": 9596 + }, + { + "epoch": 0.28458322213326215, + "grad_norm": 0.1425267606973648, + "learning_rate": 0.0008219561311080827, + "loss": 2.7855, + "step": 9597 + }, + { + "epoch": 0.2846128754855736, + "grad_norm": 0.14847058057785034, + "learning_rate": 0.0008219201310236282, + "loss": 2.7591, + "step": 9598 + }, + { + "epoch": 0.2846425288378851, + "grad_norm": 0.14199689030647278, + "learning_rate": 0.0008218841280885153, + "loss": 2.7653, + "step": 9599 + }, + { + "epoch": 0.2846721821901966, + "grad_norm": 0.12342703342437744, + "learning_rate": 0.0008218481223030634, + "loss": 2.7987, + "step": 9600 + }, + { + "epoch": 0.2847018355425081, + "grad_norm": 0.10885552316904068, + "learning_rate": 0.000821812113667591, + "loss": 2.7638, + "step": 9601 + }, + { + "epoch": 0.2847314888948196, + "grad_norm": 0.12287482619285583, + "learning_rate": 0.0008217761021824172, + "loss": 2.7542, + "step": 9602 + }, + { + "epoch": 0.28476114224713106, + "grad_norm": 0.12781749665737152, + "learning_rate": 0.0008217400878478608, + "loss": 2.7642, + "step": 9603 + }, + { + "epoch": 0.28479079559944254, + "grad_norm": 0.12455471605062485, + "learning_rate": 0.0008217040706642407, + "loss": 2.75, + "step": 9604 + }, + { + "epoch": 0.284820448951754, + "grad_norm": 0.12273130565881729, + "learning_rate": 0.0008216680506318757, + "loss": 2.7533, + "step": 9605 + }, + { + "epoch": 0.2848501023040655, + "grad_norm": 0.1477767378091812, + "learning_rate": 0.000821632027751085, + "loss": 2.7727, + "step": 9606 + }, + { + "epoch": 0.28487975565637697, + "grad_norm": 0.14577917754650116, + "learning_rate": 0.0008215960020221874, + "loss": 2.8064, + "step": 9607 + }, + { + "epoch": 0.28490940900868844, + "grad_norm": 0.15456363558769226, + "learning_rate": 0.0008215599734455022, + "loss": 2.775, + "step": 9608 + }, + { + "epoch": 0.2849390623609999, + "grad_norm": 0.15134985744953156, + "learning_rate": 0.0008215239420213481, + "loss": 2.7787, + "step": 9609 + }, + { + "epoch": 0.2849687157133114, + "grad_norm": 0.12015711516141891, + "learning_rate": 0.0008214879077500444, + "loss": 2.7513, + "step": 9610 + }, + { + "epoch": 0.28499836906562287, + "grad_norm": 0.11863939464092255, + "learning_rate": 0.0008214518706319099, + "loss": 2.7982, + "step": 9611 + }, + { + "epoch": 0.28502802241793435, + "grad_norm": 0.12529917061328888, + "learning_rate": 0.0008214158306672641, + "loss": 2.7725, + "step": 9612 + }, + { + "epoch": 0.2850576757702458, + "grad_norm": 0.12123146653175354, + "learning_rate": 0.0008213797878564257, + "loss": 2.761, + "step": 9613 + }, + { + "epoch": 0.2850873291225573, + "grad_norm": 0.1267065703868866, + "learning_rate": 0.0008213437421997143, + "loss": 2.7855, + "step": 9614 + }, + { + "epoch": 0.2851169824748688, + "grad_norm": 0.1289156824350357, + "learning_rate": 0.000821307693697449, + "loss": 2.7624, + "step": 9615 + }, + { + "epoch": 0.28514663582718025, + "grad_norm": 0.12391284108161926, + "learning_rate": 0.0008212716423499488, + "loss": 2.7984, + "step": 9616 + }, + { + "epoch": 0.2851762891794917, + "grad_norm": 0.12918615341186523, + "learning_rate": 0.0008212355881575331, + "loss": 2.781, + "step": 9617 + }, + { + "epoch": 0.2852059425318032, + "grad_norm": 0.1408531367778778, + "learning_rate": 0.0008211995311205211, + "loss": 2.7504, + "step": 9618 + }, + { + "epoch": 0.2852355958841147, + "grad_norm": 0.1381378173828125, + "learning_rate": 0.0008211634712392321, + "loss": 2.7669, + "step": 9619 + }, + { + "epoch": 0.28526524923642615, + "grad_norm": 0.13402926921844482, + "learning_rate": 0.0008211274085139854, + "loss": 2.7362, + "step": 9620 + }, + { + "epoch": 0.28529490258873763, + "grad_norm": 0.13588126003742218, + "learning_rate": 0.0008210913429451003, + "loss": 2.787, + "step": 9621 + }, + { + "epoch": 0.28532455594104916, + "grad_norm": 0.1458558738231659, + "learning_rate": 0.0008210552745328965, + "loss": 2.7598, + "step": 9622 + }, + { + "epoch": 0.28535420929336064, + "grad_norm": 0.15411792695522308, + "learning_rate": 0.000821019203277693, + "loss": 2.7527, + "step": 9623 + }, + { + "epoch": 0.2853838626456721, + "grad_norm": 0.164224773645401, + "learning_rate": 0.0008209831291798094, + "loss": 2.7472, + "step": 9624 + }, + { + "epoch": 0.2854135159979836, + "grad_norm": 0.139065220952034, + "learning_rate": 0.0008209470522395651, + "loss": 2.8009, + "step": 9625 + }, + { + "epoch": 0.28544316935029507, + "grad_norm": 0.15565937757492065, + "learning_rate": 0.0008209109724572794, + "loss": 2.769, + "step": 9626 + }, + { + "epoch": 0.28547282270260654, + "grad_norm": 0.1284896284341812, + "learning_rate": 0.000820874889833272, + "loss": 2.7921, + "step": 9627 + }, + { + "epoch": 0.285502476054918, + "grad_norm": 0.1502954065799713, + "learning_rate": 0.0008208388043678625, + "loss": 2.7734, + "step": 9628 + }, + { + "epoch": 0.2855321294072295, + "grad_norm": 0.16023659706115723, + "learning_rate": 0.0008208027160613704, + "loss": 2.7742, + "step": 9629 + }, + { + "epoch": 0.28556178275954097, + "grad_norm": 0.1511797457933426, + "learning_rate": 0.000820766624914115, + "loss": 2.7898, + "step": 9630 + }, + { + "epoch": 0.28559143611185245, + "grad_norm": 0.1352044641971588, + "learning_rate": 0.0008207305309264161, + "loss": 2.7675, + "step": 9631 + }, + { + "epoch": 0.2856210894641639, + "grad_norm": 0.145717054605484, + "learning_rate": 0.0008206944340985933, + "loss": 2.754, + "step": 9632 + }, + { + "epoch": 0.2856507428164754, + "grad_norm": 0.13340458273887634, + "learning_rate": 0.0008206583344309664, + "loss": 2.7639, + "step": 9633 + }, + { + "epoch": 0.2856803961687869, + "grad_norm": 0.1284772902727127, + "learning_rate": 0.0008206222319238547, + "loss": 2.7738, + "step": 9634 + }, + { + "epoch": 0.28571004952109835, + "grad_norm": 0.13391149044036865, + "learning_rate": 0.0008205861265775782, + "loss": 2.7591, + "step": 9635 + }, + { + "epoch": 0.28573970287340983, + "grad_norm": 0.1416623741388321, + "learning_rate": 0.0008205500183924565, + "loss": 2.7461, + "step": 9636 + }, + { + "epoch": 0.2857693562257213, + "grad_norm": 0.12236212939023972, + "learning_rate": 0.0008205139073688092, + "loss": 2.7609, + "step": 9637 + }, + { + "epoch": 0.2857990095780328, + "grad_norm": 0.12403500080108643, + "learning_rate": 0.0008204777935069566, + "loss": 2.7817, + "step": 9638 + }, + { + "epoch": 0.28582866293034426, + "grad_norm": 0.1411985605955124, + "learning_rate": 0.0008204416768072178, + "loss": 2.7538, + "step": 9639 + }, + { + "epoch": 0.28585831628265573, + "grad_norm": 0.15515075623989105, + "learning_rate": 0.000820405557269913, + "loss": 2.7574, + "step": 9640 + }, + { + "epoch": 0.2858879696349672, + "grad_norm": 0.14447905123233795, + "learning_rate": 0.0008203694348953622, + "loss": 2.7649, + "step": 9641 + }, + { + "epoch": 0.2859176229872787, + "grad_norm": 0.15088534355163574, + "learning_rate": 0.000820333309683885, + "loss": 2.7563, + "step": 9642 + }, + { + "epoch": 0.2859472763395902, + "grad_norm": 0.15351933240890503, + "learning_rate": 0.0008202971816358011, + "loss": 2.7892, + "step": 9643 + }, + { + "epoch": 0.2859769296919017, + "grad_norm": 0.16955000162124634, + "learning_rate": 0.000820261050751431, + "loss": 2.7653, + "step": 9644 + }, + { + "epoch": 0.28600658304421317, + "grad_norm": 0.17039424180984497, + "learning_rate": 0.0008202249170310942, + "loss": 2.7384, + "step": 9645 + }, + { + "epoch": 0.28603623639652465, + "grad_norm": 0.150063157081604, + "learning_rate": 0.0008201887804751107, + "loss": 2.7403, + "step": 9646 + }, + { + "epoch": 0.2860658897488361, + "grad_norm": 0.14871260523796082, + "learning_rate": 0.0008201526410838007, + "loss": 2.7494, + "step": 9647 + }, + { + "epoch": 0.2860955431011476, + "grad_norm": 0.150632843375206, + "learning_rate": 0.000820116498857484, + "loss": 2.7413, + "step": 9648 + }, + { + "epoch": 0.2861251964534591, + "grad_norm": 0.14947806298732758, + "learning_rate": 0.000820080353796481, + "loss": 2.7488, + "step": 9649 + }, + { + "epoch": 0.28615484980577055, + "grad_norm": 0.15183191001415253, + "learning_rate": 0.0008200442059011113, + "loss": 2.7988, + "step": 9650 + }, + { + "epoch": 0.286184503158082, + "grad_norm": 0.1424822360277176, + "learning_rate": 0.0008200080551716953, + "loss": 2.7497, + "step": 9651 + }, + { + "epoch": 0.2862141565103935, + "grad_norm": 0.14537103474140167, + "learning_rate": 0.000819971901608553, + "loss": 2.7243, + "step": 9652 + }, + { + "epoch": 0.286243809862705, + "grad_norm": 0.1487027108669281, + "learning_rate": 0.0008199357452120047, + "loss": 2.7314, + "step": 9653 + }, + { + "epoch": 0.28627346321501645, + "grad_norm": 0.12620234489440918, + "learning_rate": 0.0008198995859823703, + "loss": 2.7809, + "step": 9654 + }, + { + "epoch": 0.28630311656732793, + "grad_norm": 0.11989116668701172, + "learning_rate": 0.0008198634239199703, + "loss": 2.7464, + "step": 9655 + }, + { + "epoch": 0.2863327699196394, + "grad_norm": 0.11206524074077606, + "learning_rate": 0.0008198272590251246, + "loss": 2.7653, + "step": 9656 + }, + { + "epoch": 0.2863624232719509, + "grad_norm": 0.12136298418045044, + "learning_rate": 0.0008197910912981538, + "loss": 2.7641, + "step": 9657 + }, + { + "epoch": 0.28639207662426236, + "grad_norm": 0.11660151183605194, + "learning_rate": 0.000819754920739378, + "loss": 2.7586, + "step": 9658 + }, + { + "epoch": 0.28642172997657384, + "grad_norm": 0.12624496221542358, + "learning_rate": 0.0008197187473491173, + "loss": 2.7917, + "step": 9659 + }, + { + "epoch": 0.2864513833288853, + "grad_norm": 0.134680837392807, + "learning_rate": 0.0008196825711276923, + "loss": 2.7718, + "step": 9660 + }, + { + "epoch": 0.2864810366811968, + "grad_norm": 0.14592060446739197, + "learning_rate": 0.0008196463920754234, + "loss": 2.7722, + "step": 9661 + }, + { + "epoch": 0.28651069003350826, + "grad_norm": 0.1252610683441162, + "learning_rate": 0.0008196102101926306, + "loss": 2.7503, + "step": 9662 + }, + { + "epoch": 0.28654034338581974, + "grad_norm": 0.1302160769701004, + "learning_rate": 0.0008195740254796347, + "loss": 2.7678, + "step": 9663 + }, + { + "epoch": 0.28656999673813127, + "grad_norm": 0.11546877771615982, + "learning_rate": 0.000819537837936756, + "loss": 2.7632, + "step": 9664 + }, + { + "epoch": 0.28659965009044275, + "grad_norm": 0.12234565615653992, + "learning_rate": 0.0008195016475643147, + "loss": 2.755, + "step": 9665 + }, + { + "epoch": 0.2866293034427542, + "grad_norm": 0.11370806396007538, + "learning_rate": 0.0008194654543626316, + "loss": 2.7623, + "step": 9666 + }, + { + "epoch": 0.2866589567950657, + "grad_norm": 0.10704001039266586, + "learning_rate": 0.000819429258332027, + "loss": 2.7255, + "step": 9667 + }, + { + "epoch": 0.2866886101473772, + "grad_norm": 0.11871360242366791, + "learning_rate": 0.0008193930594728216, + "loss": 2.765, + "step": 9668 + }, + { + "epoch": 0.28671826349968865, + "grad_norm": 0.1200123205780983, + "learning_rate": 0.0008193568577853356, + "loss": 2.7963, + "step": 9669 + }, + { + "epoch": 0.28674791685200013, + "grad_norm": 0.12325746566057205, + "learning_rate": 0.00081932065326989, + "loss": 2.7589, + "step": 9670 + }, + { + "epoch": 0.2867775702043116, + "grad_norm": 0.13662219047546387, + "learning_rate": 0.0008192844459268052, + "loss": 2.7502, + "step": 9671 + }, + { + "epoch": 0.2868072235566231, + "grad_norm": 0.14885061979293823, + "learning_rate": 0.0008192482357564018, + "loss": 2.7962, + "step": 9672 + }, + { + "epoch": 0.28683687690893456, + "grad_norm": 0.15796184539794922, + "learning_rate": 0.0008192120227590006, + "loss": 2.7735, + "step": 9673 + }, + { + "epoch": 0.28686653026124603, + "grad_norm": 0.15985874831676483, + "learning_rate": 0.000819175806934922, + "loss": 2.7711, + "step": 9674 + }, + { + "epoch": 0.2868961836135575, + "grad_norm": 0.16522513329982758, + "learning_rate": 0.0008191395882844867, + "loss": 2.7478, + "step": 9675 + }, + { + "epoch": 0.286925836965869, + "grad_norm": 0.143838033080101, + "learning_rate": 0.0008191033668080159, + "loss": 2.7669, + "step": 9676 + }, + { + "epoch": 0.28695549031818046, + "grad_norm": 0.1749596744775772, + "learning_rate": 0.0008190671425058298, + "loss": 2.7641, + "step": 9677 + }, + { + "epoch": 0.28698514367049194, + "grad_norm": 0.1632600724697113, + "learning_rate": 0.0008190309153782493, + "loss": 2.7426, + "step": 9678 + }, + { + "epoch": 0.2870147970228034, + "grad_norm": 0.16263175010681152, + "learning_rate": 0.0008189946854255954, + "loss": 2.7628, + "step": 9679 + }, + { + "epoch": 0.2870444503751149, + "grad_norm": 0.16655084490776062, + "learning_rate": 0.000818958452648189, + "loss": 2.7706, + "step": 9680 + }, + { + "epoch": 0.28707410372742637, + "grad_norm": 0.14354456961154938, + "learning_rate": 0.0008189222170463504, + "loss": 2.7689, + "step": 9681 + }, + { + "epoch": 0.28710375707973784, + "grad_norm": 0.14905573427677155, + "learning_rate": 0.0008188859786204011, + "loss": 2.795, + "step": 9682 + }, + { + "epoch": 0.2871334104320493, + "grad_norm": 0.1456969827413559, + "learning_rate": 0.0008188497373706616, + "loss": 2.7524, + "step": 9683 + }, + { + "epoch": 0.2871630637843608, + "grad_norm": 0.13036175072193146, + "learning_rate": 0.000818813493297453, + "loss": 2.7487, + "step": 9684 + }, + { + "epoch": 0.2871927171366723, + "grad_norm": 0.1356305330991745, + "learning_rate": 0.0008187772464010961, + "loss": 2.7678, + "step": 9685 + }, + { + "epoch": 0.2872223704889838, + "grad_norm": 0.12949666380882263, + "learning_rate": 0.000818740996681912, + "loss": 2.7549, + "step": 9686 + }, + { + "epoch": 0.2872520238412953, + "grad_norm": 0.15323911607265472, + "learning_rate": 0.0008187047441402217, + "loss": 2.7788, + "step": 9687 + }, + { + "epoch": 0.28728167719360675, + "grad_norm": 0.15262041985988617, + "learning_rate": 0.0008186684887763463, + "loss": 2.7255, + "step": 9688 + }, + { + "epoch": 0.28731133054591823, + "grad_norm": 0.1604713648557663, + "learning_rate": 0.0008186322305906065, + "loss": 2.7859, + "step": 9689 + }, + { + "epoch": 0.2873409838982297, + "grad_norm": 0.17616423964500427, + "learning_rate": 0.0008185959695833238, + "loss": 2.8148, + "step": 9690 + }, + { + "epoch": 0.2873706372505412, + "grad_norm": 0.178582563996315, + "learning_rate": 0.0008185597057548189, + "loss": 2.7786, + "step": 9691 + }, + { + "epoch": 0.28740029060285266, + "grad_norm": 0.15713848173618317, + "learning_rate": 0.0008185234391054133, + "loss": 2.7765, + "step": 9692 + }, + { + "epoch": 0.28742994395516414, + "grad_norm": 0.16959133744239807, + "learning_rate": 0.0008184871696354279, + "loss": 2.7665, + "step": 9693 + }, + { + "epoch": 0.2874595973074756, + "grad_norm": 0.14461442828178406, + "learning_rate": 0.0008184508973451839, + "loss": 2.7429, + "step": 9694 + }, + { + "epoch": 0.2874892506597871, + "grad_norm": 0.12714296579360962, + "learning_rate": 0.0008184146222350026, + "loss": 2.779, + "step": 9695 + }, + { + "epoch": 0.28751890401209856, + "grad_norm": 0.13524891436100006, + "learning_rate": 0.0008183783443052053, + "loss": 2.7004, + "step": 9696 + }, + { + "epoch": 0.28754855736441004, + "grad_norm": 0.1290867179632187, + "learning_rate": 0.000818342063556113, + "loss": 2.7933, + "step": 9697 + }, + { + "epoch": 0.2875782107167215, + "grad_norm": 0.11896796524524689, + "learning_rate": 0.0008183057799880469, + "loss": 2.7343, + "step": 9698 + }, + { + "epoch": 0.287607864069033, + "grad_norm": 0.1254274845123291, + "learning_rate": 0.0008182694936013286, + "loss": 2.7626, + "step": 9699 + }, + { + "epoch": 0.28763751742134447, + "grad_norm": 0.12597598135471344, + "learning_rate": 0.0008182332043962794, + "loss": 2.7627, + "step": 9700 + }, + { + "epoch": 0.28766717077365594, + "grad_norm": 0.12237483263015747, + "learning_rate": 0.0008181969123732206, + "loss": 2.766, + "step": 9701 + }, + { + "epoch": 0.2876968241259674, + "grad_norm": 0.09640014916658401, + "learning_rate": 0.0008181606175324734, + "loss": 2.8011, + "step": 9702 + }, + { + "epoch": 0.2877264774782789, + "grad_norm": 0.11400311440229416, + "learning_rate": 0.0008181243198743594, + "loss": 2.7664, + "step": 9703 + }, + { + "epoch": 0.2877561308305904, + "grad_norm": 0.11688421666622162, + "learning_rate": 0.0008180880193991997, + "loss": 2.7432, + "step": 9704 + }, + { + "epoch": 0.2877857841829019, + "grad_norm": 0.12133223563432693, + "learning_rate": 0.0008180517161073162, + "loss": 2.7519, + "step": 9705 + }, + { + "epoch": 0.2878154375352134, + "grad_norm": 0.13103029131889343, + "learning_rate": 0.0008180154099990302, + "loss": 2.7553, + "step": 9706 + }, + { + "epoch": 0.28784509088752486, + "grad_norm": 0.15530981123447418, + "learning_rate": 0.0008179791010746631, + "loss": 2.7889, + "step": 9707 + }, + { + "epoch": 0.28787474423983633, + "grad_norm": 0.15791307389736176, + "learning_rate": 0.0008179427893345364, + "loss": 2.7502, + "step": 9708 + }, + { + "epoch": 0.2879043975921478, + "grad_norm": 0.13747639954090118, + "learning_rate": 0.000817906474778972, + "loss": 2.7523, + "step": 9709 + }, + { + "epoch": 0.2879340509444593, + "grad_norm": 0.13836951553821564, + "learning_rate": 0.0008178701574082909, + "loss": 2.7738, + "step": 9710 + }, + { + "epoch": 0.28796370429677076, + "grad_norm": 0.16607102751731873, + "learning_rate": 0.000817833837222815, + "loss": 2.768, + "step": 9711 + }, + { + "epoch": 0.28799335764908224, + "grad_norm": 0.1561957150697708, + "learning_rate": 0.0008177975142228661, + "loss": 2.7608, + "step": 9712 + }, + { + "epoch": 0.2880230110013937, + "grad_norm": 0.14793932437896729, + "learning_rate": 0.0008177611884087654, + "loss": 2.7625, + "step": 9713 + }, + { + "epoch": 0.2880526643537052, + "grad_norm": 0.15761138498783112, + "learning_rate": 0.0008177248597808351, + "loss": 2.7625, + "step": 9714 + }, + { + "epoch": 0.28808231770601667, + "grad_norm": 0.16018974781036377, + "learning_rate": 0.0008176885283393967, + "loss": 2.7366, + "step": 9715 + }, + { + "epoch": 0.28811197105832814, + "grad_norm": 0.13504824042320251, + "learning_rate": 0.0008176521940847717, + "loss": 2.7718, + "step": 9716 + }, + { + "epoch": 0.2881416244106396, + "grad_norm": 0.1201774850487709, + "learning_rate": 0.0008176158570172818, + "loss": 2.7594, + "step": 9717 + }, + { + "epoch": 0.2881712777629511, + "grad_norm": 0.13948172330856323, + "learning_rate": 0.0008175795171372491, + "loss": 2.766, + "step": 9718 + }, + { + "epoch": 0.28820093111526257, + "grad_norm": 0.12614506483078003, + "learning_rate": 0.0008175431744449953, + "loss": 2.7622, + "step": 9719 + }, + { + "epoch": 0.28823058446757405, + "grad_norm": 0.1428331583738327, + "learning_rate": 0.0008175068289408423, + "loss": 2.7713, + "step": 9720 + }, + { + "epoch": 0.2882602378198855, + "grad_norm": 0.1380709707736969, + "learning_rate": 0.0008174704806251118, + "loss": 2.7498, + "step": 9721 + }, + { + "epoch": 0.288289891172197, + "grad_norm": 0.13662874698638916, + "learning_rate": 0.0008174341294981256, + "loss": 2.7477, + "step": 9722 + }, + { + "epoch": 0.2883195445245085, + "grad_norm": 0.13344727456569672, + "learning_rate": 0.0008173977755602057, + "loss": 2.7633, + "step": 9723 + }, + { + "epoch": 0.28834919787681995, + "grad_norm": 0.12168218940496445, + "learning_rate": 0.000817361418811674, + "loss": 2.7703, + "step": 9724 + }, + { + "epoch": 0.2883788512291314, + "grad_norm": 0.13275980949401855, + "learning_rate": 0.0008173250592528524, + "loss": 2.7691, + "step": 9725 + }, + { + "epoch": 0.28840850458144296, + "grad_norm": 0.15620926022529602, + "learning_rate": 0.0008172886968840632, + "loss": 2.7857, + "step": 9726 + }, + { + "epoch": 0.28843815793375444, + "grad_norm": 0.14937534928321838, + "learning_rate": 0.000817252331705628, + "loss": 2.8089, + "step": 9727 + }, + { + "epoch": 0.2884678112860659, + "grad_norm": 0.1481245458126068, + "learning_rate": 0.0008172159637178689, + "loss": 2.7851, + "step": 9728 + }, + { + "epoch": 0.2884974646383774, + "grad_norm": 0.13516627252101898, + "learning_rate": 0.000817179592921108, + "loss": 2.7466, + "step": 9729 + }, + { + "epoch": 0.28852711799068886, + "grad_norm": 0.13030913472175598, + "learning_rate": 0.0008171432193156673, + "loss": 2.7525, + "step": 9730 + }, + { + "epoch": 0.28855677134300034, + "grad_norm": 0.1437724083662033, + "learning_rate": 0.000817106842901869, + "loss": 2.7795, + "step": 9731 + }, + { + "epoch": 0.2885864246953118, + "grad_norm": 0.15893685817718506, + "learning_rate": 0.0008170704636800353, + "loss": 2.7885, + "step": 9732 + }, + { + "epoch": 0.2886160780476233, + "grad_norm": 0.16819708049297333, + "learning_rate": 0.000817034081650488, + "loss": 2.77, + "step": 9733 + }, + { + "epoch": 0.28864573139993477, + "grad_norm": 0.1330518275499344, + "learning_rate": 0.0008169976968135498, + "loss": 2.7646, + "step": 9734 + }, + { + "epoch": 0.28867538475224624, + "grad_norm": 0.13407373428344727, + "learning_rate": 0.0008169613091695422, + "loss": 2.7701, + "step": 9735 + }, + { + "epoch": 0.2887050381045577, + "grad_norm": 0.13091537356376648, + "learning_rate": 0.0008169249187187879, + "loss": 2.7781, + "step": 9736 + }, + { + "epoch": 0.2887346914568692, + "grad_norm": 0.13509437441825867, + "learning_rate": 0.0008168885254616092, + "loss": 2.7702, + "step": 9737 + }, + { + "epoch": 0.2887643448091807, + "grad_norm": 0.12667034566402435, + "learning_rate": 0.0008168521293983282, + "loss": 2.7783, + "step": 9738 + }, + { + "epoch": 0.28879399816149215, + "grad_norm": 0.1277810037136078, + "learning_rate": 0.0008168157305292672, + "loss": 2.7663, + "step": 9739 + }, + { + "epoch": 0.2888236515138036, + "grad_norm": 0.14255940914154053, + "learning_rate": 0.0008167793288547485, + "loss": 2.7724, + "step": 9740 + }, + { + "epoch": 0.2888533048661151, + "grad_norm": 0.14529576897621155, + "learning_rate": 0.0008167429243750943, + "loss": 2.782, + "step": 9741 + }, + { + "epoch": 0.2888829582184266, + "grad_norm": 0.13585998117923737, + "learning_rate": 0.0008167065170906274, + "loss": 2.7384, + "step": 9742 + }, + { + "epoch": 0.28891261157073805, + "grad_norm": 0.15350420773029327, + "learning_rate": 0.0008166701070016698, + "loss": 2.745, + "step": 9743 + }, + { + "epoch": 0.28894226492304953, + "grad_norm": 0.1671678125858307, + "learning_rate": 0.0008166336941085441, + "loss": 2.7679, + "step": 9744 + }, + { + "epoch": 0.288971918275361, + "grad_norm": 0.13034939765930176, + "learning_rate": 0.0008165972784115726, + "loss": 2.7672, + "step": 9745 + }, + { + "epoch": 0.2890015716276725, + "grad_norm": 0.13424751162528992, + "learning_rate": 0.0008165608599110779, + "loss": 2.8048, + "step": 9746 + }, + { + "epoch": 0.289031224979984, + "grad_norm": 0.14639541506767273, + "learning_rate": 0.0008165244386073824, + "loss": 2.7661, + "step": 9747 + }, + { + "epoch": 0.2890608783322955, + "grad_norm": 0.14201529324054718, + "learning_rate": 0.0008164880145008087, + "loss": 2.7679, + "step": 9748 + }, + { + "epoch": 0.28909053168460697, + "grad_norm": 0.13194546103477478, + "learning_rate": 0.0008164515875916794, + "loss": 2.746, + "step": 9749 + }, + { + "epoch": 0.28912018503691844, + "grad_norm": 0.12272284924983978, + "learning_rate": 0.0008164151578803169, + "loss": 2.7433, + "step": 9750 + }, + { + "epoch": 0.2891498383892299, + "grad_norm": 0.13237228989601135, + "learning_rate": 0.0008163787253670439, + "loss": 2.7694, + "step": 9751 + }, + { + "epoch": 0.2891794917415414, + "grad_norm": 0.13715524971485138, + "learning_rate": 0.0008163422900521829, + "loss": 2.7877, + "step": 9752 + }, + { + "epoch": 0.28920914509385287, + "grad_norm": 0.14106324315071106, + "learning_rate": 0.0008163058519360567, + "loss": 2.7715, + "step": 9753 + }, + { + "epoch": 0.28923879844616435, + "grad_norm": 0.1479358971118927, + "learning_rate": 0.0008162694110189878, + "loss": 2.7929, + "step": 9754 + }, + { + "epoch": 0.2892684517984758, + "grad_norm": 0.14679162204265594, + "learning_rate": 0.0008162329673012991, + "loss": 2.793, + "step": 9755 + }, + { + "epoch": 0.2892981051507873, + "grad_norm": 0.12230825424194336, + "learning_rate": 0.0008161965207833131, + "loss": 2.7837, + "step": 9756 + }, + { + "epoch": 0.2893277585030988, + "grad_norm": 0.14859776198863983, + "learning_rate": 0.0008161600714653526, + "loss": 2.7576, + "step": 9757 + }, + { + "epoch": 0.28935741185541025, + "grad_norm": 0.1240663230419159, + "learning_rate": 0.0008161236193477406, + "loss": 2.7722, + "step": 9758 + }, + { + "epoch": 0.2893870652077217, + "grad_norm": 0.11924824863672256, + "learning_rate": 0.0008160871644307994, + "loss": 2.7789, + "step": 9759 + }, + { + "epoch": 0.2894167185600332, + "grad_norm": 0.11959008127450943, + "learning_rate": 0.0008160507067148524, + "loss": 2.7794, + "step": 9760 + }, + { + "epoch": 0.2894463719123447, + "grad_norm": 0.11719564348459244, + "learning_rate": 0.000816014246200222, + "loss": 2.788, + "step": 9761 + }, + { + "epoch": 0.28947602526465616, + "grad_norm": 0.1311263144016266, + "learning_rate": 0.0008159777828872311, + "loss": 2.8003, + "step": 9762 + }, + { + "epoch": 0.28950567861696763, + "grad_norm": 0.14414529502391815, + "learning_rate": 0.0008159413167762029, + "loss": 2.7552, + "step": 9763 + }, + { + "epoch": 0.2895353319692791, + "grad_norm": 0.14058050513267517, + "learning_rate": 0.00081590484786746, + "loss": 2.8028, + "step": 9764 + }, + { + "epoch": 0.2895649853215906, + "grad_norm": 0.14881199598312378, + "learning_rate": 0.0008158683761613255, + "loss": 2.7496, + "step": 9765 + }, + { + "epoch": 0.28959463867390206, + "grad_norm": 0.15454483032226562, + "learning_rate": 0.0008158319016581221, + "loss": 2.744, + "step": 9766 + }, + { + "epoch": 0.28962429202621354, + "grad_norm": 0.1578584462404251, + "learning_rate": 0.0008157954243581733, + "loss": 2.7691, + "step": 9767 + }, + { + "epoch": 0.28965394537852507, + "grad_norm": 0.15327900648117065, + "learning_rate": 0.0008157589442618016, + "loss": 2.7867, + "step": 9768 + }, + { + "epoch": 0.28968359873083654, + "grad_norm": 0.15346771478652954, + "learning_rate": 0.0008157224613693304, + "loss": 2.7786, + "step": 9769 + }, + { + "epoch": 0.289713252083148, + "grad_norm": 0.15265700221061707, + "learning_rate": 0.0008156859756810825, + "loss": 2.7487, + "step": 9770 + }, + { + "epoch": 0.2897429054354595, + "grad_norm": 0.13526535034179688, + "learning_rate": 0.0008156494871973811, + "loss": 2.7995, + "step": 9771 + }, + { + "epoch": 0.289772558787771, + "grad_norm": 0.14178675413131714, + "learning_rate": 0.0008156129959185494, + "loss": 2.7538, + "step": 9772 + }, + { + "epoch": 0.28980221214008245, + "grad_norm": 0.15500003099441528, + "learning_rate": 0.0008155765018449104, + "loss": 2.7592, + "step": 9773 + }, + { + "epoch": 0.2898318654923939, + "grad_norm": 0.14544259011745453, + "learning_rate": 0.0008155400049767872, + "loss": 2.7758, + "step": 9774 + }, + { + "epoch": 0.2898615188447054, + "grad_norm": 0.12916086614131927, + "learning_rate": 0.0008155035053145032, + "loss": 2.7486, + "step": 9775 + }, + { + "epoch": 0.2898911721970169, + "grad_norm": 0.13074763119220734, + "learning_rate": 0.0008154670028583814, + "loss": 2.7401, + "step": 9776 + }, + { + "epoch": 0.28992082554932835, + "grad_norm": 0.1507699340581894, + "learning_rate": 0.0008154304976087455, + "loss": 2.7958, + "step": 9777 + }, + { + "epoch": 0.28995047890163983, + "grad_norm": 0.1437666416168213, + "learning_rate": 0.0008153939895659181, + "loss": 2.7744, + "step": 9778 + }, + { + "epoch": 0.2899801322539513, + "grad_norm": 0.1389603316783905, + "learning_rate": 0.0008153574787302228, + "loss": 2.7527, + "step": 9779 + }, + { + "epoch": 0.2900097856062628, + "grad_norm": 0.139456644654274, + "learning_rate": 0.0008153209651019828, + "loss": 2.7451, + "step": 9780 + }, + { + "epoch": 0.29003943895857426, + "grad_norm": 0.14370381832122803, + "learning_rate": 0.0008152844486815218, + "loss": 2.7827, + "step": 9781 + }, + { + "epoch": 0.29006909231088573, + "grad_norm": 0.14319990575313568, + "learning_rate": 0.0008152479294691627, + "loss": 2.7576, + "step": 9782 + }, + { + "epoch": 0.2900987456631972, + "grad_norm": 0.13571085035800934, + "learning_rate": 0.0008152114074652291, + "loss": 2.7666, + "step": 9783 + }, + { + "epoch": 0.2901283990155087, + "grad_norm": 0.13536056876182556, + "learning_rate": 0.0008151748826700445, + "loss": 2.7644, + "step": 9784 + }, + { + "epoch": 0.29015805236782016, + "grad_norm": 0.11842226982116699, + "learning_rate": 0.000815138355083932, + "loss": 2.7746, + "step": 9785 + }, + { + "epoch": 0.29018770572013164, + "grad_norm": 0.12112955749034882, + "learning_rate": 0.0008151018247072155, + "loss": 2.7874, + "step": 9786 + }, + { + "epoch": 0.2902173590724431, + "grad_norm": 0.11759878695011139, + "learning_rate": 0.0008150652915402181, + "loss": 2.7692, + "step": 9787 + }, + { + "epoch": 0.2902470124247546, + "grad_norm": 0.11261989176273346, + "learning_rate": 0.0008150287555832634, + "loss": 2.7947, + "step": 9788 + }, + { + "epoch": 0.2902766657770661, + "grad_norm": 0.12042097747325897, + "learning_rate": 0.0008149922168366752, + "loss": 2.7884, + "step": 9789 + }, + { + "epoch": 0.2903063191293776, + "grad_norm": 0.1266292929649353, + "learning_rate": 0.0008149556753007768, + "loss": 2.7813, + "step": 9790 + }, + { + "epoch": 0.2903359724816891, + "grad_norm": 0.13996873795986176, + "learning_rate": 0.0008149191309758917, + "loss": 2.7567, + "step": 9791 + }, + { + "epoch": 0.29036562583400055, + "grad_norm": 0.14410267770290375, + "learning_rate": 0.0008148825838623437, + "loss": 2.7457, + "step": 9792 + }, + { + "epoch": 0.290395279186312, + "grad_norm": 0.14617682993412018, + "learning_rate": 0.0008148460339604564, + "loss": 2.7604, + "step": 9793 + }, + { + "epoch": 0.2904249325386235, + "grad_norm": 0.13056926429271698, + "learning_rate": 0.0008148094812705535, + "loss": 2.7661, + "step": 9794 + }, + { + "epoch": 0.290454585890935, + "grad_norm": 0.12837626039981842, + "learning_rate": 0.0008147729257929585, + "loss": 2.7588, + "step": 9795 + }, + { + "epoch": 0.29048423924324646, + "grad_norm": 0.11911740154027939, + "learning_rate": 0.0008147363675279953, + "loss": 2.7806, + "step": 9796 + }, + { + "epoch": 0.29051389259555793, + "grad_norm": 0.1335511952638626, + "learning_rate": 0.0008146998064759874, + "loss": 2.7697, + "step": 9797 + }, + { + "epoch": 0.2905435459478694, + "grad_norm": 0.12719303369522095, + "learning_rate": 0.0008146632426372589, + "loss": 2.7796, + "step": 9798 + }, + { + "epoch": 0.2905731993001809, + "grad_norm": 0.12442170083522797, + "learning_rate": 0.0008146266760121331, + "loss": 2.7353, + "step": 9799 + }, + { + "epoch": 0.29060285265249236, + "grad_norm": 0.1398932933807373, + "learning_rate": 0.0008145901066009344, + "loss": 2.7632, + "step": 9800 + }, + { + "epoch": 0.29063250600480384, + "grad_norm": 0.15569959580898285, + "learning_rate": 0.0008145535344039861, + "loss": 2.7834, + "step": 9801 + }, + { + "epoch": 0.2906621593571153, + "grad_norm": 0.16080816090106964, + "learning_rate": 0.0008145169594216122, + "loss": 2.7852, + "step": 9802 + }, + { + "epoch": 0.2906918127094268, + "grad_norm": 0.15318642556667328, + "learning_rate": 0.0008144803816541368, + "loss": 2.7323, + "step": 9803 + }, + { + "epoch": 0.29072146606173827, + "grad_norm": 0.12850308418273926, + "learning_rate": 0.0008144438011018836, + "loss": 2.7885, + "step": 9804 + }, + { + "epoch": 0.29075111941404974, + "grad_norm": 0.1370285600423813, + "learning_rate": 0.0008144072177651766, + "loss": 2.7862, + "step": 9805 + }, + { + "epoch": 0.2907807727663612, + "grad_norm": 0.14138972759246826, + "learning_rate": 0.0008143706316443395, + "loss": 2.7952, + "step": 9806 + }, + { + "epoch": 0.2908104261186727, + "grad_norm": 0.16544991731643677, + "learning_rate": 0.0008143340427396968, + "loss": 2.7508, + "step": 9807 + }, + { + "epoch": 0.29084007947098417, + "grad_norm": 0.1714453399181366, + "learning_rate": 0.0008142974510515719, + "loss": 2.7777, + "step": 9808 + }, + { + "epoch": 0.29086973282329565, + "grad_norm": 0.1844255030155182, + "learning_rate": 0.0008142608565802894, + "loss": 2.76, + "step": 9809 + }, + { + "epoch": 0.2908993861756072, + "grad_norm": 0.19719816744327545, + "learning_rate": 0.000814224259326173, + "loss": 2.7903, + "step": 9810 + }, + { + "epoch": 0.29092903952791865, + "grad_norm": 0.1539435088634491, + "learning_rate": 0.0008141876592895467, + "loss": 2.7593, + "step": 9811 + }, + { + "epoch": 0.29095869288023013, + "grad_norm": 0.13810446858406067, + "learning_rate": 0.0008141510564707348, + "loss": 2.7641, + "step": 9812 + }, + { + "epoch": 0.2909883462325416, + "grad_norm": 0.13756796717643738, + "learning_rate": 0.0008141144508700616, + "loss": 2.7768, + "step": 9813 + }, + { + "epoch": 0.2910179995848531, + "grad_norm": 0.12422539293766022, + "learning_rate": 0.0008140778424878508, + "loss": 2.7609, + "step": 9814 + }, + { + "epoch": 0.29104765293716456, + "grad_norm": 0.13468267023563385, + "learning_rate": 0.0008140412313244268, + "loss": 2.791, + "step": 9815 + }, + { + "epoch": 0.29107730628947603, + "grad_norm": 0.12549912929534912, + "learning_rate": 0.0008140046173801138, + "loss": 2.7678, + "step": 9816 + }, + { + "epoch": 0.2911069596417875, + "grad_norm": 0.13681554794311523, + "learning_rate": 0.0008139680006552362, + "loss": 2.7834, + "step": 9817 + }, + { + "epoch": 0.291136612994099, + "grad_norm": 0.1325029730796814, + "learning_rate": 0.0008139313811501178, + "loss": 2.7747, + "step": 9818 + }, + { + "epoch": 0.29116626634641046, + "grad_norm": 0.11896881461143494, + "learning_rate": 0.0008138947588650833, + "loss": 2.7666, + "step": 9819 + }, + { + "epoch": 0.29119591969872194, + "grad_norm": 0.12636698782444, + "learning_rate": 0.0008138581338004567, + "loss": 2.7685, + "step": 9820 + }, + { + "epoch": 0.2912255730510334, + "grad_norm": 0.14245671033859253, + "learning_rate": 0.0008138215059565626, + "loss": 2.7976, + "step": 9821 + }, + { + "epoch": 0.2912552264033449, + "grad_norm": 0.14356139302253723, + "learning_rate": 0.000813784875333725, + "loss": 2.7809, + "step": 9822 + }, + { + "epoch": 0.29128487975565637, + "grad_norm": 0.1422760784626007, + "learning_rate": 0.0008137482419322686, + "loss": 2.7398, + "step": 9823 + }, + { + "epoch": 0.29131453310796784, + "grad_norm": 0.153423011302948, + "learning_rate": 0.0008137116057525178, + "loss": 2.7611, + "step": 9824 + }, + { + "epoch": 0.2913441864602793, + "grad_norm": 0.15942123532295227, + "learning_rate": 0.0008136749667947967, + "loss": 2.7679, + "step": 9825 + }, + { + "epoch": 0.2913738398125908, + "grad_norm": 0.16306862235069275, + "learning_rate": 0.0008136383250594299, + "loss": 2.78, + "step": 9826 + }, + { + "epoch": 0.29140349316490227, + "grad_norm": 0.14301200211048126, + "learning_rate": 0.0008136016805467418, + "loss": 2.7733, + "step": 9827 + }, + { + "epoch": 0.29143314651721375, + "grad_norm": 0.1315230429172516, + "learning_rate": 0.0008135650332570572, + "loss": 2.8022, + "step": 9828 + }, + { + "epoch": 0.2914627998695252, + "grad_norm": 0.13907913863658905, + "learning_rate": 0.0008135283831907005, + "loss": 2.7735, + "step": 9829 + }, + { + "epoch": 0.29149245322183676, + "grad_norm": 0.15114152431488037, + "learning_rate": 0.000813491730347996, + "loss": 2.7698, + "step": 9830 + }, + { + "epoch": 0.29152210657414823, + "grad_norm": 0.135970339179039, + "learning_rate": 0.0008134550747292684, + "loss": 2.7445, + "step": 9831 + }, + { + "epoch": 0.2915517599264597, + "grad_norm": 0.14234709739685059, + "learning_rate": 0.0008134184163348424, + "loss": 2.7702, + "step": 9832 + }, + { + "epoch": 0.2915814132787712, + "grad_norm": 0.14386247098445892, + "learning_rate": 0.0008133817551650424, + "loss": 2.7886, + "step": 9833 + }, + { + "epoch": 0.29161106663108266, + "grad_norm": 0.1589609682559967, + "learning_rate": 0.0008133450912201932, + "loss": 2.756, + "step": 9834 + }, + { + "epoch": 0.29164071998339414, + "grad_norm": 0.14286567270755768, + "learning_rate": 0.0008133084245006194, + "loss": 2.7805, + "step": 9835 + }, + { + "epoch": 0.2916703733357056, + "grad_norm": 0.12110071629285812, + "learning_rate": 0.0008132717550066459, + "loss": 2.7895, + "step": 9836 + }, + { + "epoch": 0.2917000266880171, + "grad_norm": 0.1321154087781906, + "learning_rate": 0.000813235082738597, + "loss": 2.7831, + "step": 9837 + }, + { + "epoch": 0.29172968004032857, + "grad_norm": 0.1388998180627823, + "learning_rate": 0.000813198407696798, + "loss": 2.7253, + "step": 9838 + }, + { + "epoch": 0.29175933339264004, + "grad_norm": 0.13556337356567383, + "learning_rate": 0.000813161729881573, + "loss": 2.7593, + "step": 9839 + }, + { + "epoch": 0.2917889867449515, + "grad_norm": 0.1336812674999237, + "learning_rate": 0.0008131250492932474, + "loss": 2.787, + "step": 9840 + }, + { + "epoch": 0.291818640097263, + "grad_norm": 0.12097351998090744, + "learning_rate": 0.0008130883659321455, + "loss": 2.7701, + "step": 9841 + }, + { + "epoch": 0.29184829344957447, + "grad_norm": 0.14380377531051636, + "learning_rate": 0.0008130516797985925, + "loss": 2.772, + "step": 9842 + }, + { + "epoch": 0.29187794680188595, + "grad_norm": 0.13606108725070953, + "learning_rate": 0.0008130149908929132, + "loss": 2.7786, + "step": 9843 + }, + { + "epoch": 0.2919076001541974, + "grad_norm": 0.1245349645614624, + "learning_rate": 0.0008129782992154323, + "loss": 2.7553, + "step": 9844 + }, + { + "epoch": 0.2919372535065089, + "grad_norm": 0.12679755687713623, + "learning_rate": 0.0008129416047664748, + "loss": 2.7538, + "step": 9845 + }, + { + "epoch": 0.2919669068588204, + "grad_norm": 0.1112760677933693, + "learning_rate": 0.0008129049075463658, + "loss": 2.7508, + "step": 9846 + }, + { + "epoch": 0.29199656021113185, + "grad_norm": 0.13629570603370667, + "learning_rate": 0.0008128682075554301, + "loss": 2.7356, + "step": 9847 + }, + { + "epoch": 0.2920262135634433, + "grad_norm": 0.13477784395217896, + "learning_rate": 0.0008128315047939927, + "loss": 2.7476, + "step": 9848 + }, + { + "epoch": 0.2920558669157548, + "grad_norm": 0.10692048817873001, + "learning_rate": 0.0008127947992623788, + "loss": 2.7722, + "step": 9849 + }, + { + "epoch": 0.2920855202680663, + "grad_norm": 0.12440717220306396, + "learning_rate": 0.000812758090960913, + "loss": 2.7425, + "step": 9850 + }, + { + "epoch": 0.2921151736203778, + "grad_norm": 0.14923541247844696, + "learning_rate": 0.0008127213798899208, + "loss": 2.7893, + "step": 9851 + }, + { + "epoch": 0.2921448269726893, + "grad_norm": 0.1340404748916626, + "learning_rate": 0.000812684666049727, + "loss": 2.7828, + "step": 9852 + }, + { + "epoch": 0.29217448032500076, + "grad_norm": 0.14205025136470795, + "learning_rate": 0.0008126479494406568, + "loss": 2.7454, + "step": 9853 + }, + { + "epoch": 0.29220413367731224, + "grad_norm": 0.15343798696994781, + "learning_rate": 0.0008126112300630354, + "loss": 2.7551, + "step": 9854 + }, + { + "epoch": 0.2922337870296237, + "grad_norm": 0.14817175269126892, + "learning_rate": 0.000812574507917188, + "loss": 2.7609, + "step": 9855 + }, + { + "epoch": 0.2922634403819352, + "grad_norm": 0.12901374697685242, + "learning_rate": 0.0008125377830034395, + "loss": 2.764, + "step": 9856 + }, + { + "epoch": 0.29229309373424667, + "grad_norm": 0.13729329407215118, + "learning_rate": 0.0008125010553221152, + "loss": 2.7565, + "step": 9857 + }, + { + "epoch": 0.29232274708655814, + "grad_norm": 0.18107189238071442, + "learning_rate": 0.0008124643248735408, + "loss": 2.776, + "step": 9858 + }, + { + "epoch": 0.2923524004388696, + "grad_norm": 0.1946459710597992, + "learning_rate": 0.0008124275916580408, + "loss": 2.7698, + "step": 9859 + }, + { + "epoch": 0.2923820537911811, + "grad_norm": 0.1521175056695938, + "learning_rate": 0.000812390855675941, + "loss": 2.7286, + "step": 9860 + }, + { + "epoch": 0.29241170714349257, + "grad_norm": 0.12745985388755798, + "learning_rate": 0.0008123541169275665, + "loss": 2.758, + "step": 9861 + }, + { + "epoch": 0.29244136049580405, + "grad_norm": 0.13259269297122955, + "learning_rate": 0.0008123173754132427, + "loss": 2.7492, + "step": 9862 + }, + { + "epoch": 0.2924710138481155, + "grad_norm": 0.13889525830745697, + "learning_rate": 0.000812280631133295, + "loss": 2.7406, + "step": 9863 + }, + { + "epoch": 0.292500667200427, + "grad_norm": 0.13404794037342072, + "learning_rate": 0.0008122438840880486, + "loss": 2.7931, + "step": 9864 + }, + { + "epoch": 0.2925303205527385, + "grad_norm": 0.14411665499210358, + "learning_rate": 0.0008122071342778292, + "loss": 2.7749, + "step": 9865 + }, + { + "epoch": 0.29255997390504995, + "grad_norm": 0.14623600244522095, + "learning_rate": 0.0008121703817029617, + "loss": 2.8047, + "step": 9866 + }, + { + "epoch": 0.29258962725736143, + "grad_norm": 0.1331629455089569, + "learning_rate": 0.0008121336263637722, + "loss": 2.7436, + "step": 9867 + }, + { + "epoch": 0.2926192806096729, + "grad_norm": 0.1508730947971344, + "learning_rate": 0.0008120968682605858, + "loss": 2.7396, + "step": 9868 + }, + { + "epoch": 0.2926489339619844, + "grad_norm": 0.15676473081111908, + "learning_rate": 0.0008120601073937279, + "loss": 2.7384, + "step": 9869 + }, + { + "epoch": 0.29267858731429586, + "grad_norm": 0.17201103270053864, + "learning_rate": 0.0008120233437635244, + "loss": 2.7621, + "step": 9870 + }, + { + "epoch": 0.29270824066660733, + "grad_norm": 0.18838919699192047, + "learning_rate": 0.0008119865773703006, + "loss": 2.756, + "step": 9871 + }, + { + "epoch": 0.29273789401891886, + "grad_norm": 0.17504432797431946, + "learning_rate": 0.0008119498082143819, + "loss": 2.6913, + "step": 9872 + }, + { + "epoch": 0.29276754737123034, + "grad_norm": 0.1289861798286438, + "learning_rate": 0.0008119130362960942, + "loss": 2.7526, + "step": 9873 + }, + { + "epoch": 0.2927972007235418, + "grad_norm": 0.13427092134952545, + "learning_rate": 0.0008118762616157631, + "loss": 2.729, + "step": 9874 + }, + { + "epoch": 0.2928268540758533, + "grad_norm": 0.13769984245300293, + "learning_rate": 0.0008118394841737141, + "loss": 2.7433, + "step": 9875 + }, + { + "epoch": 0.29285650742816477, + "grad_norm": 0.13696052134037018, + "learning_rate": 0.0008118027039702732, + "loss": 2.7304, + "step": 9876 + }, + { + "epoch": 0.29288616078047625, + "grad_norm": 0.12702205777168274, + "learning_rate": 0.0008117659210057656, + "loss": 2.7263, + "step": 9877 + }, + { + "epoch": 0.2929158141327877, + "grad_norm": 0.10226481407880783, + "learning_rate": 0.0008117291352805172, + "loss": 2.7314, + "step": 9878 + }, + { + "epoch": 0.2929454674850992, + "grad_norm": 0.13203299045562744, + "learning_rate": 0.0008116923467948537, + "loss": 2.7504, + "step": 9879 + }, + { + "epoch": 0.2929751208374107, + "grad_norm": 0.11602730304002762, + "learning_rate": 0.0008116555555491012, + "loss": 2.7569, + "step": 9880 + }, + { + "epoch": 0.29300477418972215, + "grad_norm": 0.11244141310453415, + "learning_rate": 0.0008116187615435852, + "loss": 2.7707, + "step": 9881 + }, + { + "epoch": 0.2930344275420336, + "grad_norm": 0.13327430188655853, + "learning_rate": 0.0008115819647786316, + "loss": 2.7662, + "step": 9882 + }, + { + "epoch": 0.2930640808943451, + "grad_norm": 0.13584072887897491, + "learning_rate": 0.0008115451652545661, + "loss": 2.7321, + "step": 9883 + }, + { + "epoch": 0.2930937342466566, + "grad_norm": 0.1333814412355423, + "learning_rate": 0.0008115083629717148, + "loss": 2.7487, + "step": 9884 + }, + { + "epoch": 0.29312338759896805, + "grad_norm": 0.13049009442329407, + "learning_rate": 0.0008114715579304034, + "loss": 2.7476, + "step": 9885 + }, + { + "epoch": 0.29315304095127953, + "grad_norm": 0.1348867565393448, + "learning_rate": 0.000811434750130958, + "loss": 2.7683, + "step": 9886 + }, + { + "epoch": 0.293182694303591, + "grad_norm": 0.1442258358001709, + "learning_rate": 0.0008113979395737044, + "loss": 2.772, + "step": 9887 + }, + { + "epoch": 0.2932123476559025, + "grad_norm": 0.1586603969335556, + "learning_rate": 0.0008113611262589685, + "loss": 2.7535, + "step": 9888 + }, + { + "epoch": 0.29324200100821396, + "grad_norm": 0.1523243635892868, + "learning_rate": 0.0008113243101870765, + "loss": 2.7592, + "step": 9889 + }, + { + "epoch": 0.29327165436052544, + "grad_norm": 0.13729485869407654, + "learning_rate": 0.0008112874913583543, + "loss": 2.7548, + "step": 9890 + }, + { + "epoch": 0.2933013077128369, + "grad_norm": 0.12636472284793854, + "learning_rate": 0.0008112506697731278, + "loss": 2.7861, + "step": 9891 + }, + { + "epoch": 0.2933309610651484, + "grad_norm": 0.11681444942951202, + "learning_rate": 0.0008112138454317233, + "loss": 2.7739, + "step": 9892 + }, + { + "epoch": 0.2933606144174599, + "grad_norm": 0.13022682070732117, + "learning_rate": 0.0008111770183344667, + "loss": 2.7997, + "step": 9893 + }, + { + "epoch": 0.2933902677697714, + "grad_norm": 0.14027881622314453, + "learning_rate": 0.0008111401884816843, + "loss": 2.7788, + "step": 9894 + }, + { + "epoch": 0.29341992112208287, + "grad_norm": 0.14048205316066742, + "learning_rate": 0.000811103355873702, + "loss": 2.7801, + "step": 9895 + }, + { + "epoch": 0.29344957447439435, + "grad_norm": 0.12870419025421143, + "learning_rate": 0.0008110665205108463, + "loss": 2.757, + "step": 9896 + }, + { + "epoch": 0.2934792278267058, + "grad_norm": 0.13881054520606995, + "learning_rate": 0.0008110296823934429, + "loss": 2.7832, + "step": 9897 + }, + { + "epoch": 0.2935088811790173, + "grad_norm": 0.1591908186674118, + "learning_rate": 0.0008109928415218184, + "loss": 2.7722, + "step": 9898 + }, + { + "epoch": 0.2935385345313288, + "grad_norm": 0.1510862559080124, + "learning_rate": 0.0008109559978962988, + "loss": 2.7849, + "step": 9899 + }, + { + "epoch": 0.29356818788364025, + "grad_norm": 0.16670891642570496, + "learning_rate": 0.0008109191515172108, + "loss": 2.7581, + "step": 9900 + }, + { + "epoch": 0.29359784123595173, + "grad_norm": 0.18060044944286346, + "learning_rate": 0.0008108823023848799, + "loss": 2.7836, + "step": 9901 + }, + { + "epoch": 0.2936274945882632, + "grad_norm": 0.16770930588245392, + "learning_rate": 0.0008108454504996331, + "loss": 2.7629, + "step": 9902 + }, + { + "epoch": 0.2936571479405747, + "grad_norm": 0.1700611263513565, + "learning_rate": 0.0008108085958617965, + "loss": 2.7402, + "step": 9903 + }, + { + "epoch": 0.29368680129288616, + "grad_norm": 0.14484676718711853, + "learning_rate": 0.0008107717384716963, + "loss": 2.7782, + "step": 9904 + }, + { + "epoch": 0.29371645464519763, + "grad_norm": 0.15505270659923553, + "learning_rate": 0.0008107348783296591, + "loss": 2.7926, + "step": 9905 + }, + { + "epoch": 0.2937461079975091, + "grad_norm": 0.15308384597301483, + "learning_rate": 0.0008106980154360112, + "loss": 2.7813, + "step": 9906 + }, + { + "epoch": 0.2937757613498206, + "grad_norm": 0.13011574745178223, + "learning_rate": 0.000810661149791079, + "loss": 2.7167, + "step": 9907 + }, + { + "epoch": 0.29380541470213206, + "grad_norm": 0.12698902189731598, + "learning_rate": 0.0008106242813951892, + "loss": 2.7497, + "step": 9908 + }, + { + "epoch": 0.29383506805444354, + "grad_norm": 0.1217847540974617, + "learning_rate": 0.0008105874102486679, + "loss": 2.7604, + "step": 9909 + }, + { + "epoch": 0.293864721406755, + "grad_norm": 0.13344526290893555, + "learning_rate": 0.0008105505363518417, + "loss": 2.766, + "step": 9910 + }, + { + "epoch": 0.2938943747590665, + "grad_norm": 0.14037597179412842, + "learning_rate": 0.0008105136597050372, + "loss": 2.7494, + "step": 9911 + }, + { + "epoch": 0.29392402811137797, + "grad_norm": 0.121159128844738, + "learning_rate": 0.0008104767803085811, + "loss": 2.7399, + "step": 9912 + }, + { + "epoch": 0.29395368146368944, + "grad_norm": 0.12891927361488342, + "learning_rate": 0.0008104398981627996, + "loss": 2.7591, + "step": 9913 + }, + { + "epoch": 0.293983334816001, + "grad_norm": 0.1362382471561432, + "learning_rate": 0.0008104030132680198, + "loss": 2.7663, + "step": 9914 + }, + { + "epoch": 0.29401298816831245, + "grad_norm": 0.14552684128284454, + "learning_rate": 0.0008103661256245678, + "loss": 2.803, + "step": 9915 + }, + { + "epoch": 0.2940426415206239, + "grad_norm": 0.14928601682186127, + "learning_rate": 0.0008103292352327706, + "loss": 2.7883, + "step": 9916 + }, + { + "epoch": 0.2940722948729354, + "grad_norm": 0.1424020528793335, + "learning_rate": 0.0008102923420929547, + "loss": 2.7903, + "step": 9917 + }, + { + "epoch": 0.2941019482252469, + "grad_norm": 0.15814277529716492, + "learning_rate": 0.0008102554462054468, + "loss": 2.7761, + "step": 9918 + }, + { + "epoch": 0.29413160157755835, + "grad_norm": 0.14674197137355804, + "learning_rate": 0.0008102185475705739, + "loss": 2.7677, + "step": 9919 + }, + { + "epoch": 0.29416125492986983, + "grad_norm": 0.13697290420532227, + "learning_rate": 0.0008101816461886624, + "loss": 2.7403, + "step": 9920 + }, + { + "epoch": 0.2941909082821813, + "grad_norm": 0.13619233667850494, + "learning_rate": 0.000810144742060039, + "loss": 2.7528, + "step": 9921 + }, + { + "epoch": 0.2942205616344928, + "grad_norm": 0.13637444376945496, + "learning_rate": 0.0008101078351850308, + "loss": 2.784, + "step": 9922 + }, + { + "epoch": 0.29425021498680426, + "grad_norm": 0.14187400043010712, + "learning_rate": 0.0008100709255639645, + "loss": 2.7208, + "step": 9923 + }, + { + "epoch": 0.29427986833911574, + "grad_norm": 0.13035190105438232, + "learning_rate": 0.0008100340131971669, + "loss": 2.7677, + "step": 9924 + }, + { + "epoch": 0.2943095216914272, + "grad_norm": 0.13379830121994019, + "learning_rate": 0.0008099970980849649, + "loss": 2.7608, + "step": 9925 + }, + { + "epoch": 0.2943391750437387, + "grad_norm": 0.12981414794921875, + "learning_rate": 0.0008099601802276855, + "loss": 2.7881, + "step": 9926 + }, + { + "epoch": 0.29436882839605016, + "grad_norm": 0.12697641551494598, + "learning_rate": 0.0008099232596256554, + "loss": 2.7609, + "step": 9927 + }, + { + "epoch": 0.29439848174836164, + "grad_norm": 0.12366899847984314, + "learning_rate": 0.0008098863362792018, + "loss": 2.7534, + "step": 9928 + }, + { + "epoch": 0.2944281351006731, + "grad_norm": 0.12176292389631271, + "learning_rate": 0.0008098494101886513, + "loss": 2.8019, + "step": 9929 + }, + { + "epoch": 0.2944577884529846, + "grad_norm": 0.12412666529417038, + "learning_rate": 0.0008098124813543311, + "loss": 2.7593, + "step": 9930 + }, + { + "epoch": 0.29448744180529607, + "grad_norm": 0.12441686540842056, + "learning_rate": 0.0008097755497765682, + "loss": 2.7862, + "step": 9931 + }, + { + "epoch": 0.29451709515760754, + "grad_norm": 0.142483651638031, + "learning_rate": 0.0008097386154556896, + "loss": 2.7398, + "step": 9932 + }, + { + "epoch": 0.294546748509919, + "grad_norm": 0.15143592655658722, + "learning_rate": 0.0008097016783920226, + "loss": 2.7829, + "step": 9933 + }, + { + "epoch": 0.29457640186223055, + "grad_norm": 0.14544221758842468, + "learning_rate": 0.0008096647385858939, + "loss": 2.7791, + "step": 9934 + }, + { + "epoch": 0.29460605521454203, + "grad_norm": 0.1412251740694046, + "learning_rate": 0.0008096277960376308, + "loss": 2.7467, + "step": 9935 + }, + { + "epoch": 0.2946357085668535, + "grad_norm": 0.17001941800117493, + "learning_rate": 0.0008095908507475605, + "loss": 2.7584, + "step": 9936 + }, + { + "epoch": 0.294665361919165, + "grad_norm": 0.175062894821167, + "learning_rate": 0.0008095539027160099, + "loss": 2.768, + "step": 9937 + }, + { + "epoch": 0.29469501527147646, + "grad_norm": 0.1620101034641266, + "learning_rate": 0.0008095169519433066, + "loss": 2.7701, + "step": 9938 + }, + { + "epoch": 0.29472466862378793, + "grad_norm": 0.1298029124736786, + "learning_rate": 0.0008094799984297773, + "loss": 2.7656, + "step": 9939 + }, + { + "epoch": 0.2947543219760994, + "grad_norm": 0.12635107338428497, + "learning_rate": 0.0008094430421757497, + "loss": 2.7337, + "step": 9940 + }, + { + "epoch": 0.2947839753284109, + "grad_norm": 0.14729821681976318, + "learning_rate": 0.0008094060831815509, + "loss": 2.7824, + "step": 9941 + }, + { + "epoch": 0.29481362868072236, + "grad_norm": 0.1623670905828476, + "learning_rate": 0.0008093691214475081, + "loss": 2.7873, + "step": 9942 + }, + { + "epoch": 0.29484328203303384, + "grad_norm": 0.14111872017383575, + "learning_rate": 0.0008093321569739484, + "loss": 2.7747, + "step": 9943 + }, + { + "epoch": 0.2948729353853453, + "grad_norm": 0.15260878205299377, + "learning_rate": 0.0008092951897611995, + "loss": 2.796, + "step": 9944 + }, + { + "epoch": 0.2949025887376568, + "grad_norm": 0.17166578769683838, + "learning_rate": 0.0008092582198095886, + "loss": 2.7828, + "step": 9945 + }, + { + "epoch": 0.29493224208996827, + "grad_norm": 0.15206506848335266, + "learning_rate": 0.0008092212471194431, + "loss": 2.7456, + "step": 9946 + }, + { + "epoch": 0.29496189544227974, + "grad_norm": 0.11937327682971954, + "learning_rate": 0.0008091842716910904, + "loss": 2.758, + "step": 9947 + }, + { + "epoch": 0.2949915487945912, + "grad_norm": 0.13376998901367188, + "learning_rate": 0.0008091472935248578, + "loss": 2.7659, + "step": 9948 + }, + { + "epoch": 0.2950212021469027, + "grad_norm": 0.1173524484038353, + "learning_rate": 0.000809110312621073, + "loss": 2.8003, + "step": 9949 + }, + { + "epoch": 0.29505085549921417, + "grad_norm": 0.1207086518406868, + "learning_rate": 0.0008090733289800631, + "loss": 2.7603, + "step": 9950 + }, + { + "epoch": 0.29508050885152565, + "grad_norm": 0.14358267188072205, + "learning_rate": 0.0008090363426021561, + "loss": 2.8076, + "step": 9951 + }, + { + "epoch": 0.2951101622038371, + "grad_norm": 0.15479163825511932, + "learning_rate": 0.000808999353487679, + "loss": 2.7921, + "step": 9952 + }, + { + "epoch": 0.2951398155561486, + "grad_norm": 0.13508762419223785, + "learning_rate": 0.0008089623616369597, + "loss": 2.7616, + "step": 9953 + }, + { + "epoch": 0.2951694689084601, + "grad_norm": 0.13809016346931458, + "learning_rate": 0.0008089253670503256, + "loss": 2.7297, + "step": 9954 + }, + { + "epoch": 0.2951991222607716, + "grad_norm": 0.138101726770401, + "learning_rate": 0.0008088883697281044, + "loss": 2.7665, + "step": 9955 + }, + { + "epoch": 0.2952287756130831, + "grad_norm": 0.14045055210590363, + "learning_rate": 0.0008088513696706236, + "loss": 2.7809, + "step": 9956 + }, + { + "epoch": 0.29525842896539456, + "grad_norm": 0.14024226367473602, + "learning_rate": 0.0008088143668782111, + "loss": 2.756, + "step": 9957 + }, + { + "epoch": 0.29528808231770604, + "grad_norm": 0.1536264419555664, + "learning_rate": 0.0008087773613511942, + "loss": 2.769, + "step": 9958 + }, + { + "epoch": 0.2953177356700175, + "grad_norm": 0.14800097048282623, + "learning_rate": 0.0008087403530899008, + "loss": 2.7861, + "step": 9959 + }, + { + "epoch": 0.295347389022329, + "grad_norm": 0.1367587298154831, + "learning_rate": 0.0008087033420946586, + "loss": 2.7782, + "step": 9960 + }, + { + "epoch": 0.29537704237464046, + "grad_norm": 0.1461760252714157, + "learning_rate": 0.0008086663283657954, + "loss": 2.7657, + "step": 9961 + }, + { + "epoch": 0.29540669572695194, + "grad_norm": 0.13530252873897552, + "learning_rate": 0.0008086293119036386, + "loss": 2.7291, + "step": 9962 + }, + { + "epoch": 0.2954363490792634, + "grad_norm": 0.12921032309532166, + "learning_rate": 0.0008085922927085165, + "loss": 2.7665, + "step": 9963 + }, + { + "epoch": 0.2954660024315749, + "grad_norm": 0.15076278150081635, + "learning_rate": 0.0008085552707807566, + "loss": 2.77, + "step": 9964 + }, + { + "epoch": 0.29549565578388637, + "grad_norm": 0.17763786017894745, + "learning_rate": 0.0008085182461206868, + "loss": 2.7715, + "step": 9965 + }, + { + "epoch": 0.29552530913619784, + "grad_norm": 0.20320938527584076, + "learning_rate": 0.000808481218728635, + "loss": 2.7359, + "step": 9966 + }, + { + "epoch": 0.2955549624885093, + "grad_norm": 0.18627530336380005, + "learning_rate": 0.0008084441886049292, + "loss": 2.772, + "step": 9967 + }, + { + "epoch": 0.2955846158408208, + "grad_norm": 0.15998725593090057, + "learning_rate": 0.000808407155749897, + "loss": 2.7485, + "step": 9968 + }, + { + "epoch": 0.2956142691931323, + "grad_norm": 0.15487632155418396, + "learning_rate": 0.0008083701201638665, + "loss": 2.7659, + "step": 9969 + }, + { + "epoch": 0.29564392254544375, + "grad_norm": 0.13457392156124115, + "learning_rate": 0.0008083330818471657, + "loss": 2.7587, + "step": 9970 + }, + { + "epoch": 0.2956735758977552, + "grad_norm": 0.11797770857810974, + "learning_rate": 0.0008082960408001225, + "loss": 2.7665, + "step": 9971 + }, + { + "epoch": 0.2957032292500667, + "grad_norm": 0.13921082019805908, + "learning_rate": 0.000808258997023065, + "loss": 2.7386, + "step": 9972 + }, + { + "epoch": 0.2957328826023782, + "grad_norm": 0.13178148865699768, + "learning_rate": 0.0008082219505163211, + "loss": 2.775, + "step": 9973 + }, + { + "epoch": 0.29576253595468965, + "grad_norm": 0.14144815504550934, + "learning_rate": 0.000808184901280219, + "loss": 2.7454, + "step": 9974 + }, + { + "epoch": 0.29579218930700113, + "grad_norm": 0.13653215765953064, + "learning_rate": 0.0008081478493150866, + "loss": 2.7484, + "step": 9975 + }, + { + "epoch": 0.29582184265931266, + "grad_norm": 0.1207585483789444, + "learning_rate": 0.0008081107946212522, + "loss": 2.7385, + "step": 9976 + }, + { + "epoch": 0.29585149601162414, + "grad_norm": 0.12541072070598602, + "learning_rate": 0.0008080737371990438, + "loss": 2.7728, + "step": 9977 + }, + { + "epoch": 0.2958811493639356, + "grad_norm": 0.117327481508255, + "learning_rate": 0.0008080366770487895, + "loss": 2.7572, + "step": 9978 + }, + { + "epoch": 0.2959108027162471, + "grad_norm": 0.11358563601970673, + "learning_rate": 0.0008079996141708177, + "loss": 2.7366, + "step": 9979 + }, + { + "epoch": 0.29594045606855857, + "grad_norm": 0.12640903890132904, + "learning_rate": 0.0008079625485654563, + "loss": 2.7546, + "step": 9980 + }, + { + "epoch": 0.29597010942087004, + "grad_norm": 0.115998275578022, + "learning_rate": 0.0008079254802330338, + "loss": 2.7445, + "step": 9981 + }, + { + "epoch": 0.2959997627731815, + "grad_norm": 0.11624164879322052, + "learning_rate": 0.0008078884091738781, + "loss": 2.7893, + "step": 9982 + }, + { + "epoch": 0.296029416125493, + "grad_norm": 0.12266950309276581, + "learning_rate": 0.0008078513353883179, + "loss": 2.7832, + "step": 9983 + }, + { + "epoch": 0.29605906947780447, + "grad_norm": 0.108538419008255, + "learning_rate": 0.0008078142588766813, + "loss": 2.7541, + "step": 9984 + }, + { + "epoch": 0.29608872283011595, + "grad_norm": 0.1271570920944214, + "learning_rate": 0.0008077771796392966, + "loss": 2.794, + "step": 9985 + }, + { + "epoch": 0.2961183761824274, + "grad_norm": 0.12324798852205276, + "learning_rate": 0.0008077400976764919, + "loss": 2.7848, + "step": 9986 + }, + { + "epoch": 0.2961480295347389, + "grad_norm": 0.12589877843856812, + "learning_rate": 0.0008077030129885961, + "loss": 2.7597, + "step": 9987 + }, + { + "epoch": 0.2961776828870504, + "grad_norm": 0.13535486161708832, + "learning_rate": 0.0008076659255759371, + "loss": 2.7514, + "step": 9988 + }, + { + "epoch": 0.29620733623936185, + "grad_norm": 0.1468721628189087, + "learning_rate": 0.0008076288354388436, + "loss": 2.7227, + "step": 9989 + }, + { + "epoch": 0.2962369895916733, + "grad_norm": 0.15436409413814545, + "learning_rate": 0.000807591742577644, + "loss": 2.7448, + "step": 9990 + }, + { + "epoch": 0.2962666429439848, + "grad_norm": 0.14317427575588226, + "learning_rate": 0.0008075546469926666, + "loss": 2.7789, + "step": 9991 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 0.13365726172924042, + "learning_rate": 0.0008075175486842401, + "loss": 2.7452, + "step": 9992 + }, + { + "epoch": 0.29632594964860776, + "grad_norm": 0.14156565070152283, + "learning_rate": 0.000807480447652693, + "loss": 2.7425, + "step": 9993 + }, + { + "epoch": 0.29635560300091923, + "grad_norm": 0.1448909044265747, + "learning_rate": 0.0008074433438983537, + "loss": 2.7416, + "step": 9994 + }, + { + "epoch": 0.2963852563532307, + "grad_norm": 0.12643422186374664, + "learning_rate": 0.0008074062374215507, + "loss": 2.7781, + "step": 9995 + }, + { + "epoch": 0.2964149097055422, + "grad_norm": 0.14363977313041687, + "learning_rate": 0.0008073691282226128, + "loss": 2.7896, + "step": 9996 + }, + { + "epoch": 0.2964445630578537, + "grad_norm": 0.14402075111865997, + "learning_rate": 0.0008073320163018685, + "loss": 2.7833, + "step": 9997 + }, + { + "epoch": 0.2964742164101652, + "grad_norm": 0.13787627220153809, + "learning_rate": 0.0008072949016596464, + "loss": 2.7673, + "step": 9998 + }, + { + "epoch": 0.29650386976247667, + "grad_norm": 0.14843213558197021, + "learning_rate": 0.0008072577842962754, + "loss": 2.7496, + "step": 9999 + }, + { + "epoch": 0.29653352311478814, + "grad_norm": 0.14385439455509186, + "learning_rate": 0.0008072206642120839, + "loss": 2.7832, + "step": 10000 + }, + { + "epoch": 0.2965631764670996, + "grad_norm": 0.1345471739768982, + "learning_rate": 0.0008071835414074005, + "loss": 2.7572, + "step": 10001 + }, + { + "epoch": 0.2965928298194111, + "grad_norm": 0.1147291511297226, + "learning_rate": 0.0008071464158825541, + "loss": 2.7912, + "step": 10002 + }, + { + "epoch": 0.2966224831717226, + "grad_norm": 0.1255134791135788, + "learning_rate": 0.0008071092876378736, + "loss": 2.7576, + "step": 10003 + }, + { + "epoch": 0.29665213652403405, + "grad_norm": 0.11742231249809265, + "learning_rate": 0.0008070721566736877, + "loss": 2.7544, + "step": 10004 + }, + { + "epoch": 0.2966817898763455, + "grad_norm": 0.11376597732305527, + "learning_rate": 0.0008070350229903251, + "loss": 2.7473, + "step": 10005 + }, + { + "epoch": 0.296711443228657, + "grad_norm": 0.11813115328550339, + "learning_rate": 0.0008069978865881147, + "loss": 2.751, + "step": 10006 + }, + { + "epoch": 0.2967410965809685, + "grad_norm": 0.12441672384738922, + "learning_rate": 0.0008069607474673852, + "loss": 2.7489, + "step": 10007 + }, + { + "epoch": 0.29677074993327995, + "grad_norm": 0.14891740679740906, + "learning_rate": 0.0008069236056284656, + "loss": 2.7837, + "step": 10008 + }, + { + "epoch": 0.29680040328559143, + "grad_norm": 0.14444328844547272, + "learning_rate": 0.0008068864610716849, + "loss": 2.7607, + "step": 10009 + }, + { + "epoch": 0.2968300566379029, + "grad_norm": 0.15151892602443695, + "learning_rate": 0.0008068493137973718, + "loss": 2.7443, + "step": 10010 + }, + { + "epoch": 0.2968597099902144, + "grad_norm": 0.1570359319448471, + "learning_rate": 0.0008068121638058554, + "loss": 2.7758, + "step": 10011 + }, + { + "epoch": 0.29688936334252586, + "grad_norm": 0.15129472315311432, + "learning_rate": 0.0008067750110974648, + "loss": 2.7875, + "step": 10012 + }, + { + "epoch": 0.29691901669483733, + "grad_norm": 0.158717080950737, + "learning_rate": 0.0008067378556725287, + "loss": 2.7614, + "step": 10013 + }, + { + "epoch": 0.2969486700471488, + "grad_norm": 0.1504155546426773, + "learning_rate": 0.000806700697531376, + "loss": 2.7587, + "step": 10014 + }, + { + "epoch": 0.2969783233994603, + "grad_norm": 0.14301075041294098, + "learning_rate": 0.0008066635366743363, + "loss": 2.7372, + "step": 10015 + }, + { + "epoch": 0.29700797675177176, + "grad_norm": 0.15338140726089478, + "learning_rate": 0.0008066263731017382, + "loss": 2.7627, + "step": 10016 + }, + { + "epoch": 0.29703763010408324, + "grad_norm": 0.13766899704933167, + "learning_rate": 0.0008065892068139109, + "loss": 2.7739, + "step": 10017 + }, + { + "epoch": 0.29706728345639477, + "grad_norm": 0.13694660365581512, + "learning_rate": 0.0008065520378111836, + "loss": 2.7538, + "step": 10018 + }, + { + "epoch": 0.29709693680870625, + "grad_norm": 0.13281852006912231, + "learning_rate": 0.0008065148660938854, + "loss": 2.7483, + "step": 10019 + }, + { + "epoch": 0.2971265901610177, + "grad_norm": 0.1267395317554474, + "learning_rate": 0.0008064776916623456, + "loss": 2.7717, + "step": 10020 + }, + { + "epoch": 0.2971562435133292, + "grad_norm": 0.1285095363855362, + "learning_rate": 0.0008064405145168929, + "loss": 2.7571, + "step": 10021 + }, + { + "epoch": 0.2971858968656407, + "grad_norm": 0.12375786155462265, + "learning_rate": 0.000806403334657857, + "loss": 2.7816, + "step": 10022 + }, + { + "epoch": 0.29721555021795215, + "grad_norm": 0.12582948803901672, + "learning_rate": 0.0008063661520855671, + "loss": 2.7746, + "step": 10023 + }, + { + "epoch": 0.2972452035702636, + "grad_norm": 0.11923113465309143, + "learning_rate": 0.0008063289668003522, + "loss": 2.7804, + "step": 10024 + }, + { + "epoch": 0.2972748569225751, + "grad_norm": 0.14780302345752716, + "learning_rate": 0.0008062917788025417, + "loss": 2.7908, + "step": 10025 + }, + { + "epoch": 0.2973045102748866, + "grad_norm": 0.15511594712734222, + "learning_rate": 0.000806254588092465, + "loss": 2.7902, + "step": 10026 + }, + { + "epoch": 0.29733416362719806, + "grad_norm": 0.1486460268497467, + "learning_rate": 0.0008062173946704513, + "loss": 2.7832, + "step": 10027 + }, + { + "epoch": 0.29736381697950953, + "grad_norm": 0.15279245376586914, + "learning_rate": 0.00080618019853683, + "loss": 2.767, + "step": 10028 + }, + { + "epoch": 0.297393470331821, + "grad_norm": 0.1603286862373352, + "learning_rate": 0.0008061429996919305, + "loss": 2.7613, + "step": 10029 + }, + { + "epoch": 0.2974231236841325, + "grad_norm": 0.16076667606830597, + "learning_rate": 0.0008061057981360822, + "loss": 2.7531, + "step": 10030 + }, + { + "epoch": 0.29745277703644396, + "grad_norm": 0.15852059423923492, + "learning_rate": 0.0008060685938696146, + "loss": 2.7636, + "step": 10031 + }, + { + "epoch": 0.29748243038875544, + "grad_norm": 0.14578108489513397, + "learning_rate": 0.0008060313868928571, + "loss": 2.7994, + "step": 10032 + }, + { + "epoch": 0.2975120837410669, + "grad_norm": 0.13145798444747925, + "learning_rate": 0.000805994177206139, + "loss": 2.7588, + "step": 10033 + }, + { + "epoch": 0.2975417370933784, + "grad_norm": 0.16346335411071777, + "learning_rate": 0.0008059569648097899, + "loss": 2.7483, + "step": 10034 + }, + { + "epoch": 0.29757139044568987, + "grad_norm": 0.18824544548988342, + "learning_rate": 0.0008059197497041395, + "loss": 2.7845, + "step": 10035 + }, + { + "epoch": 0.29760104379800134, + "grad_norm": 0.14746108651161194, + "learning_rate": 0.0008058825318895171, + "loss": 2.7408, + "step": 10036 + }, + { + "epoch": 0.2976306971503128, + "grad_norm": 0.11959972232580185, + "learning_rate": 0.0008058453113662524, + "loss": 2.78, + "step": 10037 + }, + { + "epoch": 0.29766035050262435, + "grad_norm": 0.14527627825737, + "learning_rate": 0.000805808088134675, + "loss": 2.789, + "step": 10038 + }, + { + "epoch": 0.2976900038549358, + "grad_norm": 0.12358640879392624, + "learning_rate": 0.0008057708621951145, + "loss": 2.7587, + "step": 10039 + }, + { + "epoch": 0.2977196572072473, + "grad_norm": 0.1476866751909256, + "learning_rate": 0.0008057336335479004, + "loss": 2.7624, + "step": 10040 + }, + { + "epoch": 0.2977493105595588, + "grad_norm": 0.15040847659111023, + "learning_rate": 0.0008056964021933625, + "loss": 2.7357, + "step": 10041 + }, + { + "epoch": 0.29777896391187025, + "grad_norm": 0.13318447768688202, + "learning_rate": 0.0008056591681318307, + "loss": 2.7659, + "step": 10042 + }, + { + "epoch": 0.29780861726418173, + "grad_norm": 0.14568889141082764, + "learning_rate": 0.0008056219313636344, + "loss": 2.7592, + "step": 10043 + }, + { + "epoch": 0.2978382706164932, + "grad_norm": 0.14875200390815735, + "learning_rate": 0.0008055846918891034, + "loss": 2.7562, + "step": 10044 + }, + { + "epoch": 0.2978679239688047, + "grad_norm": 0.16352388262748718, + "learning_rate": 0.0008055474497085676, + "loss": 2.7553, + "step": 10045 + }, + { + "epoch": 0.29789757732111616, + "grad_norm": 0.15256445109844208, + "learning_rate": 0.0008055102048223566, + "loss": 2.7978, + "step": 10046 + }, + { + "epoch": 0.29792723067342763, + "grad_norm": 0.16456635296344757, + "learning_rate": 0.0008054729572308003, + "loss": 2.7652, + "step": 10047 + }, + { + "epoch": 0.2979568840257391, + "grad_norm": 0.1589871346950531, + "learning_rate": 0.0008054357069342286, + "loss": 2.771, + "step": 10048 + }, + { + "epoch": 0.2979865373780506, + "grad_norm": 0.1607089340686798, + "learning_rate": 0.0008053984539329711, + "loss": 2.769, + "step": 10049 + }, + { + "epoch": 0.29801619073036206, + "grad_norm": 0.14970608055591583, + "learning_rate": 0.0008053611982273581, + "loss": 2.7541, + "step": 10050 + }, + { + "epoch": 0.29804584408267354, + "grad_norm": 0.12197592109441757, + "learning_rate": 0.0008053239398177191, + "loss": 2.7502, + "step": 10051 + }, + { + "epoch": 0.298075497434985, + "grad_norm": 0.1273515224456787, + "learning_rate": 0.0008052866787043843, + "loss": 2.7685, + "step": 10052 + }, + { + "epoch": 0.2981051507872965, + "grad_norm": 0.11691060662269592, + "learning_rate": 0.0008052494148876834, + "loss": 2.7266, + "step": 10053 + }, + { + "epoch": 0.29813480413960797, + "grad_norm": 0.1360665112733841, + "learning_rate": 0.0008052121483679468, + "loss": 2.7453, + "step": 10054 + }, + { + "epoch": 0.29816445749191944, + "grad_norm": 0.14064902067184448, + "learning_rate": 0.000805174879145504, + "loss": 2.7911, + "step": 10055 + }, + { + "epoch": 0.2981941108442309, + "grad_norm": 0.14029276371002197, + "learning_rate": 0.0008051376072206856, + "loss": 2.7598, + "step": 10056 + }, + { + "epoch": 0.2982237641965424, + "grad_norm": 0.14317293465137482, + "learning_rate": 0.0008051003325938209, + "loss": 2.7545, + "step": 10057 + }, + { + "epoch": 0.29825341754885387, + "grad_norm": 0.13628073036670685, + "learning_rate": 0.0008050630552652406, + "loss": 2.7749, + "step": 10058 + }, + { + "epoch": 0.2982830709011654, + "grad_norm": 0.1277952343225479, + "learning_rate": 0.0008050257752352745, + "loss": 2.7825, + "step": 10059 + }, + { + "epoch": 0.2983127242534769, + "grad_norm": 0.1341298669576645, + "learning_rate": 0.0008049884925042528, + "loss": 2.7998, + "step": 10060 + }, + { + "epoch": 0.29834237760578836, + "grad_norm": 0.14958222210407257, + "learning_rate": 0.0008049512070725058, + "loss": 2.7612, + "step": 10061 + }, + { + "epoch": 0.29837203095809983, + "grad_norm": 0.14927899837493896, + "learning_rate": 0.0008049139189403633, + "loss": 2.7794, + "step": 10062 + }, + { + "epoch": 0.2984016843104113, + "grad_norm": 0.12819990515708923, + "learning_rate": 0.0008048766281081559, + "loss": 2.75, + "step": 10063 + }, + { + "epoch": 0.2984313376627228, + "grad_norm": 0.11411966383457184, + "learning_rate": 0.0008048393345762136, + "loss": 2.7348, + "step": 10064 + }, + { + "epoch": 0.29846099101503426, + "grad_norm": 0.1268002688884735, + "learning_rate": 0.0008048020383448666, + "loss": 2.762, + "step": 10065 + }, + { + "epoch": 0.29849064436734574, + "grad_norm": 0.1373661905527115, + "learning_rate": 0.0008047647394144453, + "loss": 2.7417, + "step": 10066 + }, + { + "epoch": 0.2985202977196572, + "grad_norm": 0.12964007258415222, + "learning_rate": 0.0008047274377852798, + "loss": 2.7795, + "step": 10067 + }, + { + "epoch": 0.2985499510719687, + "grad_norm": 0.11873216927051544, + "learning_rate": 0.0008046901334577006, + "loss": 2.7994, + "step": 10068 + }, + { + "epoch": 0.29857960442428017, + "grad_norm": 0.12781599164009094, + "learning_rate": 0.0008046528264320379, + "loss": 2.8022, + "step": 10069 + }, + { + "epoch": 0.29860925777659164, + "grad_norm": 0.13754940032958984, + "learning_rate": 0.0008046155167086222, + "loss": 2.7497, + "step": 10070 + }, + { + "epoch": 0.2986389111289031, + "grad_norm": 0.13036541640758514, + "learning_rate": 0.0008045782042877839, + "loss": 2.7671, + "step": 10071 + }, + { + "epoch": 0.2986685644812146, + "grad_norm": 0.11982205510139465, + "learning_rate": 0.0008045408891698532, + "loss": 2.7313, + "step": 10072 + }, + { + "epoch": 0.29869821783352607, + "grad_norm": 0.13028423488140106, + "learning_rate": 0.0008045035713551607, + "loss": 2.7689, + "step": 10073 + }, + { + "epoch": 0.29872787118583755, + "grad_norm": 0.14598074555397034, + "learning_rate": 0.0008044662508440368, + "loss": 2.7597, + "step": 10074 + }, + { + "epoch": 0.298757524538149, + "grad_norm": 0.143087238073349, + "learning_rate": 0.0008044289276368119, + "loss": 2.7787, + "step": 10075 + }, + { + "epoch": 0.2987871778904605, + "grad_norm": 0.12680380046367645, + "learning_rate": 0.0008043916017338167, + "loss": 2.7939, + "step": 10076 + }, + { + "epoch": 0.298816831242772, + "grad_norm": 0.15623286366462708, + "learning_rate": 0.0008043542731353817, + "loss": 2.7782, + "step": 10077 + }, + { + "epoch": 0.29884648459508345, + "grad_norm": 0.1581532210111618, + "learning_rate": 0.0008043169418418373, + "loss": 2.7514, + "step": 10078 + }, + { + "epoch": 0.2988761379473949, + "grad_norm": 0.146126389503479, + "learning_rate": 0.0008042796078535139, + "loss": 2.766, + "step": 10079 + }, + { + "epoch": 0.29890579129970646, + "grad_norm": 0.1413591355085373, + "learning_rate": 0.0008042422711707427, + "loss": 2.7523, + "step": 10080 + }, + { + "epoch": 0.29893544465201793, + "grad_norm": 0.15578944981098175, + "learning_rate": 0.0008042049317938538, + "loss": 2.7362, + "step": 10081 + }, + { + "epoch": 0.2989650980043294, + "grad_norm": 0.13424403965473175, + "learning_rate": 0.0008041675897231779, + "loss": 2.7821, + "step": 10082 + }, + { + "epoch": 0.2989947513566409, + "grad_norm": 0.12555085122585297, + "learning_rate": 0.0008041302449590461, + "loss": 2.7713, + "step": 10083 + }, + { + "epoch": 0.29902440470895236, + "grad_norm": 0.12881125509738922, + "learning_rate": 0.0008040928975017884, + "loss": 2.7619, + "step": 10084 + }, + { + "epoch": 0.29905405806126384, + "grad_norm": 0.1250818520784378, + "learning_rate": 0.0008040555473517361, + "loss": 2.7728, + "step": 10085 + }, + { + "epoch": 0.2990837114135753, + "grad_norm": 0.13532604277133942, + "learning_rate": 0.0008040181945092198, + "loss": 2.7405, + "step": 10086 + }, + { + "epoch": 0.2991133647658868, + "grad_norm": 0.13971182703971863, + "learning_rate": 0.0008039808389745702, + "loss": 2.7378, + "step": 10087 + }, + { + "epoch": 0.29914301811819827, + "grad_norm": 0.15132075548171997, + "learning_rate": 0.0008039434807481181, + "loss": 2.7992, + "step": 10088 + }, + { + "epoch": 0.29917267147050974, + "grad_norm": 0.15133531391620636, + "learning_rate": 0.0008039061198301941, + "loss": 2.7192, + "step": 10089 + }, + { + "epoch": 0.2992023248228212, + "grad_norm": 0.12228042632341385, + "learning_rate": 0.0008038687562211295, + "loss": 2.7293, + "step": 10090 + }, + { + "epoch": 0.2992319781751327, + "grad_norm": 0.13503290712833405, + "learning_rate": 0.0008038313899212548, + "loss": 2.7474, + "step": 10091 + }, + { + "epoch": 0.29926163152744417, + "grad_norm": 0.13536466658115387, + "learning_rate": 0.0008037940209309008, + "loss": 2.7334, + "step": 10092 + }, + { + "epoch": 0.29929128487975565, + "grad_norm": 0.1501549631357193, + "learning_rate": 0.0008037566492503989, + "loss": 2.7576, + "step": 10093 + }, + { + "epoch": 0.2993209382320671, + "grad_norm": 0.1516350507736206, + "learning_rate": 0.0008037192748800795, + "loss": 2.7412, + "step": 10094 + }, + { + "epoch": 0.2993505915843786, + "grad_norm": 0.1480002999305725, + "learning_rate": 0.0008036818978202738, + "loss": 2.7139, + "step": 10095 + }, + { + "epoch": 0.2993802449366901, + "grad_norm": 0.13304470479488373, + "learning_rate": 0.000803644518071313, + "loss": 2.7821, + "step": 10096 + }, + { + "epoch": 0.29940989828900155, + "grad_norm": 0.15209874510765076, + "learning_rate": 0.0008036071356335278, + "loss": 2.8037, + "step": 10097 + }, + { + "epoch": 0.29943955164131303, + "grad_norm": 0.16807857155799866, + "learning_rate": 0.000803569750507249, + "loss": 2.7735, + "step": 10098 + }, + { + "epoch": 0.2994692049936245, + "grad_norm": 0.14213837683200836, + "learning_rate": 0.0008035323626928082, + "loss": 2.7818, + "step": 10099 + }, + { + "epoch": 0.299498858345936, + "grad_norm": 0.11648068577051163, + "learning_rate": 0.0008034949721905363, + "loss": 2.7559, + "step": 10100 + }, + { + "epoch": 0.2995285116982475, + "grad_norm": 0.14935605227947235, + "learning_rate": 0.0008034575790007643, + "loss": 2.7703, + "step": 10101 + }, + { + "epoch": 0.299558165050559, + "grad_norm": 0.1418946534395218, + "learning_rate": 0.0008034201831238233, + "loss": 2.7588, + "step": 10102 + }, + { + "epoch": 0.29958781840287046, + "grad_norm": 0.1125485748052597, + "learning_rate": 0.0008033827845600445, + "loss": 2.7399, + "step": 10103 + }, + { + "epoch": 0.29961747175518194, + "grad_norm": 0.11464453488588333, + "learning_rate": 0.0008033453833097591, + "loss": 2.7637, + "step": 10104 + }, + { + "epoch": 0.2996471251074934, + "grad_norm": 0.12087144702672958, + "learning_rate": 0.000803307979373298, + "loss": 2.7589, + "step": 10105 + }, + { + "epoch": 0.2996767784598049, + "grad_norm": 0.12733949720859528, + "learning_rate": 0.0008032705727509929, + "loss": 2.7759, + "step": 10106 + }, + { + "epoch": 0.29970643181211637, + "grad_norm": 0.1292087733745575, + "learning_rate": 0.0008032331634431749, + "loss": 2.7737, + "step": 10107 + }, + { + "epoch": 0.29973608516442785, + "grad_norm": 0.14843691885471344, + "learning_rate": 0.0008031957514501751, + "loss": 2.7887, + "step": 10108 + }, + { + "epoch": 0.2997657385167393, + "grad_norm": 0.1726723313331604, + "learning_rate": 0.0008031583367723249, + "loss": 2.7537, + "step": 10109 + }, + { + "epoch": 0.2997953918690508, + "grad_norm": 0.17986363172531128, + "learning_rate": 0.0008031209194099556, + "loss": 2.7943, + "step": 10110 + }, + { + "epoch": 0.2998250452213623, + "grad_norm": 0.17317411303520203, + "learning_rate": 0.0008030834993633984, + "loss": 2.7688, + "step": 10111 + }, + { + "epoch": 0.29985469857367375, + "grad_norm": 0.1684320718050003, + "learning_rate": 0.0008030460766329849, + "loss": 2.7546, + "step": 10112 + }, + { + "epoch": 0.2998843519259852, + "grad_norm": 0.15211887657642365, + "learning_rate": 0.0008030086512190464, + "loss": 2.7713, + "step": 10113 + }, + { + "epoch": 0.2999140052782967, + "grad_norm": 0.13537631928920746, + "learning_rate": 0.0008029712231219142, + "loss": 2.7605, + "step": 10114 + }, + { + "epoch": 0.2999436586306082, + "grad_norm": 0.1296815276145935, + "learning_rate": 0.0008029337923419199, + "loss": 2.7761, + "step": 10115 + }, + { + "epoch": 0.29997331198291965, + "grad_norm": 0.1279601901769638, + "learning_rate": 0.0008028963588793949, + "loss": 2.7241, + "step": 10116 + }, + { + "epoch": 0.30000296533523113, + "grad_norm": 0.12992005050182343, + "learning_rate": 0.0008028589227346705, + "loss": 2.7751, + "step": 10117 + }, + { + "epoch": 0.3000326186875426, + "grad_norm": 0.11163236945867538, + "learning_rate": 0.0008028214839080784, + "loss": 2.763, + "step": 10118 + }, + { + "epoch": 0.3000622720398541, + "grad_norm": 0.12643486261367798, + "learning_rate": 0.0008027840423999502, + "loss": 2.7448, + "step": 10119 + }, + { + "epoch": 0.30009192539216556, + "grad_norm": 0.144647017121315, + "learning_rate": 0.0008027465982106172, + "loss": 2.7718, + "step": 10120 + }, + { + "epoch": 0.30012157874447704, + "grad_norm": 0.12658344209194183, + "learning_rate": 0.0008027091513404112, + "loss": 2.7498, + "step": 10121 + }, + { + "epoch": 0.30015123209678857, + "grad_norm": 0.129033163189888, + "learning_rate": 0.0008026717017896636, + "loss": 2.7348, + "step": 10122 + }, + { + "epoch": 0.30018088544910004, + "grad_norm": 0.14054031670093536, + "learning_rate": 0.0008026342495587063, + "loss": 2.775, + "step": 10123 + }, + { + "epoch": 0.3002105388014115, + "grad_norm": 0.12458688020706177, + "learning_rate": 0.0008025967946478705, + "loss": 2.7498, + "step": 10124 + }, + { + "epoch": 0.300240192153723, + "grad_norm": 0.11217480897903442, + "learning_rate": 0.0008025593370574884, + "loss": 2.7176, + "step": 10125 + }, + { + "epoch": 0.30026984550603447, + "grad_norm": 0.13330145180225372, + "learning_rate": 0.0008025218767878914, + "loss": 2.7586, + "step": 10126 + }, + { + "epoch": 0.30029949885834595, + "grad_norm": 0.12495575100183487, + "learning_rate": 0.0008024844138394112, + "loss": 2.7608, + "step": 10127 + }, + { + "epoch": 0.3003291522106574, + "grad_norm": 0.13489842414855957, + "learning_rate": 0.0008024469482123796, + "loss": 2.776, + "step": 10128 + }, + { + "epoch": 0.3003588055629689, + "grad_norm": 0.1487455666065216, + "learning_rate": 0.0008024094799071284, + "loss": 2.7285, + "step": 10129 + }, + { + "epoch": 0.3003884589152804, + "grad_norm": 0.16046980023384094, + "learning_rate": 0.0008023720089239892, + "loss": 2.7913, + "step": 10130 + }, + { + "epoch": 0.30041811226759185, + "grad_norm": 0.18437761068344116, + "learning_rate": 0.000802334535263294, + "loss": 2.7787, + "step": 10131 + }, + { + "epoch": 0.30044776561990333, + "grad_norm": 0.15257711708545685, + "learning_rate": 0.0008022970589253748, + "loss": 2.7512, + "step": 10132 + }, + { + "epoch": 0.3004774189722148, + "grad_norm": 0.12409637123346329, + "learning_rate": 0.000802259579910563, + "loss": 2.7413, + "step": 10133 + }, + { + "epoch": 0.3005070723245263, + "grad_norm": 0.13258887827396393, + "learning_rate": 0.0008022220982191909, + "loss": 2.7352, + "step": 10134 + }, + { + "epoch": 0.30053672567683776, + "grad_norm": 0.148854598402977, + "learning_rate": 0.0008021846138515903, + "loss": 2.754, + "step": 10135 + }, + { + "epoch": 0.30056637902914923, + "grad_norm": 0.1519237905740738, + "learning_rate": 0.0008021471268080929, + "loss": 2.7957, + "step": 10136 + }, + { + "epoch": 0.3005960323814607, + "grad_norm": 0.16621366143226624, + "learning_rate": 0.0008021096370890308, + "loss": 2.7675, + "step": 10137 + }, + { + "epoch": 0.3006256857337722, + "grad_norm": 0.14876024425029755, + "learning_rate": 0.0008020721446947361, + "loss": 2.7545, + "step": 10138 + }, + { + "epoch": 0.30065533908608366, + "grad_norm": 0.1331765204668045, + "learning_rate": 0.0008020346496255407, + "loss": 2.7641, + "step": 10139 + }, + { + "epoch": 0.30068499243839514, + "grad_norm": 0.15977151691913605, + "learning_rate": 0.0008019971518817768, + "loss": 2.7544, + "step": 10140 + }, + { + "epoch": 0.3007146457907066, + "grad_norm": 0.1451970785856247, + "learning_rate": 0.0008019596514637761, + "loss": 2.7252, + "step": 10141 + }, + { + "epoch": 0.30074429914301815, + "grad_norm": 0.14368367195129395, + "learning_rate": 0.0008019221483718708, + "loss": 2.7092, + "step": 10142 + }, + { + "epoch": 0.3007739524953296, + "grad_norm": 0.14765413105487823, + "learning_rate": 0.0008018846426063932, + "loss": 2.7902, + "step": 10143 + }, + { + "epoch": 0.3008036058476411, + "grad_norm": 0.1474740207195282, + "learning_rate": 0.0008018471341676752, + "loss": 2.7543, + "step": 10144 + }, + { + "epoch": 0.3008332591999526, + "grad_norm": 0.14614279568195343, + "learning_rate": 0.000801809623056049, + "loss": 2.772, + "step": 10145 + }, + { + "epoch": 0.30086291255226405, + "grad_norm": 0.15168236196041107, + "learning_rate": 0.0008017721092718469, + "loss": 2.769, + "step": 10146 + }, + { + "epoch": 0.3008925659045755, + "grad_norm": 0.12753093242645264, + "learning_rate": 0.0008017345928154007, + "loss": 2.7489, + "step": 10147 + }, + { + "epoch": 0.300922219256887, + "grad_norm": 0.1315978765487671, + "learning_rate": 0.0008016970736870432, + "loss": 2.7631, + "step": 10148 + }, + { + "epoch": 0.3009518726091985, + "grad_norm": 0.14368006587028503, + "learning_rate": 0.0008016595518871061, + "loss": 2.7835, + "step": 10149 + }, + { + "epoch": 0.30098152596150995, + "grad_norm": 0.11751601099967957, + "learning_rate": 0.0008016220274159221, + "loss": 2.7531, + "step": 10150 + }, + { + "epoch": 0.30101117931382143, + "grad_norm": 0.11479967832565308, + "learning_rate": 0.0008015845002738232, + "loss": 2.763, + "step": 10151 + }, + { + "epoch": 0.3010408326661329, + "grad_norm": 0.13381414115428925, + "learning_rate": 0.0008015469704611417, + "loss": 2.7764, + "step": 10152 + }, + { + "epoch": 0.3010704860184444, + "grad_norm": 0.13325409591197968, + "learning_rate": 0.00080150943797821, + "loss": 2.7474, + "step": 10153 + }, + { + "epoch": 0.30110013937075586, + "grad_norm": 0.12762512266635895, + "learning_rate": 0.0008014719028253606, + "loss": 2.769, + "step": 10154 + }, + { + "epoch": 0.30112979272306734, + "grad_norm": 0.14342190325260162, + "learning_rate": 0.0008014343650029256, + "loss": 2.7696, + "step": 10155 + }, + { + "epoch": 0.3011594460753788, + "grad_norm": 0.15589632093906403, + "learning_rate": 0.0008013968245112377, + "loss": 2.7578, + "step": 10156 + }, + { + "epoch": 0.3011890994276903, + "grad_norm": 0.16594275832176208, + "learning_rate": 0.0008013592813506291, + "loss": 2.7501, + "step": 10157 + }, + { + "epoch": 0.30121875278000176, + "grad_norm": 0.14867933094501495, + "learning_rate": 0.0008013217355214324, + "loss": 2.746, + "step": 10158 + }, + { + "epoch": 0.30124840613231324, + "grad_norm": 0.13156749308109283, + "learning_rate": 0.0008012841870239799, + "loss": 2.7571, + "step": 10159 + }, + { + "epoch": 0.3012780594846247, + "grad_norm": 0.1273948699235916, + "learning_rate": 0.0008012466358586044, + "loss": 2.7794, + "step": 10160 + }, + { + "epoch": 0.3013077128369362, + "grad_norm": 0.1312495917081833, + "learning_rate": 0.0008012090820256381, + "loss": 2.7166, + "step": 10161 + }, + { + "epoch": 0.30133736618924767, + "grad_norm": 0.12292052805423737, + "learning_rate": 0.0008011715255254137, + "loss": 2.7403, + "step": 10162 + }, + { + "epoch": 0.3013670195415592, + "grad_norm": 0.1307496726512909, + "learning_rate": 0.0008011339663582638, + "loss": 2.7776, + "step": 10163 + }, + { + "epoch": 0.3013966728938707, + "grad_norm": 0.15652398765087128, + "learning_rate": 0.0008010964045245208, + "loss": 2.7509, + "step": 10164 + }, + { + "epoch": 0.30142632624618215, + "grad_norm": 0.15057310461997986, + "learning_rate": 0.0008010588400245176, + "loss": 2.7377, + "step": 10165 + }, + { + "epoch": 0.30145597959849363, + "grad_norm": 0.15020950138568878, + "learning_rate": 0.0008010212728585866, + "loss": 2.7711, + "step": 10166 + }, + { + "epoch": 0.3014856329508051, + "grad_norm": 0.16089707612991333, + "learning_rate": 0.0008009837030270606, + "loss": 2.8007, + "step": 10167 + }, + { + "epoch": 0.3015152863031166, + "grad_norm": 0.14207260310649872, + "learning_rate": 0.0008009461305302722, + "loss": 2.7716, + "step": 10168 + }, + { + "epoch": 0.30154493965542806, + "grad_norm": 0.15305356681346893, + "learning_rate": 0.0008009085553685542, + "loss": 2.7403, + "step": 10169 + }, + { + "epoch": 0.30157459300773953, + "grad_norm": 0.11751044541597366, + "learning_rate": 0.0008008709775422393, + "loss": 2.7859, + "step": 10170 + }, + { + "epoch": 0.301604246360051, + "grad_norm": 0.12156768143177032, + "learning_rate": 0.0008008333970516601, + "loss": 2.784, + "step": 10171 + }, + { + "epoch": 0.3016338997123625, + "grad_norm": 0.11971190571784973, + "learning_rate": 0.0008007958138971497, + "loss": 2.7687, + "step": 10172 + }, + { + "epoch": 0.30166355306467396, + "grad_norm": 0.10338442772626877, + "learning_rate": 0.0008007582280790408, + "loss": 2.7272, + "step": 10173 + }, + { + "epoch": 0.30169320641698544, + "grad_norm": 0.12304290384054184, + "learning_rate": 0.000800720639597666, + "loss": 2.7554, + "step": 10174 + }, + { + "epoch": 0.3017228597692969, + "grad_norm": 0.1277129054069519, + "learning_rate": 0.0008006830484533585, + "loss": 2.7785, + "step": 10175 + }, + { + "epoch": 0.3017525131216084, + "grad_norm": 0.11985503137111664, + "learning_rate": 0.0008006454546464508, + "loss": 2.7765, + "step": 10176 + }, + { + "epoch": 0.30178216647391987, + "grad_norm": 0.12003887444734573, + "learning_rate": 0.000800607858177276, + "loss": 2.7466, + "step": 10177 + }, + { + "epoch": 0.30181181982623134, + "grad_norm": 0.11288759112358093, + "learning_rate": 0.0008005702590461672, + "loss": 2.7182, + "step": 10178 + }, + { + "epoch": 0.3018414731785428, + "grad_norm": 0.10879471153020859, + "learning_rate": 0.000800532657253457, + "loss": 2.7561, + "step": 10179 + }, + { + "epoch": 0.3018711265308543, + "grad_norm": 0.12805195152759552, + "learning_rate": 0.0008004950527994787, + "loss": 2.7142, + "step": 10180 + }, + { + "epoch": 0.30190077988316577, + "grad_norm": 0.14266976714134216, + "learning_rate": 0.0008004574456845651, + "loss": 2.7745, + "step": 10181 + }, + { + "epoch": 0.30193043323547725, + "grad_norm": 0.15768392384052277, + "learning_rate": 0.000800419835909049, + "loss": 2.7692, + "step": 10182 + }, + { + "epoch": 0.3019600865877887, + "grad_norm": 0.14427092671394348, + "learning_rate": 0.0008003822234732639, + "loss": 2.777, + "step": 10183 + }, + { + "epoch": 0.30198973994010025, + "grad_norm": 0.1359475702047348, + "learning_rate": 0.0008003446083775425, + "loss": 2.7585, + "step": 10184 + }, + { + "epoch": 0.30201939329241173, + "grad_norm": 0.16372330486774445, + "learning_rate": 0.0008003069906222182, + "loss": 2.7545, + "step": 10185 + }, + { + "epoch": 0.3020490466447232, + "grad_norm": 0.15718373656272888, + "learning_rate": 0.0008002693702076239, + "loss": 2.7801, + "step": 10186 + }, + { + "epoch": 0.3020786999970347, + "grad_norm": 0.1485607922077179, + "learning_rate": 0.0008002317471340928, + "loss": 2.7876, + "step": 10187 + }, + { + "epoch": 0.30210835334934616, + "grad_norm": 0.15780343115329742, + "learning_rate": 0.000800194121401958, + "loss": 2.7855, + "step": 10188 + }, + { + "epoch": 0.30213800670165764, + "grad_norm": 0.146774023771286, + "learning_rate": 0.0008001564930115528, + "loss": 2.7474, + "step": 10189 + }, + { + "epoch": 0.3021676600539691, + "grad_norm": 0.1471858024597168, + "learning_rate": 0.0008001188619632103, + "loss": 2.7558, + "step": 10190 + }, + { + "epoch": 0.3021973134062806, + "grad_norm": 0.13563331961631775, + "learning_rate": 0.0008000812282572636, + "loss": 2.7725, + "step": 10191 + }, + { + "epoch": 0.30222696675859206, + "grad_norm": 0.12990298867225647, + "learning_rate": 0.0008000435918940464, + "loss": 2.7464, + "step": 10192 + }, + { + "epoch": 0.30225662011090354, + "grad_norm": 0.12435082346200943, + "learning_rate": 0.0008000059528738916, + "loss": 2.7898, + "step": 10193 + }, + { + "epoch": 0.302286273463215, + "grad_norm": 0.145744189620018, + "learning_rate": 0.0007999683111971325, + "loss": 2.7422, + "step": 10194 + }, + { + "epoch": 0.3023159268155265, + "grad_norm": 0.14172618091106415, + "learning_rate": 0.0007999306668641025, + "loss": 2.7389, + "step": 10195 + }, + { + "epoch": 0.30234558016783797, + "grad_norm": 0.13254164159297943, + "learning_rate": 0.000799893019875135, + "loss": 2.7491, + "step": 10196 + }, + { + "epoch": 0.30237523352014944, + "grad_norm": 0.12764622271060944, + "learning_rate": 0.0007998553702305635, + "loss": 2.7906, + "step": 10197 + }, + { + "epoch": 0.3024048868724609, + "grad_norm": 0.11437182873487473, + "learning_rate": 0.000799817717930721, + "loss": 2.7751, + "step": 10198 + }, + { + "epoch": 0.3024345402247724, + "grad_norm": 0.14040637016296387, + "learning_rate": 0.0007997800629759413, + "loss": 2.7648, + "step": 10199 + }, + { + "epoch": 0.3024641935770839, + "grad_norm": 0.14396105706691742, + "learning_rate": 0.0007997424053665576, + "loss": 2.7599, + "step": 10200 + }, + { + "epoch": 0.30249384692939535, + "grad_norm": 0.1303364634513855, + "learning_rate": 0.0007997047451029035, + "loss": 2.7592, + "step": 10201 + }, + { + "epoch": 0.3025235002817068, + "grad_norm": 0.11543060094118118, + "learning_rate": 0.0007996670821853123, + "loss": 2.7368, + "step": 10202 + }, + { + "epoch": 0.3025531536340183, + "grad_norm": 0.1422603279352188, + "learning_rate": 0.0007996294166141178, + "loss": 2.7619, + "step": 10203 + }, + { + "epoch": 0.3025828069863298, + "grad_norm": 0.15077407658100128, + "learning_rate": 0.0007995917483896533, + "loss": 2.775, + "step": 10204 + }, + { + "epoch": 0.3026124603386413, + "grad_norm": 0.1383466273546219, + "learning_rate": 0.0007995540775122525, + "loss": 2.757, + "step": 10205 + }, + { + "epoch": 0.3026421136909528, + "grad_norm": 0.15311741828918457, + "learning_rate": 0.0007995164039822489, + "loss": 2.7681, + "step": 10206 + }, + { + "epoch": 0.30267176704326426, + "grad_norm": 0.12995241582393646, + "learning_rate": 0.0007994787277999762, + "loss": 2.737, + "step": 10207 + }, + { + "epoch": 0.30270142039557574, + "grad_norm": 0.11671153455972672, + "learning_rate": 0.0007994410489657679, + "loss": 2.7541, + "step": 10208 + }, + { + "epoch": 0.3027310737478872, + "grad_norm": 0.13250881433486938, + "learning_rate": 0.0007994033674799577, + "loss": 2.7353, + "step": 10209 + }, + { + "epoch": 0.3027607271001987, + "grad_norm": 0.12923642992973328, + "learning_rate": 0.0007993656833428793, + "loss": 2.7301, + "step": 10210 + }, + { + "epoch": 0.30279038045251017, + "grad_norm": 0.11920484900474548, + "learning_rate": 0.0007993279965548664, + "loss": 2.7926, + "step": 10211 + }, + { + "epoch": 0.30282003380482164, + "grad_norm": 0.12254033982753754, + "learning_rate": 0.0007992903071162527, + "loss": 2.7562, + "step": 10212 + }, + { + "epoch": 0.3028496871571331, + "grad_norm": 0.12985730171203613, + "learning_rate": 0.000799252615027372, + "loss": 2.7708, + "step": 10213 + }, + { + "epoch": 0.3028793405094446, + "grad_norm": 0.14713899791240692, + "learning_rate": 0.000799214920288558, + "loss": 2.7577, + "step": 10214 + }, + { + "epoch": 0.30290899386175607, + "grad_norm": 0.13219870626926422, + "learning_rate": 0.0007991772229001444, + "loss": 2.7583, + "step": 10215 + }, + { + "epoch": 0.30293864721406755, + "grad_norm": 0.1323373019695282, + "learning_rate": 0.0007991395228624653, + "loss": 2.7832, + "step": 10216 + }, + { + "epoch": 0.302968300566379, + "grad_norm": 0.14322611689567566, + "learning_rate": 0.0007991018201758543, + "loss": 2.7717, + "step": 10217 + }, + { + "epoch": 0.3029979539186905, + "grad_norm": 0.13813459873199463, + "learning_rate": 0.0007990641148406455, + "loss": 2.7837, + "step": 10218 + }, + { + "epoch": 0.303027607271002, + "grad_norm": 0.12105681002140045, + "learning_rate": 0.0007990264068571724, + "loss": 2.7272, + "step": 10219 + }, + { + "epoch": 0.30305726062331345, + "grad_norm": 0.1404504030942917, + "learning_rate": 0.0007989886962257694, + "loss": 2.7439, + "step": 10220 + }, + { + "epoch": 0.3030869139756249, + "grad_norm": 0.16817454993724823, + "learning_rate": 0.0007989509829467699, + "loss": 2.745, + "step": 10221 + }, + { + "epoch": 0.3031165673279364, + "grad_norm": 0.1755223423242569, + "learning_rate": 0.0007989132670205082, + "loss": 2.7839, + "step": 10222 + }, + { + "epoch": 0.3031462206802479, + "grad_norm": 0.1587154120206833, + "learning_rate": 0.0007988755484473183, + "loss": 2.7964, + "step": 10223 + }, + { + "epoch": 0.30317587403255936, + "grad_norm": 0.14552219212055206, + "learning_rate": 0.000798837827227534, + "loss": 2.7933, + "step": 10224 + }, + { + "epoch": 0.30320552738487083, + "grad_norm": 0.15085983276367188, + "learning_rate": 0.0007988001033614895, + "loss": 2.7476, + "step": 10225 + }, + { + "epoch": 0.30323518073718236, + "grad_norm": 0.14344486594200134, + "learning_rate": 0.0007987623768495189, + "loss": 2.7432, + "step": 10226 + }, + { + "epoch": 0.30326483408949384, + "grad_norm": 0.15260803699493408, + "learning_rate": 0.0007987246476919561, + "loss": 2.7328, + "step": 10227 + }, + { + "epoch": 0.3032944874418053, + "grad_norm": 0.150547057390213, + "learning_rate": 0.0007986869158891352, + "loss": 2.73, + "step": 10228 + }, + { + "epoch": 0.3033241407941168, + "grad_norm": 0.12070389837026596, + "learning_rate": 0.0007986491814413905, + "loss": 2.7213, + "step": 10229 + }, + { + "epoch": 0.30335379414642827, + "grad_norm": 0.13622640073299408, + "learning_rate": 0.000798611444349056, + "loss": 2.7683, + "step": 10230 + }, + { + "epoch": 0.30338344749873974, + "grad_norm": 0.1276421993970871, + "learning_rate": 0.0007985737046124658, + "loss": 2.7625, + "step": 10231 + }, + { + "epoch": 0.3034131008510512, + "grad_norm": 0.1272302120923996, + "learning_rate": 0.0007985359622319543, + "loss": 2.7605, + "step": 10232 + }, + { + "epoch": 0.3034427542033627, + "grad_norm": 0.1469445526599884, + "learning_rate": 0.0007984982172078557, + "loss": 2.762, + "step": 10233 + }, + { + "epoch": 0.3034724075556742, + "grad_norm": 0.17538288235664368, + "learning_rate": 0.0007984604695405039, + "loss": 2.7711, + "step": 10234 + }, + { + "epoch": 0.30350206090798565, + "grad_norm": 0.15697330236434937, + "learning_rate": 0.0007984227192302336, + "loss": 2.7441, + "step": 10235 + }, + { + "epoch": 0.3035317142602971, + "grad_norm": 0.13373596966266632, + "learning_rate": 0.0007983849662773788, + "loss": 2.7597, + "step": 10236 + }, + { + "epoch": 0.3035613676126086, + "grad_norm": 0.14157791435718536, + "learning_rate": 0.000798347210682274, + "loss": 2.753, + "step": 10237 + }, + { + "epoch": 0.3035910209649201, + "grad_norm": 0.137089341878891, + "learning_rate": 0.0007983094524452534, + "loss": 2.7514, + "step": 10238 + }, + { + "epoch": 0.30362067431723155, + "grad_norm": 0.13221873342990875, + "learning_rate": 0.0007982716915666515, + "loss": 2.7646, + "step": 10239 + }, + { + "epoch": 0.30365032766954303, + "grad_norm": 0.1548909991979599, + "learning_rate": 0.0007982339280468024, + "loss": 2.7572, + "step": 10240 + }, + { + "epoch": 0.3036799810218545, + "grad_norm": 0.14354489743709564, + "learning_rate": 0.0007981961618860407, + "loss": 2.7372, + "step": 10241 + }, + { + "epoch": 0.303709634374166, + "grad_norm": 0.15073582530021667, + "learning_rate": 0.0007981583930847008, + "loss": 2.7276, + "step": 10242 + }, + { + "epoch": 0.30373928772647746, + "grad_norm": 0.12917840480804443, + "learning_rate": 0.0007981206216431172, + "loss": 2.7643, + "step": 10243 + }, + { + "epoch": 0.30376894107878893, + "grad_norm": 0.13489721715450287, + "learning_rate": 0.0007980828475616244, + "loss": 2.7704, + "step": 10244 + }, + { + "epoch": 0.3037985944311004, + "grad_norm": 0.15151774883270264, + "learning_rate": 0.0007980450708405567, + "loss": 2.7399, + "step": 10245 + }, + { + "epoch": 0.30382824778341194, + "grad_norm": 0.1453513205051422, + "learning_rate": 0.0007980072914802488, + "loss": 2.7496, + "step": 10246 + }, + { + "epoch": 0.3038579011357234, + "grad_norm": 0.14842917025089264, + "learning_rate": 0.0007979695094810351, + "loss": 2.7578, + "step": 10247 + }, + { + "epoch": 0.3038875544880349, + "grad_norm": 0.14555853605270386, + "learning_rate": 0.0007979317248432503, + "loss": 2.767, + "step": 10248 + }, + { + "epoch": 0.30391720784034637, + "grad_norm": 0.14190275967121124, + "learning_rate": 0.0007978939375672291, + "loss": 2.7323, + "step": 10249 + }, + { + "epoch": 0.30394686119265785, + "grad_norm": 0.12290649861097336, + "learning_rate": 0.0007978561476533057, + "loss": 2.7851, + "step": 10250 + }, + { + "epoch": 0.3039765145449693, + "grad_norm": 0.11983111500740051, + "learning_rate": 0.0007978183551018151, + "loss": 2.7423, + "step": 10251 + }, + { + "epoch": 0.3040061678972808, + "grad_norm": 0.1351463496685028, + "learning_rate": 0.0007977805599130918, + "loss": 2.7278, + "step": 10252 + }, + { + "epoch": 0.3040358212495923, + "grad_norm": 0.14105187356472015, + "learning_rate": 0.0007977427620874707, + "loss": 2.7557, + "step": 10253 + }, + { + "epoch": 0.30406547460190375, + "grad_norm": 0.1371159851551056, + "learning_rate": 0.000797704961625286, + "loss": 2.7374, + "step": 10254 + }, + { + "epoch": 0.3040951279542152, + "grad_norm": 0.1297418177127838, + "learning_rate": 0.0007976671585268731, + "loss": 2.7801, + "step": 10255 + }, + { + "epoch": 0.3041247813065267, + "grad_norm": 0.13772054016590118, + "learning_rate": 0.0007976293527925662, + "loss": 2.7316, + "step": 10256 + }, + { + "epoch": 0.3041544346588382, + "grad_norm": 0.12324799597263336, + "learning_rate": 0.0007975915444227004, + "loss": 2.7444, + "step": 10257 + }, + { + "epoch": 0.30418408801114966, + "grad_norm": 0.1308123916387558, + "learning_rate": 0.0007975537334176104, + "loss": 2.7253, + "step": 10258 + }, + { + "epoch": 0.30421374136346113, + "grad_norm": 0.12573929131031036, + "learning_rate": 0.000797515919777631, + "loss": 2.7248, + "step": 10259 + }, + { + "epoch": 0.3042433947157726, + "grad_norm": 0.12954695522785187, + "learning_rate": 0.000797478103503097, + "loss": 2.7675, + "step": 10260 + }, + { + "epoch": 0.3042730480680841, + "grad_norm": 0.12885889410972595, + "learning_rate": 0.0007974402845943434, + "loss": 2.7581, + "step": 10261 + }, + { + "epoch": 0.30430270142039556, + "grad_norm": 0.12600679695606232, + "learning_rate": 0.000797402463051705, + "loss": 2.7611, + "step": 10262 + }, + { + "epoch": 0.30433235477270704, + "grad_norm": 0.12615501880645752, + "learning_rate": 0.0007973646388755167, + "loss": 2.7597, + "step": 10263 + }, + { + "epoch": 0.3043620081250185, + "grad_norm": 0.12594769895076752, + "learning_rate": 0.0007973268120661135, + "loss": 2.7556, + "step": 10264 + }, + { + "epoch": 0.30439166147733, + "grad_norm": 0.12645936012268066, + "learning_rate": 0.0007972889826238303, + "loss": 2.8074, + "step": 10265 + }, + { + "epoch": 0.30442131482964147, + "grad_norm": 0.11729322373867035, + "learning_rate": 0.0007972511505490022, + "loss": 2.7663, + "step": 10266 + }, + { + "epoch": 0.304450968181953, + "grad_norm": 0.12604108452796936, + "learning_rate": 0.0007972133158419641, + "loss": 2.7291, + "step": 10267 + }, + { + "epoch": 0.3044806215342645, + "grad_norm": 0.1092807725071907, + "learning_rate": 0.0007971754785030512, + "loss": 2.7342, + "step": 10268 + }, + { + "epoch": 0.30451027488657595, + "grad_norm": 0.1274643838405609, + "learning_rate": 0.0007971376385325984, + "loss": 2.8186, + "step": 10269 + }, + { + "epoch": 0.3045399282388874, + "grad_norm": 0.13977010548114777, + "learning_rate": 0.0007970997959309406, + "loss": 2.7116, + "step": 10270 + }, + { + "epoch": 0.3045695815911989, + "grad_norm": 0.15225253999233246, + "learning_rate": 0.0007970619506984134, + "loss": 2.726, + "step": 10271 + }, + { + "epoch": 0.3045992349435104, + "grad_norm": 0.16640695929527283, + "learning_rate": 0.0007970241028353514, + "loss": 2.7552, + "step": 10272 + }, + { + "epoch": 0.30462888829582185, + "grad_norm": 0.142167329788208, + "learning_rate": 0.0007969862523420901, + "loss": 2.7629, + "step": 10273 + }, + { + "epoch": 0.30465854164813333, + "grad_norm": 0.1293201446533203, + "learning_rate": 0.0007969483992189644, + "loss": 2.7298, + "step": 10274 + }, + { + "epoch": 0.3046881950004448, + "grad_norm": 0.1452990472316742, + "learning_rate": 0.0007969105434663098, + "loss": 2.7413, + "step": 10275 + }, + { + "epoch": 0.3047178483527563, + "grad_norm": 0.14942748844623566, + "learning_rate": 0.0007968726850844614, + "loss": 2.7698, + "step": 10276 + }, + { + "epoch": 0.30474750170506776, + "grad_norm": 0.14498957991600037, + "learning_rate": 0.0007968348240737544, + "loss": 2.7546, + "step": 10277 + }, + { + "epoch": 0.30477715505737923, + "grad_norm": 0.12767046689987183, + "learning_rate": 0.000796796960434524, + "loss": 2.7537, + "step": 10278 + }, + { + "epoch": 0.3048068084096907, + "grad_norm": 0.12352238595485687, + "learning_rate": 0.0007967590941671057, + "loss": 2.761, + "step": 10279 + }, + { + "epoch": 0.3048364617620022, + "grad_norm": 0.14371415972709656, + "learning_rate": 0.0007967212252718345, + "loss": 2.7602, + "step": 10280 + }, + { + "epoch": 0.30486611511431366, + "grad_norm": 0.12600547075271606, + "learning_rate": 0.0007966833537490461, + "loss": 2.7561, + "step": 10281 + }, + { + "epoch": 0.30489576846662514, + "grad_norm": 0.12313104420900345, + "learning_rate": 0.0007966454795990756, + "loss": 2.7725, + "step": 10282 + }, + { + "epoch": 0.3049254218189366, + "grad_norm": 0.12206391990184784, + "learning_rate": 0.0007966076028222584, + "loss": 2.7568, + "step": 10283 + }, + { + "epoch": 0.3049550751712481, + "grad_norm": 0.12849916517734528, + "learning_rate": 0.0007965697234189302, + "loss": 2.7796, + "step": 10284 + }, + { + "epoch": 0.30498472852355957, + "grad_norm": 0.15279513597488403, + "learning_rate": 0.0007965318413894261, + "loss": 2.7325, + "step": 10285 + }, + { + "epoch": 0.30501438187587104, + "grad_norm": 0.18906140327453613, + "learning_rate": 0.0007964939567340814, + "loss": 2.7716, + "step": 10286 + }, + { + "epoch": 0.3050440352281825, + "grad_norm": 0.21430785953998566, + "learning_rate": 0.000796456069453232, + "loss": 2.7645, + "step": 10287 + }, + { + "epoch": 0.30507368858049405, + "grad_norm": 0.18825611472129822, + "learning_rate": 0.0007964181795472132, + "loss": 2.759, + "step": 10288 + }, + { + "epoch": 0.3051033419328055, + "grad_norm": 0.15250645577907562, + "learning_rate": 0.0007963802870163605, + "loss": 2.7508, + "step": 10289 + }, + { + "epoch": 0.305132995285117, + "grad_norm": 0.13534192740917206, + "learning_rate": 0.0007963423918610096, + "loss": 2.7556, + "step": 10290 + }, + { + "epoch": 0.3051626486374285, + "grad_norm": 0.15035882592201233, + "learning_rate": 0.0007963044940814958, + "loss": 2.696, + "step": 10291 + }, + { + "epoch": 0.30519230198973996, + "grad_norm": 0.1364871859550476, + "learning_rate": 0.000796266593678155, + "loss": 2.77, + "step": 10292 + }, + { + "epoch": 0.30522195534205143, + "grad_norm": 0.1344217211008072, + "learning_rate": 0.0007962286906513225, + "loss": 2.7327, + "step": 10293 + }, + { + "epoch": 0.3052516086943629, + "grad_norm": 0.1491462141275406, + "learning_rate": 0.0007961907850013343, + "loss": 2.7421, + "step": 10294 + }, + { + "epoch": 0.3052812620466744, + "grad_norm": 0.14120379090309143, + "learning_rate": 0.0007961528767285258, + "loss": 2.7456, + "step": 10295 + }, + { + "epoch": 0.30531091539898586, + "grad_norm": 0.12112089991569519, + "learning_rate": 0.0007961149658332327, + "loss": 2.7496, + "step": 10296 + }, + { + "epoch": 0.30534056875129734, + "grad_norm": 0.1417355090379715, + "learning_rate": 0.0007960770523157908, + "loss": 2.7672, + "step": 10297 + }, + { + "epoch": 0.3053702221036088, + "grad_norm": 0.122139111161232, + "learning_rate": 0.0007960391361765356, + "loss": 2.7584, + "step": 10298 + }, + { + "epoch": 0.3053998754559203, + "grad_norm": 0.13341589272022247, + "learning_rate": 0.0007960012174158031, + "loss": 2.7534, + "step": 10299 + }, + { + "epoch": 0.30542952880823176, + "grad_norm": 0.11270782351493835, + "learning_rate": 0.0007959632960339292, + "loss": 2.7541, + "step": 10300 + }, + { + "epoch": 0.30545918216054324, + "grad_norm": 0.14489887654781342, + "learning_rate": 0.0007959253720312494, + "loss": 2.7725, + "step": 10301 + }, + { + "epoch": 0.3054888355128547, + "grad_norm": 0.14539870619773865, + "learning_rate": 0.0007958874454080995, + "loss": 2.7477, + "step": 10302 + }, + { + "epoch": 0.3055184888651662, + "grad_norm": 0.14376945793628693, + "learning_rate": 0.0007958495161648156, + "loss": 2.7668, + "step": 10303 + }, + { + "epoch": 0.30554814221747767, + "grad_norm": 0.14547763764858246, + "learning_rate": 0.0007958115843017335, + "loss": 2.7591, + "step": 10304 + }, + { + "epoch": 0.30557779556978915, + "grad_norm": 0.15329068899154663, + "learning_rate": 0.000795773649819189, + "loss": 2.7772, + "step": 10305 + }, + { + "epoch": 0.3056074489221006, + "grad_norm": 0.15236502885818481, + "learning_rate": 0.000795735712717518, + "loss": 2.7742, + "step": 10306 + }, + { + "epoch": 0.3056371022744121, + "grad_norm": 0.161446675658226, + "learning_rate": 0.0007956977729970566, + "loss": 2.7686, + "step": 10307 + }, + { + "epoch": 0.3056667556267236, + "grad_norm": 0.15385501086711884, + "learning_rate": 0.0007956598306581407, + "loss": 2.7378, + "step": 10308 + }, + { + "epoch": 0.3056964089790351, + "grad_norm": 0.13352790474891663, + "learning_rate": 0.0007956218857011061, + "loss": 2.7436, + "step": 10309 + }, + { + "epoch": 0.3057260623313466, + "grad_norm": 0.14534787833690643, + "learning_rate": 0.000795583938126289, + "loss": 2.8281, + "step": 10310 + }, + { + "epoch": 0.30575571568365806, + "grad_norm": 0.16128793358802795, + "learning_rate": 0.0007955459879340254, + "loss": 2.7556, + "step": 10311 + }, + { + "epoch": 0.30578536903596953, + "grad_norm": 0.16452953219413757, + "learning_rate": 0.0007955080351246515, + "loss": 2.7571, + "step": 10312 + }, + { + "epoch": 0.305815022388281, + "grad_norm": 0.15279388427734375, + "learning_rate": 0.0007954700796985031, + "loss": 2.7411, + "step": 10313 + }, + { + "epoch": 0.3058446757405925, + "grad_norm": 0.1451595425605774, + "learning_rate": 0.0007954321216559163, + "loss": 2.7171, + "step": 10314 + }, + { + "epoch": 0.30587432909290396, + "grad_norm": 0.1347319483757019, + "learning_rate": 0.0007953941609972275, + "loss": 2.7734, + "step": 10315 + }, + { + "epoch": 0.30590398244521544, + "grad_norm": 0.13417501747608185, + "learning_rate": 0.0007953561977227728, + "loss": 2.7939, + "step": 10316 + }, + { + "epoch": 0.3059336357975269, + "grad_norm": 0.14638499915599823, + "learning_rate": 0.0007953182318328881, + "loss": 2.7632, + "step": 10317 + }, + { + "epoch": 0.3059632891498384, + "grad_norm": 0.12857165932655334, + "learning_rate": 0.0007952802633279097, + "loss": 2.7389, + "step": 10318 + }, + { + "epoch": 0.30599294250214987, + "grad_norm": 0.12139031291007996, + "learning_rate": 0.0007952422922081741, + "loss": 2.7305, + "step": 10319 + }, + { + "epoch": 0.30602259585446134, + "grad_norm": 0.13569539785385132, + "learning_rate": 0.0007952043184740172, + "loss": 2.7658, + "step": 10320 + }, + { + "epoch": 0.3060522492067728, + "grad_norm": 0.1251164972782135, + "learning_rate": 0.0007951663421257754, + "loss": 2.73, + "step": 10321 + }, + { + "epoch": 0.3060819025590843, + "grad_norm": 0.11174827814102173, + "learning_rate": 0.000795128363163785, + "loss": 2.7511, + "step": 10322 + }, + { + "epoch": 0.30611155591139577, + "grad_norm": 0.13151967525482178, + "learning_rate": 0.0007950903815883823, + "loss": 2.8157, + "step": 10323 + }, + { + "epoch": 0.30614120926370725, + "grad_norm": 0.12326539307832718, + "learning_rate": 0.0007950523973999037, + "loss": 2.7509, + "step": 10324 + }, + { + "epoch": 0.3061708626160187, + "grad_norm": 0.12080827355384827, + "learning_rate": 0.0007950144105986852, + "loss": 2.7474, + "step": 10325 + }, + { + "epoch": 0.3062005159683302, + "grad_norm": 0.13159024715423584, + "learning_rate": 0.0007949764211850637, + "loss": 2.7943, + "step": 10326 + }, + { + "epoch": 0.3062301693206417, + "grad_norm": 0.11657323688268661, + "learning_rate": 0.0007949384291593753, + "loss": 2.7965, + "step": 10327 + }, + { + "epoch": 0.30625982267295315, + "grad_norm": 0.12888792157173157, + "learning_rate": 0.0007949004345219565, + "loss": 2.7743, + "step": 10328 + }, + { + "epoch": 0.30628947602526463, + "grad_norm": 0.14016376435756683, + "learning_rate": 0.0007948624372731437, + "loss": 2.7509, + "step": 10329 + }, + { + "epoch": 0.30631912937757616, + "grad_norm": 0.17734414339065552, + "learning_rate": 0.0007948244374132733, + "loss": 2.7637, + "step": 10330 + }, + { + "epoch": 0.30634878272988764, + "grad_norm": 0.20277903974056244, + "learning_rate": 0.0007947864349426821, + "loss": 2.76, + "step": 10331 + }, + { + "epoch": 0.3063784360821991, + "grad_norm": 0.14355872571468353, + "learning_rate": 0.0007947484298617063, + "loss": 2.7488, + "step": 10332 + }, + { + "epoch": 0.3064080894345106, + "grad_norm": 0.13140559196472168, + "learning_rate": 0.0007947104221706826, + "loss": 2.746, + "step": 10333 + }, + { + "epoch": 0.30643774278682206, + "grad_norm": 0.14463163912296295, + "learning_rate": 0.0007946724118699475, + "loss": 2.7413, + "step": 10334 + }, + { + "epoch": 0.30646739613913354, + "grad_norm": 0.14086872339248657, + "learning_rate": 0.0007946343989598377, + "loss": 2.7638, + "step": 10335 + }, + { + "epoch": 0.306497049491445, + "grad_norm": 0.13997948169708252, + "learning_rate": 0.0007945963834406895, + "loss": 2.7663, + "step": 10336 + }, + { + "epoch": 0.3065267028437565, + "grad_norm": 0.13537487387657166, + "learning_rate": 0.0007945583653128401, + "loss": 2.762, + "step": 10337 + }, + { + "epoch": 0.30655635619606797, + "grad_norm": 0.1359030306339264, + "learning_rate": 0.0007945203445766254, + "loss": 2.7396, + "step": 10338 + }, + { + "epoch": 0.30658600954837945, + "grad_norm": 0.14113815128803253, + "learning_rate": 0.0007944823212323828, + "loss": 2.7821, + "step": 10339 + }, + { + "epoch": 0.3066156629006909, + "grad_norm": 0.15351077914237976, + "learning_rate": 0.0007944442952804487, + "loss": 2.7503, + "step": 10340 + }, + { + "epoch": 0.3066453162530024, + "grad_norm": 0.13750700652599335, + "learning_rate": 0.0007944062667211598, + "loss": 2.783, + "step": 10341 + }, + { + "epoch": 0.3066749696053139, + "grad_norm": 0.13221558928489685, + "learning_rate": 0.0007943682355548527, + "loss": 2.7629, + "step": 10342 + }, + { + "epoch": 0.30670462295762535, + "grad_norm": 0.13397341966629028, + "learning_rate": 0.0007943302017818645, + "loss": 2.7596, + "step": 10343 + }, + { + "epoch": 0.3067342763099368, + "grad_norm": 0.11559660732746124, + "learning_rate": 0.0007942921654025318, + "loss": 2.735, + "step": 10344 + }, + { + "epoch": 0.3067639296622483, + "grad_norm": 0.11410047113895416, + "learning_rate": 0.0007942541264171914, + "loss": 2.7473, + "step": 10345 + }, + { + "epoch": 0.3067935830145598, + "grad_norm": 0.1353592723608017, + "learning_rate": 0.0007942160848261803, + "loss": 2.7472, + "step": 10346 + }, + { + "epoch": 0.30682323636687125, + "grad_norm": 0.14039073884487152, + "learning_rate": 0.0007941780406298353, + "loss": 2.7628, + "step": 10347 + }, + { + "epoch": 0.30685288971918273, + "grad_norm": 0.14628811180591583, + "learning_rate": 0.0007941399938284933, + "loss": 2.7645, + "step": 10348 + }, + { + "epoch": 0.3068825430714942, + "grad_norm": 0.13860398530960083, + "learning_rate": 0.0007941019444224909, + "loss": 2.7695, + "step": 10349 + }, + { + "epoch": 0.30691219642380574, + "grad_norm": 0.14445455372333527, + "learning_rate": 0.0007940638924121654, + "loss": 2.7226, + "step": 10350 + }, + { + "epoch": 0.3069418497761172, + "grad_norm": 0.14020881056785583, + "learning_rate": 0.0007940258377978537, + "loss": 2.7936, + "step": 10351 + }, + { + "epoch": 0.3069715031284287, + "grad_norm": 0.1662714183330536, + "learning_rate": 0.0007939877805798928, + "loss": 2.7489, + "step": 10352 + }, + { + "epoch": 0.30700115648074017, + "grad_norm": 0.17848345637321472, + "learning_rate": 0.0007939497207586197, + "loss": 2.7412, + "step": 10353 + }, + { + "epoch": 0.30703080983305164, + "grad_norm": 0.13781407475471497, + "learning_rate": 0.0007939116583343712, + "loss": 2.7432, + "step": 10354 + }, + { + "epoch": 0.3070604631853631, + "grad_norm": 0.13336217403411865, + "learning_rate": 0.0007938735933074846, + "loss": 2.7715, + "step": 10355 + }, + { + "epoch": 0.3070901165376746, + "grad_norm": 0.1264890730381012, + "learning_rate": 0.0007938355256782969, + "loss": 2.7504, + "step": 10356 + }, + { + "epoch": 0.30711976988998607, + "grad_norm": 0.12821292877197266, + "learning_rate": 0.000793797455447145, + "loss": 2.7316, + "step": 10357 + }, + { + "epoch": 0.30714942324229755, + "grad_norm": 0.1354653686285019, + "learning_rate": 0.0007937593826143664, + "loss": 2.7738, + "step": 10358 + }, + { + "epoch": 0.307179076594609, + "grad_norm": 0.1346152424812317, + "learning_rate": 0.000793721307180298, + "loss": 2.7325, + "step": 10359 + }, + { + "epoch": 0.3072087299469205, + "grad_norm": 0.1340363770723343, + "learning_rate": 0.000793683229145277, + "loss": 2.7593, + "step": 10360 + }, + { + "epoch": 0.307238383299232, + "grad_norm": 0.136399045586586, + "learning_rate": 0.0007936451485096406, + "loss": 2.758, + "step": 10361 + }, + { + "epoch": 0.30726803665154345, + "grad_norm": 0.1327098309993744, + "learning_rate": 0.0007936070652737261, + "loss": 2.7968, + "step": 10362 + }, + { + "epoch": 0.30729769000385493, + "grad_norm": 0.11935020983219147, + "learning_rate": 0.0007935689794378705, + "loss": 2.7997, + "step": 10363 + }, + { + "epoch": 0.3073273433561664, + "grad_norm": 0.14532576501369476, + "learning_rate": 0.0007935308910024113, + "loss": 2.7453, + "step": 10364 + }, + { + "epoch": 0.3073569967084779, + "grad_norm": 0.13772451877593994, + "learning_rate": 0.0007934927999676855, + "loss": 2.7503, + "step": 10365 + }, + { + "epoch": 0.30738665006078936, + "grad_norm": 0.13126540184020996, + "learning_rate": 0.0007934547063340307, + "loss": 2.7394, + "step": 10366 + }, + { + "epoch": 0.30741630341310083, + "grad_norm": 0.15023620426654816, + "learning_rate": 0.0007934166101017841, + "loss": 2.7776, + "step": 10367 + }, + { + "epoch": 0.3074459567654123, + "grad_norm": 0.15659593045711517, + "learning_rate": 0.0007933785112712831, + "loss": 2.7523, + "step": 10368 + }, + { + "epoch": 0.3074756101177238, + "grad_norm": 0.16185957193374634, + "learning_rate": 0.0007933404098428651, + "loss": 2.7815, + "step": 10369 + }, + { + "epoch": 0.30750526347003526, + "grad_norm": 0.17923951148986816, + "learning_rate": 0.0007933023058168671, + "loss": 2.7926, + "step": 10370 + }, + { + "epoch": 0.3075349168223468, + "grad_norm": 0.14197835326194763, + "learning_rate": 0.0007932641991936271, + "loss": 2.7103, + "step": 10371 + }, + { + "epoch": 0.30756457017465827, + "grad_norm": 0.1323508620262146, + "learning_rate": 0.0007932260899734822, + "loss": 2.7881, + "step": 10372 + }, + { + "epoch": 0.30759422352696975, + "grad_norm": 0.14099521934986115, + "learning_rate": 0.0007931879781567699, + "loss": 2.7841, + "step": 10373 + }, + { + "epoch": 0.3076238768792812, + "grad_norm": 0.13880079984664917, + "learning_rate": 0.0007931498637438279, + "loss": 2.7694, + "step": 10374 + }, + { + "epoch": 0.3076535302315927, + "grad_norm": 0.1591419279575348, + "learning_rate": 0.0007931117467349934, + "loss": 2.7727, + "step": 10375 + }, + { + "epoch": 0.3076831835839042, + "grad_norm": 0.1295151561498642, + "learning_rate": 0.000793073627130604, + "loss": 2.7606, + "step": 10376 + }, + { + "epoch": 0.30771283693621565, + "grad_norm": 0.13166183233261108, + "learning_rate": 0.0007930355049309975, + "loss": 2.7671, + "step": 10377 + }, + { + "epoch": 0.3077424902885271, + "grad_norm": 0.12579600512981415, + "learning_rate": 0.0007929973801365113, + "loss": 2.7542, + "step": 10378 + }, + { + "epoch": 0.3077721436408386, + "grad_norm": 0.113739974796772, + "learning_rate": 0.000792959252747483, + "loss": 2.7582, + "step": 10379 + }, + { + "epoch": 0.3078017969931501, + "grad_norm": 0.13542640209197998, + "learning_rate": 0.0007929211227642501, + "loss": 2.7361, + "step": 10380 + }, + { + "epoch": 0.30783145034546155, + "grad_norm": 0.1566009521484375, + "learning_rate": 0.0007928829901871503, + "loss": 2.7829, + "step": 10381 + }, + { + "epoch": 0.30786110369777303, + "grad_norm": 0.13874435424804688, + "learning_rate": 0.0007928448550165216, + "loss": 2.7637, + "step": 10382 + }, + { + "epoch": 0.3078907570500845, + "grad_norm": 0.1371157467365265, + "learning_rate": 0.0007928067172527013, + "loss": 2.7125, + "step": 10383 + }, + { + "epoch": 0.307920410402396, + "grad_norm": 0.1318536102771759, + "learning_rate": 0.0007927685768960274, + "loss": 2.7525, + "step": 10384 + }, + { + "epoch": 0.30795006375470746, + "grad_norm": 0.1475839912891388, + "learning_rate": 0.0007927304339468373, + "loss": 2.7599, + "step": 10385 + }, + { + "epoch": 0.30797971710701894, + "grad_norm": 0.167664036154747, + "learning_rate": 0.000792692288405469, + "loss": 2.7963, + "step": 10386 + }, + { + "epoch": 0.3080093704593304, + "grad_norm": 0.16658316552639008, + "learning_rate": 0.0007926541402722603, + "loss": 2.764, + "step": 10387 + }, + { + "epoch": 0.3080390238116419, + "grad_norm": 0.15414927899837494, + "learning_rate": 0.0007926159895475491, + "loss": 2.7527, + "step": 10388 + }, + { + "epoch": 0.30806867716395336, + "grad_norm": 0.16084086894989014, + "learning_rate": 0.0007925778362316728, + "loss": 2.7588, + "step": 10389 + }, + { + "epoch": 0.30809833051626484, + "grad_norm": 0.14351503551006317, + "learning_rate": 0.0007925396803249697, + "loss": 2.7655, + "step": 10390 + }, + { + "epoch": 0.3081279838685763, + "grad_norm": 0.12813663482666016, + "learning_rate": 0.0007925015218277774, + "loss": 2.7717, + "step": 10391 + }, + { + "epoch": 0.30815763722088785, + "grad_norm": 0.13175521790981293, + "learning_rate": 0.0007924633607404341, + "loss": 2.7451, + "step": 10392 + }, + { + "epoch": 0.3081872905731993, + "grad_norm": 0.159021258354187, + "learning_rate": 0.0007924251970632774, + "loss": 2.7353, + "step": 10393 + }, + { + "epoch": 0.3082169439255108, + "grad_norm": 0.14919476211071014, + "learning_rate": 0.0007923870307966456, + "loss": 2.7183, + "step": 10394 + }, + { + "epoch": 0.3082465972778223, + "grad_norm": 0.13288278877735138, + "learning_rate": 0.0007923488619408762, + "loss": 2.7627, + "step": 10395 + }, + { + "epoch": 0.30827625063013375, + "grad_norm": 0.13309288024902344, + "learning_rate": 0.0007923106904963075, + "loss": 2.7385, + "step": 10396 + }, + { + "epoch": 0.30830590398244523, + "grad_norm": 0.1582489311695099, + "learning_rate": 0.0007922725164632775, + "loss": 2.7458, + "step": 10397 + }, + { + "epoch": 0.3083355573347567, + "grad_norm": 0.13574403524398804, + "learning_rate": 0.0007922343398421241, + "loss": 2.779, + "step": 10398 + }, + { + "epoch": 0.3083652106870682, + "grad_norm": 0.14240670204162598, + "learning_rate": 0.0007921961606331858, + "loss": 2.7575, + "step": 10399 + }, + { + "epoch": 0.30839486403937966, + "grad_norm": 0.14434589445590973, + "learning_rate": 0.0007921579788368001, + "loss": 2.7634, + "step": 10400 + }, + { + "epoch": 0.30842451739169113, + "grad_norm": 0.15746669471263885, + "learning_rate": 0.0007921197944533052, + "loss": 2.7521, + "step": 10401 + }, + { + "epoch": 0.3084541707440026, + "grad_norm": 0.1266583800315857, + "learning_rate": 0.0007920816074830395, + "loss": 2.7323, + "step": 10402 + }, + { + "epoch": 0.3084838240963141, + "grad_norm": 0.15549160540103912, + "learning_rate": 0.0007920434179263412, + "loss": 2.7625, + "step": 10403 + }, + { + "epoch": 0.30851347744862556, + "grad_norm": 0.13058696687221527, + "learning_rate": 0.0007920052257835481, + "loss": 2.7397, + "step": 10404 + }, + { + "epoch": 0.30854313080093704, + "grad_norm": 0.13562844693660736, + "learning_rate": 0.0007919670310549987, + "loss": 2.7781, + "step": 10405 + }, + { + "epoch": 0.3085727841532485, + "grad_norm": 0.1454116702079773, + "learning_rate": 0.0007919288337410311, + "loss": 2.744, + "step": 10406 + }, + { + "epoch": 0.30860243750556, + "grad_norm": 0.1307656615972519, + "learning_rate": 0.0007918906338419835, + "loss": 2.7728, + "step": 10407 + }, + { + "epoch": 0.30863209085787147, + "grad_norm": 0.11288585513830185, + "learning_rate": 0.0007918524313581943, + "loss": 2.7502, + "step": 10408 + }, + { + "epoch": 0.30866174421018294, + "grad_norm": 0.12945865094661713, + "learning_rate": 0.0007918142262900017, + "loss": 2.766, + "step": 10409 + }, + { + "epoch": 0.3086913975624944, + "grad_norm": 0.12858353555202484, + "learning_rate": 0.000791776018637744, + "loss": 2.738, + "step": 10410 + }, + { + "epoch": 0.3087210509148059, + "grad_norm": 0.14684619009494781, + "learning_rate": 0.0007917378084017596, + "loss": 2.7475, + "step": 10411 + }, + { + "epoch": 0.30875070426711737, + "grad_norm": 0.13457152247428894, + "learning_rate": 0.0007916995955823869, + "loss": 2.78, + "step": 10412 + }, + { + "epoch": 0.3087803576194289, + "grad_norm": 0.11978582292795181, + "learning_rate": 0.0007916613801799643, + "loss": 2.7732, + "step": 10413 + }, + { + "epoch": 0.3088100109717404, + "grad_norm": 0.12777310609817505, + "learning_rate": 0.00079162316219483, + "loss": 2.7649, + "step": 10414 + }, + { + "epoch": 0.30883966432405185, + "grad_norm": 0.1265532225370407, + "learning_rate": 0.0007915849416273225, + "loss": 2.7233, + "step": 10415 + }, + { + "epoch": 0.30886931767636333, + "grad_norm": 0.13099487125873566, + "learning_rate": 0.0007915467184777803, + "loss": 2.7799, + "step": 10416 + }, + { + "epoch": 0.3088989710286748, + "grad_norm": 0.12786565721035004, + "learning_rate": 0.0007915084927465419, + "loss": 2.7458, + "step": 10417 + }, + { + "epoch": 0.3089286243809863, + "grad_norm": 0.12617149949073792, + "learning_rate": 0.0007914702644339457, + "loss": 2.7977, + "step": 10418 + }, + { + "epoch": 0.30895827773329776, + "grad_norm": 0.1279786080121994, + "learning_rate": 0.0007914320335403304, + "loss": 2.7699, + "step": 10419 + }, + { + "epoch": 0.30898793108560924, + "grad_norm": 0.12440627813339233, + "learning_rate": 0.0007913938000660343, + "loss": 2.7346, + "step": 10420 + }, + { + "epoch": 0.3090175844379207, + "grad_norm": 0.11379311233758926, + "learning_rate": 0.0007913555640113961, + "loss": 2.7452, + "step": 10421 + }, + { + "epoch": 0.3090472377902322, + "grad_norm": 0.11010633409023285, + "learning_rate": 0.0007913173253767543, + "loss": 2.7832, + "step": 10422 + }, + { + "epoch": 0.30907689114254366, + "grad_norm": 0.12881693243980408, + "learning_rate": 0.0007912790841624477, + "loss": 2.7622, + "step": 10423 + }, + { + "epoch": 0.30910654449485514, + "grad_norm": 0.13640761375427246, + "learning_rate": 0.0007912408403688149, + "loss": 2.7622, + "step": 10424 + }, + { + "epoch": 0.3091361978471666, + "grad_norm": 0.1272430717945099, + "learning_rate": 0.0007912025939961943, + "loss": 2.757, + "step": 10425 + }, + { + "epoch": 0.3091658511994781, + "grad_norm": 0.12392377108335495, + "learning_rate": 0.0007911643450449248, + "loss": 2.7569, + "step": 10426 + }, + { + "epoch": 0.30919550455178957, + "grad_norm": 0.1252516359090805, + "learning_rate": 0.0007911260935153451, + "loss": 2.7498, + "step": 10427 + }, + { + "epoch": 0.30922515790410104, + "grad_norm": 0.138655886054039, + "learning_rate": 0.0007910878394077938, + "loss": 2.7704, + "step": 10428 + }, + { + "epoch": 0.3092548112564125, + "grad_norm": 0.16704179346561432, + "learning_rate": 0.0007910495827226097, + "loss": 2.7501, + "step": 10429 + }, + { + "epoch": 0.309284464608724, + "grad_norm": 0.16936863958835602, + "learning_rate": 0.0007910113234601317, + "loss": 2.7621, + "step": 10430 + }, + { + "epoch": 0.3093141179610355, + "grad_norm": 0.17370744049549103, + "learning_rate": 0.0007909730616206983, + "loss": 2.7394, + "step": 10431 + }, + { + "epoch": 0.30934377131334695, + "grad_norm": 0.1876881867647171, + "learning_rate": 0.0007909347972046486, + "loss": 2.7639, + "step": 10432 + }, + { + "epoch": 0.3093734246656584, + "grad_norm": 0.16655124723911285, + "learning_rate": 0.0007908965302123214, + "loss": 2.7832, + "step": 10433 + }, + { + "epoch": 0.30940307801796996, + "grad_norm": 0.17737504839897156, + "learning_rate": 0.0007908582606440555, + "loss": 2.7755, + "step": 10434 + }, + { + "epoch": 0.30943273137028143, + "grad_norm": 0.18064382672309875, + "learning_rate": 0.0007908199885001897, + "loss": 2.7814, + "step": 10435 + }, + { + "epoch": 0.3094623847225929, + "grad_norm": 0.2042022943496704, + "learning_rate": 0.0007907817137810629, + "loss": 2.7521, + "step": 10436 + }, + { + "epoch": 0.3094920380749044, + "grad_norm": 0.1665320098400116, + "learning_rate": 0.0007907434364870142, + "loss": 2.7249, + "step": 10437 + }, + { + "epoch": 0.30952169142721586, + "grad_norm": 0.1492040455341339, + "learning_rate": 0.0007907051566183825, + "loss": 2.7485, + "step": 10438 + }, + { + "epoch": 0.30955134477952734, + "grad_norm": 0.1376655101776123, + "learning_rate": 0.0007906668741755066, + "loss": 2.7679, + "step": 10439 + }, + { + "epoch": 0.3095809981318388, + "grad_norm": 0.12731856107711792, + "learning_rate": 0.0007906285891587259, + "loss": 2.7543, + "step": 10440 + }, + { + "epoch": 0.3096106514841503, + "grad_norm": 0.12392235547304153, + "learning_rate": 0.0007905903015683789, + "loss": 2.7542, + "step": 10441 + }, + { + "epoch": 0.30964030483646177, + "grad_norm": 0.11885065585374832, + "learning_rate": 0.0007905520114048051, + "loss": 2.7371, + "step": 10442 + }, + { + "epoch": 0.30966995818877324, + "grad_norm": 0.1069028452038765, + "learning_rate": 0.0007905137186683431, + "loss": 2.7662, + "step": 10443 + }, + { + "epoch": 0.3096996115410847, + "grad_norm": 0.10715138912200928, + "learning_rate": 0.0007904754233593325, + "loss": 2.7197, + "step": 10444 + }, + { + "epoch": 0.3097292648933962, + "grad_norm": 0.09907443076372147, + "learning_rate": 0.0007904371254781121, + "loss": 2.7495, + "step": 10445 + }, + { + "epoch": 0.30975891824570767, + "grad_norm": 0.11955969780683517, + "learning_rate": 0.0007903988250250212, + "loss": 2.7483, + "step": 10446 + }, + { + "epoch": 0.30978857159801915, + "grad_norm": 0.11998266726732254, + "learning_rate": 0.0007903605220003986, + "loss": 2.7329, + "step": 10447 + }, + { + "epoch": 0.3098182249503306, + "grad_norm": 0.1018722727894783, + "learning_rate": 0.0007903222164045838, + "loss": 2.7231, + "step": 10448 + }, + { + "epoch": 0.3098478783026421, + "grad_norm": 0.10219525545835495, + "learning_rate": 0.0007902839082379161, + "loss": 2.7477, + "step": 10449 + }, + { + "epoch": 0.3098775316549536, + "grad_norm": 0.12371411174535751, + "learning_rate": 0.0007902455975007344, + "loss": 2.7664, + "step": 10450 + }, + { + "epoch": 0.30990718500726505, + "grad_norm": 0.12580989301204681, + "learning_rate": 0.0007902072841933783, + "loss": 2.7675, + "step": 10451 + }, + { + "epoch": 0.3099368383595765, + "grad_norm": 0.1158287525177002, + "learning_rate": 0.0007901689683161868, + "loss": 2.7246, + "step": 10452 + }, + { + "epoch": 0.309966491711888, + "grad_norm": 0.12339465320110321, + "learning_rate": 0.0007901306498694993, + "loss": 2.758, + "step": 10453 + }, + { + "epoch": 0.30999614506419954, + "grad_norm": 0.14919285476207733, + "learning_rate": 0.000790092328853655, + "loss": 2.754, + "step": 10454 + }, + { + "epoch": 0.310025798416511, + "grad_norm": 0.17500323057174683, + "learning_rate": 0.0007900540052689932, + "loss": 2.7378, + "step": 10455 + }, + { + "epoch": 0.3100554517688225, + "grad_norm": 0.1731070578098297, + "learning_rate": 0.0007900156791158538, + "loss": 2.7659, + "step": 10456 + }, + { + "epoch": 0.31008510512113396, + "grad_norm": 0.1582673192024231, + "learning_rate": 0.0007899773503945755, + "loss": 2.774, + "step": 10457 + }, + { + "epoch": 0.31011475847344544, + "grad_norm": 0.13051725924015045, + "learning_rate": 0.000789939019105498, + "loss": 2.7269, + "step": 10458 + }, + { + "epoch": 0.3101444118257569, + "grad_norm": 0.13726869225502014, + "learning_rate": 0.0007899006852489609, + "loss": 2.7777, + "step": 10459 + }, + { + "epoch": 0.3101740651780684, + "grad_norm": 0.12750735878944397, + "learning_rate": 0.0007898623488253033, + "loss": 2.758, + "step": 10460 + }, + { + "epoch": 0.31020371853037987, + "grad_norm": 0.1314094513654709, + "learning_rate": 0.0007898240098348649, + "loss": 2.7314, + "step": 10461 + }, + { + "epoch": 0.31023337188269134, + "grad_norm": 0.13039188086986542, + "learning_rate": 0.0007897856682779851, + "loss": 2.7056, + "step": 10462 + }, + { + "epoch": 0.3102630252350028, + "grad_norm": 0.13462355732917786, + "learning_rate": 0.0007897473241550036, + "loss": 2.7534, + "step": 10463 + }, + { + "epoch": 0.3102926785873143, + "grad_norm": 0.12065165489912033, + "learning_rate": 0.0007897089774662597, + "loss": 2.7212, + "step": 10464 + }, + { + "epoch": 0.3103223319396258, + "grad_norm": 0.13034972548484802, + "learning_rate": 0.0007896706282120932, + "loss": 2.7754, + "step": 10465 + }, + { + "epoch": 0.31035198529193725, + "grad_norm": 0.14678223431110382, + "learning_rate": 0.0007896322763928434, + "loss": 2.7592, + "step": 10466 + }, + { + "epoch": 0.3103816386442487, + "grad_norm": 0.14490433037281036, + "learning_rate": 0.00078959392200885, + "loss": 2.7408, + "step": 10467 + }, + { + "epoch": 0.3104112919965602, + "grad_norm": 0.14575132727622986, + "learning_rate": 0.0007895555650604529, + "loss": 2.7446, + "step": 10468 + }, + { + "epoch": 0.3104409453488717, + "grad_norm": 0.13945665955543518, + "learning_rate": 0.0007895172055479916, + "loss": 2.7958, + "step": 10469 + }, + { + "epoch": 0.31047059870118315, + "grad_norm": 0.12792156636714935, + "learning_rate": 0.0007894788434718057, + "loss": 2.7123, + "step": 10470 + }, + { + "epoch": 0.31050025205349463, + "grad_norm": 0.12920057773590088, + "learning_rate": 0.0007894404788322349, + "loss": 2.7561, + "step": 10471 + }, + { + "epoch": 0.3105299054058061, + "grad_norm": 0.12852275371551514, + "learning_rate": 0.000789402111629619, + "loss": 2.7808, + "step": 10472 + }, + { + "epoch": 0.3105595587581176, + "grad_norm": 0.1344766914844513, + "learning_rate": 0.0007893637418642976, + "loss": 2.7129, + "step": 10473 + }, + { + "epoch": 0.31058921211042906, + "grad_norm": 0.13527420163154602, + "learning_rate": 0.0007893253695366107, + "loss": 2.7324, + "step": 10474 + }, + { + "epoch": 0.3106188654627406, + "grad_norm": 0.1496642380952835, + "learning_rate": 0.0007892869946468981, + "loss": 2.7085, + "step": 10475 + }, + { + "epoch": 0.31064851881505207, + "grad_norm": 0.1375284343957901, + "learning_rate": 0.0007892486171954995, + "loss": 2.7871, + "step": 10476 + }, + { + "epoch": 0.31067817216736354, + "grad_norm": 0.14679744839668274, + "learning_rate": 0.0007892102371827545, + "loss": 2.7587, + "step": 10477 + }, + { + "epoch": 0.310707825519675, + "grad_norm": 0.15587250888347626, + "learning_rate": 0.0007891718546090035, + "loss": 2.7779, + "step": 10478 + }, + { + "epoch": 0.3107374788719865, + "grad_norm": 0.14334464073181152, + "learning_rate": 0.000789133469474586, + "loss": 2.7487, + "step": 10479 + }, + { + "epoch": 0.31076713222429797, + "grad_norm": 0.1274605691432953, + "learning_rate": 0.0007890950817798419, + "loss": 2.7452, + "step": 10480 + }, + { + "epoch": 0.31079678557660945, + "grad_norm": 0.15621992945671082, + "learning_rate": 0.0007890566915251113, + "loss": 2.7959, + "step": 10481 + }, + { + "epoch": 0.3108264389289209, + "grad_norm": 0.14794999361038208, + "learning_rate": 0.000789018298710734, + "loss": 2.749, + "step": 10482 + }, + { + "epoch": 0.3108560922812324, + "grad_norm": 0.12949195504188538, + "learning_rate": 0.0007889799033370502, + "loss": 2.7603, + "step": 10483 + }, + { + "epoch": 0.3108857456335439, + "grad_norm": 0.14213694632053375, + "learning_rate": 0.0007889415054043997, + "loss": 2.7597, + "step": 10484 + }, + { + "epoch": 0.31091539898585535, + "grad_norm": 0.11271162331104279, + "learning_rate": 0.0007889031049131225, + "loss": 2.7567, + "step": 10485 + }, + { + "epoch": 0.3109450523381668, + "grad_norm": 0.12924115359783173, + "learning_rate": 0.0007888647018635588, + "loss": 2.7508, + "step": 10486 + }, + { + "epoch": 0.3109747056904783, + "grad_norm": 0.131354421377182, + "learning_rate": 0.0007888262962560486, + "loss": 2.7774, + "step": 10487 + }, + { + "epoch": 0.3110043590427898, + "grad_norm": 0.14489127695560455, + "learning_rate": 0.0007887878880909318, + "loss": 2.7292, + "step": 10488 + }, + { + "epoch": 0.31103401239510126, + "grad_norm": 0.1236901581287384, + "learning_rate": 0.0007887494773685488, + "loss": 2.7666, + "step": 10489 + }, + { + "epoch": 0.31106366574741273, + "grad_norm": 0.14798873662948608, + "learning_rate": 0.0007887110640892398, + "loss": 2.7346, + "step": 10490 + }, + { + "epoch": 0.3110933190997242, + "grad_norm": 0.15629972517490387, + "learning_rate": 0.0007886726482533445, + "loss": 2.7484, + "step": 10491 + }, + { + "epoch": 0.3111229724520357, + "grad_norm": 0.11875274032354355, + "learning_rate": 0.0007886342298612035, + "loss": 2.743, + "step": 10492 + }, + { + "epoch": 0.31115262580434716, + "grad_norm": 0.13243719935417175, + "learning_rate": 0.0007885958089131566, + "loss": 2.7938, + "step": 10493 + }, + { + "epoch": 0.31118227915665864, + "grad_norm": 0.18486225605010986, + "learning_rate": 0.0007885573854095447, + "loss": 2.7666, + "step": 10494 + }, + { + "epoch": 0.3112119325089701, + "grad_norm": 0.18991324305534363, + "learning_rate": 0.0007885189593507074, + "loss": 2.7681, + "step": 10495 + }, + { + "epoch": 0.31124158586128164, + "grad_norm": 0.14867381751537323, + "learning_rate": 0.0007884805307369851, + "loss": 2.7452, + "step": 10496 + }, + { + "epoch": 0.3112712392135931, + "grad_norm": 0.15682728588581085, + "learning_rate": 0.0007884420995687183, + "loss": 2.7501, + "step": 10497 + }, + { + "epoch": 0.3113008925659046, + "grad_norm": 0.1306236833333969, + "learning_rate": 0.0007884036658462472, + "loss": 2.7424, + "step": 10498 + }, + { + "epoch": 0.3113305459182161, + "grad_norm": 0.1299743801355362, + "learning_rate": 0.000788365229569912, + "loss": 2.7499, + "step": 10499 + }, + { + "epoch": 0.31136019927052755, + "grad_norm": 0.14544086158275604, + "learning_rate": 0.0007883267907400533, + "loss": 2.7469, + "step": 10500 + }, + { + "epoch": 0.311389852622839, + "grad_norm": 0.13079893589019775, + "learning_rate": 0.0007882883493570115, + "loss": 2.7577, + "step": 10501 + }, + { + "epoch": 0.3114195059751505, + "grad_norm": 0.13424761593341827, + "learning_rate": 0.0007882499054211267, + "loss": 2.7632, + "step": 10502 + }, + { + "epoch": 0.311449159327462, + "grad_norm": 0.13993491232395172, + "learning_rate": 0.0007882114589327396, + "loss": 2.7513, + "step": 10503 + }, + { + "epoch": 0.31147881267977345, + "grad_norm": 0.14282922446727753, + "learning_rate": 0.0007881730098921905, + "loss": 2.762, + "step": 10504 + }, + { + "epoch": 0.31150846603208493, + "grad_norm": 0.12193500995635986, + "learning_rate": 0.0007881345582998198, + "loss": 2.7401, + "step": 10505 + }, + { + "epoch": 0.3115381193843964, + "grad_norm": 0.12208286672830582, + "learning_rate": 0.0007880961041559683, + "loss": 2.7573, + "step": 10506 + }, + { + "epoch": 0.3115677727367079, + "grad_norm": 0.12092237919569016, + "learning_rate": 0.0007880576474609764, + "loss": 2.7528, + "step": 10507 + }, + { + "epoch": 0.31159742608901936, + "grad_norm": 0.13116781413555145, + "learning_rate": 0.0007880191882151846, + "loss": 2.7089, + "step": 10508 + }, + { + "epoch": 0.31162707944133083, + "grad_norm": 0.15294377505779266, + "learning_rate": 0.0007879807264189333, + "loss": 2.763, + "step": 10509 + }, + { + "epoch": 0.3116567327936423, + "grad_norm": 0.15112166106700897, + "learning_rate": 0.0007879422620725633, + "loss": 2.7881, + "step": 10510 + }, + { + "epoch": 0.3116863861459538, + "grad_norm": 0.14777429401874542, + "learning_rate": 0.0007879037951764152, + "loss": 2.7243, + "step": 10511 + }, + { + "epoch": 0.31171603949826526, + "grad_norm": 0.15295062959194183, + "learning_rate": 0.0007878653257308295, + "loss": 2.7689, + "step": 10512 + }, + { + "epoch": 0.31174569285057674, + "grad_norm": 0.17854043841362, + "learning_rate": 0.0007878268537361469, + "loss": 2.7134, + "step": 10513 + }, + { + "epoch": 0.3117753462028882, + "grad_norm": 0.1697879433631897, + "learning_rate": 0.0007877883791927081, + "loss": 2.7619, + "step": 10514 + }, + { + "epoch": 0.3118049995551997, + "grad_norm": 0.14917320013046265, + "learning_rate": 0.000787749902100854, + "loss": 2.7619, + "step": 10515 + }, + { + "epoch": 0.31183465290751117, + "grad_norm": 0.13883933424949646, + "learning_rate": 0.0007877114224609249, + "loss": 2.7728, + "step": 10516 + }, + { + "epoch": 0.3118643062598227, + "grad_norm": 0.13788829743862152, + "learning_rate": 0.0007876729402732618, + "loss": 2.7124, + "step": 10517 + }, + { + "epoch": 0.3118939596121342, + "grad_norm": 0.12804248929023743, + "learning_rate": 0.0007876344555382055, + "loss": 2.7584, + "step": 10518 + }, + { + "epoch": 0.31192361296444565, + "grad_norm": 0.12736430764198303, + "learning_rate": 0.0007875959682560968, + "loss": 2.7503, + "step": 10519 + }, + { + "epoch": 0.3119532663167571, + "grad_norm": 0.12012766301631927, + "learning_rate": 0.0007875574784272763, + "loss": 2.736, + "step": 10520 + }, + { + "epoch": 0.3119829196690686, + "grad_norm": 0.10594359785318375, + "learning_rate": 0.000787518986052085, + "loss": 2.7665, + "step": 10521 + }, + { + "epoch": 0.3120125730213801, + "grad_norm": 0.12371047586202621, + "learning_rate": 0.0007874804911308639, + "loss": 2.7636, + "step": 10522 + }, + { + "epoch": 0.31204222637369156, + "grad_norm": 0.12503083050251007, + "learning_rate": 0.0007874419936639536, + "loss": 2.7169, + "step": 10523 + }, + { + "epoch": 0.31207187972600303, + "grad_norm": 0.11639060080051422, + "learning_rate": 0.0007874034936516949, + "loss": 2.7484, + "step": 10524 + }, + { + "epoch": 0.3121015330783145, + "grad_norm": 0.11787469685077667, + "learning_rate": 0.0007873649910944292, + "loss": 2.7656, + "step": 10525 + }, + { + "epoch": 0.312131186430626, + "grad_norm": 0.121648408472538, + "learning_rate": 0.000787326485992497, + "loss": 2.7279, + "step": 10526 + }, + { + "epoch": 0.31216083978293746, + "grad_norm": 0.13150838017463684, + "learning_rate": 0.0007872879783462395, + "loss": 2.7772, + "step": 10527 + }, + { + "epoch": 0.31219049313524894, + "grad_norm": 0.1338718980550766, + "learning_rate": 0.0007872494681559978, + "loss": 2.7213, + "step": 10528 + }, + { + "epoch": 0.3122201464875604, + "grad_norm": 0.1412484496831894, + "learning_rate": 0.0007872109554221125, + "loss": 2.7371, + "step": 10529 + }, + { + "epoch": 0.3122497998398719, + "grad_norm": 0.15727928280830383, + "learning_rate": 0.0007871724401449251, + "loss": 2.7445, + "step": 10530 + }, + { + "epoch": 0.31227945319218336, + "grad_norm": 0.1591072678565979, + "learning_rate": 0.0007871339223247762, + "loss": 2.752, + "step": 10531 + }, + { + "epoch": 0.31230910654449484, + "grad_norm": 0.15636864304542542, + "learning_rate": 0.0007870954019620073, + "loss": 2.7608, + "step": 10532 + }, + { + "epoch": 0.3123387598968063, + "grad_norm": 0.13662514090538025, + "learning_rate": 0.0007870568790569594, + "loss": 2.7377, + "step": 10533 + }, + { + "epoch": 0.3123684132491178, + "grad_norm": 0.12673956155776978, + "learning_rate": 0.0007870183536099736, + "loss": 2.7286, + "step": 10534 + }, + { + "epoch": 0.31239806660142927, + "grad_norm": 0.14005637168884277, + "learning_rate": 0.0007869798256213908, + "loss": 2.7487, + "step": 10535 + }, + { + "epoch": 0.31242771995374075, + "grad_norm": 0.12814942002296448, + "learning_rate": 0.0007869412950915524, + "loss": 2.7844, + "step": 10536 + }, + { + "epoch": 0.3124573733060522, + "grad_norm": 0.11778143793344498, + "learning_rate": 0.0007869027620207998, + "loss": 2.7239, + "step": 10537 + }, + { + "epoch": 0.31248702665836375, + "grad_norm": 0.13168196380138397, + "learning_rate": 0.0007868642264094737, + "loss": 2.7493, + "step": 10538 + }, + { + "epoch": 0.31251668001067523, + "grad_norm": 0.14084580540657043, + "learning_rate": 0.0007868256882579159, + "loss": 2.7569, + "step": 10539 + }, + { + "epoch": 0.3125463333629867, + "grad_norm": 0.13800804316997528, + "learning_rate": 0.0007867871475664673, + "loss": 2.7377, + "step": 10540 + }, + { + "epoch": 0.3125759867152982, + "grad_norm": 0.13279205560684204, + "learning_rate": 0.0007867486043354692, + "loss": 2.7465, + "step": 10541 + }, + { + "epoch": 0.31260564006760966, + "grad_norm": 0.14140978455543518, + "learning_rate": 0.0007867100585652631, + "loss": 2.7522, + "step": 10542 + }, + { + "epoch": 0.31263529341992113, + "grad_norm": 0.1410800665616989, + "learning_rate": 0.0007866715102561902, + "loss": 2.7445, + "step": 10543 + }, + { + "epoch": 0.3126649467722326, + "grad_norm": 0.13848601281642914, + "learning_rate": 0.0007866329594085919, + "loss": 2.74, + "step": 10544 + }, + { + "epoch": 0.3126946001245441, + "grad_norm": 0.13026443123817444, + "learning_rate": 0.0007865944060228094, + "loss": 2.7672, + "step": 10545 + }, + { + "epoch": 0.31272425347685556, + "grad_norm": 0.11640472710132599, + "learning_rate": 0.0007865558500991844, + "loss": 2.7518, + "step": 10546 + }, + { + "epoch": 0.31275390682916704, + "grad_norm": 0.11911450326442719, + "learning_rate": 0.000786517291638058, + "loss": 2.7421, + "step": 10547 + }, + { + "epoch": 0.3127835601814785, + "grad_norm": 0.124411940574646, + "learning_rate": 0.0007864787306397721, + "loss": 2.7443, + "step": 10548 + }, + { + "epoch": 0.31281321353379, + "grad_norm": 0.11570625007152557, + "learning_rate": 0.0007864401671046676, + "loss": 2.7662, + "step": 10549 + }, + { + "epoch": 0.31284286688610147, + "grad_norm": 0.1132110133767128, + "learning_rate": 0.0007864016010330863, + "loss": 2.794, + "step": 10550 + }, + { + "epoch": 0.31287252023841294, + "grad_norm": 0.132551372051239, + "learning_rate": 0.0007863630324253697, + "loss": 2.75, + "step": 10551 + }, + { + "epoch": 0.3129021735907244, + "grad_norm": 0.1643187552690506, + "learning_rate": 0.0007863244612818592, + "loss": 2.6984, + "step": 10552 + }, + { + "epoch": 0.3129318269430359, + "grad_norm": 0.18010199069976807, + "learning_rate": 0.0007862858876028965, + "loss": 2.7537, + "step": 10553 + }, + { + "epoch": 0.31296148029534737, + "grad_norm": 0.19148148596286774, + "learning_rate": 0.0007862473113888233, + "loss": 2.7209, + "step": 10554 + }, + { + "epoch": 0.31299113364765885, + "grad_norm": 0.1513475924730301, + "learning_rate": 0.0007862087326399808, + "loss": 2.7785, + "step": 10555 + }, + { + "epoch": 0.3130207869999703, + "grad_norm": 0.15449143946170807, + "learning_rate": 0.000786170151356711, + "loss": 2.7469, + "step": 10556 + }, + { + "epoch": 0.3130504403522818, + "grad_norm": 0.16103194653987885, + "learning_rate": 0.0007861315675393553, + "loss": 2.7683, + "step": 10557 + }, + { + "epoch": 0.31308009370459333, + "grad_norm": 0.1508917212486267, + "learning_rate": 0.0007860929811882554, + "loss": 2.7673, + "step": 10558 + }, + { + "epoch": 0.3131097470569048, + "grad_norm": 0.14557337760925293, + "learning_rate": 0.0007860543923037531, + "loss": 2.7813, + "step": 10559 + }, + { + "epoch": 0.3131394004092163, + "grad_norm": 0.15165114402770996, + "learning_rate": 0.0007860158008861901, + "loss": 2.746, + "step": 10560 + }, + { + "epoch": 0.31316905376152776, + "grad_norm": 0.16156005859375, + "learning_rate": 0.0007859772069359081, + "loss": 2.7293, + "step": 10561 + }, + { + "epoch": 0.31319870711383924, + "grad_norm": 0.15694956481456757, + "learning_rate": 0.0007859386104532486, + "loss": 2.7831, + "step": 10562 + }, + { + "epoch": 0.3132283604661507, + "grad_norm": 0.14167265594005585, + "learning_rate": 0.000785900011438554, + "loss": 2.7402, + "step": 10563 + }, + { + "epoch": 0.3132580138184622, + "grad_norm": 0.1141243502497673, + "learning_rate": 0.0007858614098921655, + "loss": 2.7433, + "step": 10564 + }, + { + "epoch": 0.31328766717077366, + "grad_norm": 0.14021331071853638, + "learning_rate": 0.0007858228058144252, + "loss": 2.7494, + "step": 10565 + }, + { + "epoch": 0.31331732052308514, + "grad_norm": 0.13222777843475342, + "learning_rate": 0.0007857841992056749, + "loss": 2.7341, + "step": 10566 + }, + { + "epoch": 0.3133469738753966, + "grad_norm": 0.11572054028511047, + "learning_rate": 0.0007857455900662564, + "loss": 2.7285, + "step": 10567 + }, + { + "epoch": 0.3133766272277081, + "grad_norm": 0.14769437909126282, + "learning_rate": 0.0007857069783965119, + "loss": 2.7484, + "step": 10568 + }, + { + "epoch": 0.31340628058001957, + "grad_norm": 0.14692284166812897, + "learning_rate": 0.0007856683641967828, + "loss": 2.7356, + "step": 10569 + }, + { + "epoch": 0.31343593393233105, + "grad_norm": 0.14682641625404358, + "learning_rate": 0.0007856297474674112, + "loss": 2.7536, + "step": 10570 + }, + { + "epoch": 0.3134655872846425, + "grad_norm": 0.13328619301319122, + "learning_rate": 0.0007855911282087394, + "loss": 2.7502, + "step": 10571 + }, + { + "epoch": 0.313495240636954, + "grad_norm": 0.10845541208982468, + "learning_rate": 0.0007855525064211091, + "loss": 2.7449, + "step": 10572 + }, + { + "epoch": 0.3135248939892655, + "grad_norm": 0.10894706100225449, + "learning_rate": 0.0007855138821048623, + "loss": 2.7439, + "step": 10573 + }, + { + "epoch": 0.31355454734157695, + "grad_norm": 0.11133959889411926, + "learning_rate": 0.0007854752552603411, + "loss": 2.7373, + "step": 10574 + }, + { + "epoch": 0.3135842006938884, + "grad_norm": 0.11211752891540527, + "learning_rate": 0.0007854366258878874, + "loss": 2.7476, + "step": 10575 + }, + { + "epoch": 0.3136138540461999, + "grad_norm": 0.10693582147359848, + "learning_rate": 0.0007853979939878435, + "loss": 2.7626, + "step": 10576 + }, + { + "epoch": 0.3136435073985114, + "grad_norm": 0.11926605552434921, + "learning_rate": 0.0007853593595605513, + "loss": 2.738, + "step": 10577 + }, + { + "epoch": 0.31367316075082285, + "grad_norm": 0.13393911719322205, + "learning_rate": 0.0007853207226063531, + "loss": 2.7977, + "step": 10578 + }, + { + "epoch": 0.3137028141031344, + "grad_norm": 0.1329624205827713, + "learning_rate": 0.0007852820831255909, + "loss": 2.72, + "step": 10579 + }, + { + "epoch": 0.31373246745544586, + "grad_norm": 0.14860396087169647, + "learning_rate": 0.0007852434411186068, + "loss": 2.7756, + "step": 10580 + }, + { + "epoch": 0.31376212080775734, + "grad_norm": 0.1417611837387085, + "learning_rate": 0.0007852047965857432, + "loss": 2.7273, + "step": 10581 + }, + { + "epoch": 0.3137917741600688, + "grad_norm": 0.14901208877563477, + "learning_rate": 0.0007851661495273421, + "loss": 2.7603, + "step": 10582 + }, + { + "epoch": 0.3138214275123803, + "grad_norm": 0.15592753887176514, + "learning_rate": 0.0007851274999437456, + "loss": 2.7279, + "step": 10583 + }, + { + "epoch": 0.31385108086469177, + "grad_norm": 0.1458013653755188, + "learning_rate": 0.0007850888478352964, + "loss": 2.7373, + "step": 10584 + }, + { + "epoch": 0.31388073421700324, + "grad_norm": 0.1251181662082672, + "learning_rate": 0.0007850501932023364, + "loss": 2.763, + "step": 10585 + }, + { + "epoch": 0.3139103875693147, + "grad_norm": 0.1440366953611374, + "learning_rate": 0.0007850115360452082, + "loss": 2.7682, + "step": 10586 + }, + { + "epoch": 0.3139400409216262, + "grad_norm": 0.1499941051006317, + "learning_rate": 0.0007849728763642537, + "loss": 2.775, + "step": 10587 + }, + { + "epoch": 0.31396969427393767, + "grad_norm": 0.13396281003952026, + "learning_rate": 0.0007849342141598156, + "loss": 2.7525, + "step": 10588 + }, + { + "epoch": 0.31399934762624915, + "grad_norm": 0.14159296452999115, + "learning_rate": 0.0007848955494322361, + "loss": 2.7682, + "step": 10589 + }, + { + "epoch": 0.3140290009785606, + "grad_norm": 0.13517943024635315, + "learning_rate": 0.0007848568821818576, + "loss": 2.7755, + "step": 10590 + }, + { + "epoch": 0.3140586543308721, + "grad_norm": 0.13067595660686493, + "learning_rate": 0.0007848182124090224, + "loss": 2.7682, + "step": 10591 + }, + { + "epoch": 0.3140883076831836, + "grad_norm": 0.12315063923597336, + "learning_rate": 0.0007847795401140731, + "loss": 2.7207, + "step": 10592 + }, + { + "epoch": 0.31411796103549505, + "grad_norm": 0.12125451117753983, + "learning_rate": 0.0007847408652973522, + "loss": 2.7523, + "step": 10593 + }, + { + "epoch": 0.31414761438780653, + "grad_norm": 0.13858333230018616, + "learning_rate": 0.000784702187959202, + "loss": 2.7549, + "step": 10594 + }, + { + "epoch": 0.314177267740118, + "grad_norm": 0.14985184371471405, + "learning_rate": 0.0007846635080999651, + "loss": 2.7616, + "step": 10595 + }, + { + "epoch": 0.3142069210924295, + "grad_norm": 0.14513280987739563, + "learning_rate": 0.0007846248257199839, + "loss": 2.7458, + "step": 10596 + }, + { + "epoch": 0.31423657444474096, + "grad_norm": 0.14069467782974243, + "learning_rate": 0.0007845861408196009, + "loss": 2.7664, + "step": 10597 + }, + { + "epoch": 0.31426622779705243, + "grad_norm": 0.14500631392002106, + "learning_rate": 0.0007845474533991591, + "loss": 2.7471, + "step": 10598 + }, + { + "epoch": 0.3142958811493639, + "grad_norm": 0.1405453234910965, + "learning_rate": 0.0007845087634590005, + "loss": 2.7516, + "step": 10599 + }, + { + "epoch": 0.31432553450167544, + "grad_norm": 0.133841872215271, + "learning_rate": 0.0007844700709994681, + "loss": 2.7763, + "step": 10600 + }, + { + "epoch": 0.3143551878539869, + "grad_norm": 0.13846223056316376, + "learning_rate": 0.0007844313760209043, + "loss": 2.7263, + "step": 10601 + }, + { + "epoch": 0.3143848412062984, + "grad_norm": 0.11560788005590439, + "learning_rate": 0.0007843926785236519, + "loss": 2.7649, + "step": 10602 + }, + { + "epoch": 0.31441449455860987, + "grad_norm": 0.13331671059131622, + "learning_rate": 0.0007843539785080533, + "loss": 2.7734, + "step": 10603 + }, + { + "epoch": 0.31444414791092135, + "grad_norm": 0.1502983719110489, + "learning_rate": 0.0007843152759744516, + "loss": 2.7702, + "step": 10604 + }, + { + "epoch": 0.3144738012632328, + "grad_norm": 0.1291169673204422, + "learning_rate": 0.0007842765709231894, + "loss": 2.7626, + "step": 10605 + }, + { + "epoch": 0.3145034546155443, + "grad_norm": 0.11983446776866913, + "learning_rate": 0.0007842378633546093, + "loss": 2.7438, + "step": 10606 + }, + { + "epoch": 0.3145331079678558, + "grad_norm": 0.11331596225500107, + "learning_rate": 0.0007841991532690542, + "loss": 2.7465, + "step": 10607 + }, + { + "epoch": 0.31456276132016725, + "grad_norm": 0.12293583899736404, + "learning_rate": 0.0007841604406668667, + "loss": 2.7432, + "step": 10608 + }, + { + "epoch": 0.3145924146724787, + "grad_norm": 0.11715284734964371, + "learning_rate": 0.0007841217255483896, + "loss": 2.7676, + "step": 10609 + }, + { + "epoch": 0.3146220680247902, + "grad_norm": 0.10862971097230911, + "learning_rate": 0.0007840830079139661, + "loss": 2.7691, + "step": 10610 + }, + { + "epoch": 0.3146517213771017, + "grad_norm": 0.10806631296873093, + "learning_rate": 0.0007840442877639389, + "loss": 2.7658, + "step": 10611 + }, + { + "epoch": 0.31468137472941315, + "grad_norm": 0.1149684488773346, + "learning_rate": 0.0007840055650986505, + "loss": 2.7742, + "step": 10612 + }, + { + "epoch": 0.31471102808172463, + "grad_norm": 0.12802067399024963, + "learning_rate": 0.0007839668399184441, + "loss": 2.7259, + "step": 10613 + }, + { + "epoch": 0.3147406814340361, + "grad_norm": 0.14305828511714935, + "learning_rate": 0.0007839281122236628, + "loss": 2.7556, + "step": 10614 + }, + { + "epoch": 0.3147703347863476, + "grad_norm": 0.1504461020231247, + "learning_rate": 0.0007838893820146492, + "loss": 2.7653, + "step": 10615 + }, + { + "epoch": 0.31479998813865906, + "grad_norm": 0.14426453411579132, + "learning_rate": 0.0007838506492917463, + "loss": 2.7731, + "step": 10616 + }, + { + "epoch": 0.31482964149097054, + "grad_norm": 0.15940366685390472, + "learning_rate": 0.0007838119140552973, + "loss": 2.7768, + "step": 10617 + }, + { + "epoch": 0.314859294843282, + "grad_norm": 0.16268481314182281, + "learning_rate": 0.0007837731763056451, + "loss": 2.7446, + "step": 10618 + }, + { + "epoch": 0.3148889481955935, + "grad_norm": 0.16690124571323395, + "learning_rate": 0.0007837344360431325, + "loss": 2.7921, + "step": 10619 + }, + { + "epoch": 0.31491860154790496, + "grad_norm": 0.1662382185459137, + "learning_rate": 0.000783695693268103, + "loss": 2.7417, + "step": 10620 + }, + { + "epoch": 0.3149482549002165, + "grad_norm": 0.17887629568576813, + "learning_rate": 0.0007836569479808994, + "loss": 2.7077, + "step": 10621 + }, + { + "epoch": 0.31497790825252797, + "grad_norm": 0.1571376919746399, + "learning_rate": 0.0007836182001818648, + "loss": 2.7771, + "step": 10622 + }, + { + "epoch": 0.31500756160483945, + "grad_norm": 0.16531887650489807, + "learning_rate": 0.0007835794498713422, + "loss": 2.7654, + "step": 10623 + }, + { + "epoch": 0.3150372149571509, + "grad_norm": 0.14338979125022888, + "learning_rate": 0.0007835406970496751, + "loss": 2.7572, + "step": 10624 + }, + { + "epoch": 0.3150668683094624, + "grad_norm": 0.16315153241157532, + "learning_rate": 0.0007835019417172064, + "loss": 2.7395, + "step": 10625 + }, + { + "epoch": 0.3150965216617739, + "grad_norm": 0.14478440582752228, + "learning_rate": 0.0007834631838742794, + "loss": 2.7474, + "step": 10626 + }, + { + "epoch": 0.31512617501408535, + "grad_norm": 0.12576521933078766, + "learning_rate": 0.0007834244235212373, + "loss": 2.7342, + "step": 10627 + }, + { + "epoch": 0.31515582836639683, + "grad_norm": 0.1386062502861023, + "learning_rate": 0.0007833856606584231, + "loss": 2.7433, + "step": 10628 + }, + { + "epoch": 0.3151854817187083, + "grad_norm": 0.14436008036136627, + "learning_rate": 0.0007833468952861803, + "loss": 2.7697, + "step": 10629 + }, + { + "epoch": 0.3152151350710198, + "grad_norm": 0.12626902759075165, + "learning_rate": 0.0007833081274048521, + "loss": 2.7515, + "step": 10630 + }, + { + "epoch": 0.31524478842333126, + "grad_norm": 0.10596974194049835, + "learning_rate": 0.000783269357014782, + "loss": 2.7546, + "step": 10631 + }, + { + "epoch": 0.31527444177564273, + "grad_norm": 0.11371791362762451, + "learning_rate": 0.0007832305841163128, + "loss": 2.7417, + "step": 10632 + }, + { + "epoch": 0.3153040951279542, + "grad_norm": 0.11666443198919296, + "learning_rate": 0.0007831918087097883, + "loss": 2.7216, + "step": 10633 + }, + { + "epoch": 0.3153337484802657, + "grad_norm": 0.10480658710002899, + "learning_rate": 0.0007831530307955518, + "loss": 2.7665, + "step": 10634 + }, + { + "epoch": 0.31536340183257716, + "grad_norm": 0.12384239584207535, + "learning_rate": 0.0007831142503739465, + "loss": 2.7536, + "step": 10635 + }, + { + "epoch": 0.31539305518488864, + "grad_norm": 0.12016322463750839, + "learning_rate": 0.000783075467445316, + "loss": 2.775, + "step": 10636 + }, + { + "epoch": 0.3154227085372001, + "grad_norm": 0.11909075081348419, + "learning_rate": 0.0007830366820100037, + "loss": 2.7671, + "step": 10637 + }, + { + "epoch": 0.3154523618895116, + "grad_norm": 0.1311125010251999, + "learning_rate": 0.0007829978940683529, + "loss": 2.7165, + "step": 10638 + }, + { + "epoch": 0.31548201524182307, + "grad_norm": 0.12693433463573456, + "learning_rate": 0.0007829591036207071, + "loss": 2.7263, + "step": 10639 + }, + { + "epoch": 0.31551166859413454, + "grad_norm": 0.13025815784931183, + "learning_rate": 0.00078292031066741, + "loss": 2.7783, + "step": 10640 + }, + { + "epoch": 0.315541321946446, + "grad_norm": 0.1368090659379959, + "learning_rate": 0.0007828815152088049, + "loss": 2.7443, + "step": 10641 + }, + { + "epoch": 0.31557097529875755, + "grad_norm": 0.14434579014778137, + "learning_rate": 0.0007828427172452355, + "loss": 2.7216, + "step": 10642 + }, + { + "epoch": 0.315600628651069, + "grad_norm": 0.15359722077846527, + "learning_rate": 0.0007828039167770451, + "loss": 2.7598, + "step": 10643 + }, + { + "epoch": 0.3156302820033805, + "grad_norm": 0.1439627707004547, + "learning_rate": 0.0007827651138045777, + "loss": 2.771, + "step": 10644 + }, + { + "epoch": 0.315659935355692, + "grad_norm": 0.1378818154335022, + "learning_rate": 0.0007827263083281765, + "loss": 2.7522, + "step": 10645 + }, + { + "epoch": 0.31568958870800345, + "grad_norm": 0.1251365840435028, + "learning_rate": 0.0007826875003481855, + "loss": 2.764, + "step": 10646 + }, + { + "epoch": 0.31571924206031493, + "grad_norm": 0.12115974724292755, + "learning_rate": 0.0007826486898649482, + "loss": 2.7498, + "step": 10647 + }, + { + "epoch": 0.3157488954126264, + "grad_norm": 0.13193996250629425, + "learning_rate": 0.0007826098768788079, + "loss": 2.7901, + "step": 10648 + }, + { + "epoch": 0.3157785487649379, + "grad_norm": 0.15646278858184814, + "learning_rate": 0.000782571061390109, + "loss": 2.7713, + "step": 10649 + }, + { + "epoch": 0.31580820211724936, + "grad_norm": 0.15738779306411743, + "learning_rate": 0.0007825322433991946, + "loss": 2.7824, + "step": 10650 + }, + { + "epoch": 0.31583785546956084, + "grad_norm": 0.14477349817752838, + "learning_rate": 0.0007824934229064089, + "loss": 2.7773, + "step": 10651 + }, + { + "epoch": 0.3158675088218723, + "grad_norm": 0.15085001289844513, + "learning_rate": 0.0007824545999120953, + "loss": 2.7677, + "step": 10652 + }, + { + "epoch": 0.3158971621741838, + "grad_norm": 0.15062779188156128, + "learning_rate": 0.0007824157744165979, + "loss": 2.7447, + "step": 10653 + }, + { + "epoch": 0.31592681552649526, + "grad_norm": 0.1551516056060791, + "learning_rate": 0.0007823769464202601, + "loss": 2.7787, + "step": 10654 + }, + { + "epoch": 0.31595646887880674, + "grad_norm": 0.1686614602804184, + "learning_rate": 0.000782338115923426, + "loss": 2.7169, + "step": 10655 + }, + { + "epoch": 0.3159861222311182, + "grad_norm": 0.16598564386367798, + "learning_rate": 0.0007822992829264395, + "loss": 2.7544, + "step": 10656 + }, + { + "epoch": 0.3160157755834297, + "grad_norm": 0.1561289131641388, + "learning_rate": 0.0007822604474296444, + "loss": 2.7328, + "step": 10657 + }, + { + "epoch": 0.31604542893574117, + "grad_norm": 0.15748655796051025, + "learning_rate": 0.0007822216094333848, + "loss": 2.7456, + "step": 10658 + }, + { + "epoch": 0.31607508228805264, + "grad_norm": 0.15510907769203186, + "learning_rate": 0.0007821827689380041, + "loss": 2.7476, + "step": 10659 + }, + { + "epoch": 0.3161047356403641, + "grad_norm": 0.14072179794311523, + "learning_rate": 0.0007821439259438466, + "loss": 2.7608, + "step": 10660 + }, + { + "epoch": 0.3161343889926756, + "grad_norm": 0.14116992056369781, + "learning_rate": 0.0007821050804512562, + "loss": 2.7335, + "step": 10661 + }, + { + "epoch": 0.31616404234498713, + "grad_norm": 0.16437304019927979, + "learning_rate": 0.0007820662324605769, + "loss": 2.7383, + "step": 10662 + }, + { + "epoch": 0.3161936956972986, + "grad_norm": 0.1217183917760849, + "learning_rate": 0.0007820273819721527, + "loss": 2.7475, + "step": 10663 + }, + { + "epoch": 0.3162233490496101, + "grad_norm": 0.12576091289520264, + "learning_rate": 0.0007819885289863278, + "loss": 2.7674, + "step": 10664 + }, + { + "epoch": 0.31625300240192156, + "grad_norm": 0.12426267564296722, + "learning_rate": 0.0007819496735034458, + "loss": 2.7827, + "step": 10665 + }, + { + "epoch": 0.31628265575423303, + "grad_norm": 0.12497309595346451, + "learning_rate": 0.0007819108155238513, + "loss": 2.7221, + "step": 10666 + }, + { + "epoch": 0.3163123091065445, + "grad_norm": 0.1271679848432541, + "learning_rate": 0.0007818719550478878, + "loss": 2.765, + "step": 10667 + }, + { + "epoch": 0.316341962458856, + "grad_norm": 0.11674649268388748, + "learning_rate": 0.0007818330920759001, + "loss": 2.7667, + "step": 10668 + }, + { + "epoch": 0.31637161581116746, + "grad_norm": 0.11000815778970718, + "learning_rate": 0.0007817942266082319, + "loss": 2.7594, + "step": 10669 + }, + { + "epoch": 0.31640126916347894, + "grad_norm": 0.12049450725317001, + "learning_rate": 0.0007817553586452273, + "loss": 2.7394, + "step": 10670 + }, + { + "epoch": 0.3164309225157904, + "grad_norm": 0.13201695680618286, + "learning_rate": 0.0007817164881872308, + "loss": 2.7239, + "step": 10671 + }, + { + "epoch": 0.3164605758681019, + "grad_norm": 0.1388504058122635, + "learning_rate": 0.0007816776152345864, + "loss": 2.7786, + "step": 10672 + }, + { + "epoch": 0.31649022922041337, + "grad_norm": 0.1433560699224472, + "learning_rate": 0.0007816387397876383, + "loss": 2.7102, + "step": 10673 + }, + { + "epoch": 0.31651988257272484, + "grad_norm": 0.12959617376327515, + "learning_rate": 0.0007815998618467308, + "loss": 2.7304, + "step": 10674 + }, + { + "epoch": 0.3165495359250363, + "grad_norm": 0.10223573446273804, + "learning_rate": 0.0007815609814122082, + "loss": 2.703, + "step": 10675 + }, + { + "epoch": 0.3165791892773478, + "grad_norm": 0.1114981397986412, + "learning_rate": 0.000781522098484415, + "loss": 2.7457, + "step": 10676 + }, + { + "epoch": 0.31660884262965927, + "grad_norm": 0.12259338051080704, + "learning_rate": 0.0007814832130636951, + "loss": 2.7845, + "step": 10677 + }, + { + "epoch": 0.31663849598197075, + "grad_norm": 0.11766564100980759, + "learning_rate": 0.0007814443251503931, + "loss": 2.8008, + "step": 10678 + }, + { + "epoch": 0.3166681493342822, + "grad_norm": 0.12559841573238373, + "learning_rate": 0.0007814054347448532, + "loss": 2.7327, + "step": 10679 + }, + { + "epoch": 0.3166978026865937, + "grad_norm": 0.1349814385175705, + "learning_rate": 0.0007813665418474198, + "loss": 2.7567, + "step": 10680 + }, + { + "epoch": 0.3167274560389052, + "grad_norm": 0.14293785393238068, + "learning_rate": 0.0007813276464584375, + "loss": 2.7893, + "step": 10681 + }, + { + "epoch": 0.31675710939121665, + "grad_norm": 0.1467715948820114, + "learning_rate": 0.0007812887485782507, + "loss": 2.7572, + "step": 10682 + }, + { + "epoch": 0.3167867627435282, + "grad_norm": 0.14972157776355743, + "learning_rate": 0.0007812498482072037, + "loss": 2.7157, + "step": 10683 + }, + { + "epoch": 0.31681641609583966, + "grad_norm": 0.14070215821266174, + "learning_rate": 0.0007812109453456409, + "loss": 2.7486, + "step": 10684 + }, + { + "epoch": 0.31684606944815114, + "grad_norm": 0.16750440001487732, + "learning_rate": 0.0007811720399939071, + "loss": 2.7769, + "step": 10685 + }, + { + "epoch": 0.3168757228004626, + "grad_norm": 0.16638149321079254, + "learning_rate": 0.0007811331321523465, + "loss": 2.7266, + "step": 10686 + }, + { + "epoch": 0.3169053761527741, + "grad_norm": 0.18884354829788208, + "learning_rate": 0.0007810942218213037, + "loss": 2.7617, + "step": 10687 + }, + { + "epoch": 0.31693502950508556, + "grad_norm": 0.19707466661930084, + "learning_rate": 0.0007810553090011234, + "loss": 2.7873, + "step": 10688 + }, + { + "epoch": 0.31696468285739704, + "grad_norm": 0.17081627249717712, + "learning_rate": 0.0007810163936921502, + "loss": 2.761, + "step": 10689 + }, + { + "epoch": 0.3169943362097085, + "grad_norm": 0.13178181648254395, + "learning_rate": 0.0007809774758947283, + "loss": 2.7252, + "step": 10690 + }, + { + "epoch": 0.31702398956202, + "grad_norm": 0.14326471090316772, + "learning_rate": 0.0007809385556092029, + "loss": 2.7566, + "step": 10691 + }, + { + "epoch": 0.31705364291433147, + "grad_norm": 0.1341821700334549, + "learning_rate": 0.0007808996328359184, + "loss": 2.7436, + "step": 10692 + }, + { + "epoch": 0.31708329626664294, + "grad_norm": 0.12106886506080627, + "learning_rate": 0.0007808607075752192, + "loss": 2.7022, + "step": 10693 + }, + { + "epoch": 0.3171129496189544, + "grad_norm": 0.11920400708913803, + "learning_rate": 0.0007808217798274501, + "loss": 2.7632, + "step": 10694 + }, + { + "epoch": 0.3171426029712659, + "grad_norm": 0.13992281258106232, + "learning_rate": 0.0007807828495929564, + "loss": 2.7619, + "step": 10695 + }, + { + "epoch": 0.3171722563235774, + "grad_norm": 0.14554740488529205, + "learning_rate": 0.000780743916872082, + "loss": 2.7371, + "step": 10696 + }, + { + "epoch": 0.31720190967588885, + "grad_norm": 0.1190292164683342, + "learning_rate": 0.0007807049816651722, + "loss": 2.7257, + "step": 10697 + }, + { + "epoch": 0.3172315630282003, + "grad_norm": 0.12330209463834763, + "learning_rate": 0.0007806660439725716, + "loss": 2.7104, + "step": 10698 + }, + { + "epoch": 0.3172612163805118, + "grad_norm": 0.12040528655052185, + "learning_rate": 0.0007806271037946251, + "loss": 2.7486, + "step": 10699 + }, + { + "epoch": 0.3172908697328233, + "grad_norm": 0.13183076679706573, + "learning_rate": 0.0007805881611316771, + "loss": 2.7257, + "step": 10700 + }, + { + "epoch": 0.31732052308513475, + "grad_norm": 0.14786987006664276, + "learning_rate": 0.000780549215984073, + "loss": 2.7417, + "step": 10701 + }, + { + "epoch": 0.31735017643744623, + "grad_norm": 0.17075157165527344, + "learning_rate": 0.0007805102683521574, + "loss": 2.7429, + "step": 10702 + }, + { + "epoch": 0.3173798297897577, + "grad_norm": 0.15184703469276428, + "learning_rate": 0.0007804713182362753, + "loss": 2.756, + "step": 10703 + }, + { + "epoch": 0.31740948314206924, + "grad_norm": 0.11678843945264816, + "learning_rate": 0.0007804323656367716, + "loss": 2.7875, + "step": 10704 + }, + { + "epoch": 0.3174391364943807, + "grad_norm": 0.1509263664484024, + "learning_rate": 0.000780393410553991, + "loss": 2.757, + "step": 10705 + }, + { + "epoch": 0.3174687898466922, + "grad_norm": 0.15292130410671234, + "learning_rate": 0.0007803544529882785, + "loss": 2.7755, + "step": 10706 + }, + { + "epoch": 0.31749844319900367, + "grad_norm": 0.1292896717786789, + "learning_rate": 0.0007803154929399794, + "loss": 2.7215, + "step": 10707 + }, + { + "epoch": 0.31752809655131514, + "grad_norm": 0.13430795073509216, + "learning_rate": 0.0007802765304094384, + "loss": 2.7788, + "step": 10708 + }, + { + "epoch": 0.3175577499036266, + "grad_norm": 0.12970098853111267, + "learning_rate": 0.0007802375653970006, + "loss": 2.7351, + "step": 10709 + }, + { + "epoch": 0.3175874032559381, + "grad_norm": 0.13017933070659637, + "learning_rate": 0.0007801985979030112, + "loss": 2.7849, + "step": 10710 + }, + { + "epoch": 0.31761705660824957, + "grad_norm": 0.12263365089893341, + "learning_rate": 0.0007801596279278151, + "loss": 2.7622, + "step": 10711 + }, + { + "epoch": 0.31764670996056105, + "grad_norm": 0.11336734890937805, + "learning_rate": 0.0007801206554717573, + "loss": 2.7346, + "step": 10712 + }, + { + "epoch": 0.3176763633128725, + "grad_norm": 0.1320856362581253, + "learning_rate": 0.0007800816805351831, + "loss": 2.7555, + "step": 10713 + }, + { + "epoch": 0.317706016665184, + "grad_norm": 0.11611784249544144, + "learning_rate": 0.0007800427031184374, + "loss": 2.7406, + "step": 10714 + }, + { + "epoch": 0.3177356700174955, + "grad_norm": 0.117008738219738, + "learning_rate": 0.0007800037232218657, + "loss": 2.7273, + "step": 10715 + }, + { + "epoch": 0.31776532336980695, + "grad_norm": 0.1134147047996521, + "learning_rate": 0.0007799647408458128, + "loss": 2.7583, + "step": 10716 + }, + { + "epoch": 0.3177949767221184, + "grad_norm": 0.12546521425247192, + "learning_rate": 0.000779925755990624, + "loss": 2.7673, + "step": 10717 + }, + { + "epoch": 0.3178246300744299, + "grad_norm": 0.1412908434867859, + "learning_rate": 0.0007798867686566449, + "loss": 2.7709, + "step": 10718 + }, + { + "epoch": 0.3178542834267414, + "grad_norm": 0.11665835231542587, + "learning_rate": 0.0007798477788442202, + "loss": 2.7365, + "step": 10719 + }, + { + "epoch": 0.31788393677905286, + "grad_norm": 0.11884767562150955, + "learning_rate": 0.0007798087865536953, + "loss": 2.7507, + "step": 10720 + }, + { + "epoch": 0.31791359013136433, + "grad_norm": 0.11771974712610245, + "learning_rate": 0.0007797697917854158, + "loss": 2.7488, + "step": 10721 + }, + { + "epoch": 0.3179432434836758, + "grad_norm": 0.12382162362337112, + "learning_rate": 0.0007797307945397266, + "loss": 2.7396, + "step": 10722 + }, + { + "epoch": 0.3179728968359873, + "grad_norm": 0.11202426999807358, + "learning_rate": 0.0007796917948169733, + "loss": 2.7751, + "step": 10723 + }, + { + "epoch": 0.31800255018829876, + "grad_norm": 0.12656882405281067, + "learning_rate": 0.0007796527926175011, + "loss": 2.7187, + "step": 10724 + }, + { + "epoch": 0.3180322035406103, + "grad_norm": 0.13764695823192596, + "learning_rate": 0.0007796137879416554, + "loss": 2.7928, + "step": 10725 + }, + { + "epoch": 0.31806185689292177, + "grad_norm": 0.18092423677444458, + "learning_rate": 0.0007795747807897816, + "loss": 2.7661, + "step": 10726 + }, + { + "epoch": 0.31809151024523324, + "grad_norm": 0.1900225430727005, + "learning_rate": 0.0007795357711622252, + "loss": 2.7697, + "step": 10727 + }, + { + "epoch": 0.3181211635975447, + "grad_norm": 0.1854754239320755, + "learning_rate": 0.0007794967590593315, + "loss": 2.7587, + "step": 10728 + }, + { + "epoch": 0.3181508169498562, + "grad_norm": 0.15958915650844574, + "learning_rate": 0.0007794577444814461, + "loss": 2.7513, + "step": 10729 + }, + { + "epoch": 0.3181804703021677, + "grad_norm": 0.1662672609090805, + "learning_rate": 0.0007794187274289145, + "loss": 2.7776, + "step": 10730 + }, + { + "epoch": 0.31821012365447915, + "grad_norm": 0.15903286635875702, + "learning_rate": 0.0007793797079020818, + "loss": 2.6858, + "step": 10731 + }, + { + "epoch": 0.3182397770067906, + "grad_norm": 0.16030922532081604, + "learning_rate": 0.0007793406859012939, + "loss": 2.7243, + "step": 10732 + }, + { + "epoch": 0.3182694303591021, + "grad_norm": 0.1394396275281906, + "learning_rate": 0.0007793016614268964, + "loss": 2.7399, + "step": 10733 + }, + { + "epoch": 0.3182990837114136, + "grad_norm": 0.1451479196548462, + "learning_rate": 0.0007792626344792347, + "loss": 2.7485, + "step": 10734 + }, + { + "epoch": 0.31832873706372505, + "grad_norm": 0.13535955548286438, + "learning_rate": 0.0007792236050586545, + "loss": 2.7606, + "step": 10735 + }, + { + "epoch": 0.31835839041603653, + "grad_norm": 0.12983764708042145, + "learning_rate": 0.0007791845731655013, + "loss": 2.7506, + "step": 10736 + }, + { + "epoch": 0.318388043768348, + "grad_norm": 0.1256982535123825, + "learning_rate": 0.0007791455388001208, + "loss": 2.7698, + "step": 10737 + }, + { + "epoch": 0.3184176971206595, + "grad_norm": 0.13452664017677307, + "learning_rate": 0.0007791065019628585, + "loss": 2.7654, + "step": 10738 + }, + { + "epoch": 0.31844735047297096, + "grad_norm": 0.15558350086212158, + "learning_rate": 0.0007790674626540605, + "loss": 2.7421, + "step": 10739 + }, + { + "epoch": 0.31847700382528243, + "grad_norm": 0.15525910258293152, + "learning_rate": 0.000779028420874072, + "loss": 2.7407, + "step": 10740 + }, + { + "epoch": 0.3185066571775939, + "grad_norm": 0.14947408437728882, + "learning_rate": 0.000778989376623239, + "loss": 2.7245, + "step": 10741 + }, + { + "epoch": 0.3185363105299054, + "grad_norm": 0.13038459420204163, + "learning_rate": 0.0007789503299019072, + "loss": 2.7482, + "step": 10742 + }, + { + "epoch": 0.31856596388221686, + "grad_norm": 0.1301497370004654, + "learning_rate": 0.0007789112807104224, + "loss": 2.7389, + "step": 10743 + }, + { + "epoch": 0.31859561723452834, + "grad_norm": 0.13649824261665344, + "learning_rate": 0.0007788722290491301, + "loss": 2.7649, + "step": 10744 + }, + { + "epoch": 0.3186252705868398, + "grad_norm": 0.12672150135040283, + "learning_rate": 0.0007788331749183766, + "loss": 2.7908, + "step": 10745 + }, + { + "epoch": 0.31865492393915135, + "grad_norm": 0.12369589507579803, + "learning_rate": 0.0007787941183185073, + "loss": 2.7522, + "step": 10746 + }, + { + "epoch": 0.3186845772914628, + "grad_norm": 0.12047302722930908, + "learning_rate": 0.0007787550592498684, + "loss": 2.7432, + "step": 10747 + }, + { + "epoch": 0.3187142306437743, + "grad_norm": 0.1150144711136818, + "learning_rate": 0.0007787159977128055, + "loss": 2.7158, + "step": 10748 + }, + { + "epoch": 0.3187438839960858, + "grad_norm": 0.11195351183414459, + "learning_rate": 0.0007786769337076646, + "loss": 2.7635, + "step": 10749 + }, + { + "epoch": 0.31877353734839725, + "grad_norm": 0.11837231367826462, + "learning_rate": 0.0007786378672347916, + "loss": 2.7399, + "step": 10750 + }, + { + "epoch": 0.3188031907007087, + "grad_norm": 0.11613138765096664, + "learning_rate": 0.0007785987982945324, + "loss": 2.7436, + "step": 10751 + }, + { + "epoch": 0.3188328440530202, + "grad_norm": 0.11859480291604996, + "learning_rate": 0.0007785597268872332, + "loss": 2.7504, + "step": 10752 + }, + { + "epoch": 0.3188624974053317, + "grad_norm": 0.11699580401182175, + "learning_rate": 0.0007785206530132397, + "loss": 2.7452, + "step": 10753 + }, + { + "epoch": 0.31889215075764316, + "grad_norm": 0.1380472630262375, + "learning_rate": 0.000778481576672898, + "loss": 2.7402, + "step": 10754 + }, + { + "epoch": 0.31892180410995463, + "grad_norm": 0.13563303649425507, + "learning_rate": 0.0007784424978665541, + "loss": 2.7356, + "step": 10755 + }, + { + "epoch": 0.3189514574622661, + "grad_norm": 0.1559830605983734, + "learning_rate": 0.0007784034165945543, + "loss": 2.7283, + "step": 10756 + }, + { + "epoch": 0.3189811108145776, + "grad_norm": 0.16130506992340088, + "learning_rate": 0.0007783643328572443, + "loss": 2.7649, + "step": 10757 + }, + { + "epoch": 0.31901076416688906, + "grad_norm": 0.17583724856376648, + "learning_rate": 0.0007783252466549703, + "loss": 2.7518, + "step": 10758 + }, + { + "epoch": 0.31904041751920054, + "grad_norm": 0.16389407217502594, + "learning_rate": 0.0007782861579880785, + "loss": 2.7628, + "step": 10759 + }, + { + "epoch": 0.319070070871512, + "grad_norm": 0.14167405664920807, + "learning_rate": 0.000778247066856915, + "loss": 2.8035, + "step": 10760 + }, + { + "epoch": 0.3190997242238235, + "grad_norm": 0.12347980588674545, + "learning_rate": 0.000778207973261826, + "loss": 2.7464, + "step": 10761 + }, + { + "epoch": 0.31912937757613496, + "grad_norm": 0.12014193832874298, + "learning_rate": 0.0007781688772031576, + "loss": 2.7216, + "step": 10762 + }, + { + "epoch": 0.31915903092844644, + "grad_norm": 0.1138811707496643, + "learning_rate": 0.0007781297786812562, + "loss": 2.7534, + "step": 10763 + }, + { + "epoch": 0.3191886842807579, + "grad_norm": 0.1318514049053192, + "learning_rate": 0.0007780906776964677, + "loss": 2.7461, + "step": 10764 + }, + { + "epoch": 0.3192183376330694, + "grad_norm": 0.11307931691408157, + "learning_rate": 0.0007780515742491386, + "loss": 2.7788, + "step": 10765 + }, + { + "epoch": 0.3192479909853809, + "grad_norm": 0.13252192735671997, + "learning_rate": 0.000778012468339615, + "loss": 2.7321, + "step": 10766 + }, + { + "epoch": 0.3192776443376924, + "grad_norm": 0.13361282646656036, + "learning_rate": 0.0007779733599682434, + "loss": 2.7649, + "step": 10767 + }, + { + "epoch": 0.3193072976900039, + "grad_norm": 0.14763793349266052, + "learning_rate": 0.0007779342491353698, + "loss": 2.7455, + "step": 10768 + }, + { + "epoch": 0.31933695104231535, + "grad_norm": 0.158365860581398, + "learning_rate": 0.0007778951358413409, + "loss": 2.7666, + "step": 10769 + }, + { + "epoch": 0.31936660439462683, + "grad_norm": 0.15459580719470978, + "learning_rate": 0.0007778560200865028, + "loss": 2.7871, + "step": 10770 + }, + { + "epoch": 0.3193962577469383, + "grad_norm": 0.1351480782032013, + "learning_rate": 0.0007778169018712018, + "loss": 2.7746, + "step": 10771 + }, + { + "epoch": 0.3194259110992498, + "grad_norm": 0.14047342538833618, + "learning_rate": 0.0007777777811957847, + "loss": 2.7575, + "step": 10772 + }, + { + "epoch": 0.31945556445156126, + "grad_norm": 0.13496574759483337, + "learning_rate": 0.0007777386580605975, + "loss": 2.7427, + "step": 10773 + }, + { + "epoch": 0.31948521780387273, + "grad_norm": 0.12376981228590012, + "learning_rate": 0.0007776995324659869, + "loss": 2.7369, + "step": 10774 + }, + { + "epoch": 0.3195148711561842, + "grad_norm": 0.14374807476997375, + "learning_rate": 0.0007776604044122992, + "loss": 2.7475, + "step": 10775 + }, + { + "epoch": 0.3195445245084957, + "grad_norm": 0.13712824881076813, + "learning_rate": 0.0007776212738998811, + "loss": 2.7393, + "step": 10776 + }, + { + "epoch": 0.31957417786080716, + "grad_norm": 0.13981185853481293, + "learning_rate": 0.0007775821409290788, + "loss": 2.7393, + "step": 10777 + }, + { + "epoch": 0.31960383121311864, + "grad_norm": 0.12522833049297333, + "learning_rate": 0.000777543005500239, + "loss": 2.742, + "step": 10778 + }, + { + "epoch": 0.3196334845654301, + "grad_norm": 0.1313609480857849, + "learning_rate": 0.0007775038676137083, + "loss": 2.7727, + "step": 10779 + }, + { + "epoch": 0.3196631379177416, + "grad_norm": 0.11064968258142471, + "learning_rate": 0.0007774647272698332, + "loss": 2.7765, + "step": 10780 + }, + { + "epoch": 0.31969279127005307, + "grad_norm": 0.11576291173696518, + "learning_rate": 0.0007774255844689604, + "loss": 2.7485, + "step": 10781 + }, + { + "epoch": 0.31972244462236454, + "grad_norm": 0.12490308284759521, + "learning_rate": 0.0007773864392114365, + "loss": 2.735, + "step": 10782 + }, + { + "epoch": 0.319752097974676, + "grad_norm": 0.11791924387216568, + "learning_rate": 0.0007773472914976079, + "loss": 2.7459, + "step": 10783 + }, + { + "epoch": 0.3197817513269875, + "grad_norm": 0.11483268439769745, + "learning_rate": 0.0007773081413278214, + "loss": 2.7288, + "step": 10784 + }, + { + "epoch": 0.31981140467929897, + "grad_norm": 0.14043262600898743, + "learning_rate": 0.0007772689887024238, + "loss": 2.7553, + "step": 10785 + }, + { + "epoch": 0.31984105803161045, + "grad_norm": 0.1466572880744934, + "learning_rate": 0.0007772298336217617, + "loss": 2.767, + "step": 10786 + }, + { + "epoch": 0.319870711383922, + "grad_norm": 0.14890243113040924, + "learning_rate": 0.0007771906760861818, + "loss": 2.7468, + "step": 10787 + }, + { + "epoch": 0.31990036473623346, + "grad_norm": 0.12993940711021423, + "learning_rate": 0.0007771515160960309, + "loss": 2.7243, + "step": 10788 + }, + { + "epoch": 0.31993001808854493, + "grad_norm": 0.14304228127002716, + "learning_rate": 0.0007771123536516558, + "loss": 2.7841, + "step": 10789 + }, + { + "epoch": 0.3199596714408564, + "grad_norm": 0.14787410199642181, + "learning_rate": 0.0007770731887534031, + "loss": 2.7403, + "step": 10790 + }, + { + "epoch": 0.3199893247931679, + "grad_norm": 0.1253897249698639, + "learning_rate": 0.00077703402140162, + "loss": 2.7411, + "step": 10791 + }, + { + "epoch": 0.32001897814547936, + "grad_norm": 0.13075794279575348, + "learning_rate": 0.0007769948515966529, + "loss": 2.7265, + "step": 10792 + }, + { + "epoch": 0.32004863149779084, + "grad_norm": 0.12596650421619415, + "learning_rate": 0.0007769556793388488, + "loss": 2.7591, + "step": 10793 + }, + { + "epoch": 0.3200782848501023, + "grad_norm": 0.12398447096347809, + "learning_rate": 0.0007769165046285548, + "loss": 2.7547, + "step": 10794 + }, + { + "epoch": 0.3201079382024138, + "grad_norm": 0.12191497534513474, + "learning_rate": 0.0007768773274661176, + "loss": 2.7621, + "step": 10795 + }, + { + "epoch": 0.32013759155472526, + "grad_norm": 0.11443372070789337, + "learning_rate": 0.000776838147851884, + "loss": 2.7181, + "step": 10796 + }, + { + "epoch": 0.32016724490703674, + "grad_norm": 0.1112380400300026, + "learning_rate": 0.0007767989657862011, + "loss": 2.731, + "step": 10797 + }, + { + "epoch": 0.3201968982593482, + "grad_norm": 0.10383453965187073, + "learning_rate": 0.0007767597812694159, + "loss": 2.758, + "step": 10798 + }, + { + "epoch": 0.3202265516116597, + "grad_norm": 0.12409725785255432, + "learning_rate": 0.0007767205943018753, + "loss": 2.7435, + "step": 10799 + }, + { + "epoch": 0.32025620496397117, + "grad_norm": 0.12678077816963196, + "learning_rate": 0.0007766814048839265, + "loss": 2.7336, + "step": 10800 + }, + { + "epoch": 0.32028585831628265, + "grad_norm": 0.12607669830322266, + "learning_rate": 0.0007766422130159162, + "loss": 2.7661, + "step": 10801 + }, + { + "epoch": 0.3203155116685941, + "grad_norm": 0.14048020541667938, + "learning_rate": 0.0007766030186981916, + "loss": 2.7355, + "step": 10802 + }, + { + "epoch": 0.3203451650209056, + "grad_norm": 0.1438475251197815, + "learning_rate": 0.0007765638219310998, + "loss": 2.7256, + "step": 10803 + }, + { + "epoch": 0.3203748183732171, + "grad_norm": 0.13684719800949097, + "learning_rate": 0.000776524622714988, + "loss": 2.7557, + "step": 10804 + }, + { + "epoch": 0.32040447172552855, + "grad_norm": 0.15049226582050323, + "learning_rate": 0.0007764854210502031, + "loss": 2.769, + "step": 10805 + }, + { + "epoch": 0.32043412507784, + "grad_norm": 0.1411493569612503, + "learning_rate": 0.0007764462169370924, + "loss": 2.7576, + "step": 10806 + }, + { + "epoch": 0.3204637784301515, + "grad_norm": 0.1392858624458313, + "learning_rate": 0.000776407010376003, + "loss": 2.7365, + "step": 10807 + }, + { + "epoch": 0.32049343178246303, + "grad_norm": 0.11178004741668701, + "learning_rate": 0.0007763678013672821, + "loss": 2.7837, + "step": 10808 + }, + { + "epoch": 0.3205230851347745, + "grad_norm": 0.13702844083309174, + "learning_rate": 0.0007763285899112767, + "loss": 2.732, + "step": 10809 + }, + { + "epoch": 0.320552738487086, + "grad_norm": 0.13563981652259827, + "learning_rate": 0.0007762893760083344, + "loss": 2.6991, + "step": 10810 + }, + { + "epoch": 0.32058239183939746, + "grad_norm": 0.1398538053035736, + "learning_rate": 0.000776250159658802, + "loss": 2.7105, + "step": 10811 + }, + { + "epoch": 0.32061204519170894, + "grad_norm": 0.14277341961860657, + "learning_rate": 0.0007762109408630273, + "loss": 2.7611, + "step": 10812 + }, + { + "epoch": 0.3206416985440204, + "grad_norm": 0.14800195395946503, + "learning_rate": 0.0007761717196213574, + "loss": 2.7454, + "step": 10813 + }, + { + "epoch": 0.3206713518963319, + "grad_norm": 0.1772318184375763, + "learning_rate": 0.0007761324959341393, + "loss": 2.7308, + "step": 10814 + }, + { + "epoch": 0.32070100524864337, + "grad_norm": 0.1923232525587082, + "learning_rate": 0.0007760932698017204, + "loss": 2.7162, + "step": 10815 + }, + { + "epoch": 0.32073065860095484, + "grad_norm": 0.1624705195426941, + "learning_rate": 0.0007760540412244484, + "loss": 2.7275, + "step": 10816 + }, + { + "epoch": 0.3207603119532663, + "grad_norm": 0.1666538566350937, + "learning_rate": 0.0007760148102026705, + "loss": 2.7177, + "step": 10817 + }, + { + "epoch": 0.3207899653055778, + "grad_norm": 0.17944903671741486, + "learning_rate": 0.000775975576736734, + "loss": 2.7468, + "step": 10818 + }, + { + "epoch": 0.32081961865788927, + "grad_norm": 0.19687454402446747, + "learning_rate": 0.0007759363408269866, + "loss": 2.7736, + "step": 10819 + }, + { + "epoch": 0.32084927201020075, + "grad_norm": 0.19958584010601044, + "learning_rate": 0.0007758971024737753, + "loss": 2.7719, + "step": 10820 + }, + { + "epoch": 0.3208789253625122, + "grad_norm": 0.19552496075630188, + "learning_rate": 0.0007758578616774478, + "loss": 2.7362, + "step": 10821 + }, + { + "epoch": 0.3209085787148237, + "grad_norm": 0.14718978106975555, + "learning_rate": 0.0007758186184383518, + "loss": 2.763, + "step": 10822 + }, + { + "epoch": 0.3209382320671352, + "grad_norm": 0.14513501524925232, + "learning_rate": 0.0007757793727568343, + "loss": 2.7371, + "step": 10823 + }, + { + "epoch": 0.32096788541944665, + "grad_norm": 0.13844509422779083, + "learning_rate": 0.0007757401246332434, + "loss": 2.7741, + "step": 10824 + }, + { + "epoch": 0.32099753877175813, + "grad_norm": 0.12623605132102966, + "learning_rate": 0.0007757008740679263, + "loss": 2.7409, + "step": 10825 + }, + { + "epoch": 0.3210271921240696, + "grad_norm": 0.12270570546388626, + "learning_rate": 0.0007756616210612305, + "loss": 2.7382, + "step": 10826 + }, + { + "epoch": 0.3210568454763811, + "grad_norm": 0.13863711059093475, + "learning_rate": 0.0007756223656135039, + "loss": 2.7753, + "step": 10827 + }, + { + "epoch": 0.32108649882869256, + "grad_norm": 0.11399886012077332, + "learning_rate": 0.0007755831077250938, + "loss": 2.7252, + "step": 10828 + }, + { + "epoch": 0.3211161521810041, + "grad_norm": 0.12461636960506439, + "learning_rate": 0.0007755438473963479, + "loss": 2.7338, + "step": 10829 + }, + { + "epoch": 0.32114580553331556, + "grad_norm": 0.1190204918384552, + "learning_rate": 0.0007755045846276141, + "loss": 2.7402, + "step": 10830 + }, + { + "epoch": 0.32117545888562704, + "grad_norm": 0.11675746738910675, + "learning_rate": 0.0007754653194192399, + "loss": 2.7549, + "step": 10831 + }, + { + "epoch": 0.3212051122379385, + "grad_norm": 0.1708255410194397, + "learning_rate": 0.000775426051771573, + "loss": 2.7474, + "step": 10832 + }, + { + "epoch": 0.32123476559025, + "grad_norm": 0.11445440351963043, + "learning_rate": 0.0007753867816849611, + "loss": 2.7401, + "step": 10833 + }, + { + "epoch": 0.32126441894256147, + "grad_norm": 0.12600919604301453, + "learning_rate": 0.000775347509159752, + "loss": 2.7569, + "step": 10834 + }, + { + "epoch": 0.32129407229487295, + "grad_norm": 0.12854522466659546, + "learning_rate": 0.0007753082341962934, + "loss": 2.745, + "step": 10835 + }, + { + "epoch": 0.3213237256471844, + "grad_norm": 0.11245785653591156, + "learning_rate": 0.0007752689567949332, + "loss": 2.7525, + "step": 10836 + }, + { + "epoch": 0.3213533789994959, + "grad_norm": 0.12539273500442505, + "learning_rate": 0.0007752296769560192, + "loss": 2.7598, + "step": 10837 + }, + { + "epoch": 0.3213830323518074, + "grad_norm": 0.12249375134706497, + "learning_rate": 0.000775190394679899, + "loss": 2.7479, + "step": 10838 + }, + { + "epoch": 0.32141268570411885, + "grad_norm": 0.12762846052646637, + "learning_rate": 0.0007751511099669207, + "loss": 2.754, + "step": 10839 + }, + { + "epoch": 0.3214423390564303, + "grad_norm": 0.12196175009012222, + "learning_rate": 0.0007751118228174321, + "loss": 2.7124, + "step": 10840 + }, + { + "epoch": 0.3214719924087418, + "grad_norm": 0.13179895281791687, + "learning_rate": 0.0007750725332317811, + "loss": 2.7477, + "step": 10841 + }, + { + "epoch": 0.3215016457610533, + "grad_norm": 0.1269065886735916, + "learning_rate": 0.0007750332412103156, + "loss": 2.7603, + "step": 10842 + }, + { + "epoch": 0.32153129911336475, + "grad_norm": 0.11037624627351761, + "learning_rate": 0.0007749939467533836, + "loss": 2.7587, + "step": 10843 + }, + { + "epoch": 0.32156095246567623, + "grad_norm": 0.13481029868125916, + "learning_rate": 0.0007749546498613329, + "loss": 2.7537, + "step": 10844 + }, + { + "epoch": 0.3215906058179877, + "grad_norm": 0.13991139829158783, + "learning_rate": 0.0007749153505345114, + "loss": 2.7333, + "step": 10845 + }, + { + "epoch": 0.3216202591702992, + "grad_norm": 0.1396397203207016, + "learning_rate": 0.0007748760487732676, + "loss": 2.7361, + "step": 10846 + }, + { + "epoch": 0.32164991252261066, + "grad_norm": 0.13415716588497162, + "learning_rate": 0.0007748367445779492, + "loss": 2.7581, + "step": 10847 + }, + { + "epoch": 0.32167956587492214, + "grad_norm": 0.15861386060714722, + "learning_rate": 0.0007747974379489041, + "loss": 2.7438, + "step": 10848 + }, + { + "epoch": 0.3217092192272336, + "grad_norm": 0.13121236860752106, + "learning_rate": 0.0007747581288864804, + "loss": 2.7864, + "step": 10849 + }, + { + "epoch": 0.32173887257954514, + "grad_norm": 0.12361809611320496, + "learning_rate": 0.0007747188173910266, + "loss": 2.763, + "step": 10850 + }, + { + "epoch": 0.3217685259318566, + "grad_norm": 0.14797958731651306, + "learning_rate": 0.0007746795034628904, + "loss": 2.7313, + "step": 10851 + }, + { + "epoch": 0.3217981792841681, + "grad_norm": 0.130004420876503, + "learning_rate": 0.00077464018710242, + "loss": 2.7744, + "step": 10852 + }, + { + "epoch": 0.32182783263647957, + "grad_norm": 0.1396298110485077, + "learning_rate": 0.0007746008683099637, + "loss": 2.7331, + "step": 10853 + }, + { + "epoch": 0.32185748598879105, + "grad_norm": 0.18203379213809967, + "learning_rate": 0.0007745615470858694, + "loss": 2.7607, + "step": 10854 + }, + { + "epoch": 0.3218871393411025, + "grad_norm": 0.16204676032066345, + "learning_rate": 0.0007745222234304856, + "loss": 2.7683, + "step": 10855 + }, + { + "epoch": 0.321916792693414, + "grad_norm": 0.15173272788524628, + "learning_rate": 0.0007744828973441603, + "loss": 2.7239, + "step": 10856 + }, + { + "epoch": 0.3219464460457255, + "grad_norm": 0.1499367356300354, + "learning_rate": 0.000774443568827242, + "loss": 2.7565, + "step": 10857 + }, + { + "epoch": 0.32197609939803695, + "grad_norm": 0.14054323732852936, + "learning_rate": 0.0007744042378800786, + "loss": 2.7381, + "step": 10858 + }, + { + "epoch": 0.32200575275034843, + "grad_norm": 0.12858006358146667, + "learning_rate": 0.0007743649045030187, + "loss": 2.7335, + "step": 10859 + }, + { + "epoch": 0.3220354061026599, + "grad_norm": 0.11888623237609863, + "learning_rate": 0.0007743255686964106, + "loss": 2.743, + "step": 10860 + }, + { + "epoch": 0.3220650594549714, + "grad_norm": 0.12987791001796722, + "learning_rate": 0.0007742862304606022, + "loss": 2.7449, + "step": 10861 + }, + { + "epoch": 0.32209471280728286, + "grad_norm": 0.12458200007677078, + "learning_rate": 0.0007742468897959422, + "loss": 2.7183, + "step": 10862 + }, + { + "epoch": 0.32212436615959433, + "grad_norm": 0.12676090002059937, + "learning_rate": 0.0007742075467027791, + "loss": 2.7376, + "step": 10863 + }, + { + "epoch": 0.3221540195119058, + "grad_norm": 0.13920964300632477, + "learning_rate": 0.000774168201181461, + "loss": 2.786, + "step": 10864 + }, + { + "epoch": 0.3221836728642173, + "grad_norm": 0.14142689108848572, + "learning_rate": 0.0007741288532323365, + "loss": 2.7442, + "step": 10865 + }, + { + "epoch": 0.32221332621652876, + "grad_norm": 0.16072525084018707, + "learning_rate": 0.0007740895028557539, + "loss": 2.7466, + "step": 10866 + }, + { + "epoch": 0.32224297956884024, + "grad_norm": 0.16054172813892365, + "learning_rate": 0.0007740501500520617, + "loss": 2.7303, + "step": 10867 + }, + { + "epoch": 0.3222726329211517, + "grad_norm": 0.15420472621917725, + "learning_rate": 0.0007740107948216084, + "loss": 2.7271, + "step": 10868 + }, + { + "epoch": 0.3223022862734632, + "grad_norm": 0.1335654854774475, + "learning_rate": 0.0007739714371647424, + "loss": 2.7497, + "step": 10869 + }, + { + "epoch": 0.3223319396257747, + "grad_norm": 0.14866791665554047, + "learning_rate": 0.0007739320770818124, + "loss": 2.7216, + "step": 10870 + }, + { + "epoch": 0.3223615929780862, + "grad_norm": 0.174810528755188, + "learning_rate": 0.0007738927145731668, + "loss": 2.7482, + "step": 10871 + }, + { + "epoch": 0.3223912463303977, + "grad_norm": 0.13654161989688873, + "learning_rate": 0.0007738533496391542, + "loss": 2.7092, + "step": 10872 + }, + { + "epoch": 0.32242089968270915, + "grad_norm": 0.12741222977638245, + "learning_rate": 0.0007738139822801232, + "loss": 2.7402, + "step": 10873 + }, + { + "epoch": 0.3224505530350206, + "grad_norm": 0.14677633345127106, + "learning_rate": 0.0007737746124964223, + "loss": 2.7246, + "step": 10874 + }, + { + "epoch": 0.3224802063873321, + "grad_norm": 0.12217477709054947, + "learning_rate": 0.0007737352402884002, + "loss": 2.7706, + "step": 10875 + }, + { + "epoch": 0.3225098597396436, + "grad_norm": 0.12090058624744415, + "learning_rate": 0.0007736958656564057, + "loss": 2.7494, + "step": 10876 + }, + { + "epoch": 0.32253951309195505, + "grad_norm": 0.13919955492019653, + "learning_rate": 0.0007736564886007873, + "loss": 2.6957, + "step": 10877 + }, + { + "epoch": 0.32256916644426653, + "grad_norm": 0.13843144476413727, + "learning_rate": 0.0007736171091218936, + "loss": 2.7352, + "step": 10878 + }, + { + "epoch": 0.322598819796578, + "grad_norm": 0.14644645154476166, + "learning_rate": 0.0007735777272200736, + "loss": 2.7472, + "step": 10879 + }, + { + "epoch": 0.3226284731488895, + "grad_norm": 0.139519602060318, + "learning_rate": 0.0007735383428956757, + "loss": 2.737, + "step": 10880 + }, + { + "epoch": 0.32265812650120096, + "grad_norm": 0.12908707559108734, + "learning_rate": 0.0007734989561490489, + "loss": 2.753, + "step": 10881 + }, + { + "epoch": 0.32268777985351244, + "grad_norm": 0.11552344262599945, + "learning_rate": 0.0007734595669805418, + "loss": 2.7255, + "step": 10882 + }, + { + "epoch": 0.3227174332058239, + "grad_norm": 0.1299416720867157, + "learning_rate": 0.0007734201753905035, + "loss": 2.729, + "step": 10883 + }, + { + "epoch": 0.3227470865581354, + "grad_norm": 0.12801961600780487, + "learning_rate": 0.0007733807813792826, + "loss": 2.7393, + "step": 10884 + }, + { + "epoch": 0.32277673991044686, + "grad_norm": 0.12103865295648575, + "learning_rate": 0.0007733413849472278, + "loss": 2.7763, + "step": 10885 + }, + { + "epoch": 0.32280639326275834, + "grad_norm": 0.13308179378509521, + "learning_rate": 0.0007733019860946881, + "loss": 2.7559, + "step": 10886 + }, + { + "epoch": 0.3228360466150698, + "grad_norm": 0.13597814738750458, + "learning_rate": 0.0007732625848220125, + "loss": 2.7566, + "step": 10887 + }, + { + "epoch": 0.3228656999673813, + "grad_norm": 0.13018906116485596, + "learning_rate": 0.0007732231811295498, + "loss": 2.7445, + "step": 10888 + }, + { + "epoch": 0.32289535331969277, + "grad_norm": 0.12943875789642334, + "learning_rate": 0.0007731837750176489, + "loss": 2.7802, + "step": 10889 + }, + { + "epoch": 0.32292500667200424, + "grad_norm": 0.11131870001554489, + "learning_rate": 0.0007731443664866589, + "loss": 2.7562, + "step": 10890 + }, + { + "epoch": 0.3229546600243158, + "grad_norm": 0.10887891054153442, + "learning_rate": 0.0007731049555369285, + "loss": 2.7542, + "step": 10891 + }, + { + "epoch": 0.32298431337662725, + "grad_norm": 0.1220618486404419, + "learning_rate": 0.0007730655421688069, + "loss": 2.7798, + "step": 10892 + }, + { + "epoch": 0.32301396672893873, + "grad_norm": 0.15752951800823212, + "learning_rate": 0.0007730261263826432, + "loss": 2.7352, + "step": 10893 + }, + { + "epoch": 0.3230436200812502, + "grad_norm": 0.17345625162124634, + "learning_rate": 0.000772986708178786, + "loss": 2.7855, + "step": 10894 + }, + { + "epoch": 0.3230732734335617, + "grad_norm": 0.17453697323799133, + "learning_rate": 0.0007729472875575848, + "loss": 2.7489, + "step": 10895 + }, + { + "epoch": 0.32310292678587316, + "grad_norm": 0.15203744173049927, + "learning_rate": 0.0007729078645193886, + "loss": 2.7339, + "step": 10896 + }, + { + "epoch": 0.32313258013818463, + "grad_norm": 0.14577937126159668, + "learning_rate": 0.0007728684390645461, + "loss": 2.7109, + "step": 10897 + }, + { + "epoch": 0.3231622334904961, + "grad_norm": 0.12223582714796066, + "learning_rate": 0.0007728290111934071, + "loss": 2.7534, + "step": 10898 + }, + { + "epoch": 0.3231918868428076, + "grad_norm": 0.12334778904914856, + "learning_rate": 0.0007727895809063202, + "loss": 2.7311, + "step": 10899 + }, + { + "epoch": 0.32322154019511906, + "grad_norm": 0.14440369606018066, + "learning_rate": 0.0007727501482036348, + "loss": 2.7407, + "step": 10900 + }, + { + "epoch": 0.32325119354743054, + "grad_norm": 0.15957091748714447, + "learning_rate": 0.0007727107130856999, + "loss": 2.7546, + "step": 10901 + }, + { + "epoch": 0.323280846899742, + "grad_norm": 0.1271725744009018, + "learning_rate": 0.0007726712755528649, + "loss": 2.7545, + "step": 10902 + }, + { + "epoch": 0.3233105002520535, + "grad_norm": 0.12170428037643433, + "learning_rate": 0.000772631835605479, + "loss": 2.7263, + "step": 10903 + }, + { + "epoch": 0.32334015360436497, + "grad_norm": 0.12108857929706573, + "learning_rate": 0.0007725923932438914, + "loss": 2.7468, + "step": 10904 + }, + { + "epoch": 0.32336980695667644, + "grad_norm": 0.13712291419506073, + "learning_rate": 0.0007725529484684513, + "loss": 2.7482, + "step": 10905 + }, + { + "epoch": 0.3233994603089879, + "grad_norm": 0.11503151804208755, + "learning_rate": 0.000772513501279508, + "loss": 2.7527, + "step": 10906 + }, + { + "epoch": 0.3234291136612994, + "grad_norm": 0.1068546250462532, + "learning_rate": 0.0007724740516774109, + "loss": 2.7573, + "step": 10907 + }, + { + "epoch": 0.32345876701361087, + "grad_norm": 0.1144535019993782, + "learning_rate": 0.0007724345996625095, + "loss": 2.748, + "step": 10908 + }, + { + "epoch": 0.32348842036592235, + "grad_norm": 0.10856352746486664, + "learning_rate": 0.0007723951452351527, + "loss": 2.7318, + "step": 10909 + }, + { + "epoch": 0.3235180737182338, + "grad_norm": 0.11639559268951416, + "learning_rate": 0.0007723556883956903, + "loss": 2.7621, + "step": 10910 + }, + { + "epoch": 0.3235477270705453, + "grad_norm": 0.1154695600271225, + "learning_rate": 0.0007723162291444715, + "loss": 2.7471, + "step": 10911 + }, + { + "epoch": 0.32357738042285683, + "grad_norm": 0.12314198166131973, + "learning_rate": 0.0007722767674818458, + "loss": 2.7103, + "step": 10912 + }, + { + "epoch": 0.3236070337751683, + "grad_norm": 0.13687768578529358, + "learning_rate": 0.0007722373034081625, + "loss": 2.7403, + "step": 10913 + }, + { + "epoch": 0.3236366871274798, + "grad_norm": 0.11338244378566742, + "learning_rate": 0.0007721978369237711, + "loss": 2.7753, + "step": 10914 + }, + { + "epoch": 0.32366634047979126, + "grad_norm": 0.12228762358427048, + "learning_rate": 0.0007721583680290212, + "loss": 2.7201, + "step": 10915 + }, + { + "epoch": 0.32369599383210274, + "grad_norm": 0.1206929013133049, + "learning_rate": 0.0007721188967242623, + "loss": 2.7865, + "step": 10916 + }, + { + "epoch": 0.3237256471844142, + "grad_norm": 0.11168327927589417, + "learning_rate": 0.0007720794230098438, + "loss": 2.743, + "step": 10917 + }, + { + "epoch": 0.3237553005367257, + "grad_norm": 0.10914678126573563, + "learning_rate": 0.0007720399468861153, + "loss": 2.7394, + "step": 10918 + }, + { + "epoch": 0.32378495388903716, + "grad_norm": 0.12985754013061523, + "learning_rate": 0.0007720004683534263, + "loss": 2.7734, + "step": 10919 + }, + { + "epoch": 0.32381460724134864, + "grad_norm": 0.17057783901691437, + "learning_rate": 0.0007719609874121265, + "loss": 2.7655, + "step": 10920 + }, + { + "epoch": 0.3238442605936601, + "grad_norm": 0.19598513841629028, + "learning_rate": 0.0007719215040625655, + "loss": 2.7947, + "step": 10921 + }, + { + "epoch": 0.3238739139459716, + "grad_norm": 0.2193889021873474, + "learning_rate": 0.000771882018305093, + "loss": 2.7613, + "step": 10922 + }, + { + "epoch": 0.32390356729828307, + "grad_norm": 0.16427898406982422, + "learning_rate": 0.0007718425301400585, + "loss": 2.7561, + "step": 10923 + }, + { + "epoch": 0.32393322065059454, + "grad_norm": 0.15771082043647766, + "learning_rate": 0.0007718030395678118, + "loss": 2.7583, + "step": 10924 + }, + { + "epoch": 0.323962874002906, + "grad_norm": 0.1467820703983307, + "learning_rate": 0.0007717635465887023, + "loss": 2.7473, + "step": 10925 + }, + { + "epoch": 0.3239925273552175, + "grad_norm": 0.15947748720645905, + "learning_rate": 0.0007717240512030801, + "loss": 2.7711, + "step": 10926 + }, + { + "epoch": 0.324022180707529, + "grad_norm": 0.14234262704849243, + "learning_rate": 0.0007716845534112949, + "loss": 2.7759, + "step": 10927 + }, + { + "epoch": 0.32405183405984045, + "grad_norm": 0.14729437232017517, + "learning_rate": 0.0007716450532136961, + "loss": 2.7642, + "step": 10928 + }, + { + "epoch": 0.3240814874121519, + "grad_norm": 0.15134257078170776, + "learning_rate": 0.0007716055506106339, + "loss": 2.7644, + "step": 10929 + }, + { + "epoch": 0.3241111407644634, + "grad_norm": 0.1485934853553772, + "learning_rate": 0.0007715660456024578, + "loss": 2.7758, + "step": 10930 + }, + { + "epoch": 0.3241407941167749, + "grad_norm": 0.13813987374305725, + "learning_rate": 0.0007715265381895179, + "loss": 2.7422, + "step": 10931 + }, + { + "epoch": 0.32417044746908635, + "grad_norm": 0.1290975660085678, + "learning_rate": 0.0007714870283721637, + "loss": 2.7532, + "step": 10932 + }, + { + "epoch": 0.3242001008213979, + "grad_norm": 0.13218937814235687, + "learning_rate": 0.0007714475161507455, + "loss": 2.7149, + "step": 10933 + }, + { + "epoch": 0.32422975417370936, + "grad_norm": 0.1332622617483139, + "learning_rate": 0.0007714080015256129, + "loss": 2.7345, + "step": 10934 + }, + { + "epoch": 0.32425940752602084, + "grad_norm": 0.128301739692688, + "learning_rate": 0.0007713684844971157, + "loss": 2.7372, + "step": 10935 + }, + { + "epoch": 0.3242890608783323, + "grad_norm": 0.14847779273986816, + "learning_rate": 0.0007713289650656041, + "loss": 2.7682, + "step": 10936 + }, + { + "epoch": 0.3243187142306438, + "grad_norm": 0.14421100914478302, + "learning_rate": 0.0007712894432314279, + "loss": 2.8075, + "step": 10937 + }, + { + "epoch": 0.32434836758295527, + "grad_norm": 0.1378175914287567, + "learning_rate": 0.0007712499189949371, + "loss": 2.7358, + "step": 10938 + }, + { + "epoch": 0.32437802093526674, + "grad_norm": 0.119951531291008, + "learning_rate": 0.0007712103923564819, + "loss": 2.737, + "step": 10939 + }, + { + "epoch": 0.3244076742875782, + "grad_norm": 0.12143946439027786, + "learning_rate": 0.0007711708633164118, + "loss": 2.7261, + "step": 10940 + }, + { + "epoch": 0.3244373276398897, + "grad_norm": 0.13153903186321259, + "learning_rate": 0.0007711313318750774, + "loss": 2.7535, + "step": 10941 + }, + { + "epoch": 0.32446698099220117, + "grad_norm": 0.13478849828243256, + "learning_rate": 0.0007710917980328285, + "loss": 2.74, + "step": 10942 + }, + { + "epoch": 0.32449663434451265, + "grad_norm": 0.12920930981636047, + "learning_rate": 0.0007710522617900152, + "loss": 2.7371, + "step": 10943 + }, + { + "epoch": 0.3245262876968241, + "grad_norm": 0.15512225031852722, + "learning_rate": 0.0007710127231469876, + "loss": 2.7211, + "step": 10944 + }, + { + "epoch": 0.3245559410491356, + "grad_norm": 0.1641542911529541, + "learning_rate": 0.0007709731821040956, + "loss": 2.7747, + "step": 10945 + }, + { + "epoch": 0.3245855944014471, + "grad_norm": 0.16072313487529755, + "learning_rate": 0.0007709336386616898, + "loss": 2.7586, + "step": 10946 + }, + { + "epoch": 0.32461524775375855, + "grad_norm": 0.13642443716526031, + "learning_rate": 0.00077089409282012, + "loss": 2.7037, + "step": 10947 + }, + { + "epoch": 0.32464490110607, + "grad_norm": 0.16539232432842255, + "learning_rate": 0.0007708545445797366, + "loss": 2.7658, + "step": 10948 + }, + { + "epoch": 0.3246745544583815, + "grad_norm": 0.14109648764133453, + "learning_rate": 0.0007708149939408898, + "loss": 2.7791, + "step": 10949 + }, + { + "epoch": 0.324704207810693, + "grad_norm": 0.132168248295784, + "learning_rate": 0.0007707754409039296, + "loss": 2.7632, + "step": 10950 + }, + { + "epoch": 0.32473386116300446, + "grad_norm": 0.14687548577785492, + "learning_rate": 0.0007707358854692064, + "loss": 2.7545, + "step": 10951 + }, + { + "epoch": 0.32476351451531593, + "grad_norm": 0.1336919665336609, + "learning_rate": 0.0007706963276370704, + "loss": 2.6802, + "step": 10952 + }, + { + "epoch": 0.3247931678676274, + "grad_norm": 0.11208511143922806, + "learning_rate": 0.0007706567674078719, + "loss": 2.7397, + "step": 10953 + }, + { + "epoch": 0.32482282121993894, + "grad_norm": 0.11986538022756577, + "learning_rate": 0.0007706172047819615, + "loss": 2.7405, + "step": 10954 + }, + { + "epoch": 0.3248524745722504, + "grad_norm": 0.112055703997612, + "learning_rate": 0.0007705776397596893, + "loss": 2.7403, + "step": 10955 + }, + { + "epoch": 0.3248821279245619, + "grad_norm": 0.12417983263731003, + "learning_rate": 0.0007705380723414055, + "loss": 2.7421, + "step": 10956 + }, + { + "epoch": 0.32491178127687337, + "grad_norm": 0.1476924568414688, + "learning_rate": 0.0007704985025274607, + "loss": 2.7344, + "step": 10957 + }, + { + "epoch": 0.32494143462918484, + "grad_norm": 0.1283760666847229, + "learning_rate": 0.0007704589303182051, + "loss": 2.7341, + "step": 10958 + }, + { + "epoch": 0.3249710879814963, + "grad_norm": 0.13200125098228455, + "learning_rate": 0.0007704193557139893, + "loss": 2.7733, + "step": 10959 + }, + { + "epoch": 0.3250007413338078, + "grad_norm": 0.12869639694690704, + "learning_rate": 0.0007703797787151638, + "loss": 2.7356, + "step": 10960 + }, + { + "epoch": 0.3250303946861193, + "grad_norm": 0.11314135789871216, + "learning_rate": 0.000770340199322079, + "loss": 2.7319, + "step": 10961 + }, + { + "epoch": 0.32506004803843075, + "grad_norm": 0.11569339781999588, + "learning_rate": 0.0007703006175350853, + "loss": 2.7267, + "step": 10962 + }, + { + "epoch": 0.3250897013907422, + "grad_norm": 0.11204716563224792, + "learning_rate": 0.0007702610333545333, + "loss": 2.7711, + "step": 10963 + }, + { + "epoch": 0.3251193547430537, + "grad_norm": 0.11792299896478653, + "learning_rate": 0.0007702214467807732, + "loss": 2.7756, + "step": 10964 + }, + { + "epoch": 0.3251490080953652, + "grad_norm": 0.10708655416965485, + "learning_rate": 0.0007701818578141559, + "loss": 2.7461, + "step": 10965 + }, + { + "epoch": 0.32517866144767665, + "grad_norm": 0.13464438915252686, + "learning_rate": 0.0007701422664550318, + "loss": 2.7512, + "step": 10966 + }, + { + "epoch": 0.32520831479998813, + "grad_norm": 0.13959786295890808, + "learning_rate": 0.0007701026727037518, + "loss": 2.7568, + "step": 10967 + }, + { + "epoch": 0.3252379681522996, + "grad_norm": 0.13341711461544037, + "learning_rate": 0.0007700630765606661, + "loss": 2.7577, + "step": 10968 + }, + { + "epoch": 0.3252676215046111, + "grad_norm": 0.13862916827201843, + "learning_rate": 0.0007700234780261255, + "loss": 2.7502, + "step": 10969 + }, + { + "epoch": 0.32529727485692256, + "grad_norm": 0.13256973028182983, + "learning_rate": 0.0007699838771004808, + "loss": 2.7577, + "step": 10970 + }, + { + "epoch": 0.32532692820923403, + "grad_norm": 0.12722761929035187, + "learning_rate": 0.0007699442737840823, + "loss": 2.7743, + "step": 10971 + }, + { + "epoch": 0.3253565815615455, + "grad_norm": 0.14271385967731476, + "learning_rate": 0.0007699046680772811, + "loss": 2.7416, + "step": 10972 + }, + { + "epoch": 0.325386234913857, + "grad_norm": 0.15490812063217163, + "learning_rate": 0.0007698650599804276, + "loss": 2.7891, + "step": 10973 + }, + { + "epoch": 0.32541588826616846, + "grad_norm": 0.1722327023744583, + "learning_rate": 0.0007698254494938728, + "loss": 2.7812, + "step": 10974 + }, + { + "epoch": 0.32544554161848, + "grad_norm": 0.158044695854187, + "learning_rate": 0.0007697858366179673, + "loss": 2.7381, + "step": 10975 + }, + { + "epoch": 0.32547519497079147, + "grad_norm": 0.16046084463596344, + "learning_rate": 0.0007697462213530619, + "loss": 2.7436, + "step": 10976 + }, + { + "epoch": 0.32550484832310295, + "grad_norm": 0.1416674554347992, + "learning_rate": 0.0007697066036995074, + "loss": 2.735, + "step": 10977 + }, + { + "epoch": 0.3255345016754144, + "grad_norm": 0.13522258400917053, + "learning_rate": 0.0007696669836576547, + "loss": 2.7424, + "step": 10978 + }, + { + "epoch": 0.3255641550277259, + "grad_norm": 0.14505304396152496, + "learning_rate": 0.0007696273612278543, + "loss": 2.7248, + "step": 10979 + }, + { + "epoch": 0.3255938083800374, + "grad_norm": 0.13898953795433044, + "learning_rate": 0.0007695877364104576, + "loss": 2.7195, + "step": 10980 + }, + { + "epoch": 0.32562346173234885, + "grad_norm": 0.13313688337802887, + "learning_rate": 0.0007695481092058152, + "loss": 2.7571, + "step": 10981 + }, + { + "epoch": 0.3256531150846603, + "grad_norm": 0.14759935438632965, + "learning_rate": 0.0007695084796142779, + "loss": 2.7335, + "step": 10982 + }, + { + "epoch": 0.3256827684369718, + "grad_norm": 0.1632157266139984, + "learning_rate": 0.0007694688476361968, + "loss": 2.744, + "step": 10983 + }, + { + "epoch": 0.3257124217892833, + "grad_norm": 0.15911678969860077, + "learning_rate": 0.000769429213271923, + "loss": 2.7531, + "step": 10984 + }, + { + "epoch": 0.32574207514159476, + "grad_norm": 0.13457387685775757, + "learning_rate": 0.0007693895765218071, + "loss": 2.7574, + "step": 10985 + }, + { + "epoch": 0.32577172849390623, + "grad_norm": 0.12979459762573242, + "learning_rate": 0.0007693499373862001, + "loss": 2.7648, + "step": 10986 + }, + { + "epoch": 0.3258013818462177, + "grad_norm": 0.12970900535583496, + "learning_rate": 0.0007693102958654534, + "loss": 2.7642, + "step": 10987 + }, + { + "epoch": 0.3258310351985292, + "grad_norm": 0.13717783987522125, + "learning_rate": 0.0007692706519599178, + "loss": 2.7341, + "step": 10988 + }, + { + "epoch": 0.32586068855084066, + "grad_norm": 0.12566913664340973, + "learning_rate": 0.0007692310056699443, + "loss": 2.7626, + "step": 10989 + }, + { + "epoch": 0.32589034190315214, + "grad_norm": 0.1491677314043045, + "learning_rate": 0.000769191356995884, + "loss": 2.7529, + "step": 10990 + }, + { + "epoch": 0.3259199952554636, + "grad_norm": 0.1452973335981369, + "learning_rate": 0.000769151705938088, + "loss": 2.7311, + "step": 10991 + }, + { + "epoch": 0.3259496486077751, + "grad_norm": 0.12325482815504074, + "learning_rate": 0.0007691120524969075, + "loss": 2.7591, + "step": 10992 + }, + { + "epoch": 0.32597930196008656, + "grad_norm": 0.11539129167795181, + "learning_rate": 0.0007690723966726936, + "loss": 2.7716, + "step": 10993 + }, + { + "epoch": 0.32600895531239804, + "grad_norm": 0.11424731463193893, + "learning_rate": 0.0007690327384657973, + "loss": 2.7711, + "step": 10994 + }, + { + "epoch": 0.3260386086647096, + "grad_norm": 0.12679924070835114, + "learning_rate": 0.0007689930778765701, + "loss": 2.7297, + "step": 10995 + }, + { + "epoch": 0.32606826201702105, + "grad_norm": 0.1155487522482872, + "learning_rate": 0.0007689534149053631, + "loss": 2.7657, + "step": 10996 + }, + { + "epoch": 0.3260979153693325, + "grad_norm": 0.11928165704011917, + "learning_rate": 0.0007689137495525271, + "loss": 2.7434, + "step": 10997 + }, + { + "epoch": 0.326127568721644, + "grad_norm": 0.1166486144065857, + "learning_rate": 0.000768874081818414, + "loss": 2.7465, + "step": 10998 + }, + { + "epoch": 0.3261572220739555, + "grad_norm": 0.12879322469234467, + "learning_rate": 0.0007688344117033747, + "loss": 2.7623, + "step": 10999 + }, + { + "epoch": 0.32618687542626695, + "grad_norm": 0.14586538076400757, + "learning_rate": 0.0007687947392077606, + "loss": 2.7349, + "step": 11000 + }, + { + "epoch": 0.32621652877857843, + "grad_norm": 0.1387348473072052, + "learning_rate": 0.0007687550643319228, + "loss": 2.7212, + "step": 11001 + }, + { + "epoch": 0.3262461821308899, + "grad_norm": 0.15117426216602325, + "learning_rate": 0.0007687153870762127, + "loss": 2.7825, + "step": 11002 + }, + { + "epoch": 0.3262758354832014, + "grad_norm": 0.14050054550170898, + "learning_rate": 0.0007686757074409818, + "loss": 2.7582, + "step": 11003 + }, + { + "epoch": 0.32630548883551286, + "grad_norm": 0.1312161535024643, + "learning_rate": 0.0007686360254265814, + "loss": 2.7896, + "step": 11004 + }, + { + "epoch": 0.32633514218782433, + "grad_norm": 0.14410850405693054, + "learning_rate": 0.0007685963410333631, + "loss": 2.7751, + "step": 11005 + }, + { + "epoch": 0.3263647955401358, + "grad_norm": 0.1519741714000702, + "learning_rate": 0.0007685566542616779, + "loss": 2.764, + "step": 11006 + }, + { + "epoch": 0.3263944488924473, + "grad_norm": 0.1470990628004074, + "learning_rate": 0.0007685169651118774, + "loss": 2.7563, + "step": 11007 + }, + { + "epoch": 0.32642410224475876, + "grad_norm": 0.1275092214345932, + "learning_rate": 0.000768477273584313, + "loss": 2.7465, + "step": 11008 + }, + { + "epoch": 0.32645375559707024, + "grad_norm": 0.13543222844600677, + "learning_rate": 0.0007684375796793365, + "loss": 2.7665, + "step": 11009 + }, + { + "epoch": 0.3264834089493817, + "grad_norm": 0.14821983873844147, + "learning_rate": 0.0007683978833972991, + "loss": 2.7365, + "step": 11010 + }, + { + "epoch": 0.3265130623016932, + "grad_norm": 0.13312844932079315, + "learning_rate": 0.0007683581847385523, + "loss": 2.7116, + "step": 11011 + }, + { + "epoch": 0.32654271565400467, + "grad_norm": 0.12225250899791718, + "learning_rate": 0.0007683184837034476, + "loss": 2.754, + "step": 11012 + }, + { + "epoch": 0.32657236900631614, + "grad_norm": 0.12070967257022858, + "learning_rate": 0.0007682787802923368, + "loss": 2.7349, + "step": 11013 + }, + { + "epoch": 0.3266020223586276, + "grad_norm": 0.13122807443141937, + "learning_rate": 0.0007682390745055714, + "loss": 2.7618, + "step": 11014 + }, + { + "epoch": 0.3266316757109391, + "grad_norm": 0.13155223429203033, + "learning_rate": 0.000768199366343503, + "loss": 2.772, + "step": 11015 + }, + { + "epoch": 0.3266613290632506, + "grad_norm": 0.12593971192836761, + "learning_rate": 0.0007681596558064829, + "loss": 2.7567, + "step": 11016 + }, + { + "epoch": 0.3266909824155621, + "grad_norm": 0.14955773949623108, + "learning_rate": 0.0007681199428948633, + "loss": 2.7691, + "step": 11017 + }, + { + "epoch": 0.3267206357678736, + "grad_norm": 0.15699225664138794, + "learning_rate": 0.0007680802276089954, + "loss": 2.7505, + "step": 11018 + }, + { + "epoch": 0.32675028912018506, + "grad_norm": 0.14197251200675964, + "learning_rate": 0.0007680405099492312, + "loss": 2.7722, + "step": 11019 + }, + { + "epoch": 0.32677994247249653, + "grad_norm": 0.13472750782966614, + "learning_rate": 0.0007680007899159222, + "loss": 2.7453, + "step": 11020 + }, + { + "epoch": 0.326809595824808, + "grad_norm": 0.1567811369895935, + "learning_rate": 0.0007679610675094202, + "loss": 2.7342, + "step": 11021 + }, + { + "epoch": 0.3268392491771195, + "grad_norm": 0.12813802063465118, + "learning_rate": 0.000767921342730077, + "loss": 2.7257, + "step": 11022 + }, + { + "epoch": 0.32686890252943096, + "grad_norm": 0.12382154166698456, + "learning_rate": 0.0007678816155782442, + "loss": 2.7343, + "step": 11023 + }, + { + "epoch": 0.32689855588174244, + "grad_norm": 0.15733270347118378, + "learning_rate": 0.0007678418860542738, + "loss": 2.7413, + "step": 11024 + }, + { + "epoch": 0.3269282092340539, + "grad_norm": 0.18121948838233948, + "learning_rate": 0.0007678021541585176, + "loss": 2.7495, + "step": 11025 + }, + { + "epoch": 0.3269578625863654, + "grad_norm": 0.20428243279457092, + "learning_rate": 0.0007677624198913273, + "loss": 2.7403, + "step": 11026 + }, + { + "epoch": 0.32698751593867686, + "grad_norm": 0.13961894810199738, + "learning_rate": 0.0007677226832530548, + "loss": 2.7415, + "step": 11027 + }, + { + "epoch": 0.32701716929098834, + "grad_norm": 0.1512463092803955, + "learning_rate": 0.0007676829442440521, + "loss": 2.765, + "step": 11028 + }, + { + "epoch": 0.3270468226432998, + "grad_norm": 0.1475515067577362, + "learning_rate": 0.0007676432028646707, + "loss": 2.7593, + "step": 11029 + }, + { + "epoch": 0.3270764759956113, + "grad_norm": 0.13077901303768158, + "learning_rate": 0.000767603459115263, + "loss": 2.7578, + "step": 11030 + }, + { + "epoch": 0.32710612934792277, + "grad_norm": 0.12773366272449493, + "learning_rate": 0.0007675637129961807, + "loss": 2.7129, + "step": 11031 + }, + { + "epoch": 0.32713578270023425, + "grad_norm": 0.1397845298051834, + "learning_rate": 0.0007675239645077758, + "loss": 2.7675, + "step": 11032 + }, + { + "epoch": 0.3271654360525457, + "grad_norm": 0.13605941832065582, + "learning_rate": 0.0007674842136504003, + "loss": 2.7412, + "step": 11033 + }, + { + "epoch": 0.3271950894048572, + "grad_norm": 0.1288934350013733, + "learning_rate": 0.0007674444604244062, + "loss": 2.7608, + "step": 11034 + }, + { + "epoch": 0.3272247427571687, + "grad_norm": 0.12241372466087341, + "learning_rate": 0.0007674047048301455, + "loss": 2.7413, + "step": 11035 + }, + { + "epoch": 0.32725439610948015, + "grad_norm": 0.12390866130590439, + "learning_rate": 0.00076736494686797, + "loss": 2.7621, + "step": 11036 + }, + { + "epoch": 0.3272840494617917, + "grad_norm": 0.10363132506608963, + "learning_rate": 0.0007673251865382323, + "loss": 2.7518, + "step": 11037 + }, + { + "epoch": 0.32731370281410316, + "grad_norm": 0.1147899478673935, + "learning_rate": 0.000767285423841284, + "loss": 2.7326, + "step": 11038 + }, + { + "epoch": 0.32734335616641463, + "grad_norm": 0.11770681291818619, + "learning_rate": 0.0007672456587774775, + "loss": 2.732, + "step": 11039 + }, + { + "epoch": 0.3273730095187261, + "grad_norm": 0.13662174344062805, + "learning_rate": 0.0007672058913471649, + "loss": 2.7293, + "step": 11040 + }, + { + "epoch": 0.3274026628710376, + "grad_norm": 0.14479081332683563, + "learning_rate": 0.0007671661215506981, + "loss": 2.7516, + "step": 11041 + }, + { + "epoch": 0.32743231622334906, + "grad_norm": 0.14236560463905334, + "learning_rate": 0.0007671263493884293, + "loss": 2.7627, + "step": 11042 + }, + { + "epoch": 0.32746196957566054, + "grad_norm": 0.13602101802825928, + "learning_rate": 0.0007670865748607112, + "loss": 2.7477, + "step": 11043 + }, + { + "epoch": 0.327491622927972, + "grad_norm": 0.12502636015415192, + "learning_rate": 0.0007670467979678955, + "loss": 2.7399, + "step": 11044 + }, + { + "epoch": 0.3275212762802835, + "grad_norm": 0.12474311143159866, + "learning_rate": 0.0007670070187103344, + "loss": 2.7535, + "step": 11045 + }, + { + "epoch": 0.32755092963259497, + "grad_norm": 0.14747503399848938, + "learning_rate": 0.0007669672370883804, + "loss": 2.7373, + "step": 11046 + }, + { + "epoch": 0.32758058298490644, + "grad_norm": 0.13599549233913422, + "learning_rate": 0.0007669274531023857, + "loss": 2.7549, + "step": 11047 + }, + { + "epoch": 0.3276102363372179, + "grad_norm": 0.13195441663265228, + "learning_rate": 0.0007668876667527027, + "loss": 2.7184, + "step": 11048 + }, + { + "epoch": 0.3276398896895294, + "grad_norm": 0.12315557152032852, + "learning_rate": 0.0007668478780396835, + "loss": 2.7547, + "step": 11049 + }, + { + "epoch": 0.32766954304184087, + "grad_norm": 0.11940643191337585, + "learning_rate": 0.0007668080869636805, + "loss": 2.7516, + "step": 11050 + }, + { + "epoch": 0.32769919639415235, + "grad_norm": 0.1122424304485321, + "learning_rate": 0.0007667682935250462, + "loss": 2.7479, + "step": 11051 + }, + { + "epoch": 0.3277288497464638, + "grad_norm": 0.1317882239818573, + "learning_rate": 0.0007667284977241328, + "loss": 2.7189, + "step": 11052 + }, + { + "epoch": 0.3277585030987753, + "grad_norm": 0.12703543901443481, + "learning_rate": 0.0007666886995612928, + "loss": 2.7438, + "step": 11053 + }, + { + "epoch": 0.3277881564510868, + "grad_norm": 0.14338859915733337, + "learning_rate": 0.0007666488990368786, + "loss": 2.7633, + "step": 11054 + }, + { + "epoch": 0.32781780980339825, + "grad_norm": 0.1464366763830185, + "learning_rate": 0.0007666090961512425, + "loss": 2.7411, + "step": 11055 + }, + { + "epoch": 0.32784746315570973, + "grad_norm": 0.14666153490543365, + "learning_rate": 0.0007665692909047373, + "loss": 2.7539, + "step": 11056 + }, + { + "epoch": 0.3278771165080212, + "grad_norm": 0.14154231548309326, + "learning_rate": 0.000766529483297715, + "loss": 2.7574, + "step": 11057 + }, + { + "epoch": 0.32790676986033274, + "grad_norm": 0.14417129755020142, + "learning_rate": 0.0007664896733305287, + "loss": 2.7621, + "step": 11058 + }, + { + "epoch": 0.3279364232126442, + "grad_norm": 0.15504977107048035, + "learning_rate": 0.0007664498610035303, + "loss": 2.7813, + "step": 11059 + }, + { + "epoch": 0.3279660765649557, + "grad_norm": 0.1619531661272049, + "learning_rate": 0.0007664100463170729, + "loss": 2.7667, + "step": 11060 + }, + { + "epoch": 0.32799572991726716, + "grad_norm": 0.16005587577819824, + "learning_rate": 0.0007663702292715087, + "loss": 2.7343, + "step": 11061 + }, + { + "epoch": 0.32802538326957864, + "grad_norm": 0.1652931421995163, + "learning_rate": 0.0007663304098671903, + "loss": 2.7643, + "step": 11062 + }, + { + "epoch": 0.3280550366218901, + "grad_norm": 0.11930812895298004, + "learning_rate": 0.0007662905881044705, + "loss": 2.7381, + "step": 11063 + }, + { + "epoch": 0.3280846899742016, + "grad_norm": 0.11940357089042664, + "learning_rate": 0.0007662507639837017, + "loss": 2.7306, + "step": 11064 + }, + { + "epoch": 0.32811434332651307, + "grad_norm": 0.126410573720932, + "learning_rate": 0.0007662109375052371, + "loss": 2.7504, + "step": 11065 + }, + { + "epoch": 0.32814399667882455, + "grad_norm": 0.13420799374580383, + "learning_rate": 0.0007661711086694286, + "loss": 2.7447, + "step": 11066 + }, + { + "epoch": 0.328173650031136, + "grad_norm": 0.12023168057203293, + "learning_rate": 0.0007661312774766293, + "loss": 2.7291, + "step": 11067 + }, + { + "epoch": 0.3282033033834475, + "grad_norm": 0.1060110554099083, + "learning_rate": 0.0007660914439271918, + "loss": 2.7403, + "step": 11068 + }, + { + "epoch": 0.328232956735759, + "grad_norm": 0.13429278135299683, + "learning_rate": 0.000766051608021469, + "loss": 2.7402, + "step": 11069 + }, + { + "epoch": 0.32826261008807045, + "grad_norm": 0.15310271084308624, + "learning_rate": 0.0007660117697598134, + "loss": 2.751, + "step": 11070 + }, + { + "epoch": 0.3282922634403819, + "grad_norm": 0.16842173039913177, + "learning_rate": 0.0007659719291425781, + "loss": 2.7591, + "step": 11071 + }, + { + "epoch": 0.3283219167926934, + "grad_norm": 0.17569595575332642, + "learning_rate": 0.0007659320861701156, + "loss": 2.7394, + "step": 11072 + }, + { + "epoch": 0.3283515701450049, + "grad_norm": 0.14365442097187042, + "learning_rate": 0.0007658922408427789, + "loss": 2.7035, + "step": 11073 + }, + { + "epoch": 0.32838122349731635, + "grad_norm": 0.1281736046075821, + "learning_rate": 0.0007658523931609207, + "loss": 2.7356, + "step": 11074 + }, + { + "epoch": 0.32841087684962783, + "grad_norm": 0.1651373654603958, + "learning_rate": 0.0007658125431248938, + "loss": 2.7624, + "step": 11075 + }, + { + "epoch": 0.3284405302019393, + "grad_norm": 0.1514696627855301, + "learning_rate": 0.0007657726907350515, + "loss": 2.7451, + "step": 11076 + }, + { + "epoch": 0.3284701835542508, + "grad_norm": 0.13940490782260895, + "learning_rate": 0.0007657328359917464, + "loss": 2.704, + "step": 11077 + }, + { + "epoch": 0.32849983690656226, + "grad_norm": 0.1393602341413498, + "learning_rate": 0.0007656929788953313, + "loss": 2.7613, + "step": 11078 + }, + { + "epoch": 0.3285294902588738, + "grad_norm": 0.1461511254310608, + "learning_rate": 0.0007656531194461593, + "loss": 2.7529, + "step": 11079 + }, + { + "epoch": 0.32855914361118527, + "grad_norm": 0.14430202543735504, + "learning_rate": 0.0007656132576445831, + "loss": 2.7671, + "step": 11080 + }, + { + "epoch": 0.32858879696349674, + "grad_norm": 0.1311318576335907, + "learning_rate": 0.0007655733934909562, + "loss": 2.7446, + "step": 11081 + }, + { + "epoch": 0.3286184503158082, + "grad_norm": 0.12996241450309753, + "learning_rate": 0.0007655335269856311, + "loss": 2.742, + "step": 11082 + }, + { + "epoch": 0.3286481036681197, + "grad_norm": 0.11563955247402191, + "learning_rate": 0.0007654936581289613, + "loss": 2.7489, + "step": 11083 + }, + { + "epoch": 0.32867775702043117, + "grad_norm": 0.1108771339058876, + "learning_rate": 0.0007654537869212994, + "loss": 2.7301, + "step": 11084 + }, + { + "epoch": 0.32870741037274265, + "grad_norm": 0.13029424846172333, + "learning_rate": 0.0007654139133629987, + "loss": 2.7848, + "step": 11085 + }, + { + "epoch": 0.3287370637250541, + "grad_norm": 0.11859948188066483, + "learning_rate": 0.0007653740374544123, + "loss": 2.7469, + "step": 11086 + }, + { + "epoch": 0.3287667170773656, + "grad_norm": 0.119631826877594, + "learning_rate": 0.0007653341591958931, + "loss": 2.7479, + "step": 11087 + }, + { + "epoch": 0.3287963704296771, + "grad_norm": 0.12381751835346222, + "learning_rate": 0.0007652942785877945, + "loss": 2.7482, + "step": 11088 + }, + { + "epoch": 0.32882602378198855, + "grad_norm": 0.12228722870349884, + "learning_rate": 0.0007652543956304694, + "loss": 2.773, + "step": 11089 + }, + { + "epoch": 0.32885567713430003, + "grad_norm": 0.11309994012117386, + "learning_rate": 0.0007652145103242712, + "loss": 2.7346, + "step": 11090 + }, + { + "epoch": 0.3288853304866115, + "grad_norm": 0.12073270976543427, + "learning_rate": 0.0007651746226695529, + "loss": 2.7251, + "step": 11091 + }, + { + "epoch": 0.328914983838923, + "grad_norm": 0.11433033645153046, + "learning_rate": 0.000765134732666668, + "loss": 2.7384, + "step": 11092 + }, + { + "epoch": 0.32894463719123446, + "grad_norm": 0.12066473811864853, + "learning_rate": 0.0007650948403159694, + "loss": 2.7624, + "step": 11093 + }, + { + "epoch": 0.32897429054354593, + "grad_norm": 0.16884420812129974, + "learning_rate": 0.0007650549456178104, + "loss": 2.7282, + "step": 11094 + }, + { + "epoch": 0.3290039438958574, + "grad_norm": 0.18624791502952576, + "learning_rate": 0.0007650150485725445, + "loss": 2.7338, + "step": 11095 + }, + { + "epoch": 0.3290335972481689, + "grad_norm": 0.1657288521528244, + "learning_rate": 0.0007649751491805248, + "loss": 2.7479, + "step": 11096 + }, + { + "epoch": 0.32906325060048036, + "grad_norm": 0.14784473180770874, + "learning_rate": 0.0007649352474421047, + "loss": 2.75, + "step": 11097 + }, + { + "epoch": 0.32909290395279184, + "grad_norm": 0.15768779814243317, + "learning_rate": 0.0007648953433576376, + "loss": 2.7371, + "step": 11098 + }, + { + "epoch": 0.32912255730510337, + "grad_norm": 0.17978975176811218, + "learning_rate": 0.0007648554369274765, + "loss": 2.7903, + "step": 11099 + }, + { + "epoch": 0.32915221065741485, + "grad_norm": 0.18475022912025452, + "learning_rate": 0.0007648155281519751, + "loss": 2.769, + "step": 11100 + }, + { + "epoch": 0.3291818640097263, + "grad_norm": 0.18153834342956543, + "learning_rate": 0.0007647756170314868, + "loss": 2.7405, + "step": 11101 + }, + { + "epoch": 0.3292115173620378, + "grad_norm": 0.18507319688796997, + "learning_rate": 0.0007647357035663651, + "loss": 2.7671, + "step": 11102 + }, + { + "epoch": 0.3292411707143493, + "grad_norm": 0.16976167261600494, + "learning_rate": 0.0007646957877569632, + "loss": 2.7665, + "step": 11103 + }, + { + "epoch": 0.32927082406666075, + "grad_norm": 0.16365082561969757, + "learning_rate": 0.0007646558696036348, + "loss": 2.761, + "step": 11104 + }, + { + "epoch": 0.3293004774189722, + "grad_norm": 0.14914420247077942, + "learning_rate": 0.000764615949106733, + "loss": 2.7481, + "step": 11105 + }, + { + "epoch": 0.3293301307712837, + "grad_norm": 0.1325032263994217, + "learning_rate": 0.0007645760262666117, + "loss": 2.7414, + "step": 11106 + }, + { + "epoch": 0.3293597841235952, + "grad_norm": 0.13321244716644287, + "learning_rate": 0.0007645361010836241, + "loss": 2.6854, + "step": 11107 + }, + { + "epoch": 0.32938943747590665, + "grad_norm": 0.1257922351360321, + "learning_rate": 0.0007644961735581241, + "loss": 2.7044, + "step": 11108 + }, + { + "epoch": 0.32941909082821813, + "grad_norm": 0.12718117237091064, + "learning_rate": 0.0007644562436904652, + "loss": 2.764, + "step": 11109 + }, + { + "epoch": 0.3294487441805296, + "grad_norm": 0.10811848193407059, + "learning_rate": 0.0007644163114810006, + "loss": 2.7263, + "step": 11110 + }, + { + "epoch": 0.3294783975328411, + "grad_norm": 0.12370259314775467, + "learning_rate": 0.0007643763769300842, + "loss": 2.7756, + "step": 11111 + }, + { + "epoch": 0.32950805088515256, + "grad_norm": 0.12085644900798798, + "learning_rate": 0.0007643364400380698, + "loss": 2.7572, + "step": 11112 + }, + { + "epoch": 0.32953770423746404, + "grad_norm": 0.1306702196598053, + "learning_rate": 0.0007642965008053107, + "loss": 2.7697, + "step": 11113 + }, + { + "epoch": 0.3295673575897755, + "grad_norm": 0.10683909803628922, + "learning_rate": 0.0007642565592321607, + "loss": 2.7285, + "step": 11114 + }, + { + "epoch": 0.329597010942087, + "grad_norm": 0.12119497358798981, + "learning_rate": 0.0007642166153189736, + "loss": 2.7971, + "step": 11115 + }, + { + "epoch": 0.32962666429439846, + "grad_norm": 0.11995162069797516, + "learning_rate": 0.000764176669066103, + "loss": 2.7537, + "step": 11116 + }, + { + "epoch": 0.32965631764670994, + "grad_norm": 0.11823147535324097, + "learning_rate": 0.0007641367204739027, + "loss": 2.7406, + "step": 11117 + }, + { + "epoch": 0.3296859709990214, + "grad_norm": 0.12888461351394653, + "learning_rate": 0.0007640967695427263, + "loss": 2.7299, + "step": 11118 + }, + { + "epoch": 0.3297156243513329, + "grad_norm": 0.12639553844928741, + "learning_rate": 0.0007640568162729277, + "loss": 2.7468, + "step": 11119 + }, + { + "epoch": 0.3297452777036444, + "grad_norm": 0.11957091838121414, + "learning_rate": 0.0007640168606648606, + "loss": 2.7734, + "step": 11120 + }, + { + "epoch": 0.3297749310559559, + "grad_norm": 0.12159255892038345, + "learning_rate": 0.000763976902718879, + "loss": 2.7729, + "step": 11121 + }, + { + "epoch": 0.3298045844082674, + "grad_norm": 0.13067777454853058, + "learning_rate": 0.0007639369424353366, + "loss": 2.7682, + "step": 11122 + }, + { + "epoch": 0.32983423776057885, + "grad_norm": 0.13159874081611633, + "learning_rate": 0.0007638969798145871, + "loss": 2.7223, + "step": 11123 + }, + { + "epoch": 0.32986389111289033, + "grad_norm": 0.12803593277931213, + "learning_rate": 0.0007638570148569847, + "loss": 2.7356, + "step": 11124 + }, + { + "epoch": 0.3298935444652018, + "grad_norm": 0.12980255484580994, + "learning_rate": 0.0007638170475628832, + "loss": 2.7382, + "step": 11125 + }, + { + "epoch": 0.3299231978175133, + "grad_norm": 0.15378957986831665, + "learning_rate": 0.0007637770779326364, + "loss": 2.7582, + "step": 11126 + }, + { + "epoch": 0.32995285116982476, + "grad_norm": 0.15844304859638214, + "learning_rate": 0.0007637371059665982, + "loss": 2.741, + "step": 11127 + }, + { + "epoch": 0.32998250452213623, + "grad_norm": 0.15398120880126953, + "learning_rate": 0.0007636971316651228, + "loss": 2.7788, + "step": 11128 + }, + { + "epoch": 0.3300121578744477, + "grad_norm": 0.16592426598072052, + "learning_rate": 0.000763657155028564, + "loss": 2.7754, + "step": 11129 + }, + { + "epoch": 0.3300418112267592, + "grad_norm": 0.16598336398601532, + "learning_rate": 0.0007636171760572759, + "loss": 2.7401, + "step": 11130 + }, + { + "epoch": 0.33007146457907066, + "grad_norm": 0.1526591032743454, + "learning_rate": 0.0007635771947516124, + "loss": 2.7578, + "step": 11131 + }, + { + "epoch": 0.33010111793138214, + "grad_norm": 0.14661116898059845, + "learning_rate": 0.0007635372111119276, + "loss": 2.7491, + "step": 11132 + }, + { + "epoch": 0.3301307712836936, + "grad_norm": 0.13705319166183472, + "learning_rate": 0.0007634972251385755, + "loss": 2.7438, + "step": 11133 + }, + { + "epoch": 0.3301604246360051, + "grad_norm": 0.1325734406709671, + "learning_rate": 0.0007634572368319101, + "loss": 2.7645, + "step": 11134 + }, + { + "epoch": 0.33019007798831657, + "grad_norm": 0.12712156772613525, + "learning_rate": 0.0007634172461922859, + "loss": 2.7532, + "step": 11135 + }, + { + "epoch": 0.33021973134062804, + "grad_norm": 0.13255208730697632, + "learning_rate": 0.0007633772532200568, + "loss": 2.7467, + "step": 11136 + }, + { + "epoch": 0.3302493846929395, + "grad_norm": 0.13047976791858673, + "learning_rate": 0.0007633372579155768, + "loss": 2.7358, + "step": 11137 + }, + { + "epoch": 0.330279038045251, + "grad_norm": 0.12084564566612244, + "learning_rate": 0.0007632972602792002, + "loss": 2.765, + "step": 11138 + }, + { + "epoch": 0.33030869139756247, + "grad_norm": 0.11749053746461868, + "learning_rate": 0.000763257260311281, + "loss": 2.7309, + "step": 11139 + }, + { + "epoch": 0.33033834474987395, + "grad_norm": 0.1185239851474762, + "learning_rate": 0.0007632172580121738, + "loss": 2.7362, + "step": 11140 + }, + { + "epoch": 0.3303679981021855, + "grad_norm": 0.13286776840686798, + "learning_rate": 0.0007631772533822325, + "loss": 2.7615, + "step": 11141 + }, + { + "epoch": 0.33039765145449695, + "grad_norm": 0.12163635343313217, + "learning_rate": 0.0007631372464218116, + "loss": 2.7369, + "step": 11142 + }, + { + "epoch": 0.33042730480680843, + "grad_norm": 0.11021443456411362, + "learning_rate": 0.000763097237131265, + "loss": 2.7958, + "step": 11143 + }, + { + "epoch": 0.3304569581591199, + "grad_norm": 0.13004693388938904, + "learning_rate": 0.0007630572255109474, + "loss": 2.741, + "step": 11144 + }, + { + "epoch": 0.3304866115114314, + "grad_norm": 0.130162313580513, + "learning_rate": 0.0007630172115612127, + "loss": 2.7732, + "step": 11145 + }, + { + "epoch": 0.33051626486374286, + "grad_norm": 0.13484738767147064, + "learning_rate": 0.0007629771952824155, + "loss": 2.7532, + "step": 11146 + }, + { + "epoch": 0.33054591821605434, + "grad_norm": 0.1584932506084442, + "learning_rate": 0.0007629371766749103, + "loss": 2.755, + "step": 11147 + }, + { + "epoch": 0.3305755715683658, + "grad_norm": 0.14771471917629242, + "learning_rate": 0.000762897155739051, + "loss": 2.7867, + "step": 11148 + }, + { + "epoch": 0.3306052249206773, + "grad_norm": 0.13858292996883392, + "learning_rate": 0.0007628571324751925, + "loss": 2.715, + "step": 11149 + }, + { + "epoch": 0.33063487827298876, + "grad_norm": 0.11474978923797607, + "learning_rate": 0.0007628171068836888, + "loss": 2.7553, + "step": 11150 + }, + { + "epoch": 0.33066453162530024, + "grad_norm": 0.12286955118179321, + "learning_rate": 0.0007627770789648945, + "loss": 2.7519, + "step": 11151 + }, + { + "epoch": 0.3306941849776117, + "grad_norm": 0.1428145319223404, + "learning_rate": 0.0007627370487191642, + "loss": 2.7136, + "step": 11152 + }, + { + "epoch": 0.3307238383299232, + "grad_norm": 0.13652539253234863, + "learning_rate": 0.0007626970161468521, + "loss": 2.7087, + "step": 11153 + }, + { + "epoch": 0.33075349168223467, + "grad_norm": 0.1397078037261963, + "learning_rate": 0.0007626569812483129, + "loss": 2.7418, + "step": 11154 + }, + { + "epoch": 0.33078314503454614, + "grad_norm": 0.16342611610889435, + "learning_rate": 0.0007626169440239011, + "loss": 2.7618, + "step": 11155 + }, + { + "epoch": 0.3308127983868576, + "grad_norm": 0.18393030762672424, + "learning_rate": 0.000762576904473971, + "loss": 2.7281, + "step": 11156 + }, + { + "epoch": 0.3308424517391691, + "grad_norm": 0.15388086438179016, + "learning_rate": 0.0007625368625988776, + "loss": 2.732, + "step": 11157 + }, + { + "epoch": 0.3308721050914806, + "grad_norm": 0.13604454696178436, + "learning_rate": 0.0007624968183989749, + "loss": 2.7855, + "step": 11158 + }, + { + "epoch": 0.33090175844379205, + "grad_norm": 0.13188962638378143, + "learning_rate": 0.000762456771874618, + "loss": 2.7528, + "step": 11159 + }, + { + "epoch": 0.3309314117961035, + "grad_norm": 0.1281210333108902, + "learning_rate": 0.0007624167230261614, + "loss": 2.7192, + "step": 11160 + }, + { + "epoch": 0.330961065148415, + "grad_norm": 0.14330415427684784, + "learning_rate": 0.0007623766718539596, + "loss": 2.7165, + "step": 11161 + }, + { + "epoch": 0.33099071850072653, + "grad_norm": 0.13482244312763214, + "learning_rate": 0.0007623366183583673, + "loss": 2.7439, + "step": 11162 + }, + { + "epoch": 0.331020371853038, + "grad_norm": 0.115603968501091, + "learning_rate": 0.0007622965625397393, + "loss": 2.7358, + "step": 11163 + }, + { + "epoch": 0.3310500252053495, + "grad_norm": 0.12920373678207397, + "learning_rate": 0.0007622565043984301, + "loss": 2.7426, + "step": 11164 + }, + { + "epoch": 0.33107967855766096, + "grad_norm": 0.14142033457756042, + "learning_rate": 0.0007622164439347945, + "loss": 2.7467, + "step": 11165 + }, + { + "epoch": 0.33110933190997244, + "grad_norm": 0.13080838322639465, + "learning_rate": 0.0007621763811491876, + "loss": 2.7151, + "step": 11166 + }, + { + "epoch": 0.3311389852622839, + "grad_norm": 0.13179224729537964, + "learning_rate": 0.0007621363160419634, + "loss": 2.7834, + "step": 11167 + }, + { + "epoch": 0.3311686386145954, + "grad_norm": 0.12738078832626343, + "learning_rate": 0.0007620962486134774, + "loss": 2.771, + "step": 11168 + }, + { + "epoch": 0.33119829196690687, + "grad_norm": 0.1312248408794403, + "learning_rate": 0.0007620561788640841, + "loss": 2.7447, + "step": 11169 + }, + { + "epoch": 0.33122794531921834, + "grad_norm": 0.13437679409980774, + "learning_rate": 0.0007620161067941384, + "loss": 2.7532, + "step": 11170 + }, + { + "epoch": 0.3312575986715298, + "grad_norm": 0.1247130036354065, + "learning_rate": 0.000761976032403995, + "loss": 2.7577, + "step": 11171 + }, + { + "epoch": 0.3312872520238413, + "grad_norm": 0.10996276140213013, + "learning_rate": 0.0007619359556940089, + "loss": 2.7707, + "step": 11172 + }, + { + "epoch": 0.33131690537615277, + "grad_norm": 0.12560437619686127, + "learning_rate": 0.000761895876664535, + "loss": 2.7488, + "step": 11173 + }, + { + "epoch": 0.33134655872846425, + "grad_norm": 0.1280205100774765, + "learning_rate": 0.0007618557953159282, + "loss": 2.7396, + "step": 11174 + }, + { + "epoch": 0.3313762120807757, + "grad_norm": 0.1433860957622528, + "learning_rate": 0.0007618157116485433, + "loss": 2.7384, + "step": 11175 + }, + { + "epoch": 0.3314058654330872, + "grad_norm": 0.1435166895389557, + "learning_rate": 0.0007617756256627353, + "loss": 2.7367, + "step": 11176 + }, + { + "epoch": 0.3314355187853987, + "grad_norm": 0.1431806981563568, + "learning_rate": 0.0007617355373588593, + "loss": 2.7739, + "step": 11177 + }, + { + "epoch": 0.33146517213771015, + "grad_norm": 0.13491596281528473, + "learning_rate": 0.0007616954467372698, + "loss": 2.7366, + "step": 11178 + }, + { + "epoch": 0.3314948254900216, + "grad_norm": 0.13042335212230682, + "learning_rate": 0.0007616553537983226, + "loss": 2.7475, + "step": 11179 + }, + { + "epoch": 0.3315244788423331, + "grad_norm": 0.1252770870923996, + "learning_rate": 0.0007616152585423724, + "loss": 2.7273, + "step": 11180 + }, + { + "epoch": 0.3315541321946446, + "grad_norm": 0.15556976199150085, + "learning_rate": 0.000761575160969774, + "loss": 2.7463, + "step": 11181 + }, + { + "epoch": 0.33158378554695606, + "grad_norm": 0.16855932772159576, + "learning_rate": 0.0007615350610808827, + "loss": 2.7299, + "step": 11182 + }, + { + "epoch": 0.3316134388992676, + "grad_norm": 0.13431823253631592, + "learning_rate": 0.0007614949588760535, + "loss": 2.7567, + "step": 11183 + }, + { + "epoch": 0.33164309225157906, + "grad_norm": 0.14778277277946472, + "learning_rate": 0.0007614548543556414, + "loss": 2.7442, + "step": 11184 + }, + { + "epoch": 0.33167274560389054, + "grad_norm": 0.1407804936170578, + "learning_rate": 0.0007614147475200019, + "loss": 2.7622, + "step": 11185 + }, + { + "epoch": 0.331702398956202, + "grad_norm": 0.13771720230579376, + "learning_rate": 0.00076137463836949, + "loss": 2.7432, + "step": 11186 + }, + { + "epoch": 0.3317320523085135, + "grad_norm": 0.12812772393226624, + "learning_rate": 0.0007613345269044607, + "loss": 2.7527, + "step": 11187 + }, + { + "epoch": 0.33176170566082497, + "grad_norm": 0.13750997185707092, + "learning_rate": 0.0007612944131252694, + "loss": 2.7609, + "step": 11188 + }, + { + "epoch": 0.33179135901313644, + "grad_norm": 0.12312763184309006, + "learning_rate": 0.0007612542970322711, + "loss": 2.748, + "step": 11189 + }, + { + "epoch": 0.3318210123654479, + "grad_norm": 0.14244762063026428, + "learning_rate": 0.0007612141786258212, + "loss": 2.7263, + "step": 11190 + }, + { + "epoch": 0.3318506657177594, + "grad_norm": 0.1468420922756195, + "learning_rate": 0.0007611740579062749, + "loss": 2.7472, + "step": 11191 + }, + { + "epoch": 0.3318803190700709, + "grad_norm": 0.1451784372329712, + "learning_rate": 0.0007611339348739876, + "loss": 2.737, + "step": 11192 + }, + { + "epoch": 0.33190997242238235, + "grad_norm": 0.13734303414821625, + "learning_rate": 0.0007610938095293143, + "loss": 2.7717, + "step": 11193 + }, + { + "epoch": 0.3319396257746938, + "grad_norm": 0.11670687049627304, + "learning_rate": 0.0007610536818726106, + "loss": 2.7565, + "step": 11194 + }, + { + "epoch": 0.3319692791270053, + "grad_norm": 0.12289182841777802, + "learning_rate": 0.0007610135519042316, + "loss": 2.7701, + "step": 11195 + }, + { + "epoch": 0.3319989324793168, + "grad_norm": 0.12158488482236862, + "learning_rate": 0.000760973419624533, + "loss": 2.7675, + "step": 11196 + }, + { + "epoch": 0.33202858583162825, + "grad_norm": 0.12015530467033386, + "learning_rate": 0.00076093328503387, + "loss": 2.7211, + "step": 11197 + }, + { + "epoch": 0.33205823918393973, + "grad_norm": 0.12615889310836792, + "learning_rate": 0.0007608931481325978, + "loss": 2.7774, + "step": 11198 + }, + { + "epoch": 0.3320878925362512, + "grad_norm": 0.13672606647014618, + "learning_rate": 0.000760853008921072, + "loss": 2.7175, + "step": 11199 + }, + { + "epoch": 0.3321175458885627, + "grad_norm": 0.15186631679534912, + "learning_rate": 0.000760812867399648, + "loss": 2.7311, + "step": 11200 + }, + { + "epoch": 0.33214719924087416, + "grad_norm": 0.1429232358932495, + "learning_rate": 0.0007607727235686815, + "loss": 2.7646, + "step": 11201 + }, + { + "epoch": 0.33217685259318563, + "grad_norm": 0.13880963623523712, + "learning_rate": 0.0007607325774285276, + "loss": 2.7184, + "step": 11202 + }, + { + "epoch": 0.33220650594549717, + "grad_norm": 0.1289006918668747, + "learning_rate": 0.0007606924289795421, + "loss": 2.7705, + "step": 11203 + }, + { + "epoch": 0.33223615929780864, + "grad_norm": 0.13851523399353027, + "learning_rate": 0.0007606522782220801, + "loss": 2.7408, + "step": 11204 + }, + { + "epoch": 0.3322658126501201, + "grad_norm": 0.13947688043117523, + "learning_rate": 0.0007606121251564978, + "loss": 2.7735, + "step": 11205 + }, + { + "epoch": 0.3322954660024316, + "grad_norm": 0.1260519176721573, + "learning_rate": 0.0007605719697831502, + "loss": 2.7414, + "step": 11206 + }, + { + "epoch": 0.33232511935474307, + "grad_norm": 0.11468972265720367, + "learning_rate": 0.0007605318121023932, + "loss": 2.7141, + "step": 11207 + }, + { + "epoch": 0.33235477270705455, + "grad_norm": 0.12049468606710434, + "learning_rate": 0.0007604916521145822, + "loss": 2.7457, + "step": 11208 + }, + { + "epoch": 0.332384426059366, + "grad_norm": 0.121476911008358, + "learning_rate": 0.0007604514898200729, + "loss": 2.7762, + "step": 11209 + }, + { + "epoch": 0.3324140794116775, + "grad_norm": 0.12522703409194946, + "learning_rate": 0.0007604113252192209, + "loss": 2.7441, + "step": 11210 + }, + { + "epoch": 0.332443732763989, + "grad_norm": 0.1125408187508583, + "learning_rate": 0.000760371158312382, + "loss": 2.7599, + "step": 11211 + }, + { + "epoch": 0.33247338611630045, + "grad_norm": 0.11919335275888443, + "learning_rate": 0.0007603309890999119, + "loss": 2.7545, + "step": 11212 + }, + { + "epoch": 0.3325030394686119, + "grad_norm": 0.12691621482372284, + "learning_rate": 0.0007602908175821661, + "loss": 2.7578, + "step": 11213 + }, + { + "epoch": 0.3325326928209234, + "grad_norm": 0.12030409276485443, + "learning_rate": 0.0007602506437595005, + "loss": 2.7717, + "step": 11214 + }, + { + "epoch": 0.3325623461732349, + "grad_norm": 0.12082437425851822, + "learning_rate": 0.0007602104676322707, + "loss": 2.7284, + "step": 11215 + }, + { + "epoch": 0.33259199952554636, + "grad_norm": 0.12141794711351395, + "learning_rate": 0.0007601702892008326, + "loss": 2.7337, + "step": 11216 + }, + { + "epoch": 0.33262165287785783, + "grad_norm": 0.11828526109457016, + "learning_rate": 0.0007601301084655417, + "loss": 2.7209, + "step": 11217 + }, + { + "epoch": 0.3326513062301693, + "grad_norm": 0.14053279161453247, + "learning_rate": 0.0007600899254267544, + "loss": 2.7417, + "step": 11218 + }, + { + "epoch": 0.3326809595824808, + "grad_norm": 0.1648961752653122, + "learning_rate": 0.0007600497400848258, + "loss": 2.7716, + "step": 11219 + }, + { + "epoch": 0.33271061293479226, + "grad_norm": 0.17766165733337402, + "learning_rate": 0.0007600095524401124, + "loss": 2.784, + "step": 11220 + }, + { + "epoch": 0.33274026628710374, + "grad_norm": 0.1627461463212967, + "learning_rate": 0.0007599693624929697, + "loss": 2.7283, + "step": 11221 + }, + { + "epoch": 0.3327699196394152, + "grad_norm": 0.12453081458806992, + "learning_rate": 0.0007599291702437537, + "loss": 2.7265, + "step": 11222 + }, + { + "epoch": 0.3327995729917267, + "grad_norm": 0.1360243409872055, + "learning_rate": 0.0007598889756928203, + "loss": 2.7596, + "step": 11223 + }, + { + "epoch": 0.3328292263440382, + "grad_norm": 0.1580236852169037, + "learning_rate": 0.0007598487788405253, + "loss": 2.7595, + "step": 11224 + }, + { + "epoch": 0.3328588796963497, + "grad_norm": 0.15110544860363007, + "learning_rate": 0.0007598085796872247, + "loss": 2.7736, + "step": 11225 + }, + { + "epoch": 0.3328885330486612, + "grad_norm": 0.1297559142112732, + "learning_rate": 0.0007597683782332747, + "loss": 2.713, + "step": 11226 + }, + { + "epoch": 0.33291818640097265, + "grad_norm": 0.122470922768116, + "learning_rate": 0.0007597281744790309, + "loss": 2.7726, + "step": 11227 + }, + { + "epoch": 0.3329478397532841, + "grad_norm": 0.1327618509531021, + "learning_rate": 0.0007596879684248499, + "loss": 2.7497, + "step": 11228 + }, + { + "epoch": 0.3329774931055956, + "grad_norm": 0.11671758443117142, + "learning_rate": 0.0007596477600710871, + "loss": 2.7281, + "step": 11229 + }, + { + "epoch": 0.3330071464579071, + "grad_norm": 0.13169361650943756, + "learning_rate": 0.0007596075494180988, + "loss": 2.7642, + "step": 11230 + }, + { + "epoch": 0.33303679981021855, + "grad_norm": 0.11428860574960709, + "learning_rate": 0.0007595673364662412, + "loss": 2.7581, + "step": 11231 + }, + { + "epoch": 0.33306645316253003, + "grad_norm": 0.121186763048172, + "learning_rate": 0.0007595271212158703, + "loss": 2.7596, + "step": 11232 + }, + { + "epoch": 0.3330961065148415, + "grad_norm": 0.11314690858125687, + "learning_rate": 0.0007594869036673422, + "loss": 2.7459, + "step": 11233 + }, + { + "epoch": 0.333125759867153, + "grad_norm": 0.11188143491744995, + "learning_rate": 0.0007594466838210129, + "loss": 2.7517, + "step": 11234 + }, + { + "epoch": 0.33315541321946446, + "grad_norm": 0.13404695689678192, + "learning_rate": 0.0007594064616772388, + "loss": 2.7398, + "step": 11235 + }, + { + "epoch": 0.33318506657177593, + "grad_norm": 0.13135254383087158, + "learning_rate": 0.0007593662372363759, + "loss": 2.7191, + "step": 11236 + }, + { + "epoch": 0.3332147199240874, + "grad_norm": 0.13094785809516907, + "learning_rate": 0.0007593260104987805, + "loss": 2.7015, + "step": 11237 + }, + { + "epoch": 0.3332443732763989, + "grad_norm": 0.12698344886302948, + "learning_rate": 0.0007592857814648086, + "loss": 2.7299, + "step": 11238 + }, + { + "epoch": 0.33327402662871036, + "grad_norm": 0.1335768699645996, + "learning_rate": 0.0007592455501348168, + "loss": 2.7142, + "step": 11239 + }, + { + "epoch": 0.33330367998102184, + "grad_norm": 0.13973957300186157, + "learning_rate": 0.0007592053165091611, + "loss": 2.743, + "step": 11240 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.14264318346977234, + "learning_rate": 0.0007591650805881978, + "loss": 2.7268, + "step": 11241 + }, + { + "epoch": 0.3333629866856448, + "grad_norm": 0.15155543386936188, + "learning_rate": 0.0007591248423722832, + "loss": 2.7257, + "step": 11242 + }, + { + "epoch": 0.33339264003795627, + "grad_norm": 0.1500030755996704, + "learning_rate": 0.0007590846018617737, + "loss": 2.7149, + "step": 11243 + }, + { + "epoch": 0.33342229339026774, + "grad_norm": 0.1294313222169876, + "learning_rate": 0.0007590443590570255, + "loss": 2.7055, + "step": 11244 + }, + { + "epoch": 0.3334519467425793, + "grad_norm": 0.11351586878299713, + "learning_rate": 0.0007590041139583952, + "loss": 2.7488, + "step": 11245 + }, + { + "epoch": 0.33348160009489075, + "grad_norm": 0.14426353573799133, + "learning_rate": 0.0007589638665662389, + "loss": 2.7555, + "step": 11246 + }, + { + "epoch": 0.3335112534472022, + "grad_norm": 0.16256414353847504, + "learning_rate": 0.000758923616880913, + "loss": 2.7637, + "step": 11247 + }, + { + "epoch": 0.3335409067995137, + "grad_norm": 0.16184371709823608, + "learning_rate": 0.0007588833649027742, + "loss": 2.7701, + "step": 11248 + }, + { + "epoch": 0.3335705601518252, + "grad_norm": 0.1805511713027954, + "learning_rate": 0.0007588431106321787, + "loss": 2.7749, + "step": 11249 + }, + { + "epoch": 0.33360021350413666, + "grad_norm": 0.16352735459804535, + "learning_rate": 0.0007588028540694831, + "loss": 2.7322, + "step": 11250 + }, + { + "epoch": 0.33362986685644813, + "grad_norm": 0.1253003478050232, + "learning_rate": 0.0007587625952150437, + "loss": 2.7134, + "step": 11251 + }, + { + "epoch": 0.3336595202087596, + "grad_norm": 0.13235639035701752, + "learning_rate": 0.000758722334069217, + "loss": 2.7146, + "step": 11252 + }, + { + "epoch": 0.3336891735610711, + "grad_norm": 0.11871129274368286, + "learning_rate": 0.0007586820706323599, + "loss": 2.7426, + "step": 11253 + }, + { + "epoch": 0.33371882691338256, + "grad_norm": 0.13702811300754547, + "learning_rate": 0.0007586418049048284, + "loss": 2.7492, + "step": 11254 + }, + { + "epoch": 0.33374848026569404, + "grad_norm": 0.14977243542671204, + "learning_rate": 0.0007586015368869797, + "loss": 2.771, + "step": 11255 + }, + { + "epoch": 0.3337781336180055, + "grad_norm": 0.11485752463340759, + "learning_rate": 0.0007585612665791697, + "loss": 2.7347, + "step": 11256 + }, + { + "epoch": 0.333807786970317, + "grad_norm": 0.11439801752567291, + "learning_rate": 0.0007585209939817552, + "loss": 2.7597, + "step": 11257 + }, + { + "epoch": 0.33383744032262846, + "grad_norm": 0.11821726709604263, + "learning_rate": 0.0007584807190950931, + "loss": 2.766, + "step": 11258 + }, + { + "epoch": 0.33386709367493994, + "grad_norm": 0.13666072487831116, + "learning_rate": 0.0007584404419195399, + "loss": 2.7266, + "step": 11259 + }, + { + "epoch": 0.3338967470272514, + "grad_norm": 0.1221875548362732, + "learning_rate": 0.0007584001624554522, + "loss": 2.7342, + "step": 11260 + }, + { + "epoch": 0.3339264003795629, + "grad_norm": 0.11863940209150314, + "learning_rate": 0.0007583598807031866, + "loss": 2.7303, + "step": 11261 + }, + { + "epoch": 0.33395605373187437, + "grad_norm": 0.12390550225973129, + "learning_rate": 0.0007583195966631, + "loss": 2.7776, + "step": 11262 + }, + { + "epoch": 0.33398570708418585, + "grad_norm": 0.14047358930110931, + "learning_rate": 0.000758279310335549, + "loss": 2.7747, + "step": 11263 + }, + { + "epoch": 0.3340153604364973, + "grad_norm": 0.15612904727458954, + "learning_rate": 0.0007582390217208905, + "loss": 2.7502, + "step": 11264 + }, + { + "epoch": 0.3340450137888088, + "grad_norm": 0.16190645098686218, + "learning_rate": 0.000758198730819481, + "loss": 2.7516, + "step": 11265 + }, + { + "epoch": 0.33407466714112033, + "grad_norm": 0.15997737646102905, + "learning_rate": 0.0007581584376316775, + "loss": 2.6991, + "step": 11266 + }, + { + "epoch": 0.3341043204934318, + "grad_norm": 0.17219485342502594, + "learning_rate": 0.0007581181421578367, + "loss": 2.7551, + "step": 11267 + }, + { + "epoch": 0.3341339738457433, + "grad_norm": 0.15576699376106262, + "learning_rate": 0.0007580778443983153, + "loss": 2.7231, + "step": 11268 + }, + { + "epoch": 0.33416362719805476, + "grad_norm": 0.14873754978179932, + "learning_rate": 0.0007580375443534704, + "loss": 2.7359, + "step": 11269 + }, + { + "epoch": 0.33419328055036623, + "grad_norm": 0.13997720181941986, + "learning_rate": 0.0007579972420236588, + "loss": 2.6728, + "step": 11270 + }, + { + "epoch": 0.3342229339026777, + "grad_norm": 0.15169040858745575, + "learning_rate": 0.0007579569374092372, + "loss": 2.7533, + "step": 11271 + }, + { + "epoch": 0.3342525872549892, + "grad_norm": 0.14316163957118988, + "learning_rate": 0.0007579166305105628, + "loss": 2.7372, + "step": 11272 + }, + { + "epoch": 0.33428224060730066, + "grad_norm": 0.13572506606578827, + "learning_rate": 0.0007578763213279924, + "loss": 2.7286, + "step": 11273 + }, + { + "epoch": 0.33431189395961214, + "grad_norm": 0.1354190856218338, + "learning_rate": 0.0007578360098618828, + "loss": 2.7691, + "step": 11274 + }, + { + "epoch": 0.3343415473119236, + "grad_norm": 0.13267189264297485, + "learning_rate": 0.000757795696112591, + "loss": 2.7691, + "step": 11275 + }, + { + "epoch": 0.3343712006642351, + "grad_norm": 0.17521484196186066, + "learning_rate": 0.0007577553800804742, + "loss": 2.7429, + "step": 11276 + }, + { + "epoch": 0.33440085401654657, + "grad_norm": 0.1761752814054489, + "learning_rate": 0.0007577150617658892, + "loss": 2.8016, + "step": 11277 + }, + { + "epoch": 0.33443050736885804, + "grad_norm": 0.15556150674819946, + "learning_rate": 0.0007576747411691931, + "loss": 2.7898, + "step": 11278 + }, + { + "epoch": 0.3344601607211695, + "grad_norm": 0.12243206799030304, + "learning_rate": 0.0007576344182907431, + "loss": 2.7507, + "step": 11279 + }, + { + "epoch": 0.334489814073481, + "grad_norm": 0.14097963273525238, + "learning_rate": 0.0007575940931308959, + "loss": 2.7292, + "step": 11280 + }, + { + "epoch": 0.33451946742579247, + "grad_norm": 0.11982238292694092, + "learning_rate": 0.0007575537656900087, + "loss": 2.7538, + "step": 11281 + }, + { + "epoch": 0.33454912077810395, + "grad_norm": 0.11969979107379913, + "learning_rate": 0.0007575134359684388, + "loss": 2.7329, + "step": 11282 + }, + { + "epoch": 0.3345787741304154, + "grad_norm": 0.12390291690826416, + "learning_rate": 0.0007574731039665434, + "loss": 2.7241, + "step": 11283 + }, + { + "epoch": 0.3346084274827269, + "grad_norm": 0.12887121737003326, + "learning_rate": 0.0007574327696846793, + "loss": 2.7579, + "step": 11284 + }, + { + "epoch": 0.3346380808350384, + "grad_norm": 0.12505818903446198, + "learning_rate": 0.0007573924331232038, + "loss": 2.7221, + "step": 11285 + }, + { + "epoch": 0.33466773418734985, + "grad_norm": 0.12535816431045532, + "learning_rate": 0.0007573520942824743, + "loss": 2.7242, + "step": 11286 + }, + { + "epoch": 0.3346973875396614, + "grad_norm": 0.12009931355714798, + "learning_rate": 0.0007573117531628477, + "loss": 2.7379, + "step": 11287 + }, + { + "epoch": 0.33472704089197286, + "grad_norm": 0.13246209919452667, + "learning_rate": 0.0007572714097646813, + "loss": 2.7322, + "step": 11288 + }, + { + "epoch": 0.33475669424428434, + "grad_norm": 0.12322743237018585, + "learning_rate": 0.0007572310640883327, + "loss": 2.7362, + "step": 11289 + }, + { + "epoch": 0.3347863475965958, + "grad_norm": 0.0998331680893898, + "learning_rate": 0.0007571907161341585, + "loss": 2.729, + "step": 11290 + }, + { + "epoch": 0.3348160009489073, + "grad_norm": 0.11309615522623062, + "learning_rate": 0.0007571503659025166, + "loss": 2.7336, + "step": 11291 + }, + { + "epoch": 0.33484565430121876, + "grad_norm": 0.11870445311069489, + "learning_rate": 0.0007571100133937639, + "loss": 2.7498, + "step": 11292 + }, + { + "epoch": 0.33487530765353024, + "grad_norm": 0.11923840641975403, + "learning_rate": 0.0007570696586082581, + "loss": 2.75, + "step": 11293 + }, + { + "epoch": 0.3349049610058417, + "grad_norm": 0.14409814774990082, + "learning_rate": 0.0007570293015463562, + "loss": 2.7504, + "step": 11294 + }, + { + "epoch": 0.3349346143581532, + "grad_norm": 0.16121777892112732, + "learning_rate": 0.0007569889422084158, + "loss": 2.73, + "step": 11295 + }, + { + "epoch": 0.33496426771046467, + "grad_norm": 0.151496022939682, + "learning_rate": 0.000756948580594794, + "loss": 2.7339, + "step": 11296 + }, + { + "epoch": 0.33499392106277615, + "grad_norm": 0.13666056096553802, + "learning_rate": 0.0007569082167058487, + "loss": 2.7432, + "step": 11297 + }, + { + "epoch": 0.3350235744150876, + "grad_norm": 0.15193091332912445, + "learning_rate": 0.0007568678505419368, + "loss": 2.7446, + "step": 11298 + }, + { + "epoch": 0.3350532277673991, + "grad_norm": 0.14525623619556427, + "learning_rate": 0.0007568274821034163, + "loss": 2.7308, + "step": 11299 + }, + { + "epoch": 0.3350828811197106, + "grad_norm": 0.1220298483967781, + "learning_rate": 0.0007567871113906442, + "loss": 2.7381, + "step": 11300 + }, + { + "epoch": 0.33511253447202205, + "grad_norm": 0.13797780871391296, + "learning_rate": 0.000756746738403978, + "loss": 2.7523, + "step": 11301 + }, + { + "epoch": 0.3351421878243335, + "grad_norm": 0.1263808161020279, + "learning_rate": 0.0007567063631437755, + "loss": 2.7653, + "step": 11302 + }, + { + "epoch": 0.335171841176645, + "grad_norm": 0.1358518898487091, + "learning_rate": 0.0007566659856103941, + "loss": 2.7438, + "step": 11303 + }, + { + "epoch": 0.3352014945289565, + "grad_norm": 0.12801028788089752, + "learning_rate": 0.0007566256058041913, + "loss": 2.7684, + "step": 11304 + }, + { + "epoch": 0.33523114788126795, + "grad_norm": 0.11823441088199615, + "learning_rate": 0.0007565852237255248, + "loss": 2.7361, + "step": 11305 + }, + { + "epoch": 0.33526080123357943, + "grad_norm": 0.11071038246154785, + "learning_rate": 0.000756544839374752, + "loss": 2.7591, + "step": 11306 + }, + { + "epoch": 0.33529045458589096, + "grad_norm": 0.132549449801445, + "learning_rate": 0.0007565044527522306, + "loss": 2.7508, + "step": 11307 + }, + { + "epoch": 0.33532010793820244, + "grad_norm": 0.1318301409482956, + "learning_rate": 0.0007564640638583183, + "loss": 2.7615, + "step": 11308 + }, + { + "epoch": 0.3353497612905139, + "grad_norm": 0.1360914260149002, + "learning_rate": 0.0007564236726933727, + "loss": 2.6982, + "step": 11309 + }, + { + "epoch": 0.3353794146428254, + "grad_norm": 0.1308961808681488, + "learning_rate": 0.0007563832792577514, + "loss": 2.7273, + "step": 11310 + }, + { + "epoch": 0.33540906799513687, + "grad_norm": 0.16988834738731384, + "learning_rate": 0.0007563428835518122, + "loss": 2.7474, + "step": 11311 + }, + { + "epoch": 0.33543872134744834, + "grad_norm": 0.19331516325473785, + "learning_rate": 0.0007563024855759128, + "loss": 2.7815, + "step": 11312 + }, + { + "epoch": 0.3354683746997598, + "grad_norm": 0.21640978753566742, + "learning_rate": 0.000756262085330411, + "loss": 2.7272, + "step": 11313 + }, + { + "epoch": 0.3354980280520713, + "grad_norm": 0.1742866039276123, + "learning_rate": 0.0007562216828156642, + "loss": 2.7165, + "step": 11314 + }, + { + "epoch": 0.33552768140438277, + "grad_norm": 0.14261692762374878, + "learning_rate": 0.0007561812780320305, + "loss": 2.7439, + "step": 11315 + }, + { + "epoch": 0.33555733475669425, + "grad_norm": 0.1436396688222885, + "learning_rate": 0.0007561408709798677, + "loss": 2.7432, + "step": 11316 + }, + { + "epoch": 0.3355869881090057, + "grad_norm": 0.15074501931667328, + "learning_rate": 0.0007561004616595335, + "loss": 2.726, + "step": 11317 + }, + { + "epoch": 0.3356166414613172, + "grad_norm": 0.1323728710412979, + "learning_rate": 0.0007560600500713856, + "loss": 2.7518, + "step": 11318 + }, + { + "epoch": 0.3356462948136287, + "grad_norm": 0.13928769528865814, + "learning_rate": 0.0007560196362157822, + "loss": 2.7533, + "step": 11319 + }, + { + "epoch": 0.33567594816594015, + "grad_norm": 0.14053519070148468, + "learning_rate": 0.0007559792200930809, + "loss": 2.741, + "step": 11320 + }, + { + "epoch": 0.33570560151825163, + "grad_norm": 0.15108130872249603, + "learning_rate": 0.0007559388017036397, + "loss": 2.7407, + "step": 11321 + }, + { + "epoch": 0.3357352548705631, + "grad_norm": 0.14037014544010162, + "learning_rate": 0.0007558983810478164, + "loss": 2.7426, + "step": 11322 + }, + { + "epoch": 0.3357649082228746, + "grad_norm": 0.13653609156608582, + "learning_rate": 0.000755857958125969, + "loss": 2.7712, + "step": 11323 + }, + { + "epoch": 0.33579456157518606, + "grad_norm": 0.12192559987306595, + "learning_rate": 0.0007558175329384556, + "loss": 2.7232, + "step": 11324 + }, + { + "epoch": 0.33582421492749753, + "grad_norm": 0.12526366114616394, + "learning_rate": 0.0007557771054856339, + "loss": 2.7233, + "step": 11325 + }, + { + "epoch": 0.335853868279809, + "grad_norm": 0.1099526435136795, + "learning_rate": 0.000755736675767862, + "loss": 2.7137, + "step": 11326 + }, + { + "epoch": 0.3358835216321205, + "grad_norm": 0.12094072997570038, + "learning_rate": 0.000755696243785498, + "loss": 2.6961, + "step": 11327 + }, + { + "epoch": 0.335913174984432, + "grad_norm": 0.12959030270576477, + "learning_rate": 0.0007556558095388999, + "loss": 2.7414, + "step": 11328 + }, + { + "epoch": 0.3359428283367435, + "grad_norm": 0.11862373352050781, + "learning_rate": 0.0007556153730284257, + "loss": 2.7245, + "step": 11329 + }, + { + "epoch": 0.33597248168905497, + "grad_norm": 0.109065942466259, + "learning_rate": 0.0007555749342544335, + "loss": 2.747, + "step": 11330 + }, + { + "epoch": 0.33600213504136645, + "grad_norm": 0.14932070672512054, + "learning_rate": 0.0007555344932172814, + "loss": 2.7512, + "step": 11331 + }, + { + "epoch": 0.3360317883936779, + "grad_norm": 0.1657954305410385, + "learning_rate": 0.0007554940499173275, + "loss": 2.7551, + "step": 11332 + }, + { + "epoch": 0.3360614417459894, + "grad_norm": 0.13012437522411346, + "learning_rate": 0.00075545360435493, + "loss": 2.6941, + "step": 11333 + }, + { + "epoch": 0.3360910950983009, + "grad_norm": 0.1316668838262558, + "learning_rate": 0.0007554131565304469, + "loss": 2.7481, + "step": 11334 + }, + { + "epoch": 0.33612074845061235, + "grad_norm": 0.14422868192195892, + "learning_rate": 0.0007553727064442365, + "loss": 2.7361, + "step": 11335 + }, + { + "epoch": 0.3361504018029238, + "grad_norm": 0.13929522037506104, + "learning_rate": 0.000755332254096657, + "loss": 2.7313, + "step": 11336 + }, + { + "epoch": 0.3361800551552353, + "grad_norm": 0.16273421049118042, + "learning_rate": 0.0007552917994880664, + "loss": 2.7308, + "step": 11337 + }, + { + "epoch": 0.3362097085075468, + "grad_norm": 0.17802491784095764, + "learning_rate": 0.0007552513426188233, + "loss": 2.7146, + "step": 11338 + }, + { + "epoch": 0.33623936185985825, + "grad_norm": 0.16755767166614532, + "learning_rate": 0.0007552108834892857, + "loss": 2.7651, + "step": 11339 + }, + { + "epoch": 0.33626901521216973, + "grad_norm": 0.1458098590373993, + "learning_rate": 0.0007551704220998117, + "loss": 2.7078, + "step": 11340 + }, + { + "epoch": 0.3362986685644812, + "grad_norm": 0.15437059104442596, + "learning_rate": 0.00075512995845076, + "loss": 2.6976, + "step": 11341 + }, + { + "epoch": 0.3363283219167927, + "grad_norm": 0.15703849494457245, + "learning_rate": 0.0007550894925424886, + "loss": 2.743, + "step": 11342 + }, + { + "epoch": 0.33635797526910416, + "grad_norm": 0.15732181072235107, + "learning_rate": 0.0007550490243753562, + "loss": 2.7657, + "step": 11343 + }, + { + "epoch": 0.33638762862141564, + "grad_norm": 0.15602236986160278, + "learning_rate": 0.0007550085539497207, + "loss": 2.7452, + "step": 11344 + }, + { + "epoch": 0.3364172819737271, + "grad_norm": 0.12805426120758057, + "learning_rate": 0.0007549680812659408, + "loss": 2.7453, + "step": 11345 + }, + { + "epoch": 0.3364469353260386, + "grad_norm": 0.13012245297431946, + "learning_rate": 0.0007549276063243747, + "loss": 2.7555, + "step": 11346 + }, + { + "epoch": 0.33647658867835006, + "grad_norm": 0.13542787730693817, + "learning_rate": 0.0007548871291253807, + "loss": 2.7661, + "step": 11347 + }, + { + "epoch": 0.33650624203066154, + "grad_norm": 0.13183633983135223, + "learning_rate": 0.0007548466496693177, + "loss": 2.7405, + "step": 11348 + }, + { + "epoch": 0.33653589538297307, + "grad_norm": 0.1284739375114441, + "learning_rate": 0.0007548061679565439, + "loss": 2.7952, + "step": 11349 + }, + { + "epoch": 0.33656554873528455, + "grad_norm": 0.13591165840625763, + "learning_rate": 0.0007547656839874176, + "loss": 2.7441, + "step": 11350 + }, + { + "epoch": 0.336595202087596, + "grad_norm": 0.1337164342403412, + "learning_rate": 0.0007547251977622976, + "loss": 2.738, + "step": 11351 + }, + { + "epoch": 0.3366248554399075, + "grad_norm": 0.11872263997793198, + "learning_rate": 0.0007546847092815421, + "loss": 2.7617, + "step": 11352 + }, + { + "epoch": 0.336654508792219, + "grad_norm": 0.1270075887441635, + "learning_rate": 0.0007546442185455096, + "loss": 2.7422, + "step": 11353 + }, + { + "epoch": 0.33668416214453045, + "grad_norm": 0.12525896728038788, + "learning_rate": 0.0007546037255545592, + "loss": 2.7366, + "step": 11354 + }, + { + "epoch": 0.33671381549684193, + "grad_norm": 0.113465815782547, + "learning_rate": 0.0007545632303090489, + "loss": 2.7564, + "step": 11355 + }, + { + "epoch": 0.3367434688491534, + "grad_norm": 0.11328913271427155, + "learning_rate": 0.0007545227328093376, + "loss": 2.7412, + "step": 11356 + }, + { + "epoch": 0.3367731222014649, + "grad_norm": 0.12458338588476181, + "learning_rate": 0.0007544822330557838, + "loss": 2.7894, + "step": 11357 + }, + { + "epoch": 0.33680277555377636, + "grad_norm": 0.12479937821626663, + "learning_rate": 0.0007544417310487462, + "loss": 2.7689, + "step": 11358 + }, + { + "epoch": 0.33683242890608783, + "grad_norm": 0.13585592806339264, + "learning_rate": 0.0007544012267885832, + "loss": 2.7542, + "step": 11359 + }, + { + "epoch": 0.3368620822583993, + "grad_norm": 0.13842745125293732, + "learning_rate": 0.0007543607202756537, + "loss": 2.7181, + "step": 11360 + }, + { + "epoch": 0.3368917356107108, + "grad_norm": 0.1333639919757843, + "learning_rate": 0.0007543202115103165, + "loss": 2.7485, + "step": 11361 + }, + { + "epoch": 0.33692138896302226, + "grad_norm": 0.1228293627500534, + "learning_rate": 0.0007542797004929301, + "loss": 2.7352, + "step": 11362 + }, + { + "epoch": 0.33695104231533374, + "grad_norm": 0.11968336999416351, + "learning_rate": 0.0007542391872238535, + "loss": 2.7487, + "step": 11363 + }, + { + "epoch": 0.3369806956676452, + "grad_norm": 0.13222186267375946, + "learning_rate": 0.000754198671703445, + "loss": 2.7615, + "step": 11364 + }, + { + "epoch": 0.3370103490199567, + "grad_norm": 0.14483752846717834, + "learning_rate": 0.0007541581539320637, + "loss": 2.7156, + "step": 11365 + }, + { + "epoch": 0.33704000237226817, + "grad_norm": 0.12270928919315338, + "learning_rate": 0.0007541176339100684, + "loss": 2.7512, + "step": 11366 + }, + { + "epoch": 0.33706965572457964, + "grad_norm": 0.12060964852571487, + "learning_rate": 0.0007540771116378177, + "loss": 2.728, + "step": 11367 + }, + { + "epoch": 0.3370993090768911, + "grad_norm": 0.11357779055833817, + "learning_rate": 0.0007540365871156707, + "loss": 2.7561, + "step": 11368 + }, + { + "epoch": 0.3371289624292026, + "grad_norm": 0.11258243024349213, + "learning_rate": 0.0007539960603439859, + "loss": 2.7137, + "step": 11369 + }, + { + "epoch": 0.3371586157815141, + "grad_norm": 0.10881025344133377, + "learning_rate": 0.0007539555313231226, + "loss": 2.7242, + "step": 11370 + }, + { + "epoch": 0.3371882691338256, + "grad_norm": 0.11624061316251755, + "learning_rate": 0.0007539150000534395, + "loss": 2.7576, + "step": 11371 + }, + { + "epoch": 0.3372179224861371, + "grad_norm": 0.14023643732070923, + "learning_rate": 0.0007538744665352953, + "loss": 2.7571, + "step": 11372 + }, + { + "epoch": 0.33724757583844855, + "grad_norm": 0.14717997610569, + "learning_rate": 0.0007538339307690492, + "loss": 2.7538, + "step": 11373 + }, + { + "epoch": 0.33727722919076003, + "grad_norm": 0.15138496458530426, + "learning_rate": 0.0007537933927550602, + "loss": 2.7421, + "step": 11374 + }, + { + "epoch": 0.3373068825430715, + "grad_norm": 0.18677546083927155, + "learning_rate": 0.000753752852493687, + "loss": 2.7303, + "step": 11375 + }, + { + "epoch": 0.337336535895383, + "grad_norm": 0.20397968590259552, + "learning_rate": 0.0007537123099852889, + "loss": 2.7799, + "step": 11376 + }, + { + "epoch": 0.33736618924769446, + "grad_norm": 0.1968134641647339, + "learning_rate": 0.0007536717652302245, + "loss": 2.7599, + "step": 11377 + }, + { + "epoch": 0.33739584260000594, + "grad_norm": 0.15825623273849487, + "learning_rate": 0.0007536312182288531, + "loss": 2.7412, + "step": 11378 + }, + { + "epoch": 0.3374254959523174, + "grad_norm": 0.14760692417621613, + "learning_rate": 0.000753590668981534, + "loss": 2.7843, + "step": 11379 + }, + { + "epoch": 0.3374551493046289, + "grad_norm": 0.1613079458475113, + "learning_rate": 0.0007535501174886257, + "loss": 2.7616, + "step": 11380 + }, + { + "epoch": 0.33748480265694036, + "grad_norm": 0.17195115983486176, + "learning_rate": 0.0007535095637504879, + "loss": 2.7476, + "step": 11381 + }, + { + "epoch": 0.33751445600925184, + "grad_norm": 0.1481531858444214, + "learning_rate": 0.0007534690077674791, + "loss": 2.7283, + "step": 11382 + }, + { + "epoch": 0.3375441093615633, + "grad_norm": 0.1640605479478836, + "learning_rate": 0.0007534284495399589, + "loss": 2.7673, + "step": 11383 + }, + { + "epoch": 0.3375737627138748, + "grad_norm": 0.1678307056427002, + "learning_rate": 0.0007533878890682861, + "loss": 2.7353, + "step": 11384 + }, + { + "epoch": 0.33760341606618627, + "grad_norm": 0.15409137308597565, + "learning_rate": 0.0007533473263528201, + "loss": 2.728, + "step": 11385 + }, + { + "epoch": 0.33763306941849774, + "grad_norm": 0.11787798255681992, + "learning_rate": 0.0007533067613939202, + "loss": 2.7473, + "step": 11386 + }, + { + "epoch": 0.3376627227708092, + "grad_norm": 0.14823363721370697, + "learning_rate": 0.0007532661941919455, + "loss": 2.7326, + "step": 11387 + }, + { + "epoch": 0.3376923761231207, + "grad_norm": 0.13858819007873535, + "learning_rate": 0.000753225624747255, + "loss": 2.7503, + "step": 11388 + }, + { + "epoch": 0.3377220294754322, + "grad_norm": 0.13064229488372803, + "learning_rate": 0.0007531850530602081, + "loss": 2.7481, + "step": 11389 + }, + { + "epoch": 0.33775168282774365, + "grad_norm": 0.12382073700428009, + "learning_rate": 0.0007531444791311641, + "loss": 2.7683, + "step": 11390 + }, + { + "epoch": 0.3377813361800552, + "grad_norm": 0.13611993193626404, + "learning_rate": 0.0007531039029604824, + "loss": 2.7057, + "step": 11391 + }, + { + "epoch": 0.33781098953236666, + "grad_norm": 0.11632544547319412, + "learning_rate": 0.0007530633245485221, + "loss": 2.7782, + "step": 11392 + }, + { + "epoch": 0.33784064288467813, + "grad_norm": 0.12313345819711685, + "learning_rate": 0.0007530227438956428, + "loss": 2.781, + "step": 11393 + }, + { + "epoch": 0.3378702962369896, + "grad_norm": 0.13703328371047974, + "learning_rate": 0.0007529821610022035, + "loss": 2.7689, + "step": 11394 + }, + { + "epoch": 0.3378999495893011, + "grad_norm": 0.13470500707626343, + "learning_rate": 0.0007529415758685638, + "loss": 2.7423, + "step": 11395 + }, + { + "epoch": 0.33792960294161256, + "grad_norm": 0.1292702853679657, + "learning_rate": 0.0007529009884950829, + "loss": 2.7502, + "step": 11396 + }, + { + "epoch": 0.33795925629392404, + "grad_norm": 0.12589704990386963, + "learning_rate": 0.0007528603988821205, + "loss": 2.7551, + "step": 11397 + }, + { + "epoch": 0.3379889096462355, + "grad_norm": 0.11859004199504852, + "learning_rate": 0.0007528198070300358, + "loss": 2.6979, + "step": 11398 + }, + { + "epoch": 0.338018562998547, + "grad_norm": 0.11976532638072968, + "learning_rate": 0.0007527792129391884, + "loss": 2.7817, + "step": 11399 + }, + { + "epoch": 0.33804821635085847, + "grad_norm": 0.12233015149831772, + "learning_rate": 0.0007527386166099375, + "loss": 2.7479, + "step": 11400 + }, + { + "epoch": 0.33807786970316994, + "grad_norm": 0.1137620136141777, + "learning_rate": 0.0007526980180426428, + "loss": 2.7523, + "step": 11401 + }, + { + "epoch": 0.3381075230554814, + "grad_norm": 0.1569630652666092, + "learning_rate": 0.0007526574172376639, + "loss": 2.7771, + "step": 11402 + }, + { + "epoch": 0.3381371764077929, + "grad_norm": 0.17867523431777954, + "learning_rate": 0.0007526168141953602, + "loss": 2.7135, + "step": 11403 + }, + { + "epoch": 0.33816682976010437, + "grad_norm": 0.1642547994852066, + "learning_rate": 0.0007525762089160912, + "loss": 2.7375, + "step": 11404 + }, + { + "epoch": 0.33819648311241585, + "grad_norm": 0.1350831687450409, + "learning_rate": 0.0007525356014002164, + "loss": 2.7577, + "step": 11405 + }, + { + "epoch": 0.3382261364647273, + "grad_norm": 0.13232752680778503, + "learning_rate": 0.0007524949916480956, + "loss": 2.7307, + "step": 11406 + }, + { + "epoch": 0.3382557898170388, + "grad_norm": 0.10902173817157745, + "learning_rate": 0.0007524543796600885, + "loss": 2.7351, + "step": 11407 + }, + { + "epoch": 0.3382854431693503, + "grad_norm": 0.13018879294395447, + "learning_rate": 0.0007524137654365543, + "loss": 2.7318, + "step": 11408 + }, + { + "epoch": 0.33831509652166175, + "grad_norm": 0.12159187346696854, + "learning_rate": 0.0007523731489778529, + "loss": 2.7437, + "step": 11409 + }, + { + "epoch": 0.3383447498739732, + "grad_norm": 0.12008479237556458, + "learning_rate": 0.000752332530284344, + "loss": 2.7509, + "step": 11410 + }, + { + "epoch": 0.33837440322628476, + "grad_norm": 0.11754812300205231, + "learning_rate": 0.0007522919093563872, + "loss": 2.7608, + "step": 11411 + }, + { + "epoch": 0.33840405657859624, + "grad_norm": 0.1178152859210968, + "learning_rate": 0.0007522512861943422, + "loss": 2.736, + "step": 11412 + }, + { + "epoch": 0.3384337099309077, + "grad_norm": 0.1280089020729065, + "learning_rate": 0.0007522106607985688, + "loss": 2.7374, + "step": 11413 + }, + { + "epoch": 0.3384633632832192, + "grad_norm": 0.11749778687953949, + "learning_rate": 0.0007521700331694268, + "loss": 2.7412, + "step": 11414 + }, + { + "epoch": 0.33849301663553066, + "grad_norm": 0.12424679100513458, + "learning_rate": 0.0007521294033072758, + "loss": 2.7088, + "step": 11415 + }, + { + "epoch": 0.33852266998784214, + "grad_norm": 0.13462220132350922, + "learning_rate": 0.0007520887712124758, + "loss": 2.7461, + "step": 11416 + }, + { + "epoch": 0.3385523233401536, + "grad_norm": 0.15355250239372253, + "learning_rate": 0.0007520481368853861, + "loss": 2.7336, + "step": 11417 + }, + { + "epoch": 0.3385819766924651, + "grad_norm": 0.14456598460674286, + "learning_rate": 0.0007520075003263671, + "loss": 2.793, + "step": 11418 + }, + { + "epoch": 0.33861163004477657, + "grad_norm": 0.13346849381923676, + "learning_rate": 0.0007519668615357784, + "loss": 2.7686, + "step": 11419 + }, + { + "epoch": 0.33864128339708804, + "grad_norm": 0.11877503991127014, + "learning_rate": 0.0007519262205139801, + "loss": 2.7181, + "step": 11420 + }, + { + "epoch": 0.3386709367493995, + "grad_norm": 0.12369485944509506, + "learning_rate": 0.0007518855772613316, + "loss": 2.7301, + "step": 11421 + }, + { + "epoch": 0.338700590101711, + "grad_norm": 0.15896478295326233, + "learning_rate": 0.0007518449317781932, + "loss": 2.748, + "step": 11422 + }, + { + "epoch": 0.3387302434540225, + "grad_norm": 0.16847079992294312, + "learning_rate": 0.0007518042840649246, + "loss": 2.7914, + "step": 11423 + }, + { + "epoch": 0.33875989680633395, + "grad_norm": 0.12714071571826935, + "learning_rate": 0.0007517636341218859, + "loss": 2.761, + "step": 11424 + }, + { + "epoch": 0.3387895501586454, + "grad_norm": 0.1472548395395279, + "learning_rate": 0.0007517229819494372, + "loss": 2.7659, + "step": 11425 + }, + { + "epoch": 0.3388192035109569, + "grad_norm": 0.1444348394870758, + "learning_rate": 0.0007516823275479379, + "loss": 2.7459, + "step": 11426 + }, + { + "epoch": 0.3388488568632684, + "grad_norm": 0.15682503581047058, + "learning_rate": 0.0007516416709177487, + "loss": 2.7526, + "step": 11427 + }, + { + "epoch": 0.33887851021557985, + "grad_norm": 0.13897274434566498, + "learning_rate": 0.0007516010120592291, + "loss": 2.7566, + "step": 11428 + }, + { + "epoch": 0.33890816356789133, + "grad_norm": 0.11940889805555344, + "learning_rate": 0.0007515603509727396, + "loss": 2.7607, + "step": 11429 + }, + { + "epoch": 0.3389378169202028, + "grad_norm": 0.13051699101924896, + "learning_rate": 0.0007515196876586397, + "loss": 2.7316, + "step": 11430 + }, + { + "epoch": 0.3389674702725143, + "grad_norm": 0.1660923808813095, + "learning_rate": 0.00075147902211729, + "loss": 2.7369, + "step": 11431 + }, + { + "epoch": 0.3389971236248258, + "grad_norm": 0.11419430375099182, + "learning_rate": 0.0007514383543490504, + "loss": 2.7352, + "step": 11432 + }, + { + "epoch": 0.3390267769771373, + "grad_norm": 0.11217288672924042, + "learning_rate": 0.0007513976843542809, + "loss": 2.7252, + "step": 11433 + }, + { + "epoch": 0.33905643032944877, + "grad_norm": 0.12679269909858704, + "learning_rate": 0.000751357012133342, + "loss": 2.7636, + "step": 11434 + }, + { + "epoch": 0.33908608368176024, + "grad_norm": 0.1212460845708847, + "learning_rate": 0.0007513163376865932, + "loss": 2.7575, + "step": 11435 + }, + { + "epoch": 0.3391157370340717, + "grad_norm": 0.12554770708084106, + "learning_rate": 0.0007512756610143954, + "loss": 2.7288, + "step": 11436 + }, + { + "epoch": 0.3391453903863832, + "grad_norm": 0.11372867971658707, + "learning_rate": 0.0007512349821171085, + "loss": 2.7202, + "step": 11437 + }, + { + "epoch": 0.33917504373869467, + "grad_norm": 0.11648118495941162, + "learning_rate": 0.0007511943009950925, + "loss": 2.7861, + "step": 11438 + }, + { + "epoch": 0.33920469709100615, + "grad_norm": 0.11048833280801773, + "learning_rate": 0.0007511536176487081, + "loss": 2.7547, + "step": 11439 + }, + { + "epoch": 0.3392343504433176, + "grad_norm": 0.12622222304344177, + "learning_rate": 0.0007511129320783151, + "loss": 2.7167, + "step": 11440 + }, + { + "epoch": 0.3392640037956291, + "grad_norm": 0.11772716045379639, + "learning_rate": 0.0007510722442842741, + "loss": 2.7371, + "step": 11441 + }, + { + "epoch": 0.3392936571479406, + "grad_norm": 0.1207929328083992, + "learning_rate": 0.0007510315542669453, + "loss": 2.7679, + "step": 11442 + }, + { + "epoch": 0.33932331050025205, + "grad_norm": 0.11548826843500137, + "learning_rate": 0.0007509908620266888, + "loss": 2.7201, + "step": 11443 + }, + { + "epoch": 0.3393529638525635, + "grad_norm": 0.11977103352546692, + "learning_rate": 0.0007509501675638652, + "loss": 2.7515, + "step": 11444 + }, + { + "epoch": 0.339382617204875, + "grad_norm": 0.14631088078022003, + "learning_rate": 0.000750909470878835, + "loss": 2.7598, + "step": 11445 + }, + { + "epoch": 0.3394122705571865, + "grad_norm": 0.15170250833034515, + "learning_rate": 0.0007508687719719583, + "loss": 2.7121, + "step": 11446 + }, + { + "epoch": 0.33944192390949796, + "grad_norm": 0.153772234916687, + "learning_rate": 0.0007508280708435955, + "loss": 2.7533, + "step": 11447 + }, + { + "epoch": 0.33947157726180943, + "grad_norm": 0.1498269885778427, + "learning_rate": 0.000750787367494107, + "loss": 2.7806, + "step": 11448 + }, + { + "epoch": 0.3395012306141209, + "grad_norm": 0.15806427597999573, + "learning_rate": 0.0007507466619238535, + "loss": 2.7453, + "step": 11449 + }, + { + "epoch": 0.3395308839664324, + "grad_norm": 0.16951945424079895, + "learning_rate": 0.0007507059541331951, + "loss": 2.7367, + "step": 11450 + }, + { + "epoch": 0.33956053731874386, + "grad_norm": 0.153299480676651, + "learning_rate": 0.0007506652441224924, + "loss": 2.7438, + "step": 11451 + }, + { + "epoch": 0.33959019067105534, + "grad_norm": 0.12747570872306824, + "learning_rate": 0.0007506245318921061, + "loss": 2.7599, + "step": 11452 + }, + { + "epoch": 0.33961984402336687, + "grad_norm": 0.144967719912529, + "learning_rate": 0.0007505838174423965, + "loss": 2.7419, + "step": 11453 + }, + { + "epoch": 0.33964949737567834, + "grad_norm": 0.14721223711967468, + "learning_rate": 0.0007505431007737242, + "loss": 2.7472, + "step": 11454 + }, + { + "epoch": 0.3396791507279898, + "grad_norm": 0.14063741266727448, + "learning_rate": 0.0007505023818864497, + "loss": 2.7548, + "step": 11455 + }, + { + "epoch": 0.3397088040803013, + "grad_norm": 0.1386261284351349, + "learning_rate": 0.0007504616607809336, + "loss": 2.7755, + "step": 11456 + }, + { + "epoch": 0.3397384574326128, + "grad_norm": 0.16542711853981018, + "learning_rate": 0.0007504209374575365, + "loss": 2.7574, + "step": 11457 + }, + { + "epoch": 0.33976811078492425, + "grad_norm": 0.16024532914161682, + "learning_rate": 0.000750380211916619, + "loss": 2.7419, + "step": 11458 + }, + { + "epoch": 0.3397977641372357, + "grad_norm": 0.1667451560497284, + "learning_rate": 0.0007503394841585419, + "loss": 2.7631, + "step": 11459 + }, + { + "epoch": 0.3398274174895472, + "grad_norm": 0.15245793759822845, + "learning_rate": 0.0007502987541836655, + "loss": 2.7259, + "step": 11460 + }, + { + "epoch": 0.3398570708418587, + "grad_norm": 0.14439664781093597, + "learning_rate": 0.0007502580219923508, + "loss": 2.7222, + "step": 11461 + }, + { + "epoch": 0.33988672419417015, + "grad_norm": 0.13781346380710602, + "learning_rate": 0.0007502172875849582, + "loss": 2.7466, + "step": 11462 + }, + { + "epoch": 0.33991637754648163, + "grad_norm": 0.13924828171730042, + "learning_rate": 0.0007501765509618488, + "loss": 2.7591, + "step": 11463 + }, + { + "epoch": 0.3399460308987931, + "grad_norm": 0.1235961839556694, + "learning_rate": 0.000750135812123383, + "loss": 2.7266, + "step": 11464 + }, + { + "epoch": 0.3399756842511046, + "grad_norm": 0.1284327656030655, + "learning_rate": 0.0007500950710699215, + "loss": 2.7104, + "step": 11465 + }, + { + "epoch": 0.34000533760341606, + "grad_norm": 0.1463146060705185, + "learning_rate": 0.0007500543278018255, + "loss": 2.7406, + "step": 11466 + }, + { + "epoch": 0.34003499095572753, + "grad_norm": 0.14946958422660828, + "learning_rate": 0.0007500135823194552, + "loss": 2.7167, + "step": 11467 + }, + { + "epoch": 0.340064644308039, + "grad_norm": 0.14068356156349182, + "learning_rate": 0.000749972834623172, + "loss": 2.7278, + "step": 11468 + }, + { + "epoch": 0.3400942976603505, + "grad_norm": 0.1509346067905426, + "learning_rate": 0.0007499320847133362, + "loss": 2.7305, + "step": 11469 + }, + { + "epoch": 0.34012395101266196, + "grad_norm": 0.1418251246213913, + "learning_rate": 0.0007498913325903091, + "loss": 2.7363, + "step": 11470 + }, + { + "epoch": 0.34015360436497344, + "grad_norm": 0.1312389373779297, + "learning_rate": 0.0007498505782544511, + "loss": 2.7314, + "step": 11471 + }, + { + "epoch": 0.3401832577172849, + "grad_norm": 0.11517508327960968, + "learning_rate": 0.0007498098217061235, + "loss": 2.7483, + "step": 11472 + }, + { + "epoch": 0.3402129110695964, + "grad_norm": 0.1415979266166687, + "learning_rate": 0.0007497690629456871, + "loss": 2.7776, + "step": 11473 + }, + { + "epoch": 0.3402425644219079, + "grad_norm": 0.12895117700099945, + "learning_rate": 0.0007497283019735027, + "loss": 2.722, + "step": 11474 + }, + { + "epoch": 0.3402722177742194, + "grad_norm": 0.12514707446098328, + "learning_rate": 0.0007496875387899314, + "loss": 2.7454, + "step": 11475 + }, + { + "epoch": 0.3403018711265309, + "grad_norm": 0.12287483364343643, + "learning_rate": 0.0007496467733953341, + "loss": 2.754, + "step": 11476 + }, + { + "epoch": 0.34033152447884235, + "grad_norm": 0.1329125016927719, + "learning_rate": 0.0007496060057900715, + "loss": 2.7549, + "step": 11477 + }, + { + "epoch": 0.3403611778311538, + "grad_norm": 0.14082665741443634, + "learning_rate": 0.0007495652359745051, + "loss": 2.7247, + "step": 11478 + }, + { + "epoch": 0.3403908311834653, + "grad_norm": 0.13278743624687195, + "learning_rate": 0.0007495244639489958, + "loss": 2.7198, + "step": 11479 + }, + { + "epoch": 0.3404204845357768, + "grad_norm": 0.11247274279594421, + "learning_rate": 0.0007494836897139045, + "loss": 2.7177, + "step": 11480 + }, + { + "epoch": 0.34045013788808826, + "grad_norm": 0.12022727727890015, + "learning_rate": 0.0007494429132695921, + "loss": 2.7594, + "step": 11481 + }, + { + "epoch": 0.34047979124039973, + "grad_norm": 0.14686256647109985, + "learning_rate": 0.0007494021346164199, + "loss": 2.7792, + "step": 11482 + }, + { + "epoch": 0.3405094445927112, + "grad_norm": 0.1574370115995407, + "learning_rate": 0.0007493613537547492, + "loss": 2.7433, + "step": 11483 + }, + { + "epoch": 0.3405390979450227, + "grad_norm": 0.16653650999069214, + "learning_rate": 0.0007493205706849408, + "loss": 2.701, + "step": 11484 + }, + { + "epoch": 0.34056875129733416, + "grad_norm": 0.14524082839488983, + "learning_rate": 0.000749279785407356, + "loss": 2.7328, + "step": 11485 + }, + { + "epoch": 0.34059840464964564, + "grad_norm": 0.1462962031364441, + "learning_rate": 0.0007492389979223558, + "loss": 2.686, + "step": 11486 + }, + { + "epoch": 0.3406280580019571, + "grad_norm": 0.12048440426588058, + "learning_rate": 0.0007491982082303016, + "loss": 2.7414, + "step": 11487 + }, + { + "epoch": 0.3406577113542686, + "grad_norm": 0.1457912176847458, + "learning_rate": 0.0007491574163315543, + "loss": 2.7264, + "step": 11488 + }, + { + "epoch": 0.34068736470658006, + "grad_norm": 0.1491687297821045, + "learning_rate": 0.0007491166222264755, + "loss": 2.701, + "step": 11489 + }, + { + "epoch": 0.34071701805889154, + "grad_norm": 0.13835622370243073, + "learning_rate": 0.0007490758259154263, + "loss": 2.6863, + "step": 11490 + }, + { + "epoch": 0.340746671411203, + "grad_norm": 0.12998579442501068, + "learning_rate": 0.0007490350273987678, + "loss": 2.7157, + "step": 11491 + }, + { + "epoch": 0.3407763247635145, + "grad_norm": 0.13249622285366058, + "learning_rate": 0.0007489942266768614, + "loss": 2.7616, + "step": 11492 + }, + { + "epoch": 0.34080597811582597, + "grad_norm": 0.14895747601985931, + "learning_rate": 0.0007489534237500684, + "loss": 2.7484, + "step": 11493 + }, + { + "epoch": 0.34083563146813745, + "grad_norm": 0.14649496972560883, + "learning_rate": 0.00074891261861875, + "loss": 2.7659, + "step": 11494 + }, + { + "epoch": 0.340865284820449, + "grad_norm": 0.15382538735866547, + "learning_rate": 0.0007488718112832678, + "loss": 2.71, + "step": 11495 + }, + { + "epoch": 0.34089493817276045, + "grad_norm": 0.14758864045143127, + "learning_rate": 0.0007488310017439829, + "loss": 2.7312, + "step": 11496 + }, + { + "epoch": 0.34092459152507193, + "grad_norm": 0.12519271671772003, + "learning_rate": 0.0007487901900012569, + "loss": 2.762, + "step": 11497 + }, + { + "epoch": 0.3409542448773834, + "grad_norm": 0.11472874879837036, + "learning_rate": 0.0007487493760554509, + "loss": 2.7452, + "step": 11498 + }, + { + "epoch": 0.3409838982296949, + "grad_norm": 0.12989866733551025, + "learning_rate": 0.0007487085599069265, + "loss": 2.7397, + "step": 11499 + }, + { + "epoch": 0.34101355158200636, + "grad_norm": 0.13321411609649658, + "learning_rate": 0.0007486677415560451, + "loss": 2.722, + "step": 11500 + }, + { + "epoch": 0.34104320493431783, + "grad_norm": 0.11567031592130661, + "learning_rate": 0.0007486269210031682, + "loss": 2.7094, + "step": 11501 + }, + { + "epoch": 0.3410728582866293, + "grad_norm": 0.1168857142329216, + "learning_rate": 0.0007485860982486572, + "loss": 2.7201, + "step": 11502 + }, + { + "epoch": 0.3411025116389408, + "grad_norm": 0.12149795889854431, + "learning_rate": 0.0007485452732928737, + "loss": 2.755, + "step": 11503 + }, + { + "epoch": 0.34113216499125226, + "grad_norm": 0.14888447523117065, + "learning_rate": 0.0007485044461361792, + "loss": 2.7309, + "step": 11504 + }, + { + "epoch": 0.34116181834356374, + "grad_norm": 0.140602245926857, + "learning_rate": 0.000748463616778935, + "loss": 2.751, + "step": 11505 + }, + { + "epoch": 0.3411914716958752, + "grad_norm": 0.1511814147233963, + "learning_rate": 0.0007484227852215028, + "loss": 2.7285, + "step": 11506 + }, + { + "epoch": 0.3412211250481867, + "grad_norm": 0.14595870673656464, + "learning_rate": 0.0007483819514642445, + "loss": 2.6968, + "step": 11507 + }, + { + "epoch": 0.34125077840049817, + "grad_norm": 0.13917118310928345, + "learning_rate": 0.0007483411155075211, + "loss": 2.7512, + "step": 11508 + }, + { + "epoch": 0.34128043175280964, + "grad_norm": 0.1350087672472, + "learning_rate": 0.0007483002773516946, + "loss": 2.6968, + "step": 11509 + }, + { + "epoch": 0.3413100851051211, + "grad_norm": 0.12310261279344559, + "learning_rate": 0.0007482594369971266, + "loss": 2.7282, + "step": 11510 + }, + { + "epoch": 0.3413397384574326, + "grad_norm": 0.11890719830989838, + "learning_rate": 0.0007482185944441785, + "loss": 2.721, + "step": 11511 + }, + { + "epoch": 0.34136939180974407, + "grad_norm": 0.11999014765024185, + "learning_rate": 0.0007481777496932123, + "loss": 2.7445, + "step": 11512 + }, + { + "epoch": 0.34139904516205555, + "grad_norm": 0.12341298907995224, + "learning_rate": 0.0007481369027445894, + "loss": 2.765, + "step": 11513 + }, + { + "epoch": 0.341428698514367, + "grad_norm": 0.14169861376285553, + "learning_rate": 0.0007480960535986716, + "loss": 2.7497, + "step": 11514 + }, + { + "epoch": 0.34145835186667856, + "grad_norm": 0.14248616993427277, + "learning_rate": 0.0007480552022558208, + "loss": 2.7107, + "step": 11515 + }, + { + "epoch": 0.34148800521899003, + "grad_norm": 0.13843005895614624, + "learning_rate": 0.0007480143487163986, + "loss": 2.7486, + "step": 11516 + }, + { + "epoch": 0.3415176585713015, + "grad_norm": 0.14419341087341309, + "learning_rate": 0.0007479734929807666, + "loss": 2.7441, + "step": 11517 + }, + { + "epoch": 0.341547311923613, + "grad_norm": 0.16983148455619812, + "learning_rate": 0.0007479326350492871, + "loss": 2.7211, + "step": 11518 + }, + { + "epoch": 0.34157696527592446, + "grad_norm": 0.16324347257614136, + "learning_rate": 0.0007478917749223213, + "loss": 2.7301, + "step": 11519 + }, + { + "epoch": 0.34160661862823594, + "grad_norm": 0.13286571204662323, + "learning_rate": 0.0007478509126002313, + "loss": 2.735, + "step": 11520 + }, + { + "epoch": 0.3416362719805474, + "grad_norm": 0.14100205898284912, + "learning_rate": 0.000747810048083379, + "loss": 2.7348, + "step": 11521 + }, + { + "epoch": 0.3416659253328589, + "grad_norm": 0.1014692559838295, + "learning_rate": 0.000747769181372126, + "loss": 2.7472, + "step": 11522 + }, + { + "epoch": 0.34169557868517036, + "grad_norm": 0.11229462176561356, + "learning_rate": 0.0007477283124668345, + "loss": 2.7315, + "step": 11523 + }, + { + "epoch": 0.34172523203748184, + "grad_norm": 0.11645331978797913, + "learning_rate": 0.0007476874413678663, + "loss": 2.7552, + "step": 11524 + }, + { + "epoch": 0.3417548853897933, + "grad_norm": 0.111895851790905, + "learning_rate": 0.0007476465680755832, + "loss": 2.7408, + "step": 11525 + }, + { + "epoch": 0.3417845387421048, + "grad_norm": 0.11536592990159988, + "learning_rate": 0.0007476056925903474, + "loss": 2.738, + "step": 11526 + }, + { + "epoch": 0.34181419209441627, + "grad_norm": 0.10862204432487488, + "learning_rate": 0.0007475648149125205, + "loss": 2.7377, + "step": 11527 + }, + { + "epoch": 0.34184384544672775, + "grad_norm": 0.12970778346061707, + "learning_rate": 0.0007475239350424649, + "loss": 2.7377, + "step": 11528 + }, + { + "epoch": 0.3418734987990392, + "grad_norm": 0.14017395675182343, + "learning_rate": 0.0007474830529805422, + "loss": 2.7493, + "step": 11529 + }, + { + "epoch": 0.3419031521513507, + "grad_norm": 0.15928737819194794, + "learning_rate": 0.0007474421687271147, + "loss": 2.7681, + "step": 11530 + }, + { + "epoch": 0.3419328055036622, + "grad_norm": 0.17657403647899628, + "learning_rate": 0.0007474012822825442, + "loss": 2.7761, + "step": 11531 + }, + { + "epoch": 0.34196245885597365, + "grad_norm": 0.16454118490219116, + "learning_rate": 0.0007473603936471928, + "loss": 2.7247, + "step": 11532 + }, + { + "epoch": 0.3419921122082851, + "grad_norm": 0.13364946842193604, + "learning_rate": 0.0007473195028214229, + "loss": 2.7405, + "step": 11533 + }, + { + "epoch": 0.3420217655605966, + "grad_norm": 0.14921292662620544, + "learning_rate": 0.0007472786098055962, + "loss": 2.7161, + "step": 11534 + }, + { + "epoch": 0.3420514189129081, + "grad_norm": 0.13922333717346191, + "learning_rate": 0.000747237714600075, + "loss": 2.7136, + "step": 11535 + }, + { + "epoch": 0.3420810722652196, + "grad_norm": 0.12876754999160767, + "learning_rate": 0.0007471968172052213, + "loss": 2.7432, + "step": 11536 + }, + { + "epoch": 0.3421107256175311, + "grad_norm": 0.1355116367340088, + "learning_rate": 0.0007471559176213976, + "loss": 2.7575, + "step": 11537 + }, + { + "epoch": 0.34214037896984256, + "grad_norm": 0.11660061776638031, + "learning_rate": 0.0007471150158489656, + "loss": 2.771, + "step": 11538 + }, + { + "epoch": 0.34217003232215404, + "grad_norm": 0.12898539006710052, + "learning_rate": 0.0007470741118882878, + "loss": 2.7283, + "step": 11539 + }, + { + "epoch": 0.3421996856744655, + "grad_norm": 0.13561968505382538, + "learning_rate": 0.0007470332057397262, + "loss": 2.7151, + "step": 11540 + }, + { + "epoch": 0.342229339026777, + "grad_norm": 0.12501783668994904, + "learning_rate": 0.0007469922974036434, + "loss": 2.725, + "step": 11541 + }, + { + "epoch": 0.34225899237908847, + "grad_norm": 0.13181675970554352, + "learning_rate": 0.0007469513868804012, + "loss": 2.7576, + "step": 11542 + }, + { + "epoch": 0.34228864573139994, + "grad_norm": 0.1240285187959671, + "learning_rate": 0.0007469104741703622, + "loss": 2.726, + "step": 11543 + }, + { + "epoch": 0.3423182990837114, + "grad_norm": 0.1375218778848648, + "learning_rate": 0.0007468695592738885, + "loss": 2.7468, + "step": 11544 + }, + { + "epoch": 0.3423479524360229, + "grad_norm": 0.14438986778259277, + "learning_rate": 0.0007468286421913426, + "loss": 2.7484, + "step": 11545 + }, + { + "epoch": 0.34237760578833437, + "grad_norm": 0.1127026379108429, + "learning_rate": 0.0007467877229230866, + "loss": 2.736, + "step": 11546 + }, + { + "epoch": 0.34240725914064585, + "grad_norm": 0.11132306605577469, + "learning_rate": 0.000746746801469483, + "loss": 2.7327, + "step": 11547 + }, + { + "epoch": 0.3424369124929573, + "grad_norm": 0.11285991221666336, + "learning_rate": 0.0007467058778308941, + "loss": 2.7097, + "step": 11548 + }, + { + "epoch": 0.3424665658452688, + "grad_norm": 0.12085546553134918, + "learning_rate": 0.0007466649520076823, + "loss": 2.7176, + "step": 11549 + }, + { + "epoch": 0.3424962191975803, + "grad_norm": 0.13101069629192352, + "learning_rate": 0.0007466240240002099, + "loss": 2.7321, + "step": 11550 + }, + { + "epoch": 0.34252587254989175, + "grad_norm": 0.13521158695220947, + "learning_rate": 0.0007465830938088397, + "loss": 2.7565, + "step": 11551 + }, + { + "epoch": 0.34255552590220323, + "grad_norm": 0.1297629028558731, + "learning_rate": 0.0007465421614339337, + "loss": 2.7298, + "step": 11552 + }, + { + "epoch": 0.3425851792545147, + "grad_norm": 0.12048610299825668, + "learning_rate": 0.0007465012268758545, + "loss": 2.7464, + "step": 11553 + }, + { + "epoch": 0.3426148326068262, + "grad_norm": 0.1152932271361351, + "learning_rate": 0.0007464602901349647, + "loss": 2.7372, + "step": 11554 + }, + { + "epoch": 0.34264448595913766, + "grad_norm": 0.10565918684005737, + "learning_rate": 0.0007464193512116266, + "loss": 2.7255, + "step": 11555 + }, + { + "epoch": 0.34267413931144913, + "grad_norm": 0.11503741890192032, + "learning_rate": 0.0007463784101062031, + "loss": 2.7581, + "step": 11556 + }, + { + "epoch": 0.34270379266376066, + "grad_norm": 0.13354481756687164, + "learning_rate": 0.0007463374668190563, + "loss": 2.7098, + "step": 11557 + }, + { + "epoch": 0.34273344601607214, + "grad_norm": 0.11714072525501251, + "learning_rate": 0.000746296521350549, + "loss": 2.7536, + "step": 11558 + }, + { + "epoch": 0.3427630993683836, + "grad_norm": 0.12806569039821625, + "learning_rate": 0.0007462555737010437, + "loss": 2.7588, + "step": 11559 + }, + { + "epoch": 0.3427927527206951, + "grad_norm": 0.12460825592279434, + "learning_rate": 0.0007462146238709031, + "loss": 2.7471, + "step": 11560 + }, + { + "epoch": 0.34282240607300657, + "grad_norm": 0.1328001469373703, + "learning_rate": 0.0007461736718604897, + "loss": 2.7387, + "step": 11561 + }, + { + "epoch": 0.34285205942531805, + "grad_norm": 0.1316227912902832, + "learning_rate": 0.0007461327176701662, + "loss": 2.7285, + "step": 11562 + }, + { + "epoch": 0.3428817127776295, + "grad_norm": 0.11724887788295746, + "learning_rate": 0.0007460917613002952, + "loss": 2.7466, + "step": 11563 + }, + { + "epoch": 0.342911366129941, + "grad_norm": 0.13569600880146027, + "learning_rate": 0.0007460508027512395, + "loss": 2.754, + "step": 11564 + }, + { + "epoch": 0.3429410194822525, + "grad_norm": 0.16060832142829895, + "learning_rate": 0.0007460098420233617, + "loss": 2.7655, + "step": 11565 + }, + { + "epoch": 0.34297067283456395, + "grad_norm": 0.198057159781456, + "learning_rate": 0.0007459688791170243, + "loss": 2.7405, + "step": 11566 + }, + { + "epoch": 0.3430003261868754, + "grad_norm": 0.18403877317905426, + "learning_rate": 0.0007459279140325905, + "loss": 2.7556, + "step": 11567 + }, + { + "epoch": 0.3430299795391869, + "grad_norm": 0.14517515897750854, + "learning_rate": 0.0007458869467704227, + "loss": 2.7748, + "step": 11568 + }, + { + "epoch": 0.3430596328914984, + "grad_norm": 0.15475071966648102, + "learning_rate": 0.0007458459773308837, + "loss": 2.7298, + "step": 11569 + }, + { + "epoch": 0.34308928624380985, + "grad_norm": 0.1317422240972519, + "learning_rate": 0.0007458050057143365, + "loss": 2.7333, + "step": 11570 + }, + { + "epoch": 0.34311893959612133, + "grad_norm": 0.16155679523944855, + "learning_rate": 0.0007457640319211438, + "loss": 2.7517, + "step": 11571 + }, + { + "epoch": 0.3431485929484328, + "grad_norm": 0.13188423216342926, + "learning_rate": 0.0007457230559516683, + "loss": 2.7331, + "step": 11572 + }, + { + "epoch": 0.3431782463007443, + "grad_norm": 0.14031200110912323, + "learning_rate": 0.000745682077806273, + "loss": 2.7328, + "step": 11573 + }, + { + "epoch": 0.34320789965305576, + "grad_norm": 0.12477949261665344, + "learning_rate": 0.0007456410974853208, + "loss": 2.7698, + "step": 11574 + }, + { + "epoch": 0.34323755300536724, + "grad_norm": 0.12798328697681427, + "learning_rate": 0.0007456001149891744, + "loss": 2.7393, + "step": 11575 + }, + { + "epoch": 0.3432672063576787, + "grad_norm": 0.13536816835403442, + "learning_rate": 0.000745559130318197, + "loss": 2.6899, + "step": 11576 + }, + { + "epoch": 0.3432968597099902, + "grad_norm": 0.15677326917648315, + "learning_rate": 0.0007455181434727513, + "loss": 2.7542, + "step": 11577 + }, + { + "epoch": 0.3433265130623017, + "grad_norm": 0.13704341650009155, + "learning_rate": 0.0007454771544532003, + "loss": 2.7087, + "step": 11578 + }, + { + "epoch": 0.3433561664146132, + "grad_norm": 0.11363730579614639, + "learning_rate": 0.0007454361632599068, + "loss": 2.7273, + "step": 11579 + }, + { + "epoch": 0.34338581976692467, + "grad_norm": 0.12844425439834595, + "learning_rate": 0.0007453951698932341, + "loss": 2.7645, + "step": 11580 + }, + { + "epoch": 0.34341547311923615, + "grad_norm": 0.11965674161911011, + "learning_rate": 0.000745354174353545, + "loss": 2.7251, + "step": 11581 + }, + { + "epoch": 0.3434451264715476, + "grad_norm": 0.11129657179117203, + "learning_rate": 0.0007453131766412026, + "loss": 2.6924, + "step": 11582 + }, + { + "epoch": 0.3434747798238591, + "grad_norm": 0.13746821880340576, + "learning_rate": 0.00074527217675657, + "loss": 2.7236, + "step": 11583 + }, + { + "epoch": 0.3435044331761706, + "grad_norm": 0.14000891149044037, + "learning_rate": 0.00074523117470001, + "loss": 2.7534, + "step": 11584 + }, + { + "epoch": 0.34353408652848205, + "grad_norm": 0.12991833686828613, + "learning_rate": 0.0007451901704718859, + "loss": 2.7345, + "step": 11585 + }, + { + "epoch": 0.34356373988079353, + "grad_norm": 0.12059993296861649, + "learning_rate": 0.0007451491640725607, + "loss": 2.7502, + "step": 11586 + }, + { + "epoch": 0.343593393233105, + "grad_norm": 0.13536198437213898, + "learning_rate": 0.0007451081555023976, + "loss": 2.7447, + "step": 11587 + }, + { + "epoch": 0.3436230465854165, + "grad_norm": 0.16571465134620667, + "learning_rate": 0.0007450671447617598, + "loss": 2.743, + "step": 11588 + }, + { + "epoch": 0.34365269993772796, + "grad_norm": 0.11894584447145462, + "learning_rate": 0.0007450261318510104, + "loss": 2.7556, + "step": 11589 + }, + { + "epoch": 0.34368235329003943, + "grad_norm": 0.11773558706045151, + "learning_rate": 0.0007449851167705125, + "loss": 2.7474, + "step": 11590 + }, + { + "epoch": 0.3437120066423509, + "grad_norm": 0.13049763441085815, + "learning_rate": 0.0007449440995206294, + "loss": 2.7289, + "step": 11591 + }, + { + "epoch": 0.3437416599946624, + "grad_norm": 0.14841139316558838, + "learning_rate": 0.0007449030801017241, + "loss": 2.7755, + "step": 11592 + }, + { + "epoch": 0.34377131334697386, + "grad_norm": 0.1444566696882248, + "learning_rate": 0.0007448620585141599, + "loss": 2.7634, + "step": 11593 + }, + { + "epoch": 0.34380096669928534, + "grad_norm": 0.13838376104831696, + "learning_rate": 0.0007448210347583004, + "loss": 2.7597, + "step": 11594 + }, + { + "epoch": 0.3438306200515968, + "grad_norm": 0.15268906950950623, + "learning_rate": 0.0007447800088345084, + "loss": 2.7333, + "step": 11595 + }, + { + "epoch": 0.3438602734039083, + "grad_norm": 0.15183483064174652, + "learning_rate": 0.0007447389807431476, + "loss": 2.7178, + "step": 11596 + }, + { + "epoch": 0.34388992675621977, + "grad_norm": 0.1305222511291504, + "learning_rate": 0.000744697950484581, + "loss": 2.7664, + "step": 11597 + }, + { + "epoch": 0.34391958010853124, + "grad_norm": 0.10947351902723312, + "learning_rate": 0.000744656918059172, + "loss": 2.7421, + "step": 11598 + }, + { + "epoch": 0.3439492334608428, + "grad_norm": 0.14783087372779846, + "learning_rate": 0.0007446158834672843, + "loss": 2.7184, + "step": 11599 + }, + { + "epoch": 0.34397888681315425, + "grad_norm": 0.14833664894104004, + "learning_rate": 0.0007445748467092806, + "loss": 2.7106, + "step": 11600 + }, + { + "epoch": 0.3440085401654657, + "grad_norm": 0.10897243767976761, + "learning_rate": 0.0007445338077855248, + "loss": 2.7634, + "step": 11601 + }, + { + "epoch": 0.3440381935177772, + "grad_norm": 0.14652405679225922, + "learning_rate": 0.0007444927666963801, + "loss": 2.74, + "step": 11602 + }, + { + "epoch": 0.3440678468700887, + "grad_norm": 0.15184517204761505, + "learning_rate": 0.0007444517234422101, + "loss": 2.7522, + "step": 11603 + }, + { + "epoch": 0.34409750022240015, + "grad_norm": 0.12962305545806885, + "learning_rate": 0.000744410678023378, + "loss": 2.7314, + "step": 11604 + }, + { + "epoch": 0.34412715357471163, + "grad_norm": 0.13282154500484467, + "learning_rate": 0.0007443696304402476, + "loss": 2.7599, + "step": 11605 + }, + { + "epoch": 0.3441568069270231, + "grad_norm": 0.1308126598596573, + "learning_rate": 0.000744328580693182, + "loss": 2.755, + "step": 11606 + }, + { + "epoch": 0.3441864602793346, + "grad_norm": 0.12556374073028564, + "learning_rate": 0.0007442875287825448, + "loss": 2.75, + "step": 11607 + }, + { + "epoch": 0.34421611363164606, + "grad_norm": 0.1250041127204895, + "learning_rate": 0.0007442464747086998, + "loss": 2.7466, + "step": 11608 + }, + { + "epoch": 0.34424576698395754, + "grad_norm": 0.11367888748645782, + "learning_rate": 0.0007442054184720101, + "loss": 2.746, + "step": 11609 + }, + { + "epoch": 0.344275420336269, + "grad_norm": 0.13400709629058838, + "learning_rate": 0.0007441643600728399, + "loss": 2.7324, + "step": 11610 + }, + { + "epoch": 0.3443050736885805, + "grad_norm": 0.14027878642082214, + "learning_rate": 0.000744123299511552, + "loss": 2.7463, + "step": 11611 + }, + { + "epoch": 0.34433472704089196, + "grad_norm": 0.1287236511707306, + "learning_rate": 0.0007440822367885105, + "loss": 2.702, + "step": 11612 + }, + { + "epoch": 0.34436438039320344, + "grad_norm": 0.14558455348014832, + "learning_rate": 0.0007440411719040789, + "loss": 2.7532, + "step": 11613 + }, + { + "epoch": 0.3443940337455149, + "grad_norm": 0.1549263447523117, + "learning_rate": 0.0007440001048586209, + "loss": 2.7338, + "step": 11614 + }, + { + "epoch": 0.3444236870978264, + "grad_norm": 0.1692039519548416, + "learning_rate": 0.0007439590356525, + "loss": 2.706, + "step": 11615 + }, + { + "epoch": 0.34445334045013787, + "grad_norm": 0.14327190816402435, + "learning_rate": 0.0007439179642860802, + "loss": 2.7252, + "step": 11616 + }, + { + "epoch": 0.34448299380244934, + "grad_norm": 0.14184792339801788, + "learning_rate": 0.0007438768907597246, + "loss": 2.7681, + "step": 11617 + }, + { + "epoch": 0.3445126471547608, + "grad_norm": 0.1227576956152916, + "learning_rate": 0.0007438358150737974, + "loss": 2.7683, + "step": 11618 + }, + { + "epoch": 0.34454230050707235, + "grad_norm": 0.1326429694890976, + "learning_rate": 0.0007437947372286622, + "loss": 2.7471, + "step": 11619 + }, + { + "epoch": 0.34457195385938383, + "grad_norm": 0.13787470757961273, + "learning_rate": 0.0007437536572246828, + "loss": 2.733, + "step": 11620 + }, + { + "epoch": 0.3446016072116953, + "grad_norm": 0.1219356581568718, + "learning_rate": 0.0007437125750622229, + "loss": 2.7054, + "step": 11621 + }, + { + "epoch": 0.3446312605640068, + "grad_norm": 0.1306290626525879, + "learning_rate": 0.0007436714907416465, + "loss": 2.7424, + "step": 11622 + }, + { + "epoch": 0.34466091391631826, + "grad_norm": 0.14334729313850403, + "learning_rate": 0.0007436304042633171, + "loss": 2.7122, + "step": 11623 + }, + { + "epoch": 0.34469056726862973, + "grad_norm": 0.13434968888759613, + "learning_rate": 0.0007435893156275985, + "loss": 2.7063, + "step": 11624 + }, + { + "epoch": 0.3447202206209412, + "grad_norm": 0.12256436049938202, + "learning_rate": 0.0007435482248348547, + "loss": 2.7202, + "step": 11625 + }, + { + "epoch": 0.3447498739732527, + "grad_norm": 0.13692384958267212, + "learning_rate": 0.0007435071318854497, + "loss": 2.7457, + "step": 11626 + }, + { + "epoch": 0.34477952732556416, + "grad_norm": 0.122810959815979, + "learning_rate": 0.0007434660367797474, + "loss": 2.7173, + "step": 11627 + }, + { + "epoch": 0.34480918067787564, + "grad_norm": 0.10425833612680435, + "learning_rate": 0.0007434249395181113, + "loss": 2.7503, + "step": 11628 + }, + { + "epoch": 0.3448388340301871, + "grad_norm": 0.12406135350465775, + "learning_rate": 0.0007433838401009056, + "loss": 2.7744, + "step": 11629 + }, + { + "epoch": 0.3448684873824986, + "grad_norm": 0.12415371835231781, + "learning_rate": 0.0007433427385284944, + "loss": 2.7422, + "step": 11630 + }, + { + "epoch": 0.34489814073481007, + "grad_norm": 0.1163254976272583, + "learning_rate": 0.0007433016348012411, + "loss": 2.7492, + "step": 11631 + }, + { + "epoch": 0.34492779408712154, + "grad_norm": 0.11328677088022232, + "learning_rate": 0.0007432605289195104, + "loss": 2.7637, + "step": 11632 + }, + { + "epoch": 0.344957447439433, + "grad_norm": 0.10762957483530045, + "learning_rate": 0.000743219420883666, + "loss": 2.7059, + "step": 11633 + }, + { + "epoch": 0.3449871007917445, + "grad_norm": 0.12043620645999908, + "learning_rate": 0.0007431783106940718, + "loss": 2.7464, + "step": 11634 + }, + { + "epoch": 0.34501675414405597, + "grad_norm": 0.12197861075401306, + "learning_rate": 0.0007431371983510917, + "loss": 2.7022, + "step": 11635 + }, + { + "epoch": 0.34504640749636745, + "grad_norm": 0.12164974212646484, + "learning_rate": 0.0007430960838550902, + "loss": 2.7442, + "step": 11636 + }, + { + "epoch": 0.3450760608486789, + "grad_norm": 0.11139117926359177, + "learning_rate": 0.0007430549672064311, + "loss": 2.7472, + "step": 11637 + }, + { + "epoch": 0.3451057142009904, + "grad_norm": 0.13854801654815674, + "learning_rate": 0.0007430138484054786, + "loss": 2.691, + "step": 11638 + }, + { + "epoch": 0.3451353675533019, + "grad_norm": 0.12621715664863586, + "learning_rate": 0.0007429727274525967, + "loss": 2.7589, + "step": 11639 + }, + { + "epoch": 0.3451650209056134, + "grad_norm": 0.1361386626958847, + "learning_rate": 0.0007429316043481496, + "loss": 2.7318, + "step": 11640 + }, + { + "epoch": 0.3451946742579249, + "grad_norm": 0.16064287722110748, + "learning_rate": 0.0007428904790925014, + "loss": 2.724, + "step": 11641 + }, + { + "epoch": 0.34522432761023636, + "grad_norm": 0.1469525247812271, + "learning_rate": 0.0007428493516860164, + "loss": 2.7124, + "step": 11642 + }, + { + "epoch": 0.34525398096254784, + "grad_norm": 0.12781429290771484, + "learning_rate": 0.0007428082221290586, + "loss": 2.7356, + "step": 11643 + }, + { + "epoch": 0.3452836343148593, + "grad_norm": 0.12949146330356598, + "learning_rate": 0.0007427670904219925, + "loss": 2.7645, + "step": 11644 + }, + { + "epoch": 0.3453132876671708, + "grad_norm": 0.1411316692829132, + "learning_rate": 0.000742725956565182, + "loss": 2.7268, + "step": 11645 + }, + { + "epoch": 0.34534294101948226, + "grad_norm": 0.15751710534095764, + "learning_rate": 0.0007426848205589915, + "loss": 2.7032, + "step": 11646 + }, + { + "epoch": 0.34537259437179374, + "grad_norm": 0.15995870530605316, + "learning_rate": 0.0007426436824037853, + "loss": 2.7144, + "step": 11647 + }, + { + "epoch": 0.3454022477241052, + "grad_norm": 0.1401594877243042, + "learning_rate": 0.0007426025420999275, + "loss": 2.7614, + "step": 11648 + }, + { + "epoch": 0.3454319010764167, + "grad_norm": 0.12614662945270538, + "learning_rate": 0.0007425613996477828, + "loss": 2.7231, + "step": 11649 + }, + { + "epoch": 0.34546155442872817, + "grad_norm": 0.1491500586271286, + "learning_rate": 0.000742520255047715, + "loss": 2.7192, + "step": 11650 + }, + { + "epoch": 0.34549120778103964, + "grad_norm": 0.13513724505901337, + "learning_rate": 0.0007424791083000888, + "loss": 2.7538, + "step": 11651 + }, + { + "epoch": 0.3455208611333511, + "grad_norm": 0.1335294544696808, + "learning_rate": 0.0007424379594052686, + "loss": 2.7324, + "step": 11652 + }, + { + "epoch": 0.3455505144856626, + "grad_norm": 0.13912558555603027, + "learning_rate": 0.0007423968083636185, + "loss": 2.7182, + "step": 11653 + }, + { + "epoch": 0.3455801678379741, + "grad_norm": 0.15734031796455383, + "learning_rate": 0.0007423556551755032, + "loss": 2.745, + "step": 11654 + }, + { + "epoch": 0.34560982119028555, + "grad_norm": 0.1914864480495453, + "learning_rate": 0.000742314499841287, + "loss": 2.7469, + "step": 11655 + }, + { + "epoch": 0.345639474542597, + "grad_norm": 0.19358257949352264, + "learning_rate": 0.0007422733423613342, + "loss": 2.7213, + "step": 11656 + }, + { + "epoch": 0.3456691278949085, + "grad_norm": 0.15653954446315765, + "learning_rate": 0.0007422321827360093, + "loss": 2.7402, + "step": 11657 + }, + { + "epoch": 0.34569878124722, + "grad_norm": 0.13584642112255096, + "learning_rate": 0.0007421910209656769, + "loss": 2.7302, + "step": 11658 + }, + { + "epoch": 0.34572843459953145, + "grad_norm": 0.15139691531658173, + "learning_rate": 0.0007421498570507015, + "loss": 2.7625, + "step": 11659 + }, + { + "epoch": 0.34575808795184293, + "grad_norm": 0.13735373318195343, + "learning_rate": 0.0007421086909914477, + "loss": 2.7385, + "step": 11660 + }, + { + "epoch": 0.34578774130415446, + "grad_norm": 0.12408334016799927, + "learning_rate": 0.0007420675227882796, + "loss": 2.7369, + "step": 11661 + }, + { + "epoch": 0.34581739465646594, + "grad_norm": 0.14363574981689453, + "learning_rate": 0.0007420263524415622, + "loss": 2.7259, + "step": 11662 + }, + { + "epoch": 0.3458470480087774, + "grad_norm": 0.1371779888868332, + "learning_rate": 0.0007419851799516597, + "loss": 2.7562, + "step": 11663 + }, + { + "epoch": 0.3458767013610889, + "grad_norm": 0.13013985753059387, + "learning_rate": 0.000741944005318937, + "loss": 2.7411, + "step": 11664 + }, + { + "epoch": 0.34590635471340037, + "grad_norm": 0.13714034855365753, + "learning_rate": 0.0007419028285437588, + "loss": 2.7142, + "step": 11665 + }, + { + "epoch": 0.34593600806571184, + "grad_norm": 0.13166430592536926, + "learning_rate": 0.0007418616496264892, + "loss": 2.7435, + "step": 11666 + }, + { + "epoch": 0.3459656614180233, + "grad_norm": 0.12346676737070084, + "learning_rate": 0.0007418204685674934, + "loss": 2.7486, + "step": 11667 + }, + { + "epoch": 0.3459953147703348, + "grad_norm": 0.11394096165895462, + "learning_rate": 0.0007417792853671358, + "loss": 2.7439, + "step": 11668 + }, + { + "epoch": 0.34602496812264627, + "grad_norm": 0.135704904794693, + "learning_rate": 0.000741738100025781, + "loss": 2.7471, + "step": 11669 + }, + { + "epoch": 0.34605462147495775, + "grad_norm": 0.10670213401317596, + "learning_rate": 0.0007416969125437941, + "loss": 2.7463, + "step": 11670 + }, + { + "epoch": 0.3460842748272692, + "grad_norm": 0.11511997133493423, + "learning_rate": 0.0007416557229215394, + "loss": 2.7209, + "step": 11671 + }, + { + "epoch": 0.3461139281795807, + "grad_norm": 0.10860351473093033, + "learning_rate": 0.0007416145311593818, + "loss": 2.6829, + "step": 11672 + }, + { + "epoch": 0.3461435815318922, + "grad_norm": 0.12202882021665573, + "learning_rate": 0.000741573337257686, + "loss": 2.7383, + "step": 11673 + }, + { + "epoch": 0.34617323488420365, + "grad_norm": 0.1144893616437912, + "learning_rate": 0.0007415321412168169, + "loss": 2.7538, + "step": 11674 + }, + { + "epoch": 0.3462028882365151, + "grad_norm": 0.121971994638443, + "learning_rate": 0.0007414909430371393, + "loss": 2.7422, + "step": 11675 + }, + { + "epoch": 0.3462325415888266, + "grad_norm": 0.12198992073535919, + "learning_rate": 0.0007414497427190177, + "loss": 2.7523, + "step": 11676 + }, + { + "epoch": 0.3462621949411381, + "grad_norm": 0.12051700055599213, + "learning_rate": 0.0007414085402628175, + "loss": 2.7622, + "step": 11677 + }, + { + "epoch": 0.34629184829344956, + "grad_norm": 0.13103623688220978, + "learning_rate": 0.000741367335668903, + "loss": 2.736, + "step": 11678 + }, + { + "epoch": 0.34632150164576103, + "grad_norm": 0.1671256273984909, + "learning_rate": 0.0007413261289376394, + "loss": 2.7504, + "step": 11679 + }, + { + "epoch": 0.3463511549980725, + "grad_norm": 0.15309111773967743, + "learning_rate": 0.0007412849200693915, + "loss": 2.7505, + "step": 11680 + }, + { + "epoch": 0.346380808350384, + "grad_norm": 0.11073590070009232, + "learning_rate": 0.0007412437090645242, + "loss": 2.7496, + "step": 11681 + }, + { + "epoch": 0.3464104617026955, + "grad_norm": 0.11255805939435959, + "learning_rate": 0.0007412024959234025, + "loss": 2.7499, + "step": 11682 + }, + { + "epoch": 0.346440115055007, + "grad_norm": 0.12239613384008408, + "learning_rate": 0.0007411612806463912, + "loss": 2.7391, + "step": 11683 + }, + { + "epoch": 0.34646976840731847, + "grad_norm": 0.13217368721961975, + "learning_rate": 0.0007411200632338555, + "loss": 2.7424, + "step": 11684 + }, + { + "epoch": 0.34649942175962994, + "grad_norm": 0.1289396435022354, + "learning_rate": 0.0007410788436861602, + "loss": 2.7388, + "step": 11685 + }, + { + "epoch": 0.3465290751119414, + "grad_norm": 0.1394442319869995, + "learning_rate": 0.0007410376220036703, + "loss": 2.739, + "step": 11686 + }, + { + "epoch": 0.3465587284642529, + "grad_norm": 0.16068221628665924, + "learning_rate": 0.000740996398186751, + "loss": 2.7526, + "step": 11687 + }, + { + "epoch": 0.3465883818165644, + "grad_norm": 0.15587696433067322, + "learning_rate": 0.0007409551722357672, + "loss": 2.7563, + "step": 11688 + }, + { + "epoch": 0.34661803516887585, + "grad_norm": 0.14910681545734406, + "learning_rate": 0.0007409139441510839, + "loss": 2.6992, + "step": 11689 + }, + { + "epoch": 0.3466476885211873, + "grad_norm": 0.14034585654735565, + "learning_rate": 0.0007408727139330663, + "loss": 2.7206, + "step": 11690 + }, + { + "epoch": 0.3466773418734988, + "grad_norm": 0.14132378995418549, + "learning_rate": 0.0007408314815820794, + "loss": 2.7407, + "step": 11691 + }, + { + "epoch": 0.3467069952258103, + "grad_norm": 0.1362958401441574, + "learning_rate": 0.0007407902470984887, + "loss": 2.7203, + "step": 11692 + }, + { + "epoch": 0.34673664857812175, + "grad_norm": 0.11711172014474869, + "learning_rate": 0.0007407490104826587, + "loss": 2.7442, + "step": 11693 + }, + { + "epoch": 0.34676630193043323, + "grad_norm": 0.1279473751783371, + "learning_rate": 0.000740707771734955, + "loss": 2.7392, + "step": 11694 + }, + { + "epoch": 0.3467959552827447, + "grad_norm": 0.15711276233196259, + "learning_rate": 0.0007406665308557426, + "loss": 2.7257, + "step": 11695 + }, + { + "epoch": 0.3468256086350562, + "grad_norm": 0.13288545608520508, + "learning_rate": 0.0007406252878453867, + "loss": 2.723, + "step": 11696 + }, + { + "epoch": 0.34685526198736766, + "grad_norm": 0.12226122617721558, + "learning_rate": 0.0007405840427042528, + "loss": 2.7379, + "step": 11697 + }, + { + "epoch": 0.34688491533967913, + "grad_norm": 0.13427947461605072, + "learning_rate": 0.0007405427954327059, + "loss": 2.7333, + "step": 11698 + }, + { + "epoch": 0.3469145686919906, + "grad_norm": 0.13602635264396667, + "learning_rate": 0.0007405015460311111, + "loss": 2.7017, + "step": 11699 + }, + { + "epoch": 0.3469442220443021, + "grad_norm": 0.15021906793117523, + "learning_rate": 0.0007404602944998339, + "loss": 2.7511, + "step": 11700 + }, + { + "epoch": 0.34697387539661356, + "grad_norm": 0.15956728160381317, + "learning_rate": 0.0007404190408392395, + "loss": 2.741, + "step": 11701 + }, + { + "epoch": 0.34700352874892504, + "grad_norm": 0.13181094825267792, + "learning_rate": 0.0007403777850496931, + "loss": 2.7569, + "step": 11702 + }, + { + "epoch": 0.34703318210123657, + "grad_norm": 0.1334334909915924, + "learning_rate": 0.0007403365271315604, + "loss": 2.7174, + "step": 11703 + }, + { + "epoch": 0.34706283545354805, + "grad_norm": 0.12573140859603882, + "learning_rate": 0.0007402952670852065, + "loss": 2.7727, + "step": 11704 + }, + { + "epoch": 0.3470924888058595, + "grad_norm": 0.1269982010126114, + "learning_rate": 0.0007402540049109966, + "loss": 2.7368, + "step": 11705 + }, + { + "epoch": 0.347122142158171, + "grad_norm": 0.13976801931858063, + "learning_rate": 0.0007402127406092962, + "loss": 2.7604, + "step": 11706 + }, + { + "epoch": 0.3471517955104825, + "grad_norm": 0.12982644140720367, + "learning_rate": 0.0007401714741804709, + "loss": 2.7135, + "step": 11707 + }, + { + "epoch": 0.34718144886279395, + "grad_norm": 0.12874150276184082, + "learning_rate": 0.0007401302056248859, + "loss": 2.7309, + "step": 11708 + }, + { + "epoch": 0.3472111022151054, + "grad_norm": 0.11511972546577454, + "learning_rate": 0.0007400889349429067, + "loss": 2.7066, + "step": 11709 + }, + { + "epoch": 0.3472407555674169, + "grad_norm": 0.11924783885478973, + "learning_rate": 0.0007400476621348989, + "loss": 2.7692, + "step": 11710 + }, + { + "epoch": 0.3472704089197284, + "grad_norm": 0.12064731121063232, + "learning_rate": 0.0007400063872012277, + "loss": 2.6905, + "step": 11711 + }, + { + "epoch": 0.34730006227203986, + "grad_norm": 0.12753324210643768, + "learning_rate": 0.0007399651101422588, + "loss": 2.7432, + "step": 11712 + }, + { + "epoch": 0.34732971562435133, + "grad_norm": 0.13940607011318207, + "learning_rate": 0.0007399238309583575, + "loss": 2.7616, + "step": 11713 + }, + { + "epoch": 0.3473593689766628, + "grad_norm": 0.13065747916698456, + "learning_rate": 0.0007398825496498898, + "loss": 2.7344, + "step": 11714 + }, + { + "epoch": 0.3473890223289743, + "grad_norm": 0.1226908415555954, + "learning_rate": 0.0007398412662172207, + "loss": 2.7681, + "step": 11715 + }, + { + "epoch": 0.34741867568128576, + "grad_norm": 0.14553393423557281, + "learning_rate": 0.0007397999806607161, + "loss": 2.7668, + "step": 11716 + }, + { + "epoch": 0.34744832903359724, + "grad_norm": 0.14506953954696655, + "learning_rate": 0.0007397586929807416, + "loss": 2.7232, + "step": 11717 + }, + { + "epoch": 0.3474779823859087, + "grad_norm": 0.1524776965379715, + "learning_rate": 0.0007397174031776626, + "loss": 2.7177, + "step": 11718 + }, + { + "epoch": 0.3475076357382202, + "grad_norm": 0.13748683035373688, + "learning_rate": 0.0007396761112518449, + "loss": 2.7151, + "step": 11719 + }, + { + "epoch": 0.34753728909053166, + "grad_norm": 0.12316663563251495, + "learning_rate": 0.0007396348172036542, + "loss": 2.7617, + "step": 11720 + }, + { + "epoch": 0.34756694244284314, + "grad_norm": 0.15090607106685638, + "learning_rate": 0.0007395935210334558, + "loss": 2.768, + "step": 11721 + }, + { + "epoch": 0.3475965957951546, + "grad_norm": 0.15412114560604095, + "learning_rate": 0.0007395522227416158, + "loss": 2.739, + "step": 11722 + }, + { + "epoch": 0.34762624914746615, + "grad_norm": 0.14019949734210968, + "learning_rate": 0.0007395109223284997, + "loss": 2.7184, + "step": 11723 + }, + { + "epoch": 0.3476559024997776, + "grad_norm": 0.1270146667957306, + "learning_rate": 0.0007394696197944733, + "loss": 2.7307, + "step": 11724 + }, + { + "epoch": 0.3476855558520891, + "grad_norm": 0.1371372491121292, + "learning_rate": 0.0007394283151399025, + "loss": 2.7449, + "step": 11725 + }, + { + "epoch": 0.3477152092044006, + "grad_norm": 0.12980183959007263, + "learning_rate": 0.0007393870083651526, + "loss": 2.7638, + "step": 11726 + }, + { + "epoch": 0.34774486255671205, + "grad_norm": 0.14110009372234344, + "learning_rate": 0.0007393456994705898, + "loss": 2.7747, + "step": 11727 + }, + { + "epoch": 0.34777451590902353, + "grad_norm": 0.12767313420772552, + "learning_rate": 0.0007393043884565795, + "loss": 2.7443, + "step": 11728 + }, + { + "epoch": 0.347804169261335, + "grad_norm": 0.11627896130084991, + "learning_rate": 0.000739263075323488, + "loss": 2.7283, + "step": 11729 + }, + { + "epoch": 0.3478338226136465, + "grad_norm": 0.12642519176006317, + "learning_rate": 0.000739221760071681, + "loss": 2.7418, + "step": 11730 + }, + { + "epoch": 0.34786347596595796, + "grad_norm": 0.13010574877262115, + "learning_rate": 0.0007391804427015241, + "loss": 2.755, + "step": 11731 + }, + { + "epoch": 0.34789312931826943, + "grad_norm": 0.13257978856563568, + "learning_rate": 0.0007391391232133834, + "loss": 2.7348, + "step": 11732 + }, + { + "epoch": 0.3479227826705809, + "grad_norm": 0.14075887203216553, + "learning_rate": 0.0007390978016076247, + "loss": 2.7541, + "step": 11733 + }, + { + "epoch": 0.3479524360228924, + "grad_norm": 0.1447548121213913, + "learning_rate": 0.0007390564778846137, + "loss": 2.7515, + "step": 11734 + }, + { + "epoch": 0.34798208937520386, + "grad_norm": 0.12632966041564941, + "learning_rate": 0.0007390151520447166, + "loss": 2.7618, + "step": 11735 + }, + { + "epoch": 0.34801174272751534, + "grad_norm": 0.12039291113615036, + "learning_rate": 0.0007389738240882996, + "loss": 2.7211, + "step": 11736 + }, + { + "epoch": 0.3480413960798268, + "grad_norm": 0.10683553665876389, + "learning_rate": 0.0007389324940157283, + "loss": 2.7544, + "step": 11737 + }, + { + "epoch": 0.3480710494321383, + "grad_norm": 0.12358329445123672, + "learning_rate": 0.0007388911618273685, + "loss": 2.7396, + "step": 11738 + }, + { + "epoch": 0.34810070278444977, + "grad_norm": 0.14636585116386414, + "learning_rate": 0.0007388498275235866, + "loss": 2.7504, + "step": 11739 + }, + { + "epoch": 0.34813035613676124, + "grad_norm": 0.16536985337734222, + "learning_rate": 0.0007388084911047485, + "loss": 2.7403, + "step": 11740 + }, + { + "epoch": 0.3481600094890727, + "grad_norm": 0.1726844608783722, + "learning_rate": 0.0007387671525712203, + "loss": 2.7353, + "step": 11741 + }, + { + "epoch": 0.3481896628413842, + "grad_norm": 0.16912122070789337, + "learning_rate": 0.0007387258119233679, + "loss": 2.7433, + "step": 11742 + }, + { + "epoch": 0.34821931619369567, + "grad_norm": 0.16521814465522766, + "learning_rate": 0.0007386844691615576, + "loss": 2.7268, + "step": 11743 + }, + { + "epoch": 0.3482489695460072, + "grad_norm": 0.14940764009952545, + "learning_rate": 0.0007386431242861551, + "loss": 2.7376, + "step": 11744 + }, + { + "epoch": 0.3482786228983187, + "grad_norm": 0.1196475476026535, + "learning_rate": 0.0007386017772975269, + "loss": 2.7174, + "step": 11745 + }, + { + "epoch": 0.34830827625063016, + "grad_norm": 0.12270255386829376, + "learning_rate": 0.0007385604281960389, + "loss": 2.7288, + "step": 11746 + }, + { + "epoch": 0.34833792960294163, + "grad_norm": 0.14370714128017426, + "learning_rate": 0.0007385190769820574, + "loss": 2.7383, + "step": 11747 + }, + { + "epoch": 0.3483675829552531, + "grad_norm": 0.12979711592197418, + "learning_rate": 0.0007384777236559487, + "loss": 2.7169, + "step": 11748 + }, + { + "epoch": 0.3483972363075646, + "grad_norm": 0.117529958486557, + "learning_rate": 0.0007384363682180786, + "loss": 2.7246, + "step": 11749 + }, + { + "epoch": 0.34842688965987606, + "grad_norm": 0.11782438308000565, + "learning_rate": 0.0007383950106688137, + "loss": 2.7088, + "step": 11750 + }, + { + "epoch": 0.34845654301218754, + "grad_norm": 0.13166023790836334, + "learning_rate": 0.00073835365100852, + "loss": 2.7515, + "step": 11751 + }, + { + "epoch": 0.348486196364499, + "grad_norm": 0.11523763835430145, + "learning_rate": 0.0007383122892375638, + "loss": 2.7348, + "step": 11752 + }, + { + "epoch": 0.3485158497168105, + "grad_norm": 0.1206141859292984, + "learning_rate": 0.0007382709253563114, + "loss": 2.7283, + "step": 11753 + }, + { + "epoch": 0.34854550306912196, + "grad_norm": 0.1396579146385193, + "learning_rate": 0.0007382295593651289, + "loss": 2.7177, + "step": 11754 + }, + { + "epoch": 0.34857515642143344, + "grad_norm": 0.13898569345474243, + "learning_rate": 0.000738188191264383, + "loss": 2.7611, + "step": 11755 + }, + { + "epoch": 0.3486048097737449, + "grad_norm": 0.1270507425069809, + "learning_rate": 0.0007381468210544397, + "loss": 2.7183, + "step": 11756 + }, + { + "epoch": 0.3486344631260564, + "grad_norm": 0.13116838037967682, + "learning_rate": 0.0007381054487356653, + "loss": 2.7117, + "step": 11757 + }, + { + "epoch": 0.34866411647836787, + "grad_norm": 0.1371908038854599, + "learning_rate": 0.0007380640743084265, + "loss": 2.7335, + "step": 11758 + }, + { + "epoch": 0.34869376983067935, + "grad_norm": 0.1361478716135025, + "learning_rate": 0.0007380226977730893, + "loss": 2.7313, + "step": 11759 + }, + { + "epoch": 0.3487234231829908, + "grad_norm": 0.15153513848781586, + "learning_rate": 0.0007379813191300202, + "loss": 2.7402, + "step": 11760 + }, + { + "epoch": 0.3487530765353023, + "grad_norm": 0.12001300603151321, + "learning_rate": 0.0007379399383795857, + "loss": 2.7592, + "step": 11761 + }, + { + "epoch": 0.3487827298876138, + "grad_norm": 0.1365043967962265, + "learning_rate": 0.0007378985555221523, + "loss": 2.7076, + "step": 11762 + }, + { + "epoch": 0.34881238323992525, + "grad_norm": 0.13768166303634644, + "learning_rate": 0.0007378571705580864, + "loss": 2.7402, + "step": 11763 + }, + { + "epoch": 0.3488420365922367, + "grad_norm": 0.15447571873664856, + "learning_rate": 0.0007378157834877543, + "loss": 2.7593, + "step": 11764 + }, + { + "epoch": 0.34887168994454826, + "grad_norm": 0.1631706804037094, + "learning_rate": 0.0007377743943115227, + "loss": 2.6956, + "step": 11765 + }, + { + "epoch": 0.34890134329685973, + "grad_norm": 0.15439005196094513, + "learning_rate": 0.0007377330030297579, + "loss": 2.7782, + "step": 11766 + }, + { + "epoch": 0.3489309966491712, + "grad_norm": 0.1351710855960846, + "learning_rate": 0.0007376916096428266, + "loss": 2.7202, + "step": 11767 + }, + { + "epoch": 0.3489606500014827, + "grad_norm": 0.15661993622779846, + "learning_rate": 0.0007376502141510956, + "loss": 2.7385, + "step": 11768 + }, + { + "epoch": 0.34899030335379416, + "grad_norm": 0.17037497460842133, + "learning_rate": 0.0007376088165549308, + "loss": 2.7462, + "step": 11769 + }, + { + "epoch": 0.34901995670610564, + "grad_norm": 0.15113238990306854, + "learning_rate": 0.0007375674168546993, + "loss": 2.7266, + "step": 11770 + }, + { + "epoch": 0.3490496100584171, + "grad_norm": 0.16685813665390015, + "learning_rate": 0.0007375260150507674, + "loss": 2.74, + "step": 11771 + }, + { + "epoch": 0.3490792634107286, + "grad_norm": 0.1444714516401291, + "learning_rate": 0.0007374846111435021, + "loss": 2.6869, + "step": 11772 + }, + { + "epoch": 0.34910891676304007, + "grad_norm": 0.15805207192897797, + "learning_rate": 0.0007374432051332695, + "loss": 2.7248, + "step": 11773 + }, + { + "epoch": 0.34913857011535154, + "grad_norm": 0.15040570497512817, + "learning_rate": 0.0007374017970204368, + "loss": 2.7125, + "step": 11774 + }, + { + "epoch": 0.349168223467663, + "grad_norm": 0.13969266414642334, + "learning_rate": 0.0007373603868053703, + "loss": 2.7097, + "step": 11775 + }, + { + "epoch": 0.3491978768199745, + "grad_norm": 0.1331205666065216, + "learning_rate": 0.000737318974488437, + "loss": 2.7198, + "step": 11776 + }, + { + "epoch": 0.34922753017228597, + "grad_norm": 0.13404549658298492, + "learning_rate": 0.0007372775600700032, + "loss": 2.7138, + "step": 11777 + }, + { + "epoch": 0.34925718352459745, + "grad_norm": 0.13791072368621826, + "learning_rate": 0.0007372361435504361, + "loss": 2.731, + "step": 11778 + }, + { + "epoch": 0.3492868368769089, + "grad_norm": 0.11916396766901016, + "learning_rate": 0.0007371947249301021, + "loss": 2.7095, + "step": 11779 + }, + { + "epoch": 0.3493164902292204, + "grad_norm": 0.1403256058692932, + "learning_rate": 0.0007371533042093682, + "loss": 2.7601, + "step": 11780 + }, + { + "epoch": 0.3493461435815319, + "grad_norm": 0.13103803992271423, + "learning_rate": 0.0007371118813886009, + "loss": 2.7255, + "step": 11781 + }, + { + "epoch": 0.34937579693384335, + "grad_norm": 0.11886154860258102, + "learning_rate": 0.0007370704564681673, + "loss": 2.7646, + "step": 11782 + }, + { + "epoch": 0.34940545028615483, + "grad_norm": 0.11866918206214905, + "learning_rate": 0.0007370290294484343, + "loss": 2.7389, + "step": 11783 + }, + { + "epoch": 0.3494351036384663, + "grad_norm": 0.15052169561386108, + "learning_rate": 0.0007369876003297684, + "loss": 2.7656, + "step": 11784 + }, + { + "epoch": 0.3494647569907778, + "grad_norm": 0.15689869225025177, + "learning_rate": 0.0007369461691125366, + "loss": 2.7437, + "step": 11785 + }, + { + "epoch": 0.3494944103430893, + "grad_norm": 0.15843260288238525, + "learning_rate": 0.0007369047357971057, + "loss": 2.7386, + "step": 11786 + }, + { + "epoch": 0.3495240636954008, + "grad_norm": 0.15711572766304016, + "learning_rate": 0.0007368633003838428, + "loss": 2.7525, + "step": 11787 + }, + { + "epoch": 0.34955371704771226, + "grad_norm": 0.13509629666805267, + "learning_rate": 0.0007368218628731148, + "loss": 2.7302, + "step": 11788 + }, + { + "epoch": 0.34958337040002374, + "grad_norm": 0.12003085762262344, + "learning_rate": 0.0007367804232652885, + "loss": 2.75, + "step": 11789 + }, + { + "epoch": 0.3496130237523352, + "grad_norm": 0.13759906589984894, + "learning_rate": 0.0007367389815607308, + "loss": 2.7367, + "step": 11790 + }, + { + "epoch": 0.3496426771046467, + "grad_norm": 0.12650689482688904, + "learning_rate": 0.000736697537759809, + "loss": 2.718, + "step": 11791 + }, + { + "epoch": 0.34967233045695817, + "grad_norm": 0.12315484881401062, + "learning_rate": 0.0007366560918628897, + "loss": 2.7365, + "step": 11792 + }, + { + "epoch": 0.34970198380926965, + "grad_norm": 0.1446608006954193, + "learning_rate": 0.0007366146438703402, + "loss": 2.7294, + "step": 11793 + }, + { + "epoch": 0.3497316371615811, + "grad_norm": 0.139958918094635, + "learning_rate": 0.0007365731937825273, + "loss": 2.765, + "step": 11794 + }, + { + "epoch": 0.3497612905138926, + "grad_norm": 0.1355694830417633, + "learning_rate": 0.0007365317415998181, + "loss": 2.7867, + "step": 11795 + }, + { + "epoch": 0.3497909438662041, + "grad_norm": 0.13976381719112396, + "learning_rate": 0.00073649028732258, + "loss": 2.7371, + "step": 11796 + }, + { + "epoch": 0.34982059721851555, + "grad_norm": 0.13824811577796936, + "learning_rate": 0.0007364488309511796, + "loss": 2.7525, + "step": 11797 + }, + { + "epoch": 0.349850250570827, + "grad_norm": 0.15420770645141602, + "learning_rate": 0.0007364073724859843, + "loss": 2.7498, + "step": 11798 + }, + { + "epoch": 0.3498799039231385, + "grad_norm": 0.11848225444555283, + "learning_rate": 0.000736365911927361, + "loss": 2.7118, + "step": 11799 + }, + { + "epoch": 0.34990955727545, + "grad_norm": 0.13195203244686127, + "learning_rate": 0.0007363244492756771, + "loss": 2.7331, + "step": 11800 + }, + { + "epoch": 0.34993921062776145, + "grad_norm": 0.12065096199512482, + "learning_rate": 0.0007362829845312997, + "loss": 2.7234, + "step": 11801 + }, + { + "epoch": 0.34996886398007293, + "grad_norm": 0.12261360138654709, + "learning_rate": 0.000736241517694596, + "loss": 2.7325, + "step": 11802 + }, + { + "epoch": 0.3499985173323844, + "grad_norm": 0.11911635845899582, + "learning_rate": 0.0007362000487659328, + "loss": 2.7545, + "step": 11803 + }, + { + "epoch": 0.3500281706846959, + "grad_norm": 0.12102721631526947, + "learning_rate": 0.0007361585777456779, + "loss": 2.7524, + "step": 11804 + }, + { + "epoch": 0.35005782403700736, + "grad_norm": 0.128990039229393, + "learning_rate": 0.0007361171046341979, + "loss": 2.7448, + "step": 11805 + }, + { + "epoch": 0.35008747738931884, + "grad_norm": 0.12267189472913742, + "learning_rate": 0.0007360756294318608, + "loss": 2.7493, + "step": 11806 + }, + { + "epoch": 0.35011713074163037, + "grad_norm": 0.11774863302707672, + "learning_rate": 0.0007360341521390332, + "loss": 2.7439, + "step": 11807 + }, + { + "epoch": 0.35014678409394184, + "grad_norm": 0.10923390835523605, + "learning_rate": 0.0007359926727560828, + "loss": 2.7518, + "step": 11808 + }, + { + "epoch": 0.3501764374462533, + "grad_norm": 0.11841530352830887, + "learning_rate": 0.0007359511912833768, + "loss": 2.7771, + "step": 11809 + }, + { + "epoch": 0.3502060907985648, + "grad_norm": 0.13272875547409058, + "learning_rate": 0.0007359097077212825, + "loss": 2.7449, + "step": 11810 + }, + { + "epoch": 0.35023574415087627, + "grad_norm": 0.14295311272144318, + "learning_rate": 0.0007358682220701671, + "loss": 2.751, + "step": 11811 + }, + { + "epoch": 0.35026539750318775, + "grad_norm": 0.13025425374507904, + "learning_rate": 0.0007358267343303983, + "loss": 2.6668, + "step": 11812 + }, + { + "epoch": 0.3502950508554992, + "grad_norm": 0.1245478019118309, + "learning_rate": 0.0007357852445023432, + "loss": 2.7339, + "step": 11813 + }, + { + "epoch": 0.3503247042078107, + "grad_norm": 0.13143660128116608, + "learning_rate": 0.0007357437525863693, + "loss": 2.7287, + "step": 11814 + }, + { + "epoch": 0.3503543575601222, + "grad_norm": 0.14661307632923126, + "learning_rate": 0.000735702258582844, + "loss": 2.723, + "step": 11815 + }, + { + "epoch": 0.35038401091243365, + "grad_norm": 0.16667115688323975, + "learning_rate": 0.0007356607624921347, + "loss": 2.7636, + "step": 11816 + }, + { + "epoch": 0.35041366426474513, + "grad_norm": 0.12728461623191833, + "learning_rate": 0.0007356192643146089, + "loss": 2.7428, + "step": 11817 + }, + { + "epoch": 0.3504433176170566, + "grad_norm": 0.11131129413843155, + "learning_rate": 0.0007355777640506341, + "loss": 2.7174, + "step": 11818 + }, + { + "epoch": 0.3504729709693681, + "grad_norm": 0.12733934819698334, + "learning_rate": 0.0007355362617005778, + "loss": 2.7507, + "step": 11819 + }, + { + "epoch": 0.35050262432167956, + "grad_norm": 0.129606232047081, + "learning_rate": 0.0007354947572648074, + "loss": 2.7476, + "step": 11820 + }, + { + "epoch": 0.35053227767399103, + "grad_norm": 0.11231893301010132, + "learning_rate": 0.0007354532507436906, + "loss": 2.7296, + "step": 11821 + }, + { + "epoch": 0.3505619310263025, + "grad_norm": 0.1288909763097763, + "learning_rate": 0.0007354117421375948, + "loss": 2.72, + "step": 11822 + }, + { + "epoch": 0.350591584378614, + "grad_norm": 0.12969985604286194, + "learning_rate": 0.0007353702314468878, + "loss": 2.7311, + "step": 11823 + }, + { + "epoch": 0.35062123773092546, + "grad_norm": 0.14424264430999756, + "learning_rate": 0.0007353287186719367, + "loss": 2.7666, + "step": 11824 + }, + { + "epoch": 0.35065089108323694, + "grad_norm": 0.1544671505689621, + "learning_rate": 0.0007352872038131095, + "loss": 2.7434, + "step": 11825 + }, + { + "epoch": 0.3506805444355484, + "grad_norm": 0.14621788263320923, + "learning_rate": 0.0007352456868707737, + "loss": 2.7638, + "step": 11826 + }, + { + "epoch": 0.35071019778785995, + "grad_norm": 0.14489920437335968, + "learning_rate": 0.000735204167845297, + "loss": 2.7289, + "step": 11827 + }, + { + "epoch": 0.3507398511401714, + "grad_norm": 0.1427588164806366, + "learning_rate": 0.000735162646737047, + "loss": 2.759, + "step": 11828 + }, + { + "epoch": 0.3507695044924829, + "grad_norm": 0.16208553314208984, + "learning_rate": 0.0007351211235463915, + "loss": 2.7255, + "step": 11829 + }, + { + "epoch": 0.3507991578447944, + "grad_norm": 0.1434612274169922, + "learning_rate": 0.000735079598273698, + "loss": 2.7172, + "step": 11830 + }, + { + "epoch": 0.35082881119710585, + "grad_norm": 0.16665972769260406, + "learning_rate": 0.0007350380709193342, + "loss": 2.7596, + "step": 11831 + }, + { + "epoch": 0.3508584645494173, + "grad_norm": 0.17405065894126892, + "learning_rate": 0.0007349965414836681, + "loss": 2.7324, + "step": 11832 + }, + { + "epoch": 0.3508881179017288, + "grad_norm": 0.14228208363056183, + "learning_rate": 0.0007349550099670672, + "loss": 2.7042, + "step": 11833 + }, + { + "epoch": 0.3509177712540403, + "grad_norm": 0.15057259798049927, + "learning_rate": 0.0007349134763698996, + "loss": 2.7262, + "step": 11834 + }, + { + "epoch": 0.35094742460635175, + "grad_norm": 0.14809919893741608, + "learning_rate": 0.0007348719406925326, + "loss": 2.7333, + "step": 11835 + }, + { + "epoch": 0.35097707795866323, + "grad_norm": 0.1501838117837906, + "learning_rate": 0.0007348304029353343, + "loss": 2.733, + "step": 11836 + }, + { + "epoch": 0.3510067313109747, + "grad_norm": 0.13747698068618774, + "learning_rate": 0.0007347888630986723, + "loss": 2.7279, + "step": 11837 + }, + { + "epoch": 0.3510363846632862, + "grad_norm": 0.1402643471956253, + "learning_rate": 0.0007347473211829149, + "loss": 2.7035, + "step": 11838 + }, + { + "epoch": 0.35106603801559766, + "grad_norm": 0.13058333098888397, + "learning_rate": 0.0007347057771884296, + "loss": 2.7656, + "step": 11839 + }, + { + "epoch": 0.35109569136790914, + "grad_norm": 0.10423155874013901, + "learning_rate": 0.0007346642311155844, + "loss": 2.702, + "step": 11840 + }, + { + "epoch": 0.3511253447202206, + "grad_norm": 0.13394922018051147, + "learning_rate": 0.000734622682964747, + "loss": 2.7378, + "step": 11841 + }, + { + "epoch": 0.3511549980725321, + "grad_norm": 0.1275050938129425, + "learning_rate": 0.0007345811327362856, + "loss": 2.7456, + "step": 11842 + }, + { + "epoch": 0.35118465142484356, + "grad_norm": 0.13112333416938782, + "learning_rate": 0.0007345395804305678, + "loss": 2.7268, + "step": 11843 + }, + { + "epoch": 0.35121430477715504, + "grad_norm": 0.1238178089261055, + "learning_rate": 0.0007344980260479621, + "loss": 2.7134, + "step": 11844 + }, + { + "epoch": 0.3512439581294665, + "grad_norm": 0.12478633224964142, + "learning_rate": 0.0007344564695888358, + "loss": 2.7365, + "step": 11845 + }, + { + "epoch": 0.351273611481778, + "grad_norm": 0.12818171083927155, + "learning_rate": 0.0007344149110535574, + "loss": 2.7084, + "step": 11846 + }, + { + "epoch": 0.35130326483408947, + "grad_norm": 0.1306224912405014, + "learning_rate": 0.0007343733504424947, + "loss": 2.7275, + "step": 11847 + }, + { + "epoch": 0.351332918186401, + "grad_norm": 0.15939544141292572, + "learning_rate": 0.0007343317877560158, + "loss": 2.7368, + "step": 11848 + }, + { + "epoch": 0.3513625715387125, + "grad_norm": 0.16758978366851807, + "learning_rate": 0.0007342902229944885, + "loss": 2.7308, + "step": 11849 + }, + { + "epoch": 0.35139222489102395, + "grad_norm": 0.19006314873695374, + "learning_rate": 0.0007342486561582812, + "loss": 2.7437, + "step": 11850 + }, + { + "epoch": 0.35142187824333543, + "grad_norm": 0.17640700936317444, + "learning_rate": 0.0007342070872477618, + "loss": 2.7421, + "step": 11851 + }, + { + "epoch": 0.3514515315956469, + "grad_norm": 0.13456571102142334, + "learning_rate": 0.0007341655162632986, + "loss": 2.7611, + "step": 11852 + }, + { + "epoch": 0.3514811849479584, + "grad_norm": 0.18057392537593842, + "learning_rate": 0.0007341239432052593, + "loss": 2.7267, + "step": 11853 + }, + { + "epoch": 0.35151083830026986, + "grad_norm": 0.17061223089694977, + "learning_rate": 0.0007340823680740125, + "loss": 2.7065, + "step": 11854 + }, + { + "epoch": 0.35154049165258133, + "grad_norm": 0.14394597709178925, + "learning_rate": 0.000734040790869926, + "loss": 2.7553, + "step": 11855 + }, + { + "epoch": 0.3515701450048928, + "grad_norm": 0.13914139568805695, + "learning_rate": 0.0007339992115933683, + "loss": 2.7307, + "step": 11856 + }, + { + "epoch": 0.3515997983572043, + "grad_norm": 0.13581392168998718, + "learning_rate": 0.0007339576302447073, + "loss": 2.7611, + "step": 11857 + }, + { + "epoch": 0.35162945170951576, + "grad_norm": 0.13079595565795898, + "learning_rate": 0.0007339160468243114, + "loss": 2.722, + "step": 11858 + }, + { + "epoch": 0.35165910506182724, + "grad_norm": 0.12409689277410507, + "learning_rate": 0.0007338744613325486, + "loss": 2.7692, + "step": 11859 + }, + { + "epoch": 0.3516887584141387, + "grad_norm": 0.12941820919513702, + "learning_rate": 0.0007338328737697873, + "loss": 2.7585, + "step": 11860 + }, + { + "epoch": 0.3517184117664502, + "grad_norm": 0.12964874505996704, + "learning_rate": 0.000733791284136396, + "loss": 2.7497, + "step": 11861 + }, + { + "epoch": 0.35174806511876167, + "grad_norm": 0.12145158648490906, + "learning_rate": 0.0007337496924327427, + "loss": 2.7384, + "step": 11862 + }, + { + "epoch": 0.35177771847107314, + "grad_norm": 0.11207936704158783, + "learning_rate": 0.0007337080986591955, + "loss": 2.7236, + "step": 11863 + }, + { + "epoch": 0.3518073718233846, + "grad_norm": 0.1307566910982132, + "learning_rate": 0.0007336665028161232, + "loss": 2.7522, + "step": 11864 + }, + { + "epoch": 0.3518370251756961, + "grad_norm": 0.13158145546913147, + "learning_rate": 0.0007336249049038938, + "loss": 2.7692, + "step": 11865 + }, + { + "epoch": 0.35186667852800757, + "grad_norm": 0.14432261884212494, + "learning_rate": 0.0007335833049228758, + "loss": 2.7183, + "step": 11866 + }, + { + "epoch": 0.35189633188031905, + "grad_norm": 0.12562626600265503, + "learning_rate": 0.0007335417028734377, + "loss": 2.7026, + "step": 11867 + }, + { + "epoch": 0.3519259852326305, + "grad_norm": 0.11999448388814926, + "learning_rate": 0.0007335000987559474, + "loss": 2.7628, + "step": 11868 + }, + { + "epoch": 0.35195563858494205, + "grad_norm": 0.11002993583679199, + "learning_rate": 0.0007334584925707739, + "loss": 2.7086, + "step": 11869 + }, + { + "epoch": 0.35198529193725353, + "grad_norm": 0.12318123877048492, + "learning_rate": 0.0007334168843182852, + "loss": 2.7314, + "step": 11870 + }, + { + "epoch": 0.352014945289565, + "grad_norm": 0.107077457010746, + "learning_rate": 0.0007333752739988501, + "loss": 2.7288, + "step": 11871 + }, + { + "epoch": 0.3520445986418765, + "grad_norm": 0.11709243059158325, + "learning_rate": 0.0007333336616128369, + "loss": 2.7443, + "step": 11872 + }, + { + "epoch": 0.35207425199418796, + "grad_norm": 0.12752245366573334, + "learning_rate": 0.000733292047160614, + "loss": 2.745, + "step": 11873 + }, + { + "epoch": 0.35210390534649944, + "grad_norm": 0.12744638323783875, + "learning_rate": 0.00073325043064255, + "loss": 2.726, + "step": 11874 + }, + { + "epoch": 0.3521335586988109, + "grad_norm": 0.12832924723625183, + "learning_rate": 0.0007332088120590133, + "loss": 2.7412, + "step": 11875 + }, + { + "epoch": 0.3521632120511224, + "grad_norm": 0.130411297082901, + "learning_rate": 0.0007331671914103725, + "loss": 2.7232, + "step": 11876 + }, + { + "epoch": 0.35219286540343386, + "grad_norm": 0.1207573339343071, + "learning_rate": 0.0007331255686969964, + "loss": 2.7248, + "step": 11877 + }, + { + "epoch": 0.35222251875574534, + "grad_norm": 0.1237306222319603, + "learning_rate": 0.0007330839439192533, + "loss": 2.7765, + "step": 11878 + }, + { + "epoch": 0.3522521721080568, + "grad_norm": 0.12529374659061432, + "learning_rate": 0.0007330423170775119, + "loss": 2.777, + "step": 11879 + }, + { + "epoch": 0.3522818254603683, + "grad_norm": 0.14763493835926056, + "learning_rate": 0.0007330006881721407, + "loss": 2.7304, + "step": 11880 + }, + { + "epoch": 0.35231147881267977, + "grad_norm": 0.1627679467201233, + "learning_rate": 0.0007329590572035086, + "loss": 2.6949, + "step": 11881 + }, + { + "epoch": 0.35234113216499124, + "grad_norm": 0.14832600951194763, + "learning_rate": 0.0007329174241719838, + "loss": 2.7349, + "step": 11882 + }, + { + "epoch": 0.3523707855173027, + "grad_norm": 0.12507598102092743, + "learning_rate": 0.0007328757890779354, + "loss": 2.7232, + "step": 11883 + }, + { + "epoch": 0.3524004388696142, + "grad_norm": 0.12443380802869797, + "learning_rate": 0.0007328341519217317, + "loss": 2.7635, + "step": 11884 + }, + { + "epoch": 0.3524300922219257, + "grad_norm": 0.1443873792886734, + "learning_rate": 0.0007327925127037418, + "loss": 2.7485, + "step": 11885 + }, + { + "epoch": 0.35245974557423715, + "grad_norm": 0.1327812522649765, + "learning_rate": 0.0007327508714243343, + "loss": 2.7383, + "step": 11886 + }, + { + "epoch": 0.3524893989265486, + "grad_norm": 0.13194559514522552, + "learning_rate": 0.0007327092280838777, + "loss": 2.7059, + "step": 11887 + }, + { + "epoch": 0.3525190522788601, + "grad_norm": 0.1350773572921753, + "learning_rate": 0.0007326675826827411, + "loss": 2.7689, + "step": 11888 + }, + { + "epoch": 0.3525487056311716, + "grad_norm": 0.13360419869422913, + "learning_rate": 0.0007326259352212931, + "loss": 2.7092, + "step": 11889 + }, + { + "epoch": 0.3525783589834831, + "grad_norm": 0.12611795961856842, + "learning_rate": 0.0007325842856999024, + "loss": 2.7144, + "step": 11890 + }, + { + "epoch": 0.3526080123357946, + "grad_norm": 0.11658024042844772, + "learning_rate": 0.0007325426341189381, + "loss": 2.7501, + "step": 11891 + }, + { + "epoch": 0.35263766568810606, + "grad_norm": 0.1359262317419052, + "learning_rate": 0.0007325009804787687, + "loss": 2.7397, + "step": 11892 + }, + { + "epoch": 0.35266731904041754, + "grad_norm": 0.14135776460170746, + "learning_rate": 0.0007324593247797633, + "loss": 2.7369, + "step": 11893 + }, + { + "epoch": 0.352696972392729, + "grad_norm": 0.11965388059616089, + "learning_rate": 0.0007324176670222907, + "loss": 2.733, + "step": 11894 + }, + { + "epoch": 0.3527266257450405, + "grad_norm": 0.13550519943237305, + "learning_rate": 0.0007323760072067197, + "loss": 2.7431, + "step": 11895 + }, + { + "epoch": 0.35275627909735197, + "grad_norm": 0.14087934792041779, + "learning_rate": 0.0007323343453334192, + "loss": 2.7275, + "step": 11896 + }, + { + "epoch": 0.35278593244966344, + "grad_norm": 0.14460958540439606, + "learning_rate": 0.0007322926814027582, + "loss": 2.7292, + "step": 11897 + }, + { + "epoch": 0.3528155858019749, + "grad_norm": 0.15262310206890106, + "learning_rate": 0.0007322510154151058, + "loss": 2.7295, + "step": 11898 + }, + { + "epoch": 0.3528452391542864, + "grad_norm": 0.14415183663368225, + "learning_rate": 0.0007322093473708307, + "loss": 2.7398, + "step": 11899 + }, + { + "epoch": 0.35287489250659787, + "grad_norm": 0.15147297084331512, + "learning_rate": 0.000732167677270302, + "loss": 2.7317, + "step": 11900 + }, + { + "epoch": 0.35290454585890935, + "grad_norm": 0.1531604677438736, + "learning_rate": 0.0007321260051138886, + "loss": 2.6941, + "step": 11901 + }, + { + "epoch": 0.3529341992112208, + "grad_norm": 0.1270693689584732, + "learning_rate": 0.0007320843309019596, + "loss": 2.7259, + "step": 11902 + }, + { + "epoch": 0.3529638525635323, + "grad_norm": 0.12805528938770294, + "learning_rate": 0.000732042654634884, + "loss": 2.7192, + "step": 11903 + }, + { + "epoch": 0.3529935059158438, + "grad_norm": 0.11838417500257492, + "learning_rate": 0.0007320009763130309, + "loss": 2.7263, + "step": 11904 + }, + { + "epoch": 0.35302315926815525, + "grad_norm": 0.12471920251846313, + "learning_rate": 0.0007319592959367694, + "loss": 2.7217, + "step": 11905 + }, + { + "epoch": 0.3530528126204667, + "grad_norm": 0.13754984736442566, + "learning_rate": 0.0007319176135064685, + "loss": 2.7223, + "step": 11906 + }, + { + "epoch": 0.3530824659727782, + "grad_norm": 0.1189827024936676, + "learning_rate": 0.0007318759290224973, + "loss": 2.7456, + "step": 11907 + }, + { + "epoch": 0.3531121193250897, + "grad_norm": 0.13239875435829163, + "learning_rate": 0.0007318342424852248, + "loss": 2.6975, + "step": 11908 + }, + { + "epoch": 0.35314177267740116, + "grad_norm": 0.11338957399129868, + "learning_rate": 0.0007317925538950203, + "loss": 2.7682, + "step": 11909 + }, + { + "epoch": 0.35317142602971263, + "grad_norm": 0.12142884731292725, + "learning_rate": 0.0007317508632522532, + "loss": 2.7419, + "step": 11910 + }, + { + "epoch": 0.35320107938202416, + "grad_norm": 0.1275559663772583, + "learning_rate": 0.0007317091705572922, + "loss": 2.7546, + "step": 11911 + }, + { + "epoch": 0.35323073273433564, + "grad_norm": 0.12426324933767319, + "learning_rate": 0.0007316674758105069, + "loss": 2.7558, + "step": 11912 + }, + { + "epoch": 0.3532603860866471, + "grad_norm": 0.12874457240104675, + "learning_rate": 0.0007316257790122661, + "loss": 2.7662, + "step": 11913 + }, + { + "epoch": 0.3532900394389586, + "grad_norm": 0.12483725696802139, + "learning_rate": 0.0007315840801629394, + "loss": 2.7336, + "step": 11914 + }, + { + "epoch": 0.35331969279127007, + "grad_norm": 0.11655602604150772, + "learning_rate": 0.000731542379262896, + "loss": 2.7479, + "step": 11915 + }, + { + "epoch": 0.35334934614358154, + "grad_norm": 0.1191834807395935, + "learning_rate": 0.000731500676312505, + "loss": 2.7386, + "step": 11916 + }, + { + "epoch": 0.353378999495893, + "grad_norm": 0.11619916558265686, + "learning_rate": 0.0007314589713121358, + "loss": 2.7343, + "step": 11917 + }, + { + "epoch": 0.3534086528482045, + "grad_norm": 0.14545521140098572, + "learning_rate": 0.0007314172642621577, + "loss": 2.7088, + "step": 11918 + }, + { + "epoch": 0.353438306200516, + "grad_norm": 0.15586303174495697, + "learning_rate": 0.00073137555516294, + "loss": 2.7205, + "step": 11919 + }, + { + "epoch": 0.35346795955282745, + "grad_norm": 0.16692928969860077, + "learning_rate": 0.000731333844014852, + "loss": 2.7505, + "step": 11920 + }, + { + "epoch": 0.3534976129051389, + "grad_norm": 0.18135294318199158, + "learning_rate": 0.000731292130818263, + "loss": 2.7641, + "step": 11921 + }, + { + "epoch": 0.3535272662574504, + "grad_norm": 0.18793043494224548, + "learning_rate": 0.0007312504155735426, + "loss": 2.732, + "step": 11922 + }, + { + "epoch": 0.3535569196097619, + "grad_norm": 0.15327559411525726, + "learning_rate": 0.0007312086982810602, + "loss": 2.7235, + "step": 11923 + }, + { + "epoch": 0.35358657296207335, + "grad_norm": 0.12436482310295105, + "learning_rate": 0.0007311669789411848, + "loss": 2.7424, + "step": 11924 + }, + { + "epoch": 0.35361622631438483, + "grad_norm": 0.1416776329278946, + "learning_rate": 0.0007311252575542864, + "loss": 2.7338, + "step": 11925 + }, + { + "epoch": 0.3536458796666963, + "grad_norm": 0.12644270062446594, + "learning_rate": 0.0007310835341207341, + "loss": 2.7125, + "step": 11926 + }, + { + "epoch": 0.3536755330190078, + "grad_norm": 0.1284215897321701, + "learning_rate": 0.0007310418086408974, + "loss": 2.7324, + "step": 11927 + }, + { + "epoch": 0.35370518637131926, + "grad_norm": 0.11883822828531265, + "learning_rate": 0.0007310000811151457, + "loss": 2.7192, + "step": 11928 + }, + { + "epoch": 0.35373483972363073, + "grad_norm": 0.1413528174161911, + "learning_rate": 0.0007309583515438488, + "loss": 2.7653, + "step": 11929 + }, + { + "epoch": 0.3537644930759422, + "grad_norm": 0.13132640719413757, + "learning_rate": 0.000730916619927376, + "loss": 2.7424, + "step": 11930 + }, + { + "epoch": 0.35379414642825374, + "grad_norm": 0.14438267052173615, + "learning_rate": 0.0007308748862660969, + "loss": 2.6962, + "step": 11931 + }, + { + "epoch": 0.3538237997805652, + "grad_norm": 0.11629179120063782, + "learning_rate": 0.0007308331505603812, + "loss": 2.7331, + "step": 11932 + }, + { + "epoch": 0.3538534531328767, + "grad_norm": 0.11213048547506332, + "learning_rate": 0.0007307914128105982, + "loss": 2.7347, + "step": 11933 + }, + { + "epoch": 0.35388310648518817, + "grad_norm": 0.1243257001042366, + "learning_rate": 0.0007307496730171175, + "loss": 2.7119, + "step": 11934 + }, + { + "epoch": 0.35391275983749965, + "grad_norm": 0.15512299537658691, + "learning_rate": 0.0007307079311803089, + "loss": 2.7282, + "step": 11935 + }, + { + "epoch": 0.3539424131898111, + "grad_norm": 0.15610754489898682, + "learning_rate": 0.000730666187300542, + "loss": 2.7082, + "step": 11936 + }, + { + "epoch": 0.3539720665421226, + "grad_norm": 0.13960431516170502, + "learning_rate": 0.0007306244413781865, + "loss": 2.6913, + "step": 11937 + }, + { + "epoch": 0.3540017198944341, + "grad_norm": 0.12199179828166962, + "learning_rate": 0.0007305826934136119, + "loss": 2.7638, + "step": 11938 + }, + { + "epoch": 0.35403137324674555, + "grad_norm": 0.1423315852880478, + "learning_rate": 0.0007305409434071881, + "loss": 2.7642, + "step": 11939 + }, + { + "epoch": 0.354061026599057, + "grad_norm": 0.1675296127796173, + "learning_rate": 0.0007304991913592846, + "loss": 2.7531, + "step": 11940 + }, + { + "epoch": 0.3540906799513685, + "grad_norm": 0.18597370386123657, + "learning_rate": 0.000730457437270271, + "loss": 2.7623, + "step": 11941 + }, + { + "epoch": 0.35412033330368, + "grad_norm": 0.15988242626190186, + "learning_rate": 0.0007304156811405174, + "loss": 2.7456, + "step": 11942 + }, + { + "epoch": 0.35414998665599146, + "grad_norm": 0.13292329013347626, + "learning_rate": 0.0007303739229703936, + "loss": 2.7218, + "step": 11943 + }, + { + "epoch": 0.35417964000830293, + "grad_norm": 0.13662397861480713, + "learning_rate": 0.0007303321627602688, + "loss": 2.7219, + "step": 11944 + }, + { + "epoch": 0.3542092933606144, + "grad_norm": 0.12715528905391693, + "learning_rate": 0.0007302904005105134, + "loss": 2.7381, + "step": 11945 + }, + { + "epoch": 0.3542389467129259, + "grad_norm": 0.13401088118553162, + "learning_rate": 0.0007302486362214969, + "loss": 2.7175, + "step": 11946 + }, + { + "epoch": 0.35426860006523736, + "grad_norm": 0.13589726388454437, + "learning_rate": 0.0007302068698935891, + "loss": 2.7323, + "step": 11947 + }, + { + "epoch": 0.35429825341754884, + "grad_norm": 0.12552766501903534, + "learning_rate": 0.0007301651015271602, + "loss": 2.7485, + "step": 11948 + }, + { + "epoch": 0.3543279067698603, + "grad_norm": 0.117206871509552, + "learning_rate": 0.0007301233311225797, + "loss": 2.7174, + "step": 11949 + }, + { + "epoch": 0.3543575601221718, + "grad_norm": 0.1147477924823761, + "learning_rate": 0.0007300815586802175, + "loss": 2.702, + "step": 11950 + }, + { + "epoch": 0.35438721347448326, + "grad_norm": 0.11034055054187775, + "learning_rate": 0.0007300397842004437, + "loss": 2.7121, + "step": 11951 + }, + { + "epoch": 0.3544168668267948, + "grad_norm": 0.12990854680538177, + "learning_rate": 0.0007299980076836281, + "loss": 2.7608, + "step": 11952 + }, + { + "epoch": 0.3544465201791063, + "grad_norm": 0.12405852228403091, + "learning_rate": 0.0007299562291301407, + "loss": 2.7506, + "step": 11953 + }, + { + "epoch": 0.35447617353141775, + "grad_norm": 0.11630050092935562, + "learning_rate": 0.0007299144485403514, + "loss": 2.7604, + "step": 11954 + }, + { + "epoch": 0.3545058268837292, + "grad_norm": 0.12082349509000778, + "learning_rate": 0.0007298726659146302, + "loss": 2.7206, + "step": 11955 + }, + { + "epoch": 0.3545354802360407, + "grad_norm": 0.10972116887569427, + "learning_rate": 0.000729830881253347, + "loss": 2.7188, + "step": 11956 + }, + { + "epoch": 0.3545651335883522, + "grad_norm": 0.10886742919683456, + "learning_rate": 0.0007297890945568719, + "loss": 2.7055, + "step": 11957 + }, + { + "epoch": 0.35459478694066365, + "grad_norm": 0.11678656190633774, + "learning_rate": 0.000729747305825575, + "loss": 2.7582, + "step": 11958 + }, + { + "epoch": 0.35462444029297513, + "grad_norm": 0.11017078161239624, + "learning_rate": 0.0007297055150598263, + "loss": 2.7375, + "step": 11959 + }, + { + "epoch": 0.3546540936452866, + "grad_norm": 0.1244874820113182, + "learning_rate": 0.0007296637222599958, + "loss": 2.7331, + "step": 11960 + }, + { + "epoch": 0.3546837469975981, + "grad_norm": 0.10706159472465515, + "learning_rate": 0.0007296219274264536, + "loss": 2.7236, + "step": 11961 + }, + { + "epoch": 0.35471340034990956, + "grad_norm": 0.1043957993388176, + "learning_rate": 0.0007295801305595698, + "loss": 2.7054, + "step": 11962 + }, + { + "epoch": 0.35474305370222103, + "grad_norm": 0.11995387822389603, + "learning_rate": 0.0007295383316597146, + "loss": 2.7581, + "step": 11963 + }, + { + "epoch": 0.3547727070545325, + "grad_norm": 0.12181586027145386, + "learning_rate": 0.0007294965307272581, + "loss": 2.7149, + "step": 11964 + }, + { + "epoch": 0.354802360406844, + "grad_norm": 0.12636584043502808, + "learning_rate": 0.0007294547277625705, + "loss": 2.7429, + "step": 11965 + }, + { + "epoch": 0.35483201375915546, + "grad_norm": 0.14300386607646942, + "learning_rate": 0.0007294129227660218, + "loss": 2.7333, + "step": 11966 + }, + { + "epoch": 0.35486166711146694, + "grad_norm": 0.14541861414909363, + "learning_rate": 0.0007293711157379821, + "loss": 2.7451, + "step": 11967 + }, + { + "epoch": 0.3548913204637784, + "grad_norm": 0.16005945205688477, + "learning_rate": 0.000729329306678822, + "loss": 2.7427, + "step": 11968 + }, + { + "epoch": 0.3549209738160899, + "grad_norm": 0.19954651594161987, + "learning_rate": 0.0007292874955889115, + "loss": 2.7521, + "step": 11969 + }, + { + "epoch": 0.35495062716840137, + "grad_norm": 0.21320021152496338, + "learning_rate": 0.0007292456824686209, + "loss": 2.7625, + "step": 11970 + }, + { + "epoch": 0.35498028052071284, + "grad_norm": 0.1811579465866089, + "learning_rate": 0.0007292038673183203, + "loss": 2.727, + "step": 11971 + }, + { + "epoch": 0.3550099338730243, + "grad_norm": 0.16929341852664948, + "learning_rate": 0.0007291620501383803, + "loss": 2.7391, + "step": 11972 + }, + { + "epoch": 0.35503958722533585, + "grad_norm": 0.15900550782680511, + "learning_rate": 0.0007291202309291708, + "loss": 2.7265, + "step": 11973 + }, + { + "epoch": 0.3550692405776473, + "grad_norm": 0.13913649320602417, + "learning_rate": 0.0007290784096910624, + "loss": 2.7214, + "step": 11974 + }, + { + "epoch": 0.3550988939299588, + "grad_norm": 0.1680924892425537, + "learning_rate": 0.0007290365864244255, + "loss": 2.7529, + "step": 11975 + }, + { + "epoch": 0.3551285472822703, + "grad_norm": 0.15294329822063446, + "learning_rate": 0.0007289947611296303, + "loss": 2.7482, + "step": 11976 + }, + { + "epoch": 0.35515820063458176, + "grad_norm": 0.1492927372455597, + "learning_rate": 0.000728952933807047, + "loss": 2.7381, + "step": 11977 + }, + { + "epoch": 0.35518785398689323, + "grad_norm": 0.1459653675556183, + "learning_rate": 0.0007289111044570464, + "loss": 2.7161, + "step": 11978 + }, + { + "epoch": 0.3552175073392047, + "grad_norm": 0.13338960707187653, + "learning_rate": 0.0007288692730799985, + "loss": 2.6928, + "step": 11979 + }, + { + "epoch": 0.3552471606915162, + "grad_norm": 0.11803990602493286, + "learning_rate": 0.0007288274396762738, + "loss": 2.744, + "step": 11980 + }, + { + "epoch": 0.35527681404382766, + "grad_norm": 0.12049505859613419, + "learning_rate": 0.0007287856042462431, + "loss": 2.703, + "step": 11981 + }, + { + "epoch": 0.35530646739613914, + "grad_norm": 0.12591513991355896, + "learning_rate": 0.0007287437667902766, + "loss": 2.7126, + "step": 11982 + }, + { + "epoch": 0.3553361207484506, + "grad_norm": 0.10577275604009628, + "learning_rate": 0.0007287019273087447, + "loss": 2.6999, + "step": 11983 + }, + { + "epoch": 0.3553657741007621, + "grad_norm": 0.1209072694182396, + "learning_rate": 0.0007286600858020178, + "loss": 2.7011, + "step": 11984 + }, + { + "epoch": 0.35539542745307356, + "grad_norm": 0.11791937053203583, + "learning_rate": 0.0007286182422704668, + "loss": 2.7186, + "step": 11985 + }, + { + "epoch": 0.35542508080538504, + "grad_norm": 0.10612578690052032, + "learning_rate": 0.0007285763967144619, + "loss": 2.7275, + "step": 11986 + }, + { + "epoch": 0.3554547341576965, + "grad_norm": 0.10636695474386215, + "learning_rate": 0.000728534549134374, + "loss": 2.69, + "step": 11987 + }, + { + "epoch": 0.355484387510008, + "grad_norm": 0.11506319791078568, + "learning_rate": 0.0007284926995305732, + "loss": 2.6939, + "step": 11988 + }, + { + "epoch": 0.35551404086231947, + "grad_norm": 0.1071230098605156, + "learning_rate": 0.0007284508479034304, + "loss": 2.7368, + "step": 11989 + }, + { + "epoch": 0.35554369421463095, + "grad_norm": 0.10678832978010178, + "learning_rate": 0.0007284089942533162, + "loss": 2.7263, + "step": 11990 + }, + { + "epoch": 0.3555733475669424, + "grad_norm": 0.10275827348232269, + "learning_rate": 0.0007283671385806012, + "loss": 2.7596, + "step": 11991 + }, + { + "epoch": 0.3556030009192539, + "grad_norm": 0.09892785549163818, + "learning_rate": 0.0007283252808856557, + "loss": 2.7653, + "step": 11992 + }, + { + "epoch": 0.3556326542715654, + "grad_norm": 0.10505636036396027, + "learning_rate": 0.0007282834211688509, + "loss": 2.7322, + "step": 11993 + }, + { + "epoch": 0.3556623076238769, + "grad_norm": 0.10027005523443222, + "learning_rate": 0.0007282415594305571, + "loss": 2.7296, + "step": 11994 + }, + { + "epoch": 0.3556919609761884, + "grad_norm": 0.11981172114610672, + "learning_rate": 0.0007281996956711452, + "loss": 2.7355, + "step": 11995 + }, + { + "epoch": 0.35572161432849986, + "grad_norm": 0.1281871497631073, + "learning_rate": 0.0007281578298909858, + "loss": 2.7336, + "step": 11996 + }, + { + "epoch": 0.35575126768081133, + "grad_norm": 0.11134229600429535, + "learning_rate": 0.0007281159620904496, + "loss": 2.7271, + "step": 11997 + }, + { + "epoch": 0.3557809210331228, + "grad_norm": 0.11218168586492538, + "learning_rate": 0.0007280740922699075, + "loss": 2.7516, + "step": 11998 + }, + { + "epoch": 0.3558105743854343, + "grad_norm": 0.1270991712808609, + "learning_rate": 0.00072803222042973, + "loss": 2.7277, + "step": 11999 + }, + { + "epoch": 0.35584022773774576, + "grad_norm": 0.13145703077316284, + "learning_rate": 0.0007279903465702882, + "loss": 2.7451, + "step": 12000 + }, + { + "epoch": 0.35586988109005724, + "grad_norm": 0.13901643455028534, + "learning_rate": 0.0007279484706919527, + "loss": 2.7276, + "step": 12001 + }, + { + "epoch": 0.3558995344423687, + "grad_norm": 0.14850202202796936, + "learning_rate": 0.0007279065927950943, + "loss": 2.7416, + "step": 12002 + }, + { + "epoch": 0.3559291877946802, + "grad_norm": 0.1616215854883194, + "learning_rate": 0.0007278647128800841, + "loss": 2.7229, + "step": 12003 + }, + { + "epoch": 0.35595884114699167, + "grad_norm": 0.173272505402565, + "learning_rate": 0.0007278228309472927, + "loss": 2.7285, + "step": 12004 + }, + { + "epoch": 0.35598849449930314, + "grad_norm": 0.1690787374973297, + "learning_rate": 0.0007277809469970908, + "loss": 2.7143, + "step": 12005 + }, + { + "epoch": 0.3560181478516146, + "grad_norm": 0.11949393898248672, + "learning_rate": 0.0007277390610298496, + "loss": 2.7334, + "step": 12006 + }, + { + "epoch": 0.3560478012039261, + "grad_norm": 0.13423855602741241, + "learning_rate": 0.0007276971730459401, + "loss": 2.7288, + "step": 12007 + }, + { + "epoch": 0.35607745455623757, + "grad_norm": 0.12979912757873535, + "learning_rate": 0.0007276552830457329, + "loss": 2.7357, + "step": 12008 + }, + { + "epoch": 0.35610710790854905, + "grad_norm": 0.13011404871940613, + "learning_rate": 0.0007276133910295992, + "loss": 2.7443, + "step": 12009 + }, + { + "epoch": 0.3561367612608605, + "grad_norm": 0.13436616957187653, + "learning_rate": 0.0007275714969979097, + "loss": 2.7589, + "step": 12010 + }, + { + "epoch": 0.356166414613172, + "grad_norm": 0.14634643495082855, + "learning_rate": 0.0007275296009510357, + "loss": 2.7344, + "step": 12011 + }, + { + "epoch": 0.3561960679654835, + "grad_norm": 0.14189429581165314, + "learning_rate": 0.0007274877028893478, + "loss": 2.7579, + "step": 12012 + }, + { + "epoch": 0.35622572131779495, + "grad_norm": 0.14887219667434692, + "learning_rate": 0.0007274458028132173, + "loss": 2.7633, + "step": 12013 + }, + { + "epoch": 0.35625537467010643, + "grad_norm": 0.16256758570671082, + "learning_rate": 0.0007274039007230154, + "loss": 2.7387, + "step": 12014 + }, + { + "epoch": 0.35628502802241796, + "grad_norm": 0.14271429181098938, + "learning_rate": 0.0007273619966191128, + "loss": 2.713, + "step": 12015 + }, + { + "epoch": 0.35631468137472944, + "grad_norm": 0.15748347342014313, + "learning_rate": 0.0007273200905018808, + "loss": 2.7058, + "step": 12016 + }, + { + "epoch": 0.3563443347270409, + "grad_norm": 0.1556997448205948, + "learning_rate": 0.0007272781823716902, + "loss": 2.7349, + "step": 12017 + }, + { + "epoch": 0.3563739880793524, + "grad_norm": 0.15257970988750458, + "learning_rate": 0.0007272362722289122, + "loss": 2.7188, + "step": 12018 + }, + { + "epoch": 0.35640364143166386, + "grad_norm": 0.14440283179283142, + "learning_rate": 0.0007271943600739183, + "loss": 2.7335, + "step": 12019 + }, + { + "epoch": 0.35643329478397534, + "grad_norm": 0.15392787754535675, + "learning_rate": 0.0007271524459070792, + "loss": 2.7342, + "step": 12020 + }, + { + "epoch": 0.3564629481362868, + "grad_norm": 0.1665259450674057, + "learning_rate": 0.0007271105297287662, + "loss": 2.7537, + "step": 12021 + }, + { + "epoch": 0.3564926014885983, + "grad_norm": 0.14041779935359955, + "learning_rate": 0.0007270686115393504, + "loss": 2.7275, + "step": 12022 + }, + { + "epoch": 0.35652225484090977, + "grad_norm": 0.126630499958992, + "learning_rate": 0.0007270266913392032, + "loss": 2.7355, + "step": 12023 + }, + { + "epoch": 0.35655190819322125, + "grad_norm": 0.11261830478906631, + "learning_rate": 0.0007269847691286955, + "loss": 2.7349, + "step": 12024 + }, + { + "epoch": 0.3565815615455327, + "grad_norm": 0.13961833715438843, + "learning_rate": 0.0007269428449081988, + "loss": 2.762, + "step": 12025 + }, + { + "epoch": 0.3566112148978442, + "grad_norm": 0.12086772173643112, + "learning_rate": 0.0007269009186780844, + "loss": 2.7429, + "step": 12026 + }, + { + "epoch": 0.3566408682501557, + "grad_norm": 0.1282101273536682, + "learning_rate": 0.0007268589904387233, + "loss": 2.7375, + "step": 12027 + }, + { + "epoch": 0.35667052160246715, + "grad_norm": 0.10739406943321228, + "learning_rate": 0.0007268170601904869, + "loss": 2.7531, + "step": 12028 + }, + { + "epoch": 0.3567001749547786, + "grad_norm": 0.11540783196687698, + "learning_rate": 0.0007267751279337464, + "loss": 2.6981, + "step": 12029 + }, + { + "epoch": 0.3567298283070901, + "grad_norm": 0.12245550751686096, + "learning_rate": 0.0007267331936688734, + "loss": 2.7455, + "step": 12030 + }, + { + "epoch": 0.3567594816594016, + "grad_norm": 0.13435576856136322, + "learning_rate": 0.000726691257396239, + "loss": 2.761, + "step": 12031 + }, + { + "epoch": 0.35678913501171305, + "grad_norm": 0.11963916569948196, + "learning_rate": 0.0007266493191162145, + "loss": 2.7196, + "step": 12032 + }, + { + "epoch": 0.35681878836402453, + "grad_norm": 0.130268394947052, + "learning_rate": 0.0007266073788291714, + "loss": 2.7106, + "step": 12033 + }, + { + "epoch": 0.356848441716336, + "grad_norm": 0.1492094248533249, + "learning_rate": 0.0007265654365354811, + "loss": 2.7492, + "step": 12034 + }, + { + "epoch": 0.3568780950686475, + "grad_norm": 0.15140628814697266, + "learning_rate": 0.000726523492235515, + "loss": 2.7652, + "step": 12035 + }, + { + "epoch": 0.356907748420959, + "grad_norm": 0.14408142864704132, + "learning_rate": 0.0007264815459296445, + "loss": 2.7329, + "step": 12036 + }, + { + "epoch": 0.3569374017732705, + "grad_norm": 0.12990428507328033, + "learning_rate": 0.0007264395976182411, + "loss": 2.7408, + "step": 12037 + }, + { + "epoch": 0.35696705512558197, + "grad_norm": 0.13721047341823578, + "learning_rate": 0.0007263976473016761, + "loss": 2.7321, + "step": 12038 + }, + { + "epoch": 0.35699670847789344, + "grad_norm": 0.13783109188079834, + "learning_rate": 0.0007263556949803209, + "loss": 2.7466, + "step": 12039 + }, + { + "epoch": 0.3570263618302049, + "grad_norm": 0.11975836753845215, + "learning_rate": 0.0007263137406545475, + "loss": 2.7449, + "step": 12040 + }, + { + "epoch": 0.3570560151825164, + "grad_norm": 0.10881700366735458, + "learning_rate": 0.0007262717843247269, + "loss": 2.7091, + "step": 12041 + }, + { + "epoch": 0.35708566853482787, + "grad_norm": 0.12731419503688812, + "learning_rate": 0.0007262298259912309, + "loss": 2.7413, + "step": 12042 + }, + { + "epoch": 0.35711532188713935, + "grad_norm": 0.12487759441137314, + "learning_rate": 0.0007261878656544308, + "loss": 2.7143, + "step": 12043 + }, + { + "epoch": 0.3571449752394508, + "grad_norm": 0.11093836277723312, + "learning_rate": 0.0007261459033146984, + "loss": 2.7007, + "step": 12044 + }, + { + "epoch": 0.3571746285917623, + "grad_norm": 0.11988628655672073, + "learning_rate": 0.0007261039389724052, + "loss": 2.7055, + "step": 12045 + }, + { + "epoch": 0.3572042819440738, + "grad_norm": 0.12460164725780487, + "learning_rate": 0.0007260619726279229, + "loss": 2.6901, + "step": 12046 + }, + { + "epoch": 0.35723393529638525, + "grad_norm": 0.11342618614435196, + "learning_rate": 0.000726020004281623, + "loss": 2.7183, + "step": 12047 + }, + { + "epoch": 0.35726358864869673, + "grad_norm": 0.11265796422958374, + "learning_rate": 0.0007259780339338771, + "loss": 2.7391, + "step": 12048 + }, + { + "epoch": 0.3572932420010082, + "grad_norm": 0.10957556962966919, + "learning_rate": 0.000725936061585057, + "loss": 2.7051, + "step": 12049 + }, + { + "epoch": 0.3573228953533197, + "grad_norm": 0.12219816446304321, + "learning_rate": 0.0007258940872355342, + "loss": 2.7489, + "step": 12050 + }, + { + "epoch": 0.35735254870563116, + "grad_norm": 0.14076019823551178, + "learning_rate": 0.0007258521108856804, + "loss": 2.7318, + "step": 12051 + }, + { + "epoch": 0.35738220205794263, + "grad_norm": 0.15326543152332306, + "learning_rate": 0.0007258101325358677, + "loss": 2.749, + "step": 12052 + }, + { + "epoch": 0.3574118554102541, + "grad_norm": 0.1183115765452385, + "learning_rate": 0.0007257681521864673, + "loss": 2.734, + "step": 12053 + }, + { + "epoch": 0.3574415087625656, + "grad_norm": 0.13289161026477814, + "learning_rate": 0.0007257261698378512, + "loss": 2.7647, + "step": 12054 + }, + { + "epoch": 0.35747116211487706, + "grad_norm": 0.15609996020793915, + "learning_rate": 0.0007256841854903912, + "loss": 2.7393, + "step": 12055 + }, + { + "epoch": 0.3575008154671886, + "grad_norm": 0.14848709106445312, + "learning_rate": 0.0007256421991444588, + "loss": 2.7239, + "step": 12056 + }, + { + "epoch": 0.35753046881950007, + "grad_norm": 0.12823446094989777, + "learning_rate": 0.0007256002108004261, + "loss": 2.7186, + "step": 12057 + }, + { + "epoch": 0.35756012217181155, + "grad_norm": 0.13030605018138885, + "learning_rate": 0.000725558220458665, + "loss": 2.7554, + "step": 12058 + }, + { + "epoch": 0.357589775524123, + "grad_norm": 0.13948331773281097, + "learning_rate": 0.0007255162281195468, + "loss": 2.7625, + "step": 12059 + }, + { + "epoch": 0.3576194288764345, + "grad_norm": 0.162497416138649, + "learning_rate": 0.0007254742337834439, + "loss": 2.751, + "step": 12060 + }, + { + "epoch": 0.357649082228746, + "grad_norm": 0.12805712223052979, + "learning_rate": 0.000725432237450728, + "loss": 2.7055, + "step": 12061 + }, + { + "epoch": 0.35767873558105745, + "grad_norm": 0.11090312153100967, + "learning_rate": 0.0007253902391217708, + "loss": 2.7357, + "step": 12062 + }, + { + "epoch": 0.3577083889333689, + "grad_norm": 0.140515998005867, + "learning_rate": 0.0007253482387969444, + "loss": 2.7138, + "step": 12063 + }, + { + "epoch": 0.3577380422856804, + "grad_norm": 0.12869149446487427, + "learning_rate": 0.0007253062364766206, + "loss": 2.777, + "step": 12064 + }, + { + "epoch": 0.3577676956379919, + "grad_norm": 0.12315283715724945, + "learning_rate": 0.0007252642321611716, + "loss": 2.7257, + "step": 12065 + }, + { + "epoch": 0.35779734899030335, + "grad_norm": 0.13573192059993744, + "learning_rate": 0.0007252222258509689, + "loss": 2.7454, + "step": 12066 + }, + { + "epoch": 0.35782700234261483, + "grad_norm": 0.13469760119915009, + "learning_rate": 0.0007251802175463848, + "loss": 2.7199, + "step": 12067 + }, + { + "epoch": 0.3578566556949263, + "grad_norm": 0.1565553843975067, + "learning_rate": 0.0007251382072477914, + "loss": 2.7459, + "step": 12068 + }, + { + "epoch": 0.3578863090472378, + "grad_norm": 0.1626085489988327, + "learning_rate": 0.0007250961949555604, + "loss": 2.7098, + "step": 12069 + }, + { + "epoch": 0.35791596239954926, + "grad_norm": 0.11586328595876694, + "learning_rate": 0.0007250541806700639, + "loss": 2.7152, + "step": 12070 + }, + { + "epoch": 0.35794561575186074, + "grad_norm": 0.12545675039291382, + "learning_rate": 0.000725012164391674, + "loss": 2.7523, + "step": 12071 + }, + { + "epoch": 0.3579752691041722, + "grad_norm": 0.14325031638145447, + "learning_rate": 0.000724970146120763, + "loss": 2.732, + "step": 12072 + }, + { + "epoch": 0.3580049224564837, + "grad_norm": 0.12580527365207672, + "learning_rate": 0.0007249281258577025, + "loss": 2.7691, + "step": 12073 + }, + { + "epoch": 0.35803457580879516, + "grad_norm": 0.13788142800331116, + "learning_rate": 0.000724886103602865, + "loss": 2.744, + "step": 12074 + }, + { + "epoch": 0.35806422916110664, + "grad_norm": 0.14086464047431946, + "learning_rate": 0.0007248440793566223, + "loss": 2.7419, + "step": 12075 + }, + { + "epoch": 0.3580938825134181, + "grad_norm": 0.14673568308353424, + "learning_rate": 0.0007248020531193468, + "loss": 2.733, + "step": 12076 + }, + { + "epoch": 0.35812353586572965, + "grad_norm": 0.17506934702396393, + "learning_rate": 0.0007247600248914104, + "loss": 2.7672, + "step": 12077 + }, + { + "epoch": 0.3581531892180411, + "grad_norm": 0.16496191918849945, + "learning_rate": 0.0007247179946731854, + "loss": 2.7498, + "step": 12078 + }, + { + "epoch": 0.3581828425703526, + "grad_norm": 0.12993121147155762, + "learning_rate": 0.0007246759624650442, + "loss": 2.7363, + "step": 12079 + }, + { + "epoch": 0.3582124959226641, + "grad_norm": 0.12378107011318207, + "learning_rate": 0.0007246339282673586, + "loss": 2.7151, + "step": 12080 + }, + { + "epoch": 0.35824214927497555, + "grad_norm": 0.12670618295669556, + "learning_rate": 0.0007245918920805011, + "loss": 2.7272, + "step": 12081 + }, + { + "epoch": 0.35827180262728703, + "grad_norm": 0.1403469741344452, + "learning_rate": 0.0007245498539048438, + "loss": 2.7413, + "step": 12082 + }, + { + "epoch": 0.3583014559795985, + "grad_norm": 0.13338927924633026, + "learning_rate": 0.0007245078137407588, + "loss": 2.7362, + "step": 12083 + }, + { + "epoch": 0.35833110933191, + "grad_norm": 0.12658263742923737, + "learning_rate": 0.0007244657715886189, + "loss": 2.73, + "step": 12084 + }, + { + "epoch": 0.35836076268422146, + "grad_norm": 0.10766766965389252, + "learning_rate": 0.0007244237274487959, + "loss": 2.7071, + "step": 12085 + }, + { + "epoch": 0.35839041603653293, + "grad_norm": 0.1031351387500763, + "learning_rate": 0.0007243816813216624, + "loss": 2.7457, + "step": 12086 + }, + { + "epoch": 0.3584200693888444, + "grad_norm": 0.11225811392068863, + "learning_rate": 0.0007243396332075905, + "loss": 2.6674, + "step": 12087 + }, + { + "epoch": 0.3584497227411559, + "grad_norm": 0.10824230313301086, + "learning_rate": 0.0007242975831069526, + "loss": 2.763, + "step": 12088 + }, + { + "epoch": 0.35847937609346736, + "grad_norm": 0.11001870781183243, + "learning_rate": 0.0007242555310201211, + "loss": 2.7451, + "step": 12089 + }, + { + "epoch": 0.35850902944577884, + "grad_norm": 0.10889191180467606, + "learning_rate": 0.0007242134769474684, + "loss": 2.7262, + "step": 12090 + }, + { + "epoch": 0.3585386827980903, + "grad_norm": 0.10890766233205795, + "learning_rate": 0.0007241714208893671, + "loss": 2.7399, + "step": 12091 + }, + { + "epoch": 0.3585683361504018, + "grad_norm": 0.11280041188001633, + "learning_rate": 0.0007241293628461891, + "loss": 2.7242, + "step": 12092 + }, + { + "epoch": 0.35859798950271327, + "grad_norm": 0.11816404014825821, + "learning_rate": 0.0007240873028183071, + "loss": 2.7373, + "step": 12093 + }, + { + "epoch": 0.35862764285502474, + "grad_norm": 0.1448861062526703, + "learning_rate": 0.0007240452408060938, + "loss": 2.7117, + "step": 12094 + }, + { + "epoch": 0.3586572962073362, + "grad_norm": 0.1404109001159668, + "learning_rate": 0.0007240031768099214, + "loss": 2.7411, + "step": 12095 + }, + { + "epoch": 0.3586869495596477, + "grad_norm": 0.15450988709926605, + "learning_rate": 0.0007239611108301623, + "loss": 2.741, + "step": 12096 + }, + { + "epoch": 0.35871660291195917, + "grad_norm": 0.15711258351802826, + "learning_rate": 0.0007239190428671891, + "loss": 2.7634, + "step": 12097 + }, + { + "epoch": 0.3587462562642707, + "grad_norm": 0.1532723605632782, + "learning_rate": 0.0007238769729213744, + "loss": 2.7473, + "step": 12098 + }, + { + "epoch": 0.3587759096165822, + "grad_norm": 0.14710184931755066, + "learning_rate": 0.0007238349009930907, + "loss": 2.7444, + "step": 12099 + }, + { + "epoch": 0.35880556296889365, + "grad_norm": 0.144199937582016, + "learning_rate": 0.0007237928270827104, + "loss": 2.7219, + "step": 12100 + }, + { + "epoch": 0.35883521632120513, + "grad_norm": 0.13066363334655762, + "learning_rate": 0.0007237507511906062, + "loss": 2.7479, + "step": 12101 + }, + { + "epoch": 0.3588648696735166, + "grad_norm": 0.1337839663028717, + "learning_rate": 0.0007237086733171509, + "loss": 2.7512, + "step": 12102 + }, + { + "epoch": 0.3588945230258281, + "grad_norm": 0.1401173621416092, + "learning_rate": 0.0007236665934627169, + "loss": 2.7245, + "step": 12103 + }, + { + "epoch": 0.35892417637813956, + "grad_norm": 0.1396978199481964, + "learning_rate": 0.0007236245116276766, + "loss": 2.7157, + "step": 12104 + }, + { + "epoch": 0.35895382973045104, + "grad_norm": 0.14490289986133575, + "learning_rate": 0.000723582427812403, + "loss": 2.751, + "step": 12105 + }, + { + "epoch": 0.3589834830827625, + "grad_norm": 0.11954499781131744, + "learning_rate": 0.0007235403420172686, + "loss": 2.6969, + "step": 12106 + }, + { + "epoch": 0.359013136435074, + "grad_norm": 0.1294870674610138, + "learning_rate": 0.0007234982542426463, + "loss": 2.7286, + "step": 12107 + }, + { + "epoch": 0.35904278978738546, + "grad_norm": 0.14079318940639496, + "learning_rate": 0.0007234561644889084, + "loss": 2.7346, + "step": 12108 + }, + { + "epoch": 0.35907244313969694, + "grad_norm": 0.15726038813591003, + "learning_rate": 0.0007234140727564276, + "loss": 2.7308, + "step": 12109 + }, + { + "epoch": 0.3591020964920084, + "grad_norm": 0.15550287067890167, + "learning_rate": 0.0007233719790455771, + "loss": 2.7248, + "step": 12110 + }, + { + "epoch": 0.3591317498443199, + "grad_norm": 0.14141491055488586, + "learning_rate": 0.0007233298833567293, + "loss": 2.7338, + "step": 12111 + }, + { + "epoch": 0.35916140319663137, + "grad_norm": 0.14732994139194489, + "learning_rate": 0.0007232877856902572, + "loss": 2.7196, + "step": 12112 + }, + { + "epoch": 0.35919105654894284, + "grad_norm": 0.1404992938041687, + "learning_rate": 0.0007232456860465333, + "loss": 2.7691, + "step": 12113 + }, + { + "epoch": 0.3592207099012543, + "grad_norm": 0.1275038719177246, + "learning_rate": 0.0007232035844259306, + "loss": 2.7422, + "step": 12114 + }, + { + "epoch": 0.3592503632535658, + "grad_norm": 0.13592153787612915, + "learning_rate": 0.0007231614808288217, + "loss": 2.7197, + "step": 12115 + }, + { + "epoch": 0.3592800166058773, + "grad_norm": 0.11789213865995407, + "learning_rate": 0.0007231193752555797, + "loss": 2.7006, + "step": 12116 + }, + { + "epoch": 0.35930966995818875, + "grad_norm": 0.1328938603401184, + "learning_rate": 0.0007230772677065773, + "loss": 2.7541, + "step": 12117 + }, + { + "epoch": 0.3593393233105002, + "grad_norm": 0.13572978973388672, + "learning_rate": 0.0007230351581821874, + "loss": 2.7302, + "step": 12118 + }, + { + "epoch": 0.35936897666281176, + "grad_norm": 0.11258600652217865, + "learning_rate": 0.000722993046682783, + "loss": 2.7459, + "step": 12119 + }, + { + "epoch": 0.35939863001512323, + "grad_norm": 0.12055370956659317, + "learning_rate": 0.0007229509332087367, + "loss": 2.746, + "step": 12120 + }, + { + "epoch": 0.3594282833674347, + "grad_norm": 0.1277129352092743, + "learning_rate": 0.0007229088177604218, + "loss": 2.6913, + "step": 12121 + }, + { + "epoch": 0.3594579367197462, + "grad_norm": 0.12630857527256012, + "learning_rate": 0.000722866700338211, + "loss": 2.7155, + "step": 12122 + }, + { + "epoch": 0.35948759007205766, + "grad_norm": 0.12987284362316132, + "learning_rate": 0.0007228245809424772, + "loss": 2.7381, + "step": 12123 + }, + { + "epoch": 0.35951724342436914, + "grad_norm": 0.12669526040554047, + "learning_rate": 0.0007227824595735936, + "loss": 2.6894, + "step": 12124 + }, + { + "epoch": 0.3595468967766806, + "grad_norm": 0.13112910091876984, + "learning_rate": 0.0007227403362319332, + "loss": 2.7266, + "step": 12125 + }, + { + "epoch": 0.3595765501289921, + "grad_norm": 0.12270580232143402, + "learning_rate": 0.0007226982109178686, + "loss": 2.7646, + "step": 12126 + }, + { + "epoch": 0.35960620348130357, + "grad_norm": 0.12348950654268265, + "learning_rate": 0.0007226560836317733, + "loss": 2.73, + "step": 12127 + }, + { + "epoch": 0.35963585683361504, + "grad_norm": 0.13869106769561768, + "learning_rate": 0.0007226139543740201, + "loss": 2.7289, + "step": 12128 + }, + { + "epoch": 0.3596655101859265, + "grad_norm": 0.16354186832904816, + "learning_rate": 0.0007225718231449822, + "loss": 2.7103, + "step": 12129 + }, + { + "epoch": 0.359695163538238, + "grad_norm": 0.16292719542980194, + "learning_rate": 0.0007225296899450325, + "loss": 2.7157, + "step": 12130 + }, + { + "epoch": 0.35972481689054947, + "grad_norm": 0.17383718490600586, + "learning_rate": 0.0007224875547745443, + "loss": 2.7328, + "step": 12131 + }, + { + "epoch": 0.35975447024286095, + "grad_norm": 0.19992201030254364, + "learning_rate": 0.0007224454176338906, + "loss": 2.7154, + "step": 12132 + }, + { + "epoch": 0.3597841235951724, + "grad_norm": 0.16886110603809357, + "learning_rate": 0.0007224032785234445, + "loss": 2.7581, + "step": 12133 + }, + { + "epoch": 0.3598137769474839, + "grad_norm": 0.13925451040267944, + "learning_rate": 0.0007223611374435792, + "loss": 2.7589, + "step": 12134 + }, + { + "epoch": 0.3598434302997954, + "grad_norm": 0.14814916253089905, + "learning_rate": 0.0007223189943946677, + "loss": 2.722, + "step": 12135 + }, + { + "epoch": 0.35987308365210685, + "grad_norm": 0.1477784961462021, + "learning_rate": 0.0007222768493770836, + "loss": 2.7339, + "step": 12136 + }, + { + "epoch": 0.3599027370044183, + "grad_norm": 0.15583768486976624, + "learning_rate": 0.0007222347023911997, + "loss": 2.7133, + "step": 12137 + }, + { + "epoch": 0.3599323903567298, + "grad_norm": 0.1357085108757019, + "learning_rate": 0.0007221925534373894, + "loss": 2.7019, + "step": 12138 + }, + { + "epoch": 0.3599620437090413, + "grad_norm": 0.13432087004184723, + "learning_rate": 0.0007221504025160259, + "loss": 2.7313, + "step": 12139 + }, + { + "epoch": 0.3599916970613528, + "grad_norm": 0.1486063152551651, + "learning_rate": 0.0007221082496274827, + "loss": 2.7183, + "step": 12140 + }, + { + "epoch": 0.3600213504136643, + "grad_norm": 0.12953123450279236, + "learning_rate": 0.0007220660947721325, + "loss": 2.6813, + "step": 12141 + }, + { + "epoch": 0.36005100376597576, + "grad_norm": 0.12444160133600235, + "learning_rate": 0.0007220239379503489, + "loss": 2.7379, + "step": 12142 + }, + { + "epoch": 0.36008065711828724, + "grad_norm": 0.15655258297920227, + "learning_rate": 0.0007219817791625054, + "loss": 2.7095, + "step": 12143 + }, + { + "epoch": 0.3601103104705987, + "grad_norm": 0.116304412484169, + "learning_rate": 0.0007219396184089751, + "loss": 2.7451, + "step": 12144 + }, + { + "epoch": 0.3601399638229102, + "grad_norm": 0.11753694713115692, + "learning_rate": 0.0007218974556901315, + "loss": 2.7246, + "step": 12145 + }, + { + "epoch": 0.36016961717522167, + "grad_norm": 0.14243881404399872, + "learning_rate": 0.0007218552910063476, + "loss": 2.7553, + "step": 12146 + }, + { + "epoch": 0.36019927052753314, + "grad_norm": 0.13159269094467163, + "learning_rate": 0.0007218131243579971, + "loss": 2.7507, + "step": 12147 + }, + { + "epoch": 0.3602289238798446, + "grad_norm": 0.12398159503936768, + "learning_rate": 0.0007217709557454532, + "loss": 2.744, + "step": 12148 + }, + { + "epoch": 0.3602585772321561, + "grad_norm": 0.13052241504192352, + "learning_rate": 0.0007217287851690896, + "loss": 2.7498, + "step": 12149 + }, + { + "epoch": 0.3602882305844676, + "grad_norm": 0.11367776989936829, + "learning_rate": 0.0007216866126292796, + "loss": 2.7334, + "step": 12150 + }, + { + "epoch": 0.36031788393677905, + "grad_norm": 0.12849920988082886, + "learning_rate": 0.0007216444381263965, + "loss": 2.7265, + "step": 12151 + }, + { + "epoch": 0.3603475372890905, + "grad_norm": 0.1270260065793991, + "learning_rate": 0.0007216022616608138, + "loss": 2.7259, + "step": 12152 + }, + { + "epoch": 0.360377190641402, + "grad_norm": 0.1181061789393425, + "learning_rate": 0.000721560083232905, + "loss": 2.7749, + "step": 12153 + }, + { + "epoch": 0.3604068439937135, + "grad_norm": 0.1423341929912567, + "learning_rate": 0.0007215179028430437, + "loss": 2.7165, + "step": 12154 + }, + { + "epoch": 0.36043649734602495, + "grad_norm": 0.13323499262332916, + "learning_rate": 0.0007214757204916034, + "loss": 2.7154, + "step": 12155 + }, + { + "epoch": 0.36046615069833643, + "grad_norm": 0.13733305037021637, + "learning_rate": 0.0007214335361789574, + "loss": 2.7592, + "step": 12156 + }, + { + "epoch": 0.3604958040506479, + "grad_norm": 0.12483492493629456, + "learning_rate": 0.0007213913499054796, + "loss": 2.7455, + "step": 12157 + }, + { + "epoch": 0.3605254574029594, + "grad_norm": 0.1280902922153473, + "learning_rate": 0.0007213491616715434, + "loss": 2.7295, + "step": 12158 + }, + { + "epoch": 0.36055511075527086, + "grad_norm": 0.14324012398719788, + "learning_rate": 0.0007213069714775224, + "loss": 2.7549, + "step": 12159 + }, + { + "epoch": 0.3605847641075824, + "grad_norm": 0.12002971023321152, + "learning_rate": 0.0007212647793237901, + "loss": 2.7348, + "step": 12160 + }, + { + "epoch": 0.36061441745989387, + "grad_norm": 0.13478879630565643, + "learning_rate": 0.0007212225852107201, + "loss": 2.7376, + "step": 12161 + }, + { + "epoch": 0.36064407081220534, + "grad_norm": 0.14393998682498932, + "learning_rate": 0.0007211803891386863, + "loss": 2.7122, + "step": 12162 + }, + { + "epoch": 0.3606737241645168, + "grad_norm": 0.1448613852262497, + "learning_rate": 0.0007211381911080621, + "loss": 2.7711, + "step": 12163 + }, + { + "epoch": 0.3607033775168283, + "grad_norm": 0.13608676195144653, + "learning_rate": 0.0007210959911192215, + "loss": 2.7682, + "step": 12164 + }, + { + "epoch": 0.36073303086913977, + "grad_norm": 0.1309652030467987, + "learning_rate": 0.0007210537891725376, + "loss": 2.7244, + "step": 12165 + }, + { + "epoch": 0.36076268422145125, + "grad_norm": 0.12823453545570374, + "learning_rate": 0.0007210115852683846, + "loss": 2.752, + "step": 12166 + }, + { + "epoch": 0.3607923375737627, + "grad_norm": 0.1337040215730667, + "learning_rate": 0.0007209693794071361, + "loss": 2.777, + "step": 12167 + }, + { + "epoch": 0.3608219909260742, + "grad_norm": 0.16820433735847473, + "learning_rate": 0.0007209271715891657, + "loss": 2.7175, + "step": 12168 + }, + { + "epoch": 0.3608516442783857, + "grad_norm": 0.15145935118198395, + "learning_rate": 0.0007208849618148475, + "loss": 2.6896, + "step": 12169 + }, + { + "epoch": 0.36088129763069715, + "grad_norm": 0.12337685376405716, + "learning_rate": 0.0007208427500845549, + "loss": 2.7097, + "step": 12170 + }, + { + "epoch": 0.3609109509830086, + "grad_norm": 0.14210093021392822, + "learning_rate": 0.0007208005363986619, + "loss": 2.7602, + "step": 12171 + }, + { + "epoch": 0.3609406043353201, + "grad_norm": 0.1280192881822586, + "learning_rate": 0.0007207583207575422, + "loss": 2.7346, + "step": 12172 + }, + { + "epoch": 0.3609702576876316, + "grad_norm": 0.1262795478105545, + "learning_rate": 0.0007207161031615697, + "loss": 2.7136, + "step": 12173 + }, + { + "epoch": 0.36099991103994306, + "grad_norm": 0.15667946636676788, + "learning_rate": 0.0007206738836111182, + "loss": 2.7479, + "step": 12174 + }, + { + "epoch": 0.36102956439225453, + "grad_norm": 0.12445162236690521, + "learning_rate": 0.0007206316621065615, + "loss": 2.6795, + "step": 12175 + }, + { + "epoch": 0.361059217744566, + "grad_norm": 0.12261410802602768, + "learning_rate": 0.0007205894386482736, + "loss": 2.7388, + "step": 12176 + }, + { + "epoch": 0.3610888710968775, + "grad_norm": 0.11647891998291016, + "learning_rate": 0.0007205472132366285, + "loss": 2.7128, + "step": 12177 + }, + { + "epoch": 0.36111852444918896, + "grad_norm": 0.10968206822872162, + "learning_rate": 0.000720504985872, + "loss": 2.7451, + "step": 12178 + }, + { + "epoch": 0.36114817780150044, + "grad_norm": 0.12546034157276154, + "learning_rate": 0.0007204627565547619, + "loss": 2.7213, + "step": 12179 + }, + { + "epoch": 0.3611778311538119, + "grad_norm": 0.12213389575481415, + "learning_rate": 0.000720420525285288, + "loss": 2.7031, + "step": 12180 + }, + { + "epoch": 0.36120748450612344, + "grad_norm": 0.12031414359807968, + "learning_rate": 0.0007203782920639528, + "loss": 2.7165, + "step": 12181 + }, + { + "epoch": 0.3612371378584349, + "grad_norm": 0.1290195733308792, + "learning_rate": 0.00072033605689113, + "loss": 2.7326, + "step": 12182 + }, + { + "epoch": 0.3612667912107464, + "grad_norm": 0.13426505029201508, + "learning_rate": 0.0007202938197671936, + "loss": 2.778, + "step": 12183 + }, + { + "epoch": 0.3612964445630579, + "grad_norm": 0.14874456822872162, + "learning_rate": 0.0007202515806925175, + "loss": 2.7249, + "step": 12184 + }, + { + "epoch": 0.36132609791536935, + "grad_norm": 0.1534627228975296, + "learning_rate": 0.000720209339667476, + "loss": 2.7202, + "step": 12185 + }, + { + "epoch": 0.3613557512676808, + "grad_norm": 0.13606733083724976, + "learning_rate": 0.0007201670966924429, + "loss": 2.7025, + "step": 12186 + }, + { + "epoch": 0.3613854046199923, + "grad_norm": 0.137178435921669, + "learning_rate": 0.0007201248517677922, + "loss": 2.7168, + "step": 12187 + }, + { + "epoch": 0.3614150579723038, + "grad_norm": 0.13542145490646362, + "learning_rate": 0.0007200826048938985, + "loss": 2.706, + "step": 12188 + }, + { + "epoch": 0.36144471132461525, + "grad_norm": 0.11995203793048859, + "learning_rate": 0.0007200403560711353, + "loss": 2.7009, + "step": 12189 + }, + { + "epoch": 0.36147436467692673, + "grad_norm": 0.12105458974838257, + "learning_rate": 0.000719998105299877, + "loss": 2.7389, + "step": 12190 + }, + { + "epoch": 0.3615040180292382, + "grad_norm": 0.1343785673379898, + "learning_rate": 0.0007199558525804978, + "loss": 2.7437, + "step": 12191 + }, + { + "epoch": 0.3615336713815497, + "grad_norm": 0.14561328291893005, + "learning_rate": 0.0007199135979133718, + "loss": 2.7257, + "step": 12192 + }, + { + "epoch": 0.36156332473386116, + "grad_norm": 0.14013060927391052, + "learning_rate": 0.000719871341298873, + "loss": 2.714, + "step": 12193 + }, + { + "epoch": 0.36159297808617263, + "grad_norm": 0.14161163568496704, + "learning_rate": 0.0007198290827373758, + "loss": 2.7373, + "step": 12194 + }, + { + "epoch": 0.3616226314384841, + "grad_norm": 0.13359427452087402, + "learning_rate": 0.0007197868222292543, + "loss": 2.7322, + "step": 12195 + }, + { + "epoch": 0.3616522847907956, + "grad_norm": 0.12263129651546478, + "learning_rate": 0.0007197445597748828, + "loss": 2.6744, + "step": 12196 + }, + { + "epoch": 0.36168193814310706, + "grad_norm": 0.11811463534832001, + "learning_rate": 0.0007197022953746355, + "loss": 2.7568, + "step": 12197 + }, + { + "epoch": 0.36171159149541854, + "grad_norm": 0.12331145256757736, + "learning_rate": 0.0007196600290288867, + "loss": 2.7406, + "step": 12198 + }, + { + "epoch": 0.36174124484773, + "grad_norm": 0.12105703353881836, + "learning_rate": 0.0007196177607380106, + "loss": 2.7641, + "step": 12199 + }, + { + "epoch": 0.3617708982000415, + "grad_norm": 0.12463566660881042, + "learning_rate": 0.0007195754905023816, + "loss": 2.7504, + "step": 12200 + }, + { + "epoch": 0.36180055155235297, + "grad_norm": 0.1329115480184555, + "learning_rate": 0.0007195332183223739, + "loss": 2.6931, + "step": 12201 + }, + { + "epoch": 0.3618302049046645, + "grad_norm": 0.1378113329410553, + "learning_rate": 0.0007194909441983619, + "loss": 2.7081, + "step": 12202 + }, + { + "epoch": 0.361859858256976, + "grad_norm": 0.13326294720172882, + "learning_rate": 0.0007194486681307198, + "loss": 2.7457, + "step": 12203 + }, + { + "epoch": 0.36188951160928745, + "grad_norm": 0.12985269725322723, + "learning_rate": 0.0007194063901198222, + "loss": 2.6922, + "step": 12204 + }, + { + "epoch": 0.3619191649615989, + "grad_norm": 0.1431734561920166, + "learning_rate": 0.0007193641101660434, + "loss": 2.7566, + "step": 12205 + }, + { + "epoch": 0.3619488183139104, + "grad_norm": 0.1444506198167801, + "learning_rate": 0.0007193218282697576, + "loss": 2.7599, + "step": 12206 + }, + { + "epoch": 0.3619784716662219, + "grad_norm": 0.13476064801216125, + "learning_rate": 0.0007192795444313394, + "loss": 2.7527, + "step": 12207 + }, + { + "epoch": 0.36200812501853336, + "grad_norm": 0.13825160264968872, + "learning_rate": 0.0007192372586511632, + "loss": 2.7405, + "step": 12208 + }, + { + "epoch": 0.36203777837084483, + "grad_norm": 0.13824082911014557, + "learning_rate": 0.0007191949709296035, + "loss": 2.7409, + "step": 12209 + }, + { + "epoch": 0.3620674317231563, + "grad_norm": 0.13287034630775452, + "learning_rate": 0.0007191526812670347, + "loss": 2.7255, + "step": 12210 + }, + { + "epoch": 0.3620970850754678, + "grad_norm": 0.1245664581656456, + "learning_rate": 0.0007191103896638313, + "loss": 2.7059, + "step": 12211 + }, + { + "epoch": 0.36212673842777926, + "grad_norm": 0.14032712578773499, + "learning_rate": 0.0007190680961203676, + "loss": 2.7206, + "step": 12212 + }, + { + "epoch": 0.36215639178009074, + "grad_norm": 0.11429654806852341, + "learning_rate": 0.0007190258006370185, + "loss": 2.7413, + "step": 12213 + }, + { + "epoch": 0.3621860451324022, + "grad_norm": 0.12035113573074341, + "learning_rate": 0.0007189835032141582, + "loss": 2.7099, + "step": 12214 + }, + { + "epoch": 0.3622156984847137, + "grad_norm": 0.12885695695877075, + "learning_rate": 0.0007189412038521616, + "loss": 2.7472, + "step": 12215 + }, + { + "epoch": 0.36224535183702516, + "grad_norm": 0.1434810608625412, + "learning_rate": 0.000718898902551403, + "loss": 2.7167, + "step": 12216 + }, + { + "epoch": 0.36227500518933664, + "grad_norm": 0.13141492009162903, + "learning_rate": 0.000718856599312257, + "loss": 2.7068, + "step": 12217 + }, + { + "epoch": 0.3623046585416481, + "grad_norm": 0.11492954194545746, + "learning_rate": 0.0007188142941350982, + "loss": 2.7256, + "step": 12218 + }, + { + "epoch": 0.3623343118939596, + "grad_norm": 0.12969720363616943, + "learning_rate": 0.0007187719870203012, + "loss": 2.7806, + "step": 12219 + }, + { + "epoch": 0.36236396524627107, + "grad_norm": 0.12937641143798828, + "learning_rate": 0.0007187296779682409, + "loss": 2.7263, + "step": 12220 + }, + { + "epoch": 0.36239361859858255, + "grad_norm": 0.12778040766716003, + "learning_rate": 0.0007186873669792918, + "loss": 2.7509, + "step": 12221 + }, + { + "epoch": 0.362423271950894, + "grad_norm": 0.13519373536109924, + "learning_rate": 0.0007186450540538283, + "loss": 2.7236, + "step": 12222 + }, + { + "epoch": 0.36245292530320555, + "grad_norm": 0.12182097136974335, + "learning_rate": 0.0007186027391922254, + "loss": 2.735, + "step": 12223 + }, + { + "epoch": 0.36248257865551703, + "grad_norm": 0.13359655439853668, + "learning_rate": 0.0007185604223948577, + "loss": 2.7285, + "step": 12224 + }, + { + "epoch": 0.3625122320078285, + "grad_norm": 0.155584454536438, + "learning_rate": 0.0007185181036620999, + "loss": 2.7447, + "step": 12225 + }, + { + "epoch": 0.36254188536014, + "grad_norm": 0.14115875959396362, + "learning_rate": 0.0007184757829943269, + "loss": 2.7372, + "step": 12226 + }, + { + "epoch": 0.36257153871245146, + "grad_norm": 0.11865316331386566, + "learning_rate": 0.0007184334603919134, + "loss": 2.7459, + "step": 12227 + }, + { + "epoch": 0.36260119206476293, + "grad_norm": 0.10640785098075867, + "learning_rate": 0.0007183911358552339, + "loss": 2.7262, + "step": 12228 + }, + { + "epoch": 0.3626308454170744, + "grad_norm": 0.11677844822406769, + "learning_rate": 0.0007183488093846635, + "loss": 2.7443, + "step": 12229 + }, + { + "epoch": 0.3626604987693859, + "grad_norm": 0.12397952377796173, + "learning_rate": 0.000718306480980577, + "loss": 2.7337, + "step": 12230 + }, + { + "epoch": 0.36269015212169736, + "grad_norm": 0.12261880189180374, + "learning_rate": 0.0007182641506433491, + "loss": 2.7314, + "step": 12231 + }, + { + "epoch": 0.36271980547400884, + "grad_norm": 0.11923516541719437, + "learning_rate": 0.0007182218183733547, + "loss": 2.723, + "step": 12232 + }, + { + "epoch": 0.3627494588263203, + "grad_norm": 0.1192641481757164, + "learning_rate": 0.0007181794841709686, + "loss": 2.7278, + "step": 12233 + }, + { + "epoch": 0.3627791121786318, + "grad_norm": 0.12734541296958923, + "learning_rate": 0.0007181371480365656, + "loss": 2.6933, + "step": 12234 + }, + { + "epoch": 0.36280876553094327, + "grad_norm": 0.15252825617790222, + "learning_rate": 0.000718094809970521, + "loss": 2.7464, + "step": 12235 + }, + { + "epoch": 0.36283841888325474, + "grad_norm": 0.1738680601119995, + "learning_rate": 0.0007180524699732091, + "loss": 2.7468, + "step": 12236 + }, + { + "epoch": 0.3628680722355662, + "grad_norm": 0.1807592213153839, + "learning_rate": 0.0007180101280450053, + "loss": 2.7229, + "step": 12237 + }, + { + "epoch": 0.3628977255878777, + "grad_norm": 0.17794956266880035, + "learning_rate": 0.0007179677841862844, + "loss": 2.7267, + "step": 12238 + }, + { + "epoch": 0.36292737894018917, + "grad_norm": 0.13859708607196808, + "learning_rate": 0.0007179254383974213, + "loss": 2.7327, + "step": 12239 + }, + { + "epoch": 0.36295703229250065, + "grad_norm": 0.12032020837068558, + "learning_rate": 0.0007178830906787911, + "loss": 2.7188, + "step": 12240 + }, + { + "epoch": 0.3629866856448121, + "grad_norm": 0.1275831162929535, + "learning_rate": 0.0007178407410307687, + "loss": 2.7403, + "step": 12241 + }, + { + "epoch": 0.3630163389971236, + "grad_norm": 0.13658304512500763, + "learning_rate": 0.0007177983894537292, + "loss": 2.7294, + "step": 12242 + }, + { + "epoch": 0.3630459923494351, + "grad_norm": 0.12282347679138184, + "learning_rate": 0.0007177560359480477, + "loss": 2.7464, + "step": 12243 + }, + { + "epoch": 0.3630756457017466, + "grad_norm": 0.11894871294498444, + "learning_rate": 0.0007177136805140989, + "loss": 2.748, + "step": 12244 + }, + { + "epoch": 0.3631052990540581, + "grad_norm": 0.11819849908351898, + "learning_rate": 0.000717671323152258, + "loss": 2.703, + "step": 12245 + }, + { + "epoch": 0.36313495240636956, + "grad_norm": 0.11778835952281952, + "learning_rate": 0.0007176289638629003, + "loss": 2.7249, + "step": 12246 + }, + { + "epoch": 0.36316460575868104, + "grad_norm": 0.1341368854045868, + "learning_rate": 0.0007175866026464009, + "loss": 2.7674, + "step": 12247 + }, + { + "epoch": 0.3631942591109925, + "grad_norm": 0.12874452769756317, + "learning_rate": 0.0007175442395031347, + "loss": 2.7498, + "step": 12248 + }, + { + "epoch": 0.363223912463304, + "grad_norm": 0.12538279592990875, + "learning_rate": 0.000717501874433477, + "loss": 2.7476, + "step": 12249 + }, + { + "epoch": 0.36325356581561546, + "grad_norm": 0.12550193071365356, + "learning_rate": 0.0007174595074378028, + "loss": 2.7374, + "step": 12250 + }, + { + "epoch": 0.36328321916792694, + "grad_norm": 0.1218676045536995, + "learning_rate": 0.0007174171385164872, + "loss": 2.7532, + "step": 12251 + }, + { + "epoch": 0.3633128725202384, + "grad_norm": 0.1282687783241272, + "learning_rate": 0.0007173747676699055, + "loss": 2.7344, + "step": 12252 + }, + { + "epoch": 0.3633425258725499, + "grad_norm": 0.13393136858940125, + "learning_rate": 0.0007173323948984331, + "loss": 2.7216, + "step": 12253 + }, + { + "epoch": 0.36337217922486137, + "grad_norm": 0.12630486488342285, + "learning_rate": 0.0007172900202024451, + "loss": 2.7705, + "step": 12254 + }, + { + "epoch": 0.36340183257717285, + "grad_norm": 0.12834656238555908, + "learning_rate": 0.0007172476435823165, + "loss": 2.7531, + "step": 12255 + }, + { + "epoch": 0.3634314859294843, + "grad_norm": 0.13471142947673798, + "learning_rate": 0.0007172052650384228, + "loss": 2.7404, + "step": 12256 + }, + { + "epoch": 0.3634611392817958, + "grad_norm": 0.15280836820602417, + "learning_rate": 0.0007171628845711391, + "loss": 2.7697, + "step": 12257 + }, + { + "epoch": 0.3634907926341073, + "grad_norm": 0.17391982674598694, + "learning_rate": 0.0007171205021808408, + "loss": 2.719, + "step": 12258 + }, + { + "epoch": 0.36352044598641875, + "grad_norm": 0.14071545004844666, + "learning_rate": 0.0007170781178679034, + "loss": 2.7189, + "step": 12259 + }, + { + "epoch": 0.3635500993387302, + "grad_norm": 0.14355795085430145, + "learning_rate": 0.0007170357316327018, + "loss": 2.7354, + "step": 12260 + }, + { + "epoch": 0.3635797526910417, + "grad_norm": 0.121808722615242, + "learning_rate": 0.0007169933434756115, + "loss": 2.7063, + "step": 12261 + }, + { + "epoch": 0.3636094060433532, + "grad_norm": 0.142170250415802, + "learning_rate": 0.000716950953397008, + "loss": 2.7209, + "step": 12262 + }, + { + "epoch": 0.36363905939566465, + "grad_norm": 0.16073103249073029, + "learning_rate": 0.0007169085613972666, + "loss": 2.7366, + "step": 12263 + }, + { + "epoch": 0.3636687127479762, + "grad_norm": 0.16718633472919464, + "learning_rate": 0.0007168661674767626, + "loss": 2.7301, + "step": 12264 + }, + { + "epoch": 0.36369836610028766, + "grad_norm": 0.12862493097782135, + "learning_rate": 0.0007168237716358714, + "loss": 2.7039, + "step": 12265 + }, + { + "epoch": 0.36372801945259914, + "grad_norm": 0.11846119165420532, + "learning_rate": 0.0007167813738749686, + "loss": 2.7379, + "step": 12266 + }, + { + "epoch": 0.3637576728049106, + "grad_norm": 0.13465633988380432, + "learning_rate": 0.0007167389741944294, + "loss": 2.7545, + "step": 12267 + }, + { + "epoch": 0.3637873261572221, + "grad_norm": 0.13251346349716187, + "learning_rate": 0.0007166965725946297, + "loss": 2.6551, + "step": 12268 + }, + { + "epoch": 0.36381697950953357, + "grad_norm": 0.12686625123023987, + "learning_rate": 0.0007166541690759443, + "loss": 2.6818, + "step": 12269 + }, + { + "epoch": 0.36384663286184504, + "grad_norm": 0.1194608137011528, + "learning_rate": 0.0007166117636387492, + "loss": 2.7039, + "step": 12270 + }, + { + "epoch": 0.3638762862141565, + "grad_norm": 0.13103614747524261, + "learning_rate": 0.0007165693562834197, + "loss": 2.7286, + "step": 12271 + }, + { + "epoch": 0.363905939566468, + "grad_norm": 0.13397730886936188, + "learning_rate": 0.0007165269470103314, + "loss": 2.718, + "step": 12272 + }, + { + "epoch": 0.36393559291877947, + "grad_norm": 0.12285067141056061, + "learning_rate": 0.0007164845358198597, + "loss": 2.7529, + "step": 12273 + }, + { + "epoch": 0.36396524627109095, + "grad_norm": 0.12178794294595718, + "learning_rate": 0.0007164421227123805, + "loss": 2.7187, + "step": 12274 + }, + { + "epoch": 0.3639948996234024, + "grad_norm": 0.10883449018001556, + "learning_rate": 0.000716399707688269, + "loss": 2.7502, + "step": 12275 + }, + { + "epoch": 0.3640245529757139, + "grad_norm": 0.13057760894298553, + "learning_rate": 0.0007163572907479011, + "loss": 2.7468, + "step": 12276 + }, + { + "epoch": 0.3640542063280254, + "grad_norm": 0.15423697233200073, + "learning_rate": 0.000716314871891652, + "loss": 2.7318, + "step": 12277 + }, + { + "epoch": 0.36408385968033685, + "grad_norm": 0.15617400407791138, + "learning_rate": 0.0007162724511198977, + "loss": 2.7382, + "step": 12278 + }, + { + "epoch": 0.36411351303264833, + "grad_norm": 0.15285003185272217, + "learning_rate": 0.0007162300284330137, + "loss": 2.764, + "step": 12279 + }, + { + "epoch": 0.3641431663849598, + "grad_norm": 0.14460107684135437, + "learning_rate": 0.0007161876038313757, + "loss": 2.712, + "step": 12280 + }, + { + "epoch": 0.3641728197372713, + "grad_norm": 0.13539138436317444, + "learning_rate": 0.0007161451773153595, + "loss": 2.7071, + "step": 12281 + }, + { + "epoch": 0.36420247308958276, + "grad_norm": 0.11749988794326782, + "learning_rate": 0.0007161027488853405, + "loss": 2.7141, + "step": 12282 + }, + { + "epoch": 0.36423212644189423, + "grad_norm": 0.1444021761417389, + "learning_rate": 0.0007160603185416945, + "loss": 2.7194, + "step": 12283 + }, + { + "epoch": 0.3642617797942057, + "grad_norm": 0.15525545179843903, + "learning_rate": 0.0007160178862847975, + "loss": 2.754, + "step": 12284 + }, + { + "epoch": 0.36429143314651724, + "grad_norm": 0.12531360983848572, + "learning_rate": 0.0007159754521150249, + "loss": 2.7475, + "step": 12285 + }, + { + "epoch": 0.3643210864988287, + "grad_norm": 0.11079832166433334, + "learning_rate": 0.0007159330160327527, + "loss": 2.7313, + "step": 12286 + }, + { + "epoch": 0.3643507398511402, + "grad_norm": 0.13190223276615143, + "learning_rate": 0.0007158905780383566, + "loss": 2.7084, + "step": 12287 + }, + { + "epoch": 0.36438039320345167, + "grad_norm": 0.15103794634342194, + "learning_rate": 0.0007158481381322122, + "loss": 2.7534, + "step": 12288 + }, + { + "epoch": 0.36441004655576315, + "grad_norm": 0.1630031019449234, + "learning_rate": 0.0007158056963146956, + "loss": 2.7455, + "step": 12289 + }, + { + "epoch": 0.3644396999080746, + "grad_norm": 0.16297921538352966, + "learning_rate": 0.0007157632525861823, + "loss": 2.718, + "step": 12290 + }, + { + "epoch": 0.3644693532603861, + "grad_norm": 0.14092490077018738, + "learning_rate": 0.0007157208069470487, + "loss": 2.7401, + "step": 12291 + }, + { + "epoch": 0.3644990066126976, + "grad_norm": 0.14507775008678436, + "learning_rate": 0.0007156783593976701, + "loss": 2.7348, + "step": 12292 + }, + { + "epoch": 0.36452865996500905, + "grad_norm": 0.1362941414117813, + "learning_rate": 0.0007156359099384227, + "loss": 2.7476, + "step": 12293 + }, + { + "epoch": 0.3645583133173205, + "grad_norm": 0.11938997358083725, + "learning_rate": 0.0007155934585696824, + "loss": 2.7331, + "step": 12294 + }, + { + "epoch": 0.364587966669632, + "grad_norm": 0.14246393740177155, + "learning_rate": 0.0007155510052918248, + "loss": 2.7397, + "step": 12295 + }, + { + "epoch": 0.3646176200219435, + "grad_norm": 0.12957853078842163, + "learning_rate": 0.0007155085501052261, + "loss": 2.7419, + "step": 12296 + }, + { + "epoch": 0.36464727337425495, + "grad_norm": 0.12968306243419647, + "learning_rate": 0.0007154660930102624, + "loss": 2.7337, + "step": 12297 + }, + { + "epoch": 0.36467692672656643, + "grad_norm": 0.13025733828544617, + "learning_rate": 0.0007154236340073093, + "loss": 2.7152, + "step": 12298 + }, + { + "epoch": 0.3647065800788779, + "grad_norm": 0.13508982956409454, + "learning_rate": 0.0007153811730967428, + "loss": 2.7446, + "step": 12299 + }, + { + "epoch": 0.3647362334311894, + "grad_norm": 0.13886786997318268, + "learning_rate": 0.0007153387102789392, + "loss": 2.7629, + "step": 12300 + }, + { + "epoch": 0.36476588678350086, + "grad_norm": 0.12479151785373688, + "learning_rate": 0.0007152962455542744, + "loss": 2.7008, + "step": 12301 + }, + { + "epoch": 0.36479554013581234, + "grad_norm": 0.1422238051891327, + "learning_rate": 0.0007152537789231244, + "loss": 2.7503, + "step": 12302 + }, + { + "epoch": 0.3648251934881238, + "grad_norm": 0.15359464287757874, + "learning_rate": 0.0007152113103858652, + "loss": 2.7307, + "step": 12303 + }, + { + "epoch": 0.3648548468404353, + "grad_norm": 0.15113964676856995, + "learning_rate": 0.0007151688399428728, + "loss": 2.7454, + "step": 12304 + }, + { + "epoch": 0.36488450019274676, + "grad_norm": 0.1589363068342209, + "learning_rate": 0.0007151263675945236, + "loss": 2.7461, + "step": 12305 + }, + { + "epoch": 0.3649141535450583, + "grad_norm": 0.1450381875038147, + "learning_rate": 0.0007150838933411934, + "loss": 2.7563, + "step": 12306 + }, + { + "epoch": 0.36494380689736977, + "grad_norm": 0.12184616178274155, + "learning_rate": 0.0007150414171832583, + "loss": 2.7166, + "step": 12307 + }, + { + "epoch": 0.36497346024968125, + "grad_norm": 0.14854951202869415, + "learning_rate": 0.0007149989391210947, + "loss": 2.7647, + "step": 12308 + }, + { + "epoch": 0.3650031136019927, + "grad_norm": 0.11691510677337646, + "learning_rate": 0.0007149564591550784, + "loss": 2.7124, + "step": 12309 + }, + { + "epoch": 0.3650327669543042, + "grad_norm": 0.12451616674661636, + "learning_rate": 0.000714913977285586, + "loss": 2.7429, + "step": 12310 + }, + { + "epoch": 0.3650624203066157, + "grad_norm": 0.15086974203586578, + "learning_rate": 0.0007148714935129932, + "loss": 2.7509, + "step": 12311 + }, + { + "epoch": 0.36509207365892715, + "grad_norm": 0.12558132410049438, + "learning_rate": 0.0007148290078376765, + "loss": 2.712, + "step": 12312 + }, + { + "epoch": 0.36512172701123863, + "grad_norm": 0.11622264981269836, + "learning_rate": 0.0007147865202600121, + "loss": 2.7196, + "step": 12313 + }, + { + "epoch": 0.3651513803635501, + "grad_norm": 0.11678853631019592, + "learning_rate": 0.0007147440307803763, + "loss": 2.7331, + "step": 12314 + }, + { + "epoch": 0.3651810337158616, + "grad_norm": 0.1283048391342163, + "learning_rate": 0.0007147015393991451, + "loss": 2.7354, + "step": 12315 + }, + { + "epoch": 0.36521068706817306, + "grad_norm": 0.1271449625492096, + "learning_rate": 0.000714659046116695, + "loss": 2.7184, + "step": 12316 + }, + { + "epoch": 0.36524034042048453, + "grad_norm": 0.11432906985282898, + "learning_rate": 0.0007146165509334021, + "loss": 2.7279, + "step": 12317 + }, + { + "epoch": 0.365269993772796, + "grad_norm": 0.11436846107244492, + "learning_rate": 0.0007145740538496429, + "loss": 2.7297, + "step": 12318 + }, + { + "epoch": 0.3652996471251075, + "grad_norm": 0.10751200467348099, + "learning_rate": 0.0007145315548657937, + "loss": 2.7078, + "step": 12319 + }, + { + "epoch": 0.36532930047741896, + "grad_norm": 0.11625833064317703, + "learning_rate": 0.0007144890539822306, + "loss": 2.7326, + "step": 12320 + }, + { + "epoch": 0.36535895382973044, + "grad_norm": 0.13661283254623413, + "learning_rate": 0.0007144465511993302, + "loss": 2.735, + "step": 12321 + }, + { + "epoch": 0.3653886071820419, + "grad_norm": 0.12101878970861435, + "learning_rate": 0.0007144040465174686, + "loss": 2.7356, + "step": 12322 + }, + { + "epoch": 0.3654182605343534, + "grad_norm": 0.12582211196422577, + "learning_rate": 0.0007143615399370226, + "loss": 2.7001, + "step": 12323 + }, + { + "epoch": 0.36544791388666487, + "grad_norm": 0.11345137655735016, + "learning_rate": 0.0007143190314583683, + "loss": 2.7268, + "step": 12324 + }, + { + "epoch": 0.36547756723897634, + "grad_norm": 0.11909169703722, + "learning_rate": 0.0007142765210818822, + "loss": 2.7375, + "step": 12325 + }, + { + "epoch": 0.3655072205912878, + "grad_norm": 0.12398187071084976, + "learning_rate": 0.0007142340088079406, + "loss": 2.7166, + "step": 12326 + }, + { + "epoch": 0.36553687394359935, + "grad_norm": 0.13132166862487793, + "learning_rate": 0.0007141914946369203, + "loss": 2.7104, + "step": 12327 + }, + { + "epoch": 0.3655665272959108, + "grad_norm": 0.13321645557880402, + "learning_rate": 0.0007141489785691973, + "loss": 2.7023, + "step": 12328 + }, + { + "epoch": 0.3655961806482223, + "grad_norm": 0.14437128603458405, + "learning_rate": 0.0007141064606051484, + "loss": 2.7139, + "step": 12329 + }, + { + "epoch": 0.3656258340005338, + "grad_norm": 0.14333783090114594, + "learning_rate": 0.0007140639407451502, + "loss": 2.738, + "step": 12330 + }, + { + "epoch": 0.36565548735284525, + "grad_norm": 0.13603575527668, + "learning_rate": 0.0007140214189895789, + "loss": 2.7081, + "step": 12331 + }, + { + "epoch": 0.36568514070515673, + "grad_norm": 0.17713198065757751, + "learning_rate": 0.0007139788953388113, + "loss": 2.7284, + "step": 12332 + }, + { + "epoch": 0.3657147940574682, + "grad_norm": 0.17397384345531464, + "learning_rate": 0.0007139363697932238, + "loss": 2.7197, + "step": 12333 + }, + { + "epoch": 0.3657444474097797, + "grad_norm": 0.14619947969913483, + "learning_rate": 0.0007138938423531931, + "loss": 2.7059, + "step": 12334 + }, + { + "epoch": 0.36577410076209116, + "grad_norm": 0.17177176475524902, + "learning_rate": 0.0007138513130190957, + "loss": 2.7558, + "step": 12335 + }, + { + "epoch": 0.36580375411440264, + "grad_norm": 0.16184653341770172, + "learning_rate": 0.0007138087817913081, + "loss": 2.705, + "step": 12336 + }, + { + "epoch": 0.3658334074667141, + "grad_norm": 0.1644158959388733, + "learning_rate": 0.0007137662486702072, + "loss": 2.7519, + "step": 12337 + }, + { + "epoch": 0.3658630608190256, + "grad_norm": 0.1593019962310791, + "learning_rate": 0.0007137237136561693, + "loss": 2.7082, + "step": 12338 + }, + { + "epoch": 0.36589271417133706, + "grad_norm": 0.13049902021884918, + "learning_rate": 0.0007136811767495712, + "loss": 2.6935, + "step": 12339 + }, + { + "epoch": 0.36592236752364854, + "grad_norm": 0.1316152960062027, + "learning_rate": 0.0007136386379507898, + "loss": 2.6733, + "step": 12340 + }, + { + "epoch": 0.36595202087596, + "grad_norm": 0.13088373839855194, + "learning_rate": 0.0007135960972602015, + "loss": 2.7508, + "step": 12341 + }, + { + "epoch": 0.3659816742282715, + "grad_norm": 0.11042144894599915, + "learning_rate": 0.0007135535546781831, + "loss": 2.7324, + "step": 12342 + }, + { + "epoch": 0.36601132758058297, + "grad_norm": 0.1079246774315834, + "learning_rate": 0.0007135110102051112, + "loss": 2.7369, + "step": 12343 + }, + { + "epoch": 0.36604098093289444, + "grad_norm": 0.10404667258262634, + "learning_rate": 0.0007134684638413629, + "loss": 2.7201, + "step": 12344 + }, + { + "epoch": 0.3660706342852059, + "grad_norm": 0.10955584794282913, + "learning_rate": 0.0007134259155873145, + "loss": 2.7638, + "step": 12345 + }, + { + "epoch": 0.3661002876375174, + "grad_norm": 0.11164165288209915, + "learning_rate": 0.0007133833654433431, + "loss": 2.7275, + "step": 12346 + }, + { + "epoch": 0.3661299409898289, + "grad_norm": 0.11675162613391876, + "learning_rate": 0.0007133408134098254, + "loss": 2.7115, + "step": 12347 + }, + { + "epoch": 0.3661595943421404, + "grad_norm": 0.10670459270477295, + "learning_rate": 0.000713298259487138, + "loss": 2.7147, + "step": 12348 + }, + { + "epoch": 0.3661892476944519, + "grad_norm": 0.1318971961736679, + "learning_rate": 0.000713255703675658, + "loss": 2.7301, + "step": 12349 + }, + { + "epoch": 0.36621890104676336, + "grad_norm": 0.14033836126327515, + "learning_rate": 0.0007132131459757622, + "loss": 2.7656, + "step": 12350 + }, + { + "epoch": 0.36624855439907483, + "grad_norm": 0.14859788119792938, + "learning_rate": 0.0007131705863878272, + "loss": 2.7593, + "step": 12351 + }, + { + "epoch": 0.3662782077513863, + "grad_norm": 0.14919765293598175, + "learning_rate": 0.0007131280249122304, + "loss": 2.7462, + "step": 12352 + }, + { + "epoch": 0.3663078611036978, + "grad_norm": 0.15022100508213043, + "learning_rate": 0.0007130854615493481, + "loss": 2.7293, + "step": 12353 + }, + { + "epoch": 0.36633751445600926, + "grad_norm": 0.14394259452819824, + "learning_rate": 0.0007130428962995577, + "loss": 2.7154, + "step": 12354 + }, + { + "epoch": 0.36636716780832074, + "grad_norm": 0.1557081639766693, + "learning_rate": 0.0007130003291632355, + "loss": 2.7241, + "step": 12355 + }, + { + "epoch": 0.3663968211606322, + "grad_norm": 0.16272377967834473, + "learning_rate": 0.0007129577601407591, + "loss": 2.7606, + "step": 12356 + }, + { + "epoch": 0.3664264745129437, + "grad_norm": 0.14258964359760284, + "learning_rate": 0.0007129151892325052, + "loss": 2.7299, + "step": 12357 + }, + { + "epoch": 0.36645612786525517, + "grad_norm": 0.1261425018310547, + "learning_rate": 0.0007128726164388506, + "loss": 2.7167, + "step": 12358 + }, + { + "epoch": 0.36648578121756664, + "grad_norm": 0.1163162961602211, + "learning_rate": 0.0007128300417601725, + "loss": 2.716, + "step": 12359 + }, + { + "epoch": 0.3665154345698781, + "grad_norm": 0.1389615833759308, + "learning_rate": 0.0007127874651968479, + "loss": 2.7618, + "step": 12360 + }, + { + "epoch": 0.3665450879221896, + "grad_norm": 0.123108871281147, + "learning_rate": 0.0007127448867492536, + "loss": 2.7093, + "step": 12361 + }, + { + "epoch": 0.36657474127450107, + "grad_norm": 0.1500365436077118, + "learning_rate": 0.0007127023064177671, + "loss": 2.7735, + "step": 12362 + }, + { + "epoch": 0.36660439462681255, + "grad_norm": 0.18250426650047302, + "learning_rate": 0.0007126597242027651, + "loss": 2.725, + "step": 12363 + }, + { + "epoch": 0.366634047979124, + "grad_norm": 0.16167744994163513, + "learning_rate": 0.0007126171401046245, + "loss": 2.7382, + "step": 12364 + }, + { + "epoch": 0.3666637013314355, + "grad_norm": 0.1388954520225525, + "learning_rate": 0.0007125745541237228, + "loss": 2.7095, + "step": 12365 + }, + { + "epoch": 0.366693354683747, + "grad_norm": 0.1213189959526062, + "learning_rate": 0.000712531966260437, + "loss": 2.7296, + "step": 12366 + }, + { + "epoch": 0.36672300803605845, + "grad_norm": 0.1244550570845604, + "learning_rate": 0.000712489376515144, + "loss": 2.7179, + "step": 12367 + }, + { + "epoch": 0.36675266138837, + "grad_norm": 0.11705221980810165, + "learning_rate": 0.0007124467848882212, + "loss": 2.7437, + "step": 12368 + }, + { + "epoch": 0.36678231474068146, + "grad_norm": 0.11786428838968277, + "learning_rate": 0.0007124041913800456, + "loss": 2.684, + "step": 12369 + }, + { + "epoch": 0.36681196809299293, + "grad_norm": 0.12220023572444916, + "learning_rate": 0.0007123615959909945, + "loss": 2.7537, + "step": 12370 + }, + { + "epoch": 0.3668416214453044, + "grad_norm": 0.12774774432182312, + "learning_rate": 0.0007123189987214449, + "loss": 2.7402, + "step": 12371 + }, + { + "epoch": 0.3668712747976159, + "grad_norm": 0.10912548005580902, + "learning_rate": 0.0007122763995717743, + "loss": 2.7263, + "step": 12372 + }, + { + "epoch": 0.36690092814992736, + "grad_norm": 0.11821442097425461, + "learning_rate": 0.0007122337985423596, + "loss": 2.7433, + "step": 12373 + }, + { + "epoch": 0.36693058150223884, + "grad_norm": 0.12263084203004837, + "learning_rate": 0.0007121911956335782, + "loss": 2.7171, + "step": 12374 + }, + { + "epoch": 0.3669602348545503, + "grad_norm": 0.1121813952922821, + "learning_rate": 0.0007121485908458074, + "loss": 2.726, + "step": 12375 + }, + { + "epoch": 0.3669898882068618, + "grad_norm": 0.12024734914302826, + "learning_rate": 0.0007121059841794242, + "loss": 2.7447, + "step": 12376 + }, + { + "epoch": 0.36701954155917327, + "grad_norm": 0.14500939846038818, + "learning_rate": 0.0007120633756348064, + "loss": 2.7436, + "step": 12377 + }, + { + "epoch": 0.36704919491148474, + "grad_norm": 0.16832447052001953, + "learning_rate": 0.0007120207652123308, + "loss": 2.7494, + "step": 12378 + }, + { + "epoch": 0.3670788482637962, + "grad_norm": 0.16716401278972626, + "learning_rate": 0.0007119781529123751, + "loss": 2.7308, + "step": 12379 + }, + { + "epoch": 0.3671085016161077, + "grad_norm": 0.12648248672485352, + "learning_rate": 0.0007119355387353164, + "loss": 2.7467, + "step": 12380 + }, + { + "epoch": 0.3671381549684192, + "grad_norm": 0.1343744844198227, + "learning_rate": 0.0007118929226815321, + "loss": 2.7486, + "step": 12381 + }, + { + "epoch": 0.36716780832073065, + "grad_norm": 0.13720166683197021, + "learning_rate": 0.0007118503047513996, + "loss": 2.6825, + "step": 12382 + }, + { + "epoch": 0.3671974616730421, + "grad_norm": 0.13671964406967163, + "learning_rate": 0.0007118076849452964, + "loss": 2.7065, + "step": 12383 + }, + { + "epoch": 0.3672271150253536, + "grad_norm": 0.13009297847747803, + "learning_rate": 0.0007117650632635996, + "loss": 2.7247, + "step": 12384 + }, + { + "epoch": 0.3672567683776651, + "grad_norm": 0.1295720338821411, + "learning_rate": 0.000711722439706687, + "loss": 2.7194, + "step": 12385 + }, + { + "epoch": 0.36728642172997655, + "grad_norm": 0.1307729333639145, + "learning_rate": 0.0007116798142749358, + "loss": 2.6905, + "step": 12386 + }, + { + "epoch": 0.36731607508228803, + "grad_norm": 0.1297069787979126, + "learning_rate": 0.0007116371869687233, + "loss": 2.6889, + "step": 12387 + }, + { + "epoch": 0.3673457284345995, + "grad_norm": 0.13908345997333527, + "learning_rate": 0.0007115945577884274, + "loss": 2.7313, + "step": 12388 + }, + { + "epoch": 0.36737538178691104, + "grad_norm": 0.1210528165102005, + "learning_rate": 0.0007115519267344252, + "loss": 2.7184, + "step": 12389 + }, + { + "epoch": 0.3674050351392225, + "grad_norm": 0.10648688673973083, + "learning_rate": 0.0007115092938070947, + "loss": 2.7328, + "step": 12390 + }, + { + "epoch": 0.367434688491534, + "grad_norm": 0.12178285419940948, + "learning_rate": 0.0007114666590068129, + "loss": 2.7279, + "step": 12391 + }, + { + "epoch": 0.36746434184384547, + "grad_norm": 0.12575390934944153, + "learning_rate": 0.0007114240223339575, + "loss": 2.7322, + "step": 12392 + }, + { + "epoch": 0.36749399519615694, + "grad_norm": 0.12090524286031723, + "learning_rate": 0.000711381383788906, + "loss": 2.7066, + "step": 12393 + }, + { + "epoch": 0.3675236485484684, + "grad_norm": 0.11410340666770935, + "learning_rate": 0.0007113387433720363, + "loss": 2.7219, + "step": 12394 + }, + { + "epoch": 0.3675533019007799, + "grad_norm": 0.12817710638046265, + "learning_rate": 0.0007112961010837256, + "loss": 2.6791, + "step": 12395 + }, + { + "epoch": 0.36758295525309137, + "grad_norm": 0.12851829826831818, + "learning_rate": 0.0007112534569243519, + "loss": 2.7613, + "step": 12396 + }, + { + "epoch": 0.36761260860540285, + "grad_norm": 0.12886084616184235, + "learning_rate": 0.0007112108108942922, + "loss": 2.6932, + "step": 12397 + }, + { + "epoch": 0.3676422619577143, + "grad_norm": 0.1138191744685173, + "learning_rate": 0.0007111681629939249, + "loss": 2.6916, + "step": 12398 + }, + { + "epoch": 0.3676719153100258, + "grad_norm": 0.12269245088100433, + "learning_rate": 0.000711125513223627, + "loss": 2.7201, + "step": 12399 + }, + { + "epoch": 0.3677015686623373, + "grad_norm": 0.11676427721977234, + "learning_rate": 0.0007110828615837765, + "loss": 2.7305, + "step": 12400 + }, + { + "epoch": 0.36773122201464875, + "grad_norm": 0.1353369504213333, + "learning_rate": 0.000711040208074751, + "loss": 2.7175, + "step": 12401 + }, + { + "epoch": 0.3677608753669602, + "grad_norm": 0.1450187712907791, + "learning_rate": 0.0007109975526969283, + "loss": 2.7648, + "step": 12402 + }, + { + "epoch": 0.3677905287192717, + "grad_norm": 0.17104552686214447, + "learning_rate": 0.0007109548954506859, + "loss": 2.7473, + "step": 12403 + }, + { + "epoch": 0.3678201820715832, + "grad_norm": 0.18147365748882294, + "learning_rate": 0.0007109122363364019, + "loss": 2.7295, + "step": 12404 + }, + { + "epoch": 0.36784983542389466, + "grad_norm": 0.16282273828983307, + "learning_rate": 0.0007108695753544537, + "loss": 2.744, + "step": 12405 + }, + { + "epoch": 0.36787948877620613, + "grad_norm": 0.14548926055431366, + "learning_rate": 0.0007108269125052194, + "loss": 2.7629, + "step": 12406 + }, + { + "epoch": 0.3679091421285176, + "grad_norm": 0.1410132795572281, + "learning_rate": 0.0007107842477890764, + "loss": 2.7499, + "step": 12407 + }, + { + "epoch": 0.3679387954808291, + "grad_norm": 0.14207392930984497, + "learning_rate": 0.0007107415812064028, + "loss": 2.7071, + "step": 12408 + }, + { + "epoch": 0.36796844883314056, + "grad_norm": 0.15383705496788025, + "learning_rate": 0.0007106989127575763, + "loss": 2.7217, + "step": 12409 + }, + { + "epoch": 0.3679981021854521, + "grad_norm": 0.10550551116466522, + "learning_rate": 0.0007106562424429748, + "loss": 2.739, + "step": 12410 + }, + { + "epoch": 0.36802775553776357, + "grad_norm": 0.1295812875032425, + "learning_rate": 0.000710613570262976, + "loss": 2.721, + "step": 12411 + }, + { + "epoch": 0.36805740889007504, + "grad_norm": 0.13546979427337646, + "learning_rate": 0.000710570896217958, + "loss": 2.7142, + "step": 12412 + }, + { + "epoch": 0.3680870622423865, + "grad_norm": 0.1342867761850357, + "learning_rate": 0.0007105282203082985, + "loss": 2.7439, + "step": 12413 + }, + { + "epoch": 0.368116715594698, + "grad_norm": 0.1458349972963333, + "learning_rate": 0.0007104855425343755, + "loss": 2.7252, + "step": 12414 + }, + { + "epoch": 0.3681463689470095, + "grad_norm": 0.12944960594177246, + "learning_rate": 0.0007104428628965668, + "loss": 2.6859, + "step": 12415 + }, + { + "epoch": 0.36817602229932095, + "grad_norm": 0.13512134552001953, + "learning_rate": 0.0007104001813952506, + "loss": 2.672, + "step": 12416 + }, + { + "epoch": 0.3682056756516324, + "grad_norm": 0.11626730114221573, + "learning_rate": 0.0007103574980308046, + "loss": 2.7571, + "step": 12417 + }, + { + "epoch": 0.3682353290039439, + "grad_norm": 0.13395196199417114, + "learning_rate": 0.000710314812803607, + "loss": 2.7193, + "step": 12418 + }, + { + "epoch": 0.3682649823562554, + "grad_norm": 0.14980289340019226, + "learning_rate": 0.0007102721257140353, + "loss": 2.7222, + "step": 12419 + }, + { + "epoch": 0.36829463570856685, + "grad_norm": 0.12503093481063843, + "learning_rate": 0.0007102294367624681, + "loss": 2.7497, + "step": 12420 + }, + { + "epoch": 0.36832428906087833, + "grad_norm": 0.13463205099105835, + "learning_rate": 0.000710186745949283, + "loss": 2.7379, + "step": 12421 + }, + { + "epoch": 0.3683539424131898, + "grad_norm": 0.12706296145915985, + "learning_rate": 0.0007101440532748583, + "loss": 2.747, + "step": 12422 + }, + { + "epoch": 0.3683835957655013, + "grad_norm": 0.13554178178310394, + "learning_rate": 0.0007101013587395719, + "loss": 2.7459, + "step": 12423 + }, + { + "epoch": 0.36841324911781276, + "grad_norm": 0.14189249277114868, + "learning_rate": 0.000710058662343802, + "loss": 2.7158, + "step": 12424 + }, + { + "epoch": 0.36844290247012423, + "grad_norm": 0.13632529973983765, + "learning_rate": 0.0007100159640879265, + "loss": 2.7349, + "step": 12425 + }, + { + "epoch": 0.3684725558224357, + "grad_norm": 0.1294902265071869, + "learning_rate": 0.0007099732639723234, + "loss": 2.7171, + "step": 12426 + }, + { + "epoch": 0.3685022091747472, + "grad_norm": 0.12757354974746704, + "learning_rate": 0.0007099305619973713, + "loss": 2.7337, + "step": 12427 + }, + { + "epoch": 0.36853186252705866, + "grad_norm": 0.11984514445066452, + "learning_rate": 0.0007098878581634479, + "loss": 2.714, + "step": 12428 + }, + { + "epoch": 0.36856151587937014, + "grad_norm": 0.11965781450271606, + "learning_rate": 0.0007098451524709315, + "loss": 2.7563, + "step": 12429 + }, + { + "epoch": 0.3685911692316816, + "grad_norm": 0.12208306044340134, + "learning_rate": 0.0007098024449202003, + "loss": 2.7435, + "step": 12430 + }, + { + "epoch": 0.36862082258399315, + "grad_norm": 0.1319429874420166, + "learning_rate": 0.0007097597355116324, + "loss": 2.7329, + "step": 12431 + }, + { + "epoch": 0.3686504759363046, + "grad_norm": 0.15012384951114655, + "learning_rate": 0.000709717024245606, + "loss": 2.7183, + "step": 12432 + }, + { + "epoch": 0.3686801292886161, + "grad_norm": 0.15266114473342896, + "learning_rate": 0.0007096743111224995, + "loss": 2.7068, + "step": 12433 + }, + { + "epoch": 0.3687097826409276, + "grad_norm": 0.13352257013320923, + "learning_rate": 0.0007096315961426908, + "loss": 2.7299, + "step": 12434 + }, + { + "epoch": 0.36873943599323905, + "grad_norm": 0.1374281793832779, + "learning_rate": 0.0007095888793065585, + "loss": 2.699, + "step": 12435 + }, + { + "epoch": 0.3687690893455505, + "grad_norm": 0.15415962040424347, + "learning_rate": 0.0007095461606144805, + "loss": 2.7135, + "step": 12436 + }, + { + "epoch": 0.368798742697862, + "grad_norm": 0.11880998313426971, + "learning_rate": 0.0007095034400668354, + "loss": 2.7067, + "step": 12437 + }, + { + "epoch": 0.3688283960501735, + "grad_norm": 0.11377248167991638, + "learning_rate": 0.0007094607176640014, + "loss": 2.7091, + "step": 12438 + }, + { + "epoch": 0.36885804940248496, + "grad_norm": 0.1340646892786026, + "learning_rate": 0.0007094179934063567, + "loss": 2.7411, + "step": 12439 + }, + { + "epoch": 0.36888770275479643, + "grad_norm": 0.1652401089668274, + "learning_rate": 0.0007093752672942799, + "loss": 2.6937, + "step": 12440 + }, + { + "epoch": 0.3689173561071079, + "grad_norm": 0.1431603878736496, + "learning_rate": 0.000709332539328149, + "loss": 2.7182, + "step": 12441 + }, + { + "epoch": 0.3689470094594194, + "grad_norm": 0.1408611685037613, + "learning_rate": 0.0007092898095083426, + "loss": 2.7107, + "step": 12442 + }, + { + "epoch": 0.36897666281173086, + "grad_norm": 0.14631077647209167, + "learning_rate": 0.000709247077835239, + "loss": 2.738, + "step": 12443 + }, + { + "epoch": 0.36900631616404234, + "grad_norm": 0.11129459738731384, + "learning_rate": 0.0007092043443092166, + "loss": 2.7401, + "step": 12444 + }, + { + "epoch": 0.3690359695163538, + "grad_norm": 0.1321832537651062, + "learning_rate": 0.0007091616089306539, + "loss": 2.7281, + "step": 12445 + }, + { + "epoch": 0.3690656228686653, + "grad_norm": 0.13818776607513428, + "learning_rate": 0.000709118871699929, + "loss": 2.7284, + "step": 12446 + }, + { + "epoch": 0.36909527622097676, + "grad_norm": 0.13095252215862274, + "learning_rate": 0.0007090761326174208, + "loss": 2.6853, + "step": 12447 + }, + { + "epoch": 0.36912492957328824, + "grad_norm": 0.11423029005527496, + "learning_rate": 0.0007090333916835076, + "loss": 2.7035, + "step": 12448 + }, + { + "epoch": 0.3691545829255997, + "grad_norm": 0.12187691777944565, + "learning_rate": 0.0007089906488985677, + "loss": 2.7409, + "step": 12449 + }, + { + "epoch": 0.3691842362779112, + "grad_norm": 0.13166332244873047, + "learning_rate": 0.00070894790426298, + "loss": 2.7926, + "step": 12450 + }, + { + "epoch": 0.36921388963022267, + "grad_norm": 0.14605063199996948, + "learning_rate": 0.0007089051577771225, + "loss": 2.7697, + "step": 12451 + }, + { + "epoch": 0.3692435429825342, + "grad_norm": 0.15971562266349792, + "learning_rate": 0.0007088624094413739, + "loss": 2.7082, + "step": 12452 + }, + { + "epoch": 0.3692731963348457, + "grad_norm": 0.1603928655385971, + "learning_rate": 0.0007088196592561129, + "loss": 2.7259, + "step": 12453 + }, + { + "epoch": 0.36930284968715715, + "grad_norm": 0.17575077712535858, + "learning_rate": 0.000708776907221718, + "loss": 2.7518, + "step": 12454 + }, + { + "epoch": 0.36933250303946863, + "grad_norm": 0.1525023877620697, + "learning_rate": 0.0007087341533385678, + "loss": 2.7033, + "step": 12455 + }, + { + "epoch": 0.3693621563917801, + "grad_norm": 0.13927264511585236, + "learning_rate": 0.0007086913976070408, + "loss": 2.7211, + "step": 12456 + }, + { + "epoch": 0.3693918097440916, + "grad_norm": 0.12260644882917404, + "learning_rate": 0.0007086486400275158, + "loss": 2.6985, + "step": 12457 + }, + { + "epoch": 0.36942146309640306, + "grad_norm": 0.13167086243629456, + "learning_rate": 0.000708605880600371, + "loss": 2.7619, + "step": 12458 + }, + { + "epoch": 0.36945111644871453, + "grad_norm": 0.12962591648101807, + "learning_rate": 0.0007085631193259853, + "loss": 2.713, + "step": 12459 + }, + { + "epoch": 0.369480769801026, + "grad_norm": 0.11780183017253876, + "learning_rate": 0.0007085203562047376, + "loss": 2.7431, + "step": 12460 + }, + { + "epoch": 0.3695104231533375, + "grad_norm": 0.13922402262687683, + "learning_rate": 0.0007084775912370064, + "loss": 2.6901, + "step": 12461 + }, + { + "epoch": 0.36954007650564896, + "grad_norm": 0.12543992698192596, + "learning_rate": 0.0007084348244231702, + "loss": 2.7095, + "step": 12462 + }, + { + "epoch": 0.36956972985796044, + "grad_norm": 0.11736532300710678, + "learning_rate": 0.0007083920557636079, + "loss": 2.7046, + "step": 12463 + }, + { + "epoch": 0.3695993832102719, + "grad_norm": 0.12004970759153366, + "learning_rate": 0.0007083492852586981, + "loss": 2.7399, + "step": 12464 + }, + { + "epoch": 0.3696290365625834, + "grad_norm": 0.13014096021652222, + "learning_rate": 0.0007083065129088196, + "loss": 2.731, + "step": 12465 + }, + { + "epoch": 0.36965868991489487, + "grad_norm": 0.13697364926338196, + "learning_rate": 0.0007082637387143514, + "loss": 2.7606, + "step": 12466 + }, + { + "epoch": 0.36968834326720634, + "grad_norm": 0.13445179164409637, + "learning_rate": 0.0007082209626756718, + "loss": 2.7253, + "step": 12467 + }, + { + "epoch": 0.3697179966195178, + "grad_norm": 0.13683848083019257, + "learning_rate": 0.00070817818479316, + "loss": 2.7027, + "step": 12468 + }, + { + "epoch": 0.3697476499718293, + "grad_norm": 0.12204822897911072, + "learning_rate": 0.0007081354050671946, + "loss": 2.7207, + "step": 12469 + }, + { + "epoch": 0.36977730332414077, + "grad_norm": 0.11747731268405914, + "learning_rate": 0.0007080926234981544, + "loss": 2.7553, + "step": 12470 + }, + { + "epoch": 0.36980695667645225, + "grad_norm": 0.12657059729099274, + "learning_rate": 0.0007080498400864183, + "loss": 2.738, + "step": 12471 + }, + { + "epoch": 0.3698366100287638, + "grad_norm": 0.15540897846221924, + "learning_rate": 0.0007080070548323652, + "loss": 2.7344, + "step": 12472 + }, + { + "epoch": 0.36986626338107526, + "grad_norm": 0.17107746005058289, + "learning_rate": 0.0007079642677363739, + "loss": 2.6907, + "step": 12473 + }, + { + "epoch": 0.36989591673338673, + "grad_norm": 0.1420113444328308, + "learning_rate": 0.0007079214787988233, + "loss": 2.6952, + "step": 12474 + }, + { + "epoch": 0.3699255700856982, + "grad_norm": 0.12312294542789459, + "learning_rate": 0.0007078786880200923, + "loss": 2.7601, + "step": 12475 + }, + { + "epoch": 0.3699552234380097, + "grad_norm": 0.1370398998260498, + "learning_rate": 0.0007078358954005599, + "loss": 2.7042, + "step": 12476 + }, + { + "epoch": 0.36998487679032116, + "grad_norm": 0.11981916427612305, + "learning_rate": 0.0007077931009406049, + "loss": 2.7424, + "step": 12477 + }, + { + "epoch": 0.37001453014263264, + "grad_norm": 0.13097409904003143, + "learning_rate": 0.0007077503046406064, + "loss": 2.7431, + "step": 12478 + }, + { + "epoch": 0.3700441834949441, + "grad_norm": 0.11709114909172058, + "learning_rate": 0.0007077075065009433, + "loss": 2.7296, + "step": 12479 + }, + { + "epoch": 0.3700738368472556, + "grad_norm": 0.12843815982341766, + "learning_rate": 0.0007076647065219944, + "loss": 2.75, + "step": 12480 + }, + { + "epoch": 0.37010349019956706, + "grad_norm": 0.1030416190624237, + "learning_rate": 0.0007076219047041392, + "loss": 2.7229, + "step": 12481 + }, + { + "epoch": 0.37013314355187854, + "grad_norm": 0.12359046936035156, + "learning_rate": 0.0007075791010477562, + "loss": 2.7473, + "step": 12482 + }, + { + "epoch": 0.37016279690419, + "grad_norm": 0.12928788363933563, + "learning_rate": 0.0007075362955532246, + "loss": 2.6996, + "step": 12483 + }, + { + "epoch": 0.3701924502565015, + "grad_norm": 0.13275253772735596, + "learning_rate": 0.0007074934882209234, + "loss": 2.7189, + "step": 12484 + }, + { + "epoch": 0.37022210360881297, + "grad_norm": 0.13535906374454498, + "learning_rate": 0.0007074506790512319, + "loss": 2.7339, + "step": 12485 + }, + { + "epoch": 0.37025175696112445, + "grad_norm": 0.14852283895015717, + "learning_rate": 0.000707407868044529, + "loss": 2.7605, + "step": 12486 + }, + { + "epoch": 0.3702814103134359, + "grad_norm": 0.15063054859638214, + "learning_rate": 0.0007073650552011938, + "loss": 2.7348, + "step": 12487 + }, + { + "epoch": 0.3703110636657474, + "grad_norm": 0.1423327475786209, + "learning_rate": 0.0007073222405216056, + "loss": 2.7142, + "step": 12488 + }, + { + "epoch": 0.3703407170180589, + "grad_norm": 0.13453353941440582, + "learning_rate": 0.0007072794240061432, + "loss": 2.7311, + "step": 12489 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 0.13262531161308289, + "learning_rate": 0.0007072366056551859, + "loss": 2.7181, + "step": 12490 + }, + { + "epoch": 0.3704000237226818, + "grad_norm": 0.15318794548511505, + "learning_rate": 0.000707193785469113, + "loss": 2.7321, + "step": 12491 + }, + { + "epoch": 0.3704296770749933, + "grad_norm": 0.1381818652153015, + "learning_rate": 0.0007071509634483035, + "loss": 2.753, + "step": 12492 + }, + { + "epoch": 0.37045933042730483, + "grad_norm": 0.15820527076721191, + "learning_rate": 0.0007071081395931365, + "loss": 2.7204, + "step": 12493 + }, + { + "epoch": 0.3704889837796163, + "grad_norm": 0.14226600527763367, + "learning_rate": 0.0007070653139039918, + "loss": 2.7224, + "step": 12494 + }, + { + "epoch": 0.3705186371319278, + "grad_norm": 0.12886346876621246, + "learning_rate": 0.0007070224863812479, + "loss": 2.7364, + "step": 12495 + }, + { + "epoch": 0.37054829048423926, + "grad_norm": 0.13501185178756714, + "learning_rate": 0.0007069796570252845, + "loss": 2.712, + "step": 12496 + }, + { + "epoch": 0.37057794383655074, + "grad_norm": 0.13873343169689178, + "learning_rate": 0.0007069368258364804, + "loss": 2.7285, + "step": 12497 + }, + { + "epoch": 0.3706075971888622, + "grad_norm": 0.15086351335048676, + "learning_rate": 0.0007068939928152153, + "loss": 2.7284, + "step": 12498 + }, + { + "epoch": 0.3706372505411737, + "grad_norm": 0.13955897092819214, + "learning_rate": 0.0007068511579618686, + "loss": 2.7133, + "step": 12499 + }, + { + "epoch": 0.37066690389348517, + "grad_norm": 0.13354992866516113, + "learning_rate": 0.0007068083212768192, + "loss": 2.7327, + "step": 12500 + }, + { + "epoch": 0.37069655724579664, + "grad_norm": 0.1458911895751953, + "learning_rate": 0.0007067654827604468, + "loss": 2.7027, + "step": 12501 + }, + { + "epoch": 0.3707262105981081, + "grad_norm": 0.15200024843215942, + "learning_rate": 0.0007067226424131304, + "loss": 2.6964, + "step": 12502 + }, + { + "epoch": 0.3707558639504196, + "grad_norm": 0.1298360526561737, + "learning_rate": 0.0007066798002352495, + "loss": 2.7284, + "step": 12503 + }, + { + "epoch": 0.37078551730273107, + "grad_norm": 0.12227027118206024, + "learning_rate": 0.0007066369562271836, + "loss": 2.7164, + "step": 12504 + }, + { + "epoch": 0.37081517065504255, + "grad_norm": 0.13860197365283966, + "learning_rate": 0.000706594110389312, + "loss": 2.7215, + "step": 12505 + }, + { + "epoch": 0.370844824007354, + "grad_norm": 0.14285556972026825, + "learning_rate": 0.000706551262722014, + "loss": 2.6991, + "step": 12506 + }, + { + "epoch": 0.3708744773596655, + "grad_norm": 0.1283874362707138, + "learning_rate": 0.0007065084132256692, + "loss": 2.7196, + "step": 12507 + }, + { + "epoch": 0.370904130711977, + "grad_norm": 0.11224745959043503, + "learning_rate": 0.0007064655619006568, + "loss": 2.7386, + "step": 12508 + }, + { + "epoch": 0.37093378406428845, + "grad_norm": 0.12775127589702606, + "learning_rate": 0.0007064227087473564, + "loss": 2.7191, + "step": 12509 + }, + { + "epoch": 0.37096343741659993, + "grad_norm": 0.1486961990594864, + "learning_rate": 0.0007063798537661477, + "loss": 2.7404, + "step": 12510 + }, + { + "epoch": 0.3709930907689114, + "grad_norm": 0.12034523487091064, + "learning_rate": 0.0007063369969574099, + "loss": 2.6766, + "step": 12511 + }, + { + "epoch": 0.3710227441212229, + "grad_norm": 0.1252773553133011, + "learning_rate": 0.0007062941383215224, + "loss": 2.7104, + "step": 12512 + }, + { + "epoch": 0.37105239747353436, + "grad_norm": 0.12706738710403442, + "learning_rate": 0.0007062512778588651, + "loss": 2.7589, + "step": 12513 + }, + { + "epoch": 0.3710820508258459, + "grad_norm": 0.12916284799575806, + "learning_rate": 0.0007062084155698173, + "loss": 2.7166, + "step": 12514 + }, + { + "epoch": 0.37111170417815736, + "grad_norm": 0.15647663176059723, + "learning_rate": 0.0007061655514547585, + "loss": 2.7178, + "step": 12515 + }, + { + "epoch": 0.37114135753046884, + "grad_norm": 0.15221883356571198, + "learning_rate": 0.0007061226855140685, + "loss": 2.6778, + "step": 12516 + }, + { + "epoch": 0.3711710108827803, + "grad_norm": 0.1459415853023529, + "learning_rate": 0.0007060798177481266, + "loss": 2.7321, + "step": 12517 + }, + { + "epoch": 0.3712006642350918, + "grad_norm": 0.13426370918750763, + "learning_rate": 0.0007060369481573126, + "loss": 2.6867, + "step": 12518 + }, + { + "epoch": 0.37123031758740327, + "grad_norm": 0.13721974194049835, + "learning_rate": 0.000705994076742006, + "loss": 2.7419, + "step": 12519 + }, + { + "epoch": 0.37125997093971475, + "grad_norm": 0.16215740144252777, + "learning_rate": 0.0007059512035025865, + "loss": 2.7474, + "step": 12520 + }, + { + "epoch": 0.3712896242920262, + "grad_norm": 0.1627911925315857, + "learning_rate": 0.0007059083284394338, + "loss": 2.7116, + "step": 12521 + }, + { + "epoch": 0.3713192776443377, + "grad_norm": 0.14297448098659515, + "learning_rate": 0.0007058654515529276, + "loss": 2.7228, + "step": 12522 + }, + { + "epoch": 0.3713489309966492, + "grad_norm": 0.11998233944177628, + "learning_rate": 0.0007058225728434472, + "loss": 2.7124, + "step": 12523 + }, + { + "epoch": 0.37137858434896065, + "grad_norm": 0.13038583099842072, + "learning_rate": 0.0007057796923113727, + "loss": 2.7192, + "step": 12524 + }, + { + "epoch": 0.3714082377012721, + "grad_norm": 0.11632760614156723, + "learning_rate": 0.0007057368099570838, + "loss": 2.7232, + "step": 12525 + }, + { + "epoch": 0.3714378910535836, + "grad_norm": 0.11987238377332687, + "learning_rate": 0.0007056939257809602, + "loss": 2.7349, + "step": 12526 + }, + { + "epoch": 0.3714675444058951, + "grad_norm": 0.1150493323802948, + "learning_rate": 0.0007056510397833815, + "loss": 2.7272, + "step": 12527 + }, + { + "epoch": 0.37149719775820655, + "grad_norm": 0.11205830425024033, + "learning_rate": 0.0007056081519647275, + "loss": 2.7546, + "step": 12528 + }, + { + "epoch": 0.37152685111051803, + "grad_norm": 0.12283888459205627, + "learning_rate": 0.000705565262325378, + "loss": 2.7093, + "step": 12529 + }, + { + "epoch": 0.3715565044628295, + "grad_norm": 0.12259940803050995, + "learning_rate": 0.0007055223708657128, + "loss": 2.7328, + "step": 12530 + }, + { + "epoch": 0.371586157815141, + "grad_norm": 0.10942542552947998, + "learning_rate": 0.0007054794775861118, + "loss": 2.7106, + "step": 12531 + }, + { + "epoch": 0.37161581116745246, + "grad_norm": 0.1261480301618576, + "learning_rate": 0.0007054365824869549, + "loss": 2.707, + "step": 12532 + }, + { + "epoch": 0.37164546451976394, + "grad_norm": 0.11255743354558945, + "learning_rate": 0.0007053936855686216, + "loss": 2.739, + "step": 12533 + }, + { + "epoch": 0.3716751178720754, + "grad_norm": 0.10953392088413239, + "learning_rate": 0.0007053507868314919, + "loss": 2.7252, + "step": 12534 + }, + { + "epoch": 0.37170477122438694, + "grad_norm": 0.11551303416490555, + "learning_rate": 0.0007053078862759459, + "loss": 2.7207, + "step": 12535 + }, + { + "epoch": 0.3717344245766984, + "grad_norm": 0.1255711019039154, + "learning_rate": 0.0007052649839023632, + "loss": 2.7122, + "step": 12536 + }, + { + "epoch": 0.3717640779290099, + "grad_norm": 0.15169602632522583, + "learning_rate": 0.0007052220797111239, + "loss": 2.6982, + "step": 12537 + }, + { + "epoch": 0.37179373128132137, + "grad_norm": 0.15189549326896667, + "learning_rate": 0.0007051791737026079, + "loss": 2.7526, + "step": 12538 + }, + { + "epoch": 0.37182338463363285, + "grad_norm": 0.14482225477695465, + "learning_rate": 0.000705136265877195, + "loss": 2.7326, + "step": 12539 + }, + { + "epoch": 0.3718530379859443, + "grad_norm": 0.1443067491054535, + "learning_rate": 0.0007050933562352654, + "loss": 2.741, + "step": 12540 + }, + { + "epoch": 0.3718826913382558, + "grad_norm": 0.1335591971874237, + "learning_rate": 0.0007050504447771988, + "loss": 2.7096, + "step": 12541 + }, + { + "epoch": 0.3719123446905673, + "grad_norm": 0.15114225447177887, + "learning_rate": 0.0007050075315033753, + "loss": 2.7011, + "step": 12542 + }, + { + "epoch": 0.37194199804287875, + "grad_norm": 0.13764984905719757, + "learning_rate": 0.000704964616414175, + "loss": 2.7269, + "step": 12543 + }, + { + "epoch": 0.37197165139519023, + "grad_norm": 0.11573442071676254, + "learning_rate": 0.0007049216995099779, + "loss": 2.6973, + "step": 12544 + }, + { + "epoch": 0.3720013047475017, + "grad_norm": 0.12560071051120758, + "learning_rate": 0.0007048787807911637, + "loss": 2.7168, + "step": 12545 + }, + { + "epoch": 0.3720309580998132, + "grad_norm": 0.12783312797546387, + "learning_rate": 0.000704835860258113, + "loss": 2.7037, + "step": 12546 + }, + { + "epoch": 0.37206061145212466, + "grad_norm": 0.12529438734054565, + "learning_rate": 0.0007047929379112055, + "loss": 2.7489, + "step": 12547 + }, + { + "epoch": 0.37209026480443613, + "grad_norm": 0.12623706459999084, + "learning_rate": 0.0007047500137508214, + "loss": 2.719, + "step": 12548 + }, + { + "epoch": 0.3721199181567476, + "grad_norm": 0.12576304376125336, + "learning_rate": 0.0007047070877773407, + "loss": 2.7071, + "step": 12549 + }, + { + "epoch": 0.3721495715090591, + "grad_norm": 0.12263263761997223, + "learning_rate": 0.0007046641599911437, + "loss": 2.7314, + "step": 12550 + }, + { + "epoch": 0.37217922486137056, + "grad_norm": 0.13505497574806213, + "learning_rate": 0.0007046212303926105, + "loss": 2.715, + "step": 12551 + }, + { + "epoch": 0.37220887821368204, + "grad_norm": 0.1398712694644928, + "learning_rate": 0.000704578298982121, + "loss": 2.6863, + "step": 12552 + }, + { + "epoch": 0.3722385315659935, + "grad_norm": 0.14152587950229645, + "learning_rate": 0.0007045353657600558, + "loss": 2.7346, + "step": 12553 + }, + { + "epoch": 0.372268184918305, + "grad_norm": 0.15102553367614746, + "learning_rate": 0.0007044924307267945, + "loss": 2.7363, + "step": 12554 + }, + { + "epoch": 0.37229783827061647, + "grad_norm": 0.15353603661060333, + "learning_rate": 0.0007044494938827178, + "loss": 2.7178, + "step": 12555 + }, + { + "epoch": 0.372327491622928, + "grad_norm": 0.14716100692749023, + "learning_rate": 0.0007044065552282057, + "loss": 2.7205, + "step": 12556 + }, + { + "epoch": 0.3723571449752395, + "grad_norm": 0.12453668564558029, + "learning_rate": 0.0007043636147636385, + "loss": 2.6912, + "step": 12557 + }, + { + "epoch": 0.37238679832755095, + "grad_norm": 0.12850739061832428, + "learning_rate": 0.0007043206724893965, + "loss": 2.7311, + "step": 12558 + }, + { + "epoch": 0.3724164516798624, + "grad_norm": 0.11562662571668625, + "learning_rate": 0.0007042777284058599, + "loss": 2.6834, + "step": 12559 + }, + { + "epoch": 0.3724461050321739, + "grad_norm": 0.10411367565393448, + "learning_rate": 0.0007042347825134088, + "loss": 2.7211, + "step": 12560 + }, + { + "epoch": 0.3724757583844854, + "grad_norm": 0.11437302082777023, + "learning_rate": 0.0007041918348124236, + "loss": 2.7361, + "step": 12561 + }, + { + "epoch": 0.37250541173679685, + "grad_norm": 0.13164910674095154, + "learning_rate": 0.0007041488853032847, + "loss": 2.7439, + "step": 12562 + }, + { + "epoch": 0.37253506508910833, + "grad_norm": 0.1340339183807373, + "learning_rate": 0.0007041059339863724, + "loss": 2.7164, + "step": 12563 + }, + { + "epoch": 0.3725647184414198, + "grad_norm": 0.12882310152053833, + "learning_rate": 0.0007040629808620671, + "loss": 2.7138, + "step": 12564 + }, + { + "epoch": 0.3725943717937313, + "grad_norm": 0.12467227876186371, + "learning_rate": 0.000704020025930749, + "loss": 2.757, + "step": 12565 + }, + { + "epoch": 0.37262402514604276, + "grad_norm": 0.11870566010475159, + "learning_rate": 0.0007039770691927986, + "loss": 2.7084, + "step": 12566 + }, + { + "epoch": 0.37265367849835423, + "grad_norm": 0.12309780716896057, + "learning_rate": 0.0007039341106485961, + "loss": 2.728, + "step": 12567 + }, + { + "epoch": 0.3726833318506657, + "grad_norm": 0.12467373162508011, + "learning_rate": 0.0007038911502985221, + "loss": 2.7124, + "step": 12568 + }, + { + "epoch": 0.3727129852029772, + "grad_norm": 0.12379449605941772, + "learning_rate": 0.000703848188142957, + "loss": 2.7414, + "step": 12569 + }, + { + "epoch": 0.37274263855528866, + "grad_norm": 0.1300731599330902, + "learning_rate": 0.0007038052241822814, + "loss": 2.7788, + "step": 12570 + }, + { + "epoch": 0.37277229190760014, + "grad_norm": 0.1219983845949173, + "learning_rate": 0.0007037622584168754, + "loss": 2.7341, + "step": 12571 + }, + { + "epoch": 0.3728019452599116, + "grad_norm": 0.1408536732196808, + "learning_rate": 0.0007037192908471195, + "loss": 2.7017, + "step": 12572 + }, + { + "epoch": 0.3728315986122231, + "grad_norm": 0.13073407113552094, + "learning_rate": 0.0007036763214733944, + "loss": 2.7205, + "step": 12573 + }, + { + "epoch": 0.37286125196453457, + "grad_norm": 0.13957469165325165, + "learning_rate": 0.0007036333502960804, + "loss": 2.7323, + "step": 12574 + }, + { + "epoch": 0.37289090531684604, + "grad_norm": 0.13493983447551727, + "learning_rate": 0.0007035903773155584, + "loss": 2.7154, + "step": 12575 + }, + { + "epoch": 0.3729205586691576, + "grad_norm": 0.13808052241802216, + "learning_rate": 0.0007035474025322086, + "loss": 2.7568, + "step": 12576 + }, + { + "epoch": 0.37295021202146905, + "grad_norm": 0.1457509547472, + "learning_rate": 0.0007035044259464115, + "loss": 2.7242, + "step": 12577 + }, + { + "epoch": 0.37297986537378053, + "grad_norm": 0.15533186495304108, + "learning_rate": 0.0007034614475585478, + "loss": 2.7211, + "step": 12578 + }, + { + "epoch": 0.373009518726092, + "grad_norm": 0.15279780328273773, + "learning_rate": 0.000703418467368998, + "loss": 2.7317, + "step": 12579 + }, + { + "epoch": 0.3730391720784035, + "grad_norm": 0.14406628906726837, + "learning_rate": 0.0007033754853781429, + "loss": 2.763, + "step": 12580 + }, + { + "epoch": 0.37306882543071496, + "grad_norm": 0.1310415416955948, + "learning_rate": 0.0007033325015863628, + "loss": 2.7429, + "step": 12581 + }, + { + "epoch": 0.37309847878302643, + "grad_norm": 0.13686516880989075, + "learning_rate": 0.0007032895159940387, + "loss": 2.719, + "step": 12582 + }, + { + "epoch": 0.3731281321353379, + "grad_norm": 0.13079924881458282, + "learning_rate": 0.0007032465286015508, + "loss": 2.7403, + "step": 12583 + }, + { + "epoch": 0.3731577854876494, + "grad_norm": 0.12914717197418213, + "learning_rate": 0.0007032035394092802, + "loss": 2.6853, + "step": 12584 + }, + { + "epoch": 0.37318743883996086, + "grad_norm": 0.13517878949642181, + "learning_rate": 0.0007031605484176073, + "loss": 2.7503, + "step": 12585 + }, + { + "epoch": 0.37321709219227234, + "grad_norm": 0.12176869064569473, + "learning_rate": 0.0007031175556269129, + "loss": 2.7346, + "step": 12586 + }, + { + "epoch": 0.3732467455445838, + "grad_norm": 0.13182470202445984, + "learning_rate": 0.0007030745610375777, + "loss": 2.7285, + "step": 12587 + }, + { + "epoch": 0.3732763988968953, + "grad_norm": 0.12617675960063934, + "learning_rate": 0.0007030315646499823, + "loss": 2.7401, + "step": 12588 + }, + { + "epoch": 0.37330605224920677, + "grad_norm": 0.11606504768133163, + "learning_rate": 0.0007029885664645076, + "loss": 2.7233, + "step": 12589 + }, + { + "epoch": 0.37333570560151824, + "grad_norm": 0.13096439838409424, + "learning_rate": 0.0007029455664815343, + "loss": 2.724, + "step": 12590 + }, + { + "epoch": 0.3733653589538297, + "grad_norm": 0.11372532695531845, + "learning_rate": 0.0007029025647014432, + "loss": 2.7189, + "step": 12591 + }, + { + "epoch": 0.3733950123061412, + "grad_norm": 0.12162693589925766, + "learning_rate": 0.0007028595611246151, + "loss": 2.7174, + "step": 12592 + }, + { + "epoch": 0.37342466565845267, + "grad_norm": 0.14495514333248138, + "learning_rate": 0.0007028165557514308, + "loss": 2.7213, + "step": 12593 + }, + { + "epoch": 0.37345431901076415, + "grad_norm": 0.1401660442352295, + "learning_rate": 0.0007027735485822708, + "loss": 2.7412, + "step": 12594 + }, + { + "epoch": 0.3734839723630756, + "grad_norm": 0.13070768117904663, + "learning_rate": 0.0007027305396175165, + "loss": 2.7187, + "step": 12595 + }, + { + "epoch": 0.3735136257153871, + "grad_norm": 0.145598366856575, + "learning_rate": 0.0007026875288575484, + "loss": 2.7033, + "step": 12596 + }, + { + "epoch": 0.37354327906769863, + "grad_norm": 0.1517559438943863, + "learning_rate": 0.0007026445163027475, + "loss": 2.7318, + "step": 12597 + }, + { + "epoch": 0.3735729324200101, + "grad_norm": 0.15601342916488647, + "learning_rate": 0.0007026015019534946, + "loss": 2.7421, + "step": 12598 + }, + { + "epoch": 0.3736025857723216, + "grad_norm": 0.15405617654323578, + "learning_rate": 0.0007025584858101706, + "loss": 2.7378, + "step": 12599 + }, + { + "epoch": 0.37363223912463306, + "grad_norm": 0.13221168518066406, + "learning_rate": 0.0007025154678731563, + "loss": 2.751, + "step": 12600 + }, + { + "epoch": 0.37366189247694453, + "grad_norm": 0.14679254591464996, + "learning_rate": 0.0007024724481428328, + "loss": 2.7457, + "step": 12601 + }, + { + "epoch": 0.373691545829256, + "grad_norm": 0.13967359066009521, + "learning_rate": 0.0007024294266195812, + "loss": 2.7455, + "step": 12602 + }, + { + "epoch": 0.3737211991815675, + "grad_norm": 0.1337060183286667, + "learning_rate": 0.0007023864033037822, + "loss": 2.708, + "step": 12603 + }, + { + "epoch": 0.37375085253387896, + "grad_norm": 0.14035777747631073, + "learning_rate": 0.0007023433781958168, + "loss": 2.7347, + "step": 12604 + }, + { + "epoch": 0.37378050588619044, + "grad_norm": 0.1450587809085846, + "learning_rate": 0.0007023003512960661, + "loss": 2.7491, + "step": 12605 + }, + { + "epoch": 0.3738101592385019, + "grad_norm": 0.12653720378875732, + "learning_rate": 0.0007022573226049112, + "loss": 2.7367, + "step": 12606 + }, + { + "epoch": 0.3738398125908134, + "grad_norm": 0.13066977262496948, + "learning_rate": 0.0007022142921227328, + "loss": 2.6845, + "step": 12607 + }, + { + "epoch": 0.37386946594312487, + "grad_norm": 0.14924392104148865, + "learning_rate": 0.0007021712598499122, + "loss": 2.7124, + "step": 12608 + }, + { + "epoch": 0.37389911929543634, + "grad_norm": 0.13591599464416504, + "learning_rate": 0.0007021282257868304, + "loss": 2.7267, + "step": 12609 + }, + { + "epoch": 0.3739287726477478, + "grad_norm": 0.10865943878889084, + "learning_rate": 0.0007020851899338685, + "loss": 2.7063, + "step": 12610 + }, + { + "epoch": 0.3739584260000593, + "grad_norm": 0.13246755301952362, + "learning_rate": 0.0007020421522914074, + "loss": 2.7522, + "step": 12611 + }, + { + "epoch": 0.3739880793523708, + "grad_norm": 0.12195175141096115, + "learning_rate": 0.0007019991128598285, + "loss": 2.6998, + "step": 12612 + }, + { + "epoch": 0.37401773270468225, + "grad_norm": 0.1240866631269455, + "learning_rate": 0.0007019560716395128, + "loss": 2.704, + "step": 12613 + }, + { + "epoch": 0.3740473860569937, + "grad_norm": 0.14548823237419128, + "learning_rate": 0.0007019130286308414, + "loss": 2.7548, + "step": 12614 + }, + { + "epoch": 0.3740770394093052, + "grad_norm": 0.13017919659614563, + "learning_rate": 0.0007018699838341955, + "loss": 2.7303, + "step": 12615 + }, + { + "epoch": 0.3741066927616167, + "grad_norm": 0.11951955407857895, + "learning_rate": 0.0007018269372499562, + "loss": 2.7287, + "step": 12616 + }, + { + "epoch": 0.37413634611392815, + "grad_norm": 0.1121336817741394, + "learning_rate": 0.0007017838888785046, + "loss": 2.7666, + "step": 12617 + }, + { + "epoch": 0.3741659994662397, + "grad_norm": 0.11350733786821365, + "learning_rate": 0.0007017408387202222, + "loss": 2.7362, + "step": 12618 + }, + { + "epoch": 0.37419565281855116, + "grad_norm": 0.12222937494516373, + "learning_rate": 0.0007016977867754899, + "loss": 2.7263, + "step": 12619 + }, + { + "epoch": 0.37422530617086264, + "grad_norm": 0.1178736463189125, + "learning_rate": 0.0007016547330446892, + "loss": 2.6852, + "step": 12620 + }, + { + "epoch": 0.3742549595231741, + "grad_norm": 0.11688503623008728, + "learning_rate": 0.0007016116775282012, + "loss": 2.7406, + "step": 12621 + }, + { + "epoch": 0.3742846128754856, + "grad_norm": 0.16118386387825012, + "learning_rate": 0.0007015686202264072, + "loss": 2.7402, + "step": 12622 + }, + { + "epoch": 0.37431426622779707, + "grad_norm": 0.16509820520877838, + "learning_rate": 0.0007015255611396885, + "loss": 2.7401, + "step": 12623 + }, + { + "epoch": 0.37434391958010854, + "grad_norm": 0.15247191488742828, + "learning_rate": 0.0007014825002684262, + "loss": 2.7082, + "step": 12624 + }, + { + "epoch": 0.37437357293242, + "grad_norm": 0.15244756639003754, + "learning_rate": 0.000701439437613002, + "loss": 2.7402, + "step": 12625 + }, + { + "epoch": 0.3744032262847315, + "grad_norm": 0.15330897271633148, + "learning_rate": 0.000701396373173797, + "loss": 2.7297, + "step": 12626 + }, + { + "epoch": 0.37443287963704297, + "grad_norm": 0.1617363691329956, + "learning_rate": 0.0007013533069511923, + "loss": 2.7483, + "step": 12627 + }, + { + "epoch": 0.37446253298935445, + "grad_norm": 0.1495230793952942, + "learning_rate": 0.0007013102389455696, + "loss": 2.7675, + "step": 12628 + }, + { + "epoch": 0.3744921863416659, + "grad_norm": 0.14334626495838165, + "learning_rate": 0.0007012671691573102, + "loss": 2.7237, + "step": 12629 + }, + { + "epoch": 0.3745218396939774, + "grad_norm": 0.12153774499893188, + "learning_rate": 0.0007012240975867956, + "loss": 2.7205, + "step": 12630 + }, + { + "epoch": 0.3745514930462889, + "grad_norm": 0.12860336899757385, + "learning_rate": 0.000701181024234407, + "loss": 2.7291, + "step": 12631 + }, + { + "epoch": 0.37458114639860035, + "grad_norm": 0.12954995036125183, + "learning_rate": 0.0007011379491005258, + "loss": 2.7373, + "step": 12632 + }, + { + "epoch": 0.3746107997509118, + "grad_norm": 0.14004768431186676, + "learning_rate": 0.0007010948721855336, + "loss": 2.7572, + "step": 12633 + }, + { + "epoch": 0.3746404531032233, + "grad_norm": 0.12767045199871063, + "learning_rate": 0.0007010517934898118, + "loss": 2.6946, + "step": 12634 + }, + { + "epoch": 0.3746701064555348, + "grad_norm": 0.13016878068447113, + "learning_rate": 0.0007010087130137419, + "loss": 2.7255, + "step": 12635 + }, + { + "epoch": 0.37469975980784626, + "grad_norm": 0.1294214129447937, + "learning_rate": 0.0007009656307577054, + "loss": 2.7376, + "step": 12636 + }, + { + "epoch": 0.37472941316015773, + "grad_norm": 0.12173018604516983, + "learning_rate": 0.0007009225467220836, + "loss": 2.7401, + "step": 12637 + }, + { + "epoch": 0.3747590665124692, + "grad_norm": 0.11640110611915588, + "learning_rate": 0.0007008794609072582, + "loss": 2.7138, + "step": 12638 + }, + { + "epoch": 0.37478871986478074, + "grad_norm": 0.12413650006055832, + "learning_rate": 0.0007008363733136106, + "loss": 2.7079, + "step": 12639 + }, + { + "epoch": 0.3748183732170922, + "grad_norm": 0.13187529146671295, + "learning_rate": 0.0007007932839415226, + "loss": 2.7654, + "step": 12640 + }, + { + "epoch": 0.3748480265694037, + "grad_norm": 0.13696180284023285, + "learning_rate": 0.0007007501927913755, + "loss": 2.7253, + "step": 12641 + }, + { + "epoch": 0.37487767992171517, + "grad_norm": 0.11741674691438675, + "learning_rate": 0.0007007070998635511, + "loss": 2.7374, + "step": 12642 + }, + { + "epoch": 0.37490733327402664, + "grad_norm": 0.1167658120393753, + "learning_rate": 0.0007006640051584308, + "loss": 2.7216, + "step": 12643 + }, + { + "epoch": 0.3749369866263381, + "grad_norm": 0.12177925556898117, + "learning_rate": 0.0007006209086763963, + "loss": 2.7067, + "step": 12644 + }, + { + "epoch": 0.3749666399786496, + "grad_norm": 0.12606573104858398, + "learning_rate": 0.0007005778104178292, + "loss": 2.7166, + "step": 12645 + }, + { + "epoch": 0.3749962933309611, + "grad_norm": 0.13261686265468597, + "learning_rate": 0.0007005347103831112, + "loss": 2.7087, + "step": 12646 + }, + { + "epoch": 0.37502594668327255, + "grad_norm": 0.14470739662647247, + "learning_rate": 0.0007004916085726239, + "loss": 2.7356, + "step": 12647 + }, + { + "epoch": 0.375055600035584, + "grad_norm": 0.15577974915504456, + "learning_rate": 0.0007004485049867489, + "loss": 2.7165, + "step": 12648 + }, + { + "epoch": 0.3750852533878955, + "grad_norm": 0.12433440238237381, + "learning_rate": 0.0007004053996258682, + "loss": 2.714, + "step": 12649 + }, + { + "epoch": 0.375114906740207, + "grad_norm": 0.12406335771083832, + "learning_rate": 0.0007003622924903631, + "loss": 2.7473, + "step": 12650 + }, + { + "epoch": 0.37514456009251845, + "grad_norm": 0.13499099016189575, + "learning_rate": 0.0007003191835806155, + "loss": 2.7478, + "step": 12651 + }, + { + "epoch": 0.37517421344482993, + "grad_norm": 0.15706953406333923, + "learning_rate": 0.0007002760728970072, + "loss": 2.7424, + "step": 12652 + }, + { + "epoch": 0.3752038667971414, + "grad_norm": 0.16288653016090393, + "learning_rate": 0.0007002329604399199, + "loss": 2.7377, + "step": 12653 + }, + { + "epoch": 0.3752335201494529, + "grad_norm": 0.16267307102680206, + "learning_rate": 0.0007001898462097354, + "loss": 2.7047, + "step": 12654 + }, + { + "epoch": 0.37526317350176436, + "grad_norm": 0.14833475649356842, + "learning_rate": 0.0007001467302068354, + "loss": 2.7286, + "step": 12655 + }, + { + "epoch": 0.37529282685407583, + "grad_norm": 0.14506322145462036, + "learning_rate": 0.0007001036124316018, + "loss": 2.7172, + "step": 12656 + }, + { + "epoch": 0.3753224802063873, + "grad_norm": 0.1260361671447754, + "learning_rate": 0.0007000604928844163, + "loss": 2.717, + "step": 12657 + }, + { + "epoch": 0.3753521335586988, + "grad_norm": 0.1131824180483818, + "learning_rate": 0.0007000173715656608, + "loss": 2.7083, + "step": 12658 + }, + { + "epoch": 0.37538178691101026, + "grad_norm": 0.1462029665708542, + "learning_rate": 0.0006999742484757172, + "loss": 2.7341, + "step": 12659 + }, + { + "epoch": 0.3754114402633218, + "grad_norm": 0.12319092452526093, + "learning_rate": 0.0006999311236149672, + "loss": 2.717, + "step": 12660 + }, + { + "epoch": 0.37544109361563327, + "grad_norm": 0.12060490250587463, + "learning_rate": 0.000699887996983793, + "loss": 2.7004, + "step": 12661 + }, + { + "epoch": 0.37547074696794475, + "grad_norm": 0.11160559207201004, + "learning_rate": 0.0006998448685825761, + "loss": 2.701, + "step": 12662 + }, + { + "epoch": 0.3755004003202562, + "grad_norm": 0.1243051066994667, + "learning_rate": 0.0006998017384116987, + "loss": 2.7249, + "step": 12663 + }, + { + "epoch": 0.3755300536725677, + "grad_norm": 0.10849954932928085, + "learning_rate": 0.0006997586064715426, + "loss": 2.7163, + "step": 12664 + }, + { + "epoch": 0.3755597070248792, + "grad_norm": 0.11909657716751099, + "learning_rate": 0.0006997154727624895, + "loss": 2.6913, + "step": 12665 + }, + { + "epoch": 0.37558936037719065, + "grad_norm": 0.10730371624231339, + "learning_rate": 0.0006996723372849218, + "loss": 2.7374, + "step": 12666 + }, + { + "epoch": 0.3756190137295021, + "grad_norm": 0.11278577148914337, + "learning_rate": 0.0006996292000392213, + "loss": 2.717, + "step": 12667 + }, + { + "epoch": 0.3756486670818136, + "grad_norm": 0.11285204440355301, + "learning_rate": 0.00069958606102577, + "loss": 2.7431, + "step": 12668 + }, + { + "epoch": 0.3756783204341251, + "grad_norm": 0.12241268157958984, + "learning_rate": 0.0006995429202449498, + "loss": 2.6975, + "step": 12669 + }, + { + "epoch": 0.37570797378643656, + "grad_norm": 0.12253400683403015, + "learning_rate": 0.0006994997776971428, + "loss": 2.7347, + "step": 12670 + }, + { + "epoch": 0.37573762713874803, + "grad_norm": 0.1226869449019432, + "learning_rate": 0.000699456633382731, + "loss": 2.6903, + "step": 12671 + }, + { + "epoch": 0.3757672804910595, + "grad_norm": 0.11235985159873962, + "learning_rate": 0.0006994134873020965, + "loss": 2.735, + "step": 12672 + }, + { + "epoch": 0.375796933843371, + "grad_norm": 0.12288224697113037, + "learning_rate": 0.0006993703394556214, + "loss": 2.74, + "step": 12673 + }, + { + "epoch": 0.37582658719568246, + "grad_norm": 0.1064370647072792, + "learning_rate": 0.0006993271898436877, + "loss": 2.7432, + "step": 12674 + }, + { + "epoch": 0.37585624054799394, + "grad_norm": 0.10641288012266159, + "learning_rate": 0.0006992840384666774, + "loss": 2.6974, + "step": 12675 + }, + { + "epoch": 0.3758858939003054, + "grad_norm": 0.11094536632299423, + "learning_rate": 0.0006992408853249729, + "loss": 2.7398, + "step": 12676 + }, + { + "epoch": 0.3759155472526169, + "grad_norm": 0.12160826474428177, + "learning_rate": 0.000699197730418956, + "loss": 2.7305, + "step": 12677 + }, + { + "epoch": 0.37594520060492836, + "grad_norm": 0.14865729212760925, + "learning_rate": 0.000699154573749009, + "loss": 2.7259, + "step": 12678 + }, + { + "epoch": 0.37597485395723984, + "grad_norm": 0.14491286873817444, + "learning_rate": 0.0006991114153155143, + "loss": 2.7224, + "step": 12679 + }, + { + "epoch": 0.3760045073095514, + "grad_norm": 0.1462675780057907, + "learning_rate": 0.0006990682551188536, + "loss": 2.7337, + "step": 12680 + }, + { + "epoch": 0.37603416066186285, + "grad_norm": 0.15873201191425323, + "learning_rate": 0.0006990250931594096, + "loss": 2.7364, + "step": 12681 + }, + { + "epoch": 0.3760638140141743, + "grad_norm": 0.16311022639274597, + "learning_rate": 0.0006989819294375639, + "loss": 2.7154, + "step": 12682 + }, + { + "epoch": 0.3760934673664858, + "grad_norm": 0.13279783725738525, + "learning_rate": 0.0006989387639536993, + "loss": 2.7241, + "step": 12683 + }, + { + "epoch": 0.3761231207187973, + "grad_norm": 0.14496731758117676, + "learning_rate": 0.0006988955967081977, + "loss": 2.7482, + "step": 12684 + }, + { + "epoch": 0.37615277407110875, + "grad_norm": 0.13980041444301605, + "learning_rate": 0.0006988524277014415, + "loss": 2.7264, + "step": 12685 + }, + { + "epoch": 0.37618242742342023, + "grad_norm": 0.11894337832927704, + "learning_rate": 0.0006988092569338128, + "loss": 2.7109, + "step": 12686 + }, + { + "epoch": 0.3762120807757317, + "grad_norm": 0.1437745839357376, + "learning_rate": 0.0006987660844056941, + "loss": 2.7594, + "step": 12687 + }, + { + "epoch": 0.3762417341280432, + "grad_norm": 0.13068579137325287, + "learning_rate": 0.0006987229101174676, + "loss": 2.7271, + "step": 12688 + }, + { + "epoch": 0.37627138748035466, + "grad_norm": 0.1633061021566391, + "learning_rate": 0.0006986797340695157, + "loss": 2.7333, + "step": 12689 + }, + { + "epoch": 0.37630104083266613, + "grad_norm": 0.17793743312358856, + "learning_rate": 0.0006986365562622205, + "loss": 2.7798, + "step": 12690 + }, + { + "epoch": 0.3763306941849776, + "grad_norm": 0.16587622463703156, + "learning_rate": 0.0006985933766959645, + "loss": 2.764, + "step": 12691 + }, + { + "epoch": 0.3763603475372891, + "grad_norm": 0.1620694100856781, + "learning_rate": 0.0006985501953711302, + "loss": 2.7476, + "step": 12692 + }, + { + "epoch": 0.37639000088960056, + "grad_norm": 0.14311102032661438, + "learning_rate": 0.0006985070122880998, + "loss": 2.7072, + "step": 12693 + }, + { + "epoch": 0.37641965424191204, + "grad_norm": 0.11928102374076843, + "learning_rate": 0.0006984638274472556, + "loss": 2.7229, + "step": 12694 + }, + { + "epoch": 0.3764493075942235, + "grad_norm": 0.13668613135814667, + "learning_rate": 0.0006984206408489804, + "loss": 2.7027, + "step": 12695 + }, + { + "epoch": 0.376478960946535, + "grad_norm": 0.13268819451332092, + "learning_rate": 0.0006983774524936563, + "loss": 2.7491, + "step": 12696 + }, + { + "epoch": 0.37650861429884647, + "grad_norm": 0.11727308481931686, + "learning_rate": 0.0006983342623816655, + "loss": 2.6858, + "step": 12697 + }, + { + "epoch": 0.37653826765115794, + "grad_norm": 0.1163465678691864, + "learning_rate": 0.000698291070513391, + "loss": 2.7552, + "step": 12698 + }, + { + "epoch": 0.3765679210034694, + "grad_norm": 0.12332498282194138, + "learning_rate": 0.0006982478768892151, + "loss": 2.7675, + "step": 12699 + }, + { + "epoch": 0.3765975743557809, + "grad_norm": 0.1288984715938568, + "learning_rate": 0.00069820468150952, + "loss": 2.729, + "step": 12700 + }, + { + "epoch": 0.3766272277080924, + "grad_norm": 0.13408607244491577, + "learning_rate": 0.0006981614843746888, + "loss": 2.7287, + "step": 12701 + }, + { + "epoch": 0.3766568810604039, + "grad_norm": 0.11892440915107727, + "learning_rate": 0.0006981182854851034, + "loss": 2.6913, + "step": 12702 + }, + { + "epoch": 0.3766865344127154, + "grad_norm": 0.12271569669246674, + "learning_rate": 0.0006980750848411465, + "loss": 2.7297, + "step": 12703 + }, + { + "epoch": 0.37671618776502686, + "grad_norm": 0.12767237424850464, + "learning_rate": 0.0006980318824432008, + "loss": 2.7294, + "step": 12704 + }, + { + "epoch": 0.37674584111733833, + "grad_norm": 0.12186078727245331, + "learning_rate": 0.0006979886782916487, + "loss": 2.7328, + "step": 12705 + }, + { + "epoch": 0.3767754944696498, + "grad_norm": 0.11439251154661179, + "learning_rate": 0.000697945472386873, + "loss": 2.7229, + "step": 12706 + }, + { + "epoch": 0.3768051478219613, + "grad_norm": 0.12162469327449799, + "learning_rate": 0.0006979022647292562, + "loss": 2.7007, + "step": 12707 + }, + { + "epoch": 0.37683480117427276, + "grad_norm": 0.11490723490715027, + "learning_rate": 0.0006978590553191808, + "loss": 2.7112, + "step": 12708 + }, + { + "epoch": 0.37686445452658424, + "grad_norm": 0.10684516280889511, + "learning_rate": 0.0006978158441570295, + "loss": 2.711, + "step": 12709 + }, + { + "epoch": 0.3768941078788957, + "grad_norm": 0.10877878218889236, + "learning_rate": 0.0006977726312431849, + "loss": 2.7411, + "step": 12710 + }, + { + "epoch": 0.3769237612312072, + "grad_norm": 0.12449733167886734, + "learning_rate": 0.0006977294165780298, + "loss": 2.7023, + "step": 12711 + }, + { + "epoch": 0.37695341458351866, + "grad_norm": 0.1302075982093811, + "learning_rate": 0.0006976862001619467, + "loss": 2.705, + "step": 12712 + }, + { + "epoch": 0.37698306793583014, + "grad_norm": 0.13848410546779633, + "learning_rate": 0.0006976429819953183, + "loss": 2.726, + "step": 12713 + }, + { + "epoch": 0.3770127212881416, + "grad_norm": 0.1562938243150711, + "learning_rate": 0.0006975997620785276, + "loss": 2.7168, + "step": 12714 + }, + { + "epoch": 0.3770423746404531, + "grad_norm": 0.14723488688468933, + "learning_rate": 0.0006975565404119569, + "loss": 2.7284, + "step": 12715 + }, + { + "epoch": 0.37707202799276457, + "grad_norm": 0.15095369517803192, + "learning_rate": 0.0006975133169959892, + "loss": 2.7459, + "step": 12716 + }, + { + "epoch": 0.37710168134507605, + "grad_norm": 0.1604151427745819, + "learning_rate": 0.0006974700918310072, + "loss": 2.7402, + "step": 12717 + }, + { + "epoch": 0.3771313346973875, + "grad_norm": 0.14874407649040222, + "learning_rate": 0.0006974268649173936, + "loss": 2.7011, + "step": 12718 + }, + { + "epoch": 0.377160988049699, + "grad_norm": 0.14488936960697174, + "learning_rate": 0.0006973836362555311, + "loss": 2.7096, + "step": 12719 + }, + { + "epoch": 0.3771906414020105, + "grad_norm": 0.12292920053005219, + "learning_rate": 0.0006973404058458028, + "loss": 2.6907, + "step": 12720 + }, + { + "epoch": 0.37722029475432195, + "grad_norm": 0.14200550317764282, + "learning_rate": 0.0006972971736885912, + "loss": 2.7185, + "step": 12721 + }, + { + "epoch": 0.3772499481066335, + "grad_norm": 0.11867328733205795, + "learning_rate": 0.0006972539397842795, + "loss": 2.7595, + "step": 12722 + }, + { + "epoch": 0.37727960145894496, + "grad_norm": 0.12857891619205475, + "learning_rate": 0.00069721070413325, + "loss": 2.7035, + "step": 12723 + }, + { + "epoch": 0.37730925481125643, + "grad_norm": 0.14225994050502777, + "learning_rate": 0.000697167466735886, + "loss": 2.7564, + "step": 12724 + }, + { + "epoch": 0.3773389081635679, + "grad_norm": 0.13737179338932037, + "learning_rate": 0.0006971242275925704, + "loss": 2.7138, + "step": 12725 + }, + { + "epoch": 0.3773685615158794, + "grad_norm": 0.12735293805599213, + "learning_rate": 0.0006970809867036856, + "loss": 2.7292, + "step": 12726 + }, + { + "epoch": 0.37739821486819086, + "grad_norm": 0.10438914597034454, + "learning_rate": 0.000697037744069615, + "loss": 2.7393, + "step": 12727 + }, + { + "epoch": 0.37742786822050234, + "grad_norm": 0.12442982941865921, + "learning_rate": 0.0006969944996907416, + "loss": 2.7475, + "step": 12728 + }, + { + "epoch": 0.3774575215728138, + "grad_norm": 0.13994663953781128, + "learning_rate": 0.0006969512535674479, + "loss": 2.7352, + "step": 12729 + }, + { + "epoch": 0.3774871749251253, + "grad_norm": 0.14506720006465912, + "learning_rate": 0.0006969080057001168, + "loss": 2.7233, + "step": 12730 + }, + { + "epoch": 0.37751682827743677, + "grad_norm": 0.13781991600990295, + "learning_rate": 0.0006968647560891317, + "loss": 2.7229, + "step": 12731 + }, + { + "epoch": 0.37754648162974824, + "grad_norm": 0.11728507280349731, + "learning_rate": 0.0006968215047348753, + "loss": 2.6986, + "step": 12732 + }, + { + "epoch": 0.3775761349820597, + "grad_norm": 0.11808162182569504, + "learning_rate": 0.0006967782516377309, + "loss": 2.7254, + "step": 12733 + }, + { + "epoch": 0.3776057883343712, + "grad_norm": 0.11992878466844559, + "learning_rate": 0.0006967349967980813, + "loss": 2.7199, + "step": 12734 + }, + { + "epoch": 0.37763544168668267, + "grad_norm": 0.11703130602836609, + "learning_rate": 0.0006966917402163093, + "loss": 2.6935, + "step": 12735 + }, + { + "epoch": 0.37766509503899415, + "grad_norm": 0.12185662984848022, + "learning_rate": 0.0006966484818927983, + "loss": 2.7264, + "step": 12736 + }, + { + "epoch": 0.3776947483913056, + "grad_norm": 0.1217501312494278, + "learning_rate": 0.0006966052218279313, + "loss": 2.7249, + "step": 12737 + }, + { + "epoch": 0.3777244017436171, + "grad_norm": 0.12986773252487183, + "learning_rate": 0.0006965619600220912, + "loss": 2.6968, + "step": 12738 + }, + { + "epoch": 0.3777540550959286, + "grad_norm": 0.1581636220216751, + "learning_rate": 0.0006965186964756614, + "loss": 2.7364, + "step": 12739 + }, + { + "epoch": 0.37778370844824005, + "grad_norm": 0.15969263017177582, + "learning_rate": 0.0006964754311890247, + "loss": 2.7283, + "step": 12740 + }, + { + "epoch": 0.37781336180055153, + "grad_norm": 0.1299559623003006, + "learning_rate": 0.0006964321641625643, + "loss": 2.7308, + "step": 12741 + }, + { + "epoch": 0.377843015152863, + "grad_norm": 0.11876168102025986, + "learning_rate": 0.0006963888953966633, + "loss": 2.7234, + "step": 12742 + }, + { + "epoch": 0.37787266850517454, + "grad_norm": 0.12325271964073181, + "learning_rate": 0.0006963456248917049, + "loss": 2.7257, + "step": 12743 + }, + { + "epoch": 0.377902321857486, + "grad_norm": 0.12919601798057556, + "learning_rate": 0.0006963023526480725, + "loss": 2.7049, + "step": 12744 + }, + { + "epoch": 0.3779319752097975, + "grad_norm": 0.13106495141983032, + "learning_rate": 0.000696259078666149, + "loss": 2.702, + "step": 12745 + }, + { + "epoch": 0.37796162856210896, + "grad_norm": 0.11971733719110489, + "learning_rate": 0.0006962158029463175, + "loss": 2.7268, + "step": 12746 + }, + { + "epoch": 0.37799128191442044, + "grad_norm": 0.13132765889167786, + "learning_rate": 0.0006961725254889616, + "loss": 2.7356, + "step": 12747 + }, + { + "epoch": 0.3780209352667319, + "grad_norm": 0.1237090677022934, + "learning_rate": 0.0006961292462944643, + "loss": 2.7336, + "step": 12748 + }, + { + "epoch": 0.3780505886190434, + "grad_norm": 0.12564058601856232, + "learning_rate": 0.0006960859653632088, + "loss": 2.7065, + "step": 12749 + }, + { + "epoch": 0.37808024197135487, + "grad_norm": 0.13549591600894928, + "learning_rate": 0.0006960426826955784, + "loss": 2.742, + "step": 12750 + }, + { + "epoch": 0.37810989532366635, + "grad_norm": 0.13783937692642212, + "learning_rate": 0.0006959993982919564, + "loss": 2.7488, + "step": 12751 + }, + { + "epoch": 0.3781395486759778, + "grad_norm": 0.13064925372600555, + "learning_rate": 0.000695956112152726, + "loss": 2.7474, + "step": 12752 + }, + { + "epoch": 0.3781692020282893, + "grad_norm": 0.1260281503200531, + "learning_rate": 0.0006959128242782708, + "loss": 2.7501, + "step": 12753 + }, + { + "epoch": 0.3781988553806008, + "grad_norm": 0.14092497527599335, + "learning_rate": 0.0006958695346689737, + "loss": 2.7061, + "step": 12754 + }, + { + "epoch": 0.37822850873291225, + "grad_norm": 0.12672582268714905, + "learning_rate": 0.0006958262433252183, + "loss": 2.7159, + "step": 12755 + }, + { + "epoch": 0.3782581620852237, + "grad_norm": 0.12163841724395752, + "learning_rate": 0.000695782950247388, + "loss": 2.7178, + "step": 12756 + }, + { + "epoch": 0.3782878154375352, + "grad_norm": 0.12010959535837173, + "learning_rate": 0.0006957396554358661, + "loss": 2.6919, + "step": 12757 + }, + { + "epoch": 0.3783174687898467, + "grad_norm": 0.12649326026439667, + "learning_rate": 0.0006956963588910358, + "loss": 2.7102, + "step": 12758 + }, + { + "epoch": 0.37834712214215815, + "grad_norm": 0.13661080598831177, + "learning_rate": 0.0006956530606132807, + "loss": 2.6944, + "step": 12759 + }, + { + "epoch": 0.37837677549446963, + "grad_norm": 0.12646399438381195, + "learning_rate": 0.0006956097606029842, + "loss": 2.7391, + "step": 12760 + }, + { + "epoch": 0.3784064288467811, + "grad_norm": 0.12036751210689545, + "learning_rate": 0.0006955664588605298, + "loss": 2.7304, + "step": 12761 + }, + { + "epoch": 0.3784360821990926, + "grad_norm": 0.15058010816574097, + "learning_rate": 0.0006955231553863006, + "loss": 2.7662, + "step": 12762 + }, + { + "epoch": 0.37846573555140406, + "grad_norm": 0.14967265725135803, + "learning_rate": 0.0006954798501806803, + "loss": 2.7203, + "step": 12763 + }, + { + "epoch": 0.3784953889037156, + "grad_norm": 0.13157759606838226, + "learning_rate": 0.0006954365432440526, + "loss": 2.7621, + "step": 12764 + }, + { + "epoch": 0.37852504225602707, + "grad_norm": 0.14153504371643066, + "learning_rate": 0.0006953932345768006, + "loss": 2.7017, + "step": 12765 + }, + { + "epoch": 0.37855469560833854, + "grad_norm": 0.15816091001033783, + "learning_rate": 0.0006953499241793082, + "loss": 2.7273, + "step": 12766 + }, + { + "epoch": 0.37858434896065, + "grad_norm": 0.15036040544509888, + "learning_rate": 0.0006953066120519584, + "loss": 2.693, + "step": 12767 + }, + { + "epoch": 0.3786140023129615, + "grad_norm": 0.14210449159145355, + "learning_rate": 0.0006952632981951351, + "loss": 2.6806, + "step": 12768 + }, + { + "epoch": 0.37864365566527297, + "grad_norm": 0.12127391248941422, + "learning_rate": 0.0006952199826092218, + "loss": 2.7175, + "step": 12769 + }, + { + "epoch": 0.37867330901758445, + "grad_norm": 0.12246640026569366, + "learning_rate": 0.0006951766652946021, + "loss": 2.7638, + "step": 12770 + }, + { + "epoch": 0.3787029623698959, + "grad_norm": 0.14939726889133453, + "learning_rate": 0.0006951333462516595, + "loss": 2.7082, + "step": 12771 + }, + { + "epoch": 0.3787326157222074, + "grad_norm": 0.13529126346111298, + "learning_rate": 0.0006950900254807777, + "loss": 2.7108, + "step": 12772 + }, + { + "epoch": 0.3787622690745189, + "grad_norm": 0.11405704915523529, + "learning_rate": 0.00069504670298234, + "loss": 2.7681, + "step": 12773 + }, + { + "epoch": 0.37879192242683035, + "grad_norm": 0.13129916787147522, + "learning_rate": 0.0006950033787567304, + "loss": 2.7662, + "step": 12774 + }, + { + "epoch": 0.37882157577914183, + "grad_norm": 0.1406305730342865, + "learning_rate": 0.0006949600528043324, + "loss": 2.7319, + "step": 12775 + }, + { + "epoch": 0.3788512291314533, + "grad_norm": 0.1427960991859436, + "learning_rate": 0.0006949167251255297, + "loss": 2.7515, + "step": 12776 + }, + { + "epoch": 0.3788808824837648, + "grad_norm": 0.14271925389766693, + "learning_rate": 0.000694873395720706, + "loss": 2.7292, + "step": 12777 + }, + { + "epoch": 0.37891053583607626, + "grad_norm": 0.1205030083656311, + "learning_rate": 0.0006948300645902448, + "loss": 2.6999, + "step": 12778 + }, + { + "epoch": 0.37894018918838773, + "grad_norm": 0.13405513763427734, + "learning_rate": 0.0006947867317345301, + "loss": 2.7194, + "step": 12779 + }, + { + "epoch": 0.3789698425406992, + "grad_norm": 0.12631598114967346, + "learning_rate": 0.0006947433971539454, + "loss": 2.7026, + "step": 12780 + }, + { + "epoch": 0.3789994958930107, + "grad_norm": 0.12179791182279587, + "learning_rate": 0.0006947000608488743, + "loss": 2.7163, + "step": 12781 + }, + { + "epoch": 0.37902914924532216, + "grad_norm": 0.12425199896097183, + "learning_rate": 0.0006946567228197009, + "loss": 2.7199, + "step": 12782 + }, + { + "epoch": 0.37905880259763364, + "grad_norm": 0.11604645103216171, + "learning_rate": 0.0006946133830668089, + "loss": 2.7417, + "step": 12783 + }, + { + "epoch": 0.37908845594994517, + "grad_norm": 0.11396399885416031, + "learning_rate": 0.0006945700415905819, + "loss": 2.7328, + "step": 12784 + }, + { + "epoch": 0.37911810930225665, + "grad_norm": 0.12496102601289749, + "learning_rate": 0.0006945266983914038, + "loss": 2.7441, + "step": 12785 + }, + { + "epoch": 0.3791477626545681, + "grad_norm": 0.13324901461601257, + "learning_rate": 0.0006944833534696582, + "loss": 2.6829, + "step": 12786 + }, + { + "epoch": 0.3791774160068796, + "grad_norm": 0.13897858560085297, + "learning_rate": 0.0006944400068257294, + "loss": 2.7122, + "step": 12787 + }, + { + "epoch": 0.3792070693591911, + "grad_norm": 0.1438245177268982, + "learning_rate": 0.0006943966584600007, + "loss": 2.7123, + "step": 12788 + }, + { + "epoch": 0.37923672271150255, + "grad_norm": 0.12459775060415268, + "learning_rate": 0.0006943533083728565, + "loss": 2.7483, + "step": 12789 + }, + { + "epoch": 0.379266376063814, + "grad_norm": 0.13520143926143646, + "learning_rate": 0.0006943099565646802, + "loss": 2.7632, + "step": 12790 + }, + { + "epoch": 0.3792960294161255, + "grad_norm": 0.12312193959951401, + "learning_rate": 0.000694266603035856, + "loss": 2.6867, + "step": 12791 + }, + { + "epoch": 0.379325682768437, + "grad_norm": 0.12353810667991638, + "learning_rate": 0.0006942232477867676, + "loss": 2.7363, + "step": 12792 + }, + { + "epoch": 0.37935533612074845, + "grad_norm": 0.13749895989894867, + "learning_rate": 0.000694179890817799, + "loss": 2.7159, + "step": 12793 + }, + { + "epoch": 0.37938498947305993, + "grad_norm": 0.1325719654560089, + "learning_rate": 0.0006941365321293342, + "loss": 2.7114, + "step": 12794 + }, + { + "epoch": 0.3794146428253714, + "grad_norm": 0.13816867768764496, + "learning_rate": 0.0006940931717217572, + "loss": 2.7614, + "step": 12795 + }, + { + "epoch": 0.3794442961776829, + "grad_norm": 0.14223086833953857, + "learning_rate": 0.0006940498095954516, + "loss": 2.7229, + "step": 12796 + }, + { + "epoch": 0.37947394952999436, + "grad_norm": 0.15597975254058838, + "learning_rate": 0.0006940064457508018, + "loss": 2.7307, + "step": 12797 + }, + { + "epoch": 0.37950360288230583, + "grad_norm": 0.1325666755437851, + "learning_rate": 0.0006939630801881915, + "loss": 2.7307, + "step": 12798 + }, + { + "epoch": 0.3795332562346173, + "grad_norm": 0.1197928860783577, + "learning_rate": 0.0006939197129080051, + "loss": 2.7378, + "step": 12799 + }, + { + "epoch": 0.3795629095869288, + "grad_norm": 0.13638320565223694, + "learning_rate": 0.0006938763439106261, + "loss": 2.6867, + "step": 12800 + }, + { + "epoch": 0.37959256293924026, + "grad_norm": 0.1349364072084427, + "learning_rate": 0.0006938329731964387, + "loss": 2.74, + "step": 12801 + }, + { + "epoch": 0.37962221629155174, + "grad_norm": 0.13965436816215515, + "learning_rate": 0.000693789600765827, + "loss": 2.6877, + "step": 12802 + }, + { + "epoch": 0.3796518696438632, + "grad_norm": 0.14566698670387268, + "learning_rate": 0.0006937462266191754, + "loss": 2.7432, + "step": 12803 + }, + { + "epoch": 0.3796815229961747, + "grad_norm": 0.12951518595218658, + "learning_rate": 0.0006937028507568678, + "loss": 2.7448, + "step": 12804 + }, + { + "epoch": 0.3797111763484862, + "grad_norm": 0.1405699998140335, + "learning_rate": 0.000693659473179288, + "loss": 2.7385, + "step": 12805 + }, + { + "epoch": 0.3797408297007977, + "grad_norm": 0.14271864295005798, + "learning_rate": 0.0006936160938868204, + "loss": 2.6914, + "step": 12806 + }, + { + "epoch": 0.3797704830531092, + "grad_norm": 0.16274194419384003, + "learning_rate": 0.0006935727128798488, + "loss": 2.734, + "step": 12807 + }, + { + "epoch": 0.37980013640542065, + "grad_norm": 0.14786793291568756, + "learning_rate": 0.0006935293301587579, + "loss": 2.7254, + "step": 12808 + }, + { + "epoch": 0.37982978975773213, + "grad_norm": 0.1538117378950119, + "learning_rate": 0.0006934859457239314, + "loss": 2.7051, + "step": 12809 + }, + { + "epoch": 0.3798594431100436, + "grad_norm": 0.1275675892829895, + "learning_rate": 0.0006934425595757538, + "loss": 2.738, + "step": 12810 + }, + { + "epoch": 0.3798890964623551, + "grad_norm": 0.13179881870746613, + "learning_rate": 0.000693399171714609, + "loss": 2.7213, + "step": 12811 + }, + { + "epoch": 0.37991874981466656, + "grad_norm": 0.14211304485797882, + "learning_rate": 0.0006933557821408815, + "loss": 2.7411, + "step": 12812 + }, + { + "epoch": 0.37994840316697803, + "grad_norm": 0.1009565070271492, + "learning_rate": 0.0006933123908549552, + "loss": 2.7287, + "step": 12813 + }, + { + "epoch": 0.3799780565192895, + "grad_norm": 0.11741671711206436, + "learning_rate": 0.0006932689978572144, + "loss": 2.7179, + "step": 12814 + }, + { + "epoch": 0.380007709871601, + "grad_norm": 0.13246871531009674, + "learning_rate": 0.0006932256031480438, + "loss": 2.7218, + "step": 12815 + }, + { + "epoch": 0.38003736322391246, + "grad_norm": 0.12471163272857666, + "learning_rate": 0.0006931822067278271, + "loss": 2.7209, + "step": 12816 + }, + { + "epoch": 0.38006701657622394, + "grad_norm": 0.11985888332128525, + "learning_rate": 0.0006931388085969488, + "loss": 2.7162, + "step": 12817 + }, + { + "epoch": 0.3800966699285354, + "grad_norm": 0.13047318160533905, + "learning_rate": 0.0006930954087557931, + "loss": 2.7145, + "step": 12818 + }, + { + "epoch": 0.3801263232808469, + "grad_norm": 0.13447116315364838, + "learning_rate": 0.0006930520072047446, + "loss": 2.7343, + "step": 12819 + }, + { + "epoch": 0.38015597663315837, + "grad_norm": 0.1254173368215561, + "learning_rate": 0.0006930086039441873, + "loss": 2.7098, + "step": 12820 + }, + { + "epoch": 0.38018562998546984, + "grad_norm": 0.11737705767154694, + "learning_rate": 0.0006929651989745057, + "loss": 2.7239, + "step": 12821 + }, + { + "epoch": 0.3802152833377813, + "grad_norm": 0.13942793011665344, + "learning_rate": 0.0006929217922960842, + "loss": 2.7774, + "step": 12822 + }, + { + "epoch": 0.3802449366900928, + "grad_norm": 0.1550246775150299, + "learning_rate": 0.0006928783839093071, + "loss": 2.7374, + "step": 12823 + }, + { + "epoch": 0.38027459004240427, + "grad_norm": 0.14521972835063934, + "learning_rate": 0.0006928349738145588, + "loss": 2.7042, + "step": 12824 + }, + { + "epoch": 0.38030424339471575, + "grad_norm": 0.15072101354599, + "learning_rate": 0.0006927915620122235, + "loss": 2.6959, + "step": 12825 + }, + { + "epoch": 0.3803338967470273, + "grad_norm": 0.13435155153274536, + "learning_rate": 0.0006927481485026861, + "loss": 2.7051, + "step": 12826 + }, + { + "epoch": 0.38036355009933875, + "grad_norm": 0.15359735488891602, + "learning_rate": 0.0006927047332863308, + "loss": 2.7152, + "step": 12827 + }, + { + "epoch": 0.38039320345165023, + "grad_norm": 0.15619532763957977, + "learning_rate": 0.0006926613163635419, + "loss": 2.775, + "step": 12828 + }, + { + "epoch": 0.3804228568039617, + "grad_norm": 0.1568424552679062, + "learning_rate": 0.0006926178977347039, + "loss": 2.7165, + "step": 12829 + }, + { + "epoch": 0.3804525101562732, + "grad_norm": 0.12497957050800323, + "learning_rate": 0.0006925744774002015, + "loss": 2.7297, + "step": 12830 + }, + { + "epoch": 0.38048216350858466, + "grad_norm": 0.12029893696308136, + "learning_rate": 0.000692531055360419, + "loss": 2.7371, + "step": 12831 + }, + { + "epoch": 0.38051181686089613, + "grad_norm": 0.15276826918125153, + "learning_rate": 0.0006924876316157409, + "loss": 2.7129, + "step": 12832 + }, + { + "epoch": 0.3805414702132076, + "grad_norm": 0.15697474777698517, + "learning_rate": 0.0006924442061665518, + "loss": 2.7334, + "step": 12833 + }, + { + "epoch": 0.3805711235655191, + "grad_norm": 0.13851433992385864, + "learning_rate": 0.0006924007790132362, + "loss": 2.7287, + "step": 12834 + }, + { + "epoch": 0.38060077691783056, + "grad_norm": 0.11823670566082001, + "learning_rate": 0.0006923573501561786, + "loss": 2.729, + "step": 12835 + }, + { + "epoch": 0.38063043027014204, + "grad_norm": 0.14187537133693695, + "learning_rate": 0.0006923139195957639, + "loss": 2.7408, + "step": 12836 + }, + { + "epoch": 0.3806600836224535, + "grad_norm": 0.13387545943260193, + "learning_rate": 0.0006922704873323763, + "loss": 2.7411, + "step": 12837 + }, + { + "epoch": 0.380689736974765, + "grad_norm": 0.1167004406452179, + "learning_rate": 0.0006922270533664006, + "loss": 2.757, + "step": 12838 + }, + { + "epoch": 0.38071939032707647, + "grad_norm": 0.1308061182498932, + "learning_rate": 0.0006921836176982211, + "loss": 2.7506, + "step": 12839 + }, + { + "epoch": 0.38074904367938794, + "grad_norm": 0.12248872965574265, + "learning_rate": 0.0006921401803282228, + "loss": 2.728, + "step": 12840 + }, + { + "epoch": 0.3807786970316994, + "grad_norm": 0.14453691244125366, + "learning_rate": 0.0006920967412567903, + "loss": 2.7069, + "step": 12841 + }, + { + "epoch": 0.3808083503840109, + "grad_norm": 0.13205765187740326, + "learning_rate": 0.0006920533004843082, + "loss": 2.6922, + "step": 12842 + }, + { + "epoch": 0.3808380037363224, + "grad_norm": 0.12383001297712326, + "learning_rate": 0.0006920098580111611, + "loss": 2.7, + "step": 12843 + }, + { + "epoch": 0.38086765708863385, + "grad_norm": 0.1137649416923523, + "learning_rate": 0.0006919664138377339, + "loss": 2.7146, + "step": 12844 + }, + { + "epoch": 0.3808973104409453, + "grad_norm": 0.11512015014886856, + "learning_rate": 0.0006919229679644109, + "loss": 2.7159, + "step": 12845 + }, + { + "epoch": 0.3809269637932568, + "grad_norm": 0.10677679628133774, + "learning_rate": 0.0006918795203915771, + "loss": 2.7067, + "step": 12846 + }, + { + "epoch": 0.38095661714556833, + "grad_norm": 0.12454652041196823, + "learning_rate": 0.0006918360711196173, + "loss": 2.7185, + "step": 12847 + }, + { + "epoch": 0.3809862704978798, + "grad_norm": 0.138529971241951, + "learning_rate": 0.0006917926201489163, + "loss": 2.7157, + "step": 12848 + }, + { + "epoch": 0.3810159238501913, + "grad_norm": 0.12261178344488144, + "learning_rate": 0.0006917491674798586, + "loss": 2.7128, + "step": 12849 + }, + { + "epoch": 0.38104557720250276, + "grad_norm": 0.10763964056968689, + "learning_rate": 0.000691705713112829, + "loss": 2.7301, + "step": 12850 + }, + { + "epoch": 0.38107523055481424, + "grad_norm": 0.13611772656440735, + "learning_rate": 0.0006916622570482125, + "loss": 2.7422, + "step": 12851 + }, + { + "epoch": 0.3811048839071257, + "grad_norm": 0.12194377183914185, + "learning_rate": 0.0006916187992863939, + "loss": 2.767, + "step": 12852 + }, + { + "epoch": 0.3811345372594372, + "grad_norm": 0.11986289918422699, + "learning_rate": 0.0006915753398277578, + "loss": 2.7385, + "step": 12853 + }, + { + "epoch": 0.38116419061174867, + "grad_norm": 0.14011266827583313, + "learning_rate": 0.0006915318786726893, + "loss": 2.745, + "step": 12854 + }, + { + "epoch": 0.38119384396406014, + "grad_norm": 0.1445234715938568, + "learning_rate": 0.0006914884158215731, + "loss": 2.7035, + "step": 12855 + }, + { + "epoch": 0.3812234973163716, + "grad_norm": 0.1415102630853653, + "learning_rate": 0.000691444951274794, + "loss": 2.7041, + "step": 12856 + }, + { + "epoch": 0.3812531506686831, + "grad_norm": 0.18381451070308685, + "learning_rate": 0.0006914014850327372, + "loss": 2.7757, + "step": 12857 + }, + { + "epoch": 0.38128280402099457, + "grad_norm": 0.1674243062734604, + "learning_rate": 0.0006913580170957871, + "loss": 2.7372, + "step": 12858 + }, + { + "epoch": 0.38131245737330605, + "grad_norm": 0.13686063885688782, + "learning_rate": 0.0006913145474643292, + "loss": 2.7225, + "step": 12859 + }, + { + "epoch": 0.3813421107256175, + "grad_norm": 0.13418324291706085, + "learning_rate": 0.000691271076138748, + "loss": 2.7207, + "step": 12860 + }, + { + "epoch": 0.381371764077929, + "grad_norm": 0.11913938075304031, + "learning_rate": 0.0006912276031194286, + "loss": 2.7012, + "step": 12861 + }, + { + "epoch": 0.3814014174302405, + "grad_norm": 0.12427688390016556, + "learning_rate": 0.000691184128406756, + "loss": 2.7367, + "step": 12862 + }, + { + "epoch": 0.38143107078255195, + "grad_norm": 0.11711578071117401, + "learning_rate": 0.0006911406520011151, + "loss": 2.7166, + "step": 12863 + }, + { + "epoch": 0.3814607241348634, + "grad_norm": 0.11017776280641556, + "learning_rate": 0.000691097173902891, + "loss": 2.7375, + "step": 12864 + }, + { + "epoch": 0.3814903774871749, + "grad_norm": 0.1340661644935608, + "learning_rate": 0.0006910536941124684, + "loss": 2.739, + "step": 12865 + }, + { + "epoch": 0.3815200308394864, + "grad_norm": 0.1331225484609604, + "learning_rate": 0.0006910102126302328, + "loss": 2.7225, + "step": 12866 + }, + { + "epoch": 0.38154968419179786, + "grad_norm": 0.13923856616020203, + "learning_rate": 0.0006909667294565688, + "loss": 2.727, + "step": 12867 + }, + { + "epoch": 0.3815793375441094, + "grad_norm": 0.1280628740787506, + "learning_rate": 0.0006909232445918617, + "loss": 2.7236, + "step": 12868 + }, + { + "epoch": 0.38160899089642086, + "grad_norm": 0.10927939414978027, + "learning_rate": 0.0006908797580364965, + "loss": 2.7388, + "step": 12869 + }, + { + "epoch": 0.38163864424873234, + "grad_norm": 0.11832068115472794, + "learning_rate": 0.0006908362697908584, + "loss": 2.7106, + "step": 12870 + }, + { + "epoch": 0.3816682976010438, + "grad_norm": 0.1279928982257843, + "learning_rate": 0.0006907927798553322, + "loss": 2.7218, + "step": 12871 + }, + { + "epoch": 0.3816979509533553, + "grad_norm": 0.14401978254318237, + "learning_rate": 0.0006907492882303032, + "loss": 2.7544, + "step": 12872 + }, + { + "epoch": 0.38172760430566677, + "grad_norm": 0.14339536428451538, + "learning_rate": 0.0006907057949161565, + "loss": 2.7083, + "step": 12873 + }, + { + "epoch": 0.38175725765797824, + "grad_norm": 0.13890548050403595, + "learning_rate": 0.0006906622999132774, + "loss": 2.7265, + "step": 12874 + }, + { + "epoch": 0.3817869110102897, + "grad_norm": 0.14702339470386505, + "learning_rate": 0.0006906188032220509, + "loss": 2.7095, + "step": 12875 + }, + { + "epoch": 0.3818165643626012, + "grad_norm": 0.14007407426834106, + "learning_rate": 0.000690575304842862, + "loss": 2.7131, + "step": 12876 + }, + { + "epoch": 0.3818462177149127, + "grad_norm": 0.1288229525089264, + "learning_rate": 0.0006905318047760961, + "loss": 2.7096, + "step": 12877 + }, + { + "epoch": 0.38187587106722415, + "grad_norm": 0.13100439310073853, + "learning_rate": 0.0006904883030221384, + "loss": 2.7301, + "step": 12878 + }, + { + "epoch": 0.3819055244195356, + "grad_norm": 0.13401705026626587, + "learning_rate": 0.0006904447995813741, + "loss": 2.7188, + "step": 12879 + }, + { + "epoch": 0.3819351777718471, + "grad_norm": 0.122246153652668, + "learning_rate": 0.0006904012944541885, + "loss": 2.7518, + "step": 12880 + }, + { + "epoch": 0.3819648311241586, + "grad_norm": 0.12126285582780838, + "learning_rate": 0.0006903577876409666, + "loss": 2.7073, + "step": 12881 + }, + { + "epoch": 0.38199448447647005, + "grad_norm": 0.14840206503868103, + "learning_rate": 0.0006903142791420939, + "loss": 2.7217, + "step": 12882 + }, + { + "epoch": 0.38202413782878153, + "grad_norm": 0.1522854119539261, + "learning_rate": 0.0006902707689579555, + "loss": 2.7486, + "step": 12883 + }, + { + "epoch": 0.382053791181093, + "grad_norm": 0.1235598549246788, + "learning_rate": 0.000690227257088937, + "loss": 2.7327, + "step": 12884 + }, + { + "epoch": 0.3820834445334045, + "grad_norm": 0.13461163640022278, + "learning_rate": 0.0006901837435354231, + "loss": 2.6867, + "step": 12885 + }, + { + "epoch": 0.38211309788571596, + "grad_norm": 0.1598176509141922, + "learning_rate": 0.0006901402282977998, + "loss": 2.7451, + "step": 12886 + }, + { + "epoch": 0.38214275123802743, + "grad_norm": 0.17174804210662842, + "learning_rate": 0.0006900967113764522, + "loss": 2.7337, + "step": 12887 + }, + { + "epoch": 0.38217240459033897, + "grad_norm": 0.13894756138324738, + "learning_rate": 0.0006900531927717655, + "loss": 2.7185, + "step": 12888 + }, + { + "epoch": 0.38220205794265044, + "grad_norm": 0.12191062420606613, + "learning_rate": 0.000690009672484125, + "loss": 2.7494, + "step": 12889 + }, + { + "epoch": 0.3822317112949619, + "grad_norm": 0.14218363165855408, + "learning_rate": 0.0006899661505139164, + "loss": 2.7187, + "step": 12890 + }, + { + "epoch": 0.3822613646472734, + "grad_norm": 0.1237211674451828, + "learning_rate": 0.0006899226268615249, + "loss": 2.7426, + "step": 12891 + }, + { + "epoch": 0.38229101799958487, + "grad_norm": 0.12433943897485733, + "learning_rate": 0.0006898791015273359, + "loss": 2.7268, + "step": 12892 + }, + { + "epoch": 0.38232067135189635, + "grad_norm": 0.13120800256729126, + "learning_rate": 0.0006898355745117349, + "loss": 2.7243, + "step": 12893 + }, + { + "epoch": 0.3823503247042078, + "grad_norm": 0.15537360310554504, + "learning_rate": 0.0006897920458151074, + "loss": 2.7239, + "step": 12894 + }, + { + "epoch": 0.3823799780565193, + "grad_norm": 0.16806477308273315, + "learning_rate": 0.0006897485154378386, + "loss": 2.7324, + "step": 12895 + }, + { + "epoch": 0.3824096314088308, + "grad_norm": 0.19033652544021606, + "learning_rate": 0.0006897049833803142, + "loss": 2.7345, + "step": 12896 + }, + { + "epoch": 0.38243928476114225, + "grad_norm": 0.18881423771381378, + "learning_rate": 0.0006896614496429195, + "loss": 2.7198, + "step": 12897 + }, + { + "epoch": 0.3824689381134537, + "grad_norm": 0.14734075963497162, + "learning_rate": 0.0006896179142260403, + "loss": 2.6687, + "step": 12898 + }, + { + "epoch": 0.3824985914657652, + "grad_norm": 0.14852973818778992, + "learning_rate": 0.0006895743771300618, + "loss": 2.7352, + "step": 12899 + }, + { + "epoch": 0.3825282448180767, + "grad_norm": 0.13861678540706635, + "learning_rate": 0.0006895308383553697, + "loss": 2.7251, + "step": 12900 + }, + { + "epoch": 0.38255789817038816, + "grad_norm": 0.14589883387088776, + "learning_rate": 0.0006894872979023494, + "loss": 2.7374, + "step": 12901 + }, + { + "epoch": 0.38258755152269963, + "grad_norm": 0.14041230082511902, + "learning_rate": 0.0006894437557713866, + "loss": 2.7268, + "step": 12902 + }, + { + "epoch": 0.3826172048750111, + "grad_norm": 0.12376583367586136, + "learning_rate": 0.0006894002119628669, + "loss": 2.7093, + "step": 12903 + }, + { + "epoch": 0.3826468582273226, + "grad_norm": 0.1304221898317337, + "learning_rate": 0.0006893566664771758, + "loss": 2.7628, + "step": 12904 + }, + { + "epoch": 0.38267651157963406, + "grad_norm": 0.12295905500650406, + "learning_rate": 0.0006893131193146987, + "loss": 2.7431, + "step": 12905 + }, + { + "epoch": 0.38270616493194554, + "grad_norm": 0.11899597197771072, + "learning_rate": 0.0006892695704758217, + "loss": 2.7141, + "step": 12906 + }, + { + "epoch": 0.382735818284257, + "grad_norm": 0.12262549996376038, + "learning_rate": 0.0006892260199609301, + "loss": 2.7479, + "step": 12907 + }, + { + "epoch": 0.3827654716365685, + "grad_norm": 0.12914086878299713, + "learning_rate": 0.0006891824677704097, + "loss": 2.7051, + "step": 12908 + }, + { + "epoch": 0.38279512498888, + "grad_norm": 0.13002602756023407, + "learning_rate": 0.0006891389139046459, + "loss": 2.6704, + "step": 12909 + }, + { + "epoch": 0.3828247783411915, + "grad_norm": 0.10837966203689575, + "learning_rate": 0.0006890953583640246, + "loss": 2.6934, + "step": 12910 + }, + { + "epoch": 0.382854431693503, + "grad_norm": 0.1292126178741455, + "learning_rate": 0.0006890518011489314, + "loss": 2.7348, + "step": 12911 + }, + { + "epoch": 0.38288408504581445, + "grad_norm": 0.12889789044857025, + "learning_rate": 0.0006890082422597521, + "loss": 2.6883, + "step": 12912 + }, + { + "epoch": 0.3829137383981259, + "grad_norm": 0.15545713901519775, + "learning_rate": 0.0006889646816968725, + "loss": 2.6938, + "step": 12913 + }, + { + "epoch": 0.3829433917504374, + "grad_norm": 0.12798455357551575, + "learning_rate": 0.000688921119460678, + "loss": 2.7101, + "step": 12914 + }, + { + "epoch": 0.3829730451027489, + "grad_norm": 0.10550249367952347, + "learning_rate": 0.0006888775555515547, + "loss": 2.7186, + "step": 12915 + }, + { + "epoch": 0.38300269845506035, + "grad_norm": 0.11918656527996063, + "learning_rate": 0.0006888339899698881, + "loss": 2.7382, + "step": 12916 + }, + { + "epoch": 0.38303235180737183, + "grad_norm": 0.11775262653827667, + "learning_rate": 0.0006887904227160642, + "loss": 2.7289, + "step": 12917 + }, + { + "epoch": 0.3830620051596833, + "grad_norm": 0.11393467336893082, + "learning_rate": 0.0006887468537904686, + "loss": 2.7512, + "step": 12918 + }, + { + "epoch": 0.3830916585119948, + "grad_norm": 0.11414462327957153, + "learning_rate": 0.0006887032831934874, + "loss": 2.7276, + "step": 12919 + }, + { + "epoch": 0.38312131186430626, + "grad_norm": 0.10143986344337463, + "learning_rate": 0.0006886597109255062, + "loss": 2.6949, + "step": 12920 + }, + { + "epoch": 0.38315096521661773, + "grad_norm": 0.12985391914844513, + "learning_rate": 0.0006886161369869107, + "loss": 2.7078, + "step": 12921 + }, + { + "epoch": 0.3831806185689292, + "grad_norm": 0.12332696467638016, + "learning_rate": 0.0006885725613780871, + "loss": 2.7155, + "step": 12922 + }, + { + "epoch": 0.3832102719212407, + "grad_norm": 0.12391290813684464, + "learning_rate": 0.000688528984099421, + "loss": 2.7155, + "step": 12923 + }, + { + "epoch": 0.38323992527355216, + "grad_norm": 0.12643221020698547, + "learning_rate": 0.0006884854051512984, + "loss": 2.7149, + "step": 12924 + }, + { + "epoch": 0.38326957862586364, + "grad_norm": 0.11938101053237915, + "learning_rate": 0.0006884418245341052, + "loss": 2.738, + "step": 12925 + }, + { + "epoch": 0.3832992319781751, + "grad_norm": 0.12646862864494324, + "learning_rate": 0.0006883982422482273, + "loss": 2.7234, + "step": 12926 + }, + { + "epoch": 0.3833288853304866, + "grad_norm": 0.1428930014371872, + "learning_rate": 0.0006883546582940506, + "loss": 2.722, + "step": 12927 + }, + { + "epoch": 0.38335853868279807, + "grad_norm": 0.1425311267375946, + "learning_rate": 0.0006883110726719612, + "loss": 2.7025, + "step": 12928 + }, + { + "epoch": 0.38338819203510954, + "grad_norm": 0.15031377971172333, + "learning_rate": 0.0006882674853823448, + "loss": 2.7125, + "step": 12929 + }, + { + "epoch": 0.3834178453874211, + "grad_norm": 0.16315588355064392, + "learning_rate": 0.0006882238964255875, + "loss": 2.7521, + "step": 12930 + }, + { + "epoch": 0.38344749873973255, + "grad_norm": 0.16104954481124878, + "learning_rate": 0.0006881803058020752, + "loss": 2.7293, + "step": 12931 + }, + { + "epoch": 0.383477152092044, + "grad_norm": 0.15303947031497955, + "learning_rate": 0.0006881367135121942, + "loss": 2.7362, + "step": 12932 + }, + { + "epoch": 0.3835068054443555, + "grad_norm": 0.13208258152008057, + "learning_rate": 0.0006880931195563303, + "loss": 2.7309, + "step": 12933 + }, + { + "epoch": 0.383536458796667, + "grad_norm": 0.12323262542486191, + "learning_rate": 0.0006880495239348694, + "loss": 2.731, + "step": 12934 + }, + { + "epoch": 0.38356611214897846, + "grad_norm": 0.11848576366901398, + "learning_rate": 0.0006880059266481977, + "loss": 2.7164, + "step": 12935 + }, + { + "epoch": 0.38359576550128993, + "grad_norm": 0.13295067846775055, + "learning_rate": 0.0006879623276967013, + "loss": 2.7174, + "step": 12936 + }, + { + "epoch": 0.3836254188536014, + "grad_norm": 0.1411026567220688, + "learning_rate": 0.0006879187270807663, + "loss": 2.7092, + "step": 12937 + }, + { + "epoch": 0.3836550722059129, + "grad_norm": 0.13139061629772186, + "learning_rate": 0.0006878751248007787, + "loss": 2.7408, + "step": 12938 + }, + { + "epoch": 0.38368472555822436, + "grad_norm": 0.1330370157957077, + "learning_rate": 0.0006878315208571244, + "loss": 2.74, + "step": 12939 + }, + { + "epoch": 0.38371437891053584, + "grad_norm": 0.1413426548242569, + "learning_rate": 0.0006877879152501899, + "loss": 2.6767, + "step": 12940 + }, + { + "epoch": 0.3837440322628473, + "grad_norm": 0.12494640797376633, + "learning_rate": 0.0006877443079803614, + "loss": 2.7178, + "step": 12941 + }, + { + "epoch": 0.3837736856151588, + "grad_norm": 0.1137661337852478, + "learning_rate": 0.0006877006990480246, + "loss": 2.7353, + "step": 12942 + }, + { + "epoch": 0.38380333896747026, + "grad_norm": 0.11167572438716888, + "learning_rate": 0.0006876570884535657, + "loss": 2.7311, + "step": 12943 + }, + { + "epoch": 0.38383299231978174, + "grad_norm": 0.12153338640928268, + "learning_rate": 0.0006876134761973713, + "loss": 2.7244, + "step": 12944 + }, + { + "epoch": 0.3838626456720932, + "grad_norm": 0.12156278640031815, + "learning_rate": 0.0006875698622798274, + "loss": 2.7226, + "step": 12945 + }, + { + "epoch": 0.3838922990244047, + "grad_norm": 0.11606106162071228, + "learning_rate": 0.0006875262467013201, + "loss": 2.7177, + "step": 12946 + }, + { + "epoch": 0.38392195237671617, + "grad_norm": 0.11413488537073135, + "learning_rate": 0.0006874826294622357, + "loss": 2.7428, + "step": 12947 + }, + { + "epoch": 0.38395160572902765, + "grad_norm": 0.11745594441890717, + "learning_rate": 0.0006874390105629604, + "loss": 2.7102, + "step": 12948 + }, + { + "epoch": 0.3839812590813391, + "grad_norm": 0.11531510949134827, + "learning_rate": 0.0006873953900038805, + "loss": 2.7262, + "step": 12949 + }, + { + "epoch": 0.3840109124336506, + "grad_norm": 0.1277354657649994, + "learning_rate": 0.0006873517677853823, + "loss": 2.7127, + "step": 12950 + }, + { + "epoch": 0.38404056578596213, + "grad_norm": 0.14385628700256348, + "learning_rate": 0.0006873081439078521, + "loss": 2.7031, + "step": 12951 + }, + { + "epoch": 0.3840702191382736, + "grad_norm": 0.12188845872879028, + "learning_rate": 0.000687264518371676, + "loss": 2.6917, + "step": 12952 + }, + { + "epoch": 0.3840998724905851, + "grad_norm": 0.11093120276927948, + "learning_rate": 0.0006872208911772405, + "loss": 2.7252, + "step": 12953 + }, + { + "epoch": 0.38412952584289656, + "grad_norm": 0.13881495594978333, + "learning_rate": 0.0006871772623249319, + "loss": 2.7432, + "step": 12954 + }, + { + "epoch": 0.38415917919520803, + "grad_norm": 0.13519614934921265, + "learning_rate": 0.0006871336318151365, + "loss": 2.7218, + "step": 12955 + }, + { + "epoch": 0.3841888325475195, + "grad_norm": 0.1308978945016861, + "learning_rate": 0.0006870899996482405, + "loss": 2.7117, + "step": 12956 + }, + { + "epoch": 0.384218485899831, + "grad_norm": 0.11526849865913391, + "learning_rate": 0.0006870463658246306, + "loss": 2.7417, + "step": 12957 + }, + { + "epoch": 0.38424813925214246, + "grad_norm": 0.1253276914358139, + "learning_rate": 0.0006870027303446931, + "loss": 2.7209, + "step": 12958 + }, + { + "epoch": 0.38427779260445394, + "grad_norm": 0.12601757049560547, + "learning_rate": 0.0006869590932088143, + "loss": 2.7272, + "step": 12959 + }, + { + "epoch": 0.3843074459567654, + "grad_norm": 0.11652388423681259, + "learning_rate": 0.0006869154544173805, + "loss": 2.7326, + "step": 12960 + }, + { + "epoch": 0.3843370993090769, + "grad_norm": 0.13006985187530518, + "learning_rate": 0.0006868718139707784, + "loss": 2.7322, + "step": 12961 + }, + { + "epoch": 0.38436675266138837, + "grad_norm": 0.13385406136512756, + "learning_rate": 0.0006868281718693943, + "loss": 2.7326, + "step": 12962 + }, + { + "epoch": 0.38439640601369984, + "grad_norm": 0.14497657120227814, + "learning_rate": 0.0006867845281136145, + "loss": 2.6997, + "step": 12963 + }, + { + "epoch": 0.3844260593660113, + "grad_norm": 0.14063437283039093, + "learning_rate": 0.0006867408827038259, + "loss": 2.7501, + "step": 12964 + }, + { + "epoch": 0.3844557127183228, + "grad_norm": 0.11625295132398605, + "learning_rate": 0.0006866972356404145, + "loss": 2.7101, + "step": 12965 + }, + { + "epoch": 0.38448536607063427, + "grad_norm": 0.1348906308412552, + "learning_rate": 0.0006866535869237671, + "loss": 2.7505, + "step": 12966 + }, + { + "epoch": 0.38451501942294575, + "grad_norm": 0.14656437933444977, + "learning_rate": 0.0006866099365542703, + "loss": 2.731, + "step": 12967 + }, + { + "epoch": 0.3845446727752572, + "grad_norm": 0.14549319446086884, + "learning_rate": 0.0006865662845323104, + "loss": 2.7013, + "step": 12968 + }, + { + "epoch": 0.3845743261275687, + "grad_norm": 0.1581173986196518, + "learning_rate": 0.0006865226308582739, + "loss": 2.6785, + "step": 12969 + }, + { + "epoch": 0.3846039794798802, + "grad_norm": 0.1956157684326172, + "learning_rate": 0.0006864789755325476, + "loss": 2.7243, + "step": 12970 + }, + { + "epoch": 0.38463363283219165, + "grad_norm": 0.20351986587047577, + "learning_rate": 0.0006864353185555179, + "loss": 2.7367, + "step": 12971 + }, + { + "epoch": 0.3846632861845032, + "grad_norm": 0.15136204659938812, + "learning_rate": 0.0006863916599275714, + "loss": 2.7167, + "step": 12972 + }, + { + "epoch": 0.38469293953681466, + "grad_norm": 0.13827191293239594, + "learning_rate": 0.000686347999649095, + "loss": 2.7157, + "step": 12973 + }, + { + "epoch": 0.38472259288912614, + "grad_norm": 0.13188602030277252, + "learning_rate": 0.000686304337720475, + "loss": 2.742, + "step": 12974 + }, + { + "epoch": 0.3847522462414376, + "grad_norm": 0.13457369804382324, + "learning_rate": 0.0006862606741420979, + "loss": 2.702, + "step": 12975 + }, + { + "epoch": 0.3847818995937491, + "grad_norm": 0.14318786561489105, + "learning_rate": 0.0006862170089143506, + "loss": 2.7571, + "step": 12976 + }, + { + "epoch": 0.38481155294606056, + "grad_norm": 0.1340368241071701, + "learning_rate": 0.0006861733420376199, + "loss": 2.7046, + "step": 12977 + }, + { + "epoch": 0.38484120629837204, + "grad_norm": 0.13195830583572388, + "learning_rate": 0.0006861296735122923, + "loss": 2.7342, + "step": 12978 + }, + { + "epoch": 0.3848708596506835, + "grad_norm": 0.1233573779463768, + "learning_rate": 0.0006860860033387544, + "loss": 2.7338, + "step": 12979 + }, + { + "epoch": 0.384900513002995, + "grad_norm": 0.12968304753303528, + "learning_rate": 0.0006860423315173932, + "loss": 2.6937, + "step": 12980 + }, + { + "epoch": 0.38493016635530647, + "grad_norm": 0.1112135648727417, + "learning_rate": 0.0006859986580485948, + "loss": 2.726, + "step": 12981 + }, + { + "epoch": 0.38495981970761795, + "grad_norm": 0.11403866112232208, + "learning_rate": 0.0006859549829327466, + "loss": 2.6993, + "step": 12982 + }, + { + "epoch": 0.3849894730599294, + "grad_norm": 0.13170407712459564, + "learning_rate": 0.0006859113061702351, + "loss": 2.7261, + "step": 12983 + }, + { + "epoch": 0.3850191264122409, + "grad_norm": 0.11850818991661072, + "learning_rate": 0.0006858676277614472, + "loss": 2.7018, + "step": 12984 + }, + { + "epoch": 0.3850487797645524, + "grad_norm": 0.11831782758235931, + "learning_rate": 0.0006858239477067694, + "loss": 2.7587, + "step": 12985 + }, + { + "epoch": 0.38507843311686385, + "grad_norm": 0.10669045895338058, + "learning_rate": 0.0006857802660065886, + "loss": 2.7189, + "step": 12986 + }, + { + "epoch": 0.3851080864691753, + "grad_norm": 0.10477401316165924, + "learning_rate": 0.0006857365826612918, + "loss": 2.7089, + "step": 12987 + }, + { + "epoch": 0.3851377398214868, + "grad_norm": 0.10046359896659851, + "learning_rate": 0.0006856928976712656, + "loss": 2.7331, + "step": 12988 + }, + { + "epoch": 0.3851673931737983, + "grad_norm": 0.12153317034244537, + "learning_rate": 0.0006856492110368969, + "loss": 2.7465, + "step": 12989 + }, + { + "epoch": 0.38519704652610975, + "grad_norm": 0.14300483465194702, + "learning_rate": 0.0006856055227585726, + "loss": 2.7016, + "step": 12990 + }, + { + "epoch": 0.38522669987842123, + "grad_norm": 0.15040792524814606, + "learning_rate": 0.0006855618328366795, + "loss": 2.7264, + "step": 12991 + }, + { + "epoch": 0.38525635323073276, + "grad_norm": 0.13618919253349304, + "learning_rate": 0.0006855181412716045, + "loss": 2.7321, + "step": 12992 + }, + { + "epoch": 0.38528600658304424, + "grad_norm": 0.11511016637086868, + "learning_rate": 0.0006854744480637346, + "loss": 2.7331, + "step": 12993 + }, + { + "epoch": 0.3853156599353557, + "grad_norm": 0.12653358280658722, + "learning_rate": 0.0006854307532134566, + "loss": 2.7492, + "step": 12994 + }, + { + "epoch": 0.3853453132876672, + "grad_norm": 0.1297275871038437, + "learning_rate": 0.0006853870567211574, + "loss": 2.7041, + "step": 12995 + }, + { + "epoch": 0.38537496663997867, + "grad_norm": 0.12894266843795776, + "learning_rate": 0.0006853433585872241, + "loss": 2.7219, + "step": 12996 + }, + { + "epoch": 0.38540461999229014, + "grad_norm": 0.1458778977394104, + "learning_rate": 0.0006852996588120434, + "loss": 2.7123, + "step": 12997 + }, + { + "epoch": 0.3854342733446016, + "grad_norm": 0.13113068044185638, + "learning_rate": 0.0006852559573960026, + "loss": 2.7206, + "step": 12998 + }, + { + "epoch": 0.3854639266969131, + "grad_norm": 0.1401958018541336, + "learning_rate": 0.0006852122543394883, + "loss": 2.7264, + "step": 12999 + }, + { + "epoch": 0.38549358004922457, + "grad_norm": 0.1424298882484436, + "learning_rate": 0.0006851685496428877, + "loss": 2.6982, + "step": 13000 + }, + { + "epoch": 0.38552323340153605, + "grad_norm": 0.13529230654239655, + "learning_rate": 0.0006851248433065879, + "loss": 2.669, + "step": 13001 + }, + { + "epoch": 0.3855528867538475, + "grad_norm": 0.1270466446876526, + "learning_rate": 0.000685081135330976, + "loss": 2.7408, + "step": 13002 + }, + { + "epoch": 0.385582540106159, + "grad_norm": 0.12116456031799316, + "learning_rate": 0.0006850374257164387, + "loss": 2.7443, + "step": 13003 + }, + { + "epoch": 0.3856121934584705, + "grad_norm": 0.13722549378871918, + "learning_rate": 0.0006849937144633632, + "loss": 2.7118, + "step": 13004 + }, + { + "epoch": 0.38564184681078195, + "grad_norm": 0.14002349972724915, + "learning_rate": 0.0006849500015721366, + "loss": 2.7058, + "step": 13005 + }, + { + "epoch": 0.38567150016309343, + "grad_norm": 0.13290689885616302, + "learning_rate": 0.0006849062870431462, + "loss": 2.7147, + "step": 13006 + }, + { + "epoch": 0.3857011535154049, + "grad_norm": 0.13101807236671448, + "learning_rate": 0.0006848625708767787, + "loss": 2.6774, + "step": 13007 + }, + { + "epoch": 0.3857308068677164, + "grad_norm": 0.10714814066886902, + "learning_rate": 0.0006848188530734214, + "loss": 2.7249, + "step": 13008 + }, + { + "epoch": 0.38576046022002786, + "grad_norm": 0.12050303816795349, + "learning_rate": 0.0006847751336334614, + "loss": 2.73, + "step": 13009 + }, + { + "epoch": 0.38579011357233933, + "grad_norm": 0.11799520999193192, + "learning_rate": 0.0006847314125572859, + "loss": 2.7085, + "step": 13010 + }, + { + "epoch": 0.3858197669246508, + "grad_norm": 0.11164230108261108, + "learning_rate": 0.000684687689845282, + "loss": 2.7124, + "step": 13011 + }, + { + "epoch": 0.3858494202769623, + "grad_norm": 0.12230734527111053, + "learning_rate": 0.0006846439654978371, + "loss": 2.7333, + "step": 13012 + }, + { + "epoch": 0.3858790736292738, + "grad_norm": 0.13725419342517853, + "learning_rate": 0.0006846002395153382, + "loss": 2.713, + "step": 13013 + }, + { + "epoch": 0.3859087269815853, + "grad_norm": 0.13247770071029663, + "learning_rate": 0.0006845565118981723, + "loss": 2.6972, + "step": 13014 + }, + { + "epoch": 0.38593838033389677, + "grad_norm": 0.12275976687669754, + "learning_rate": 0.0006845127826467268, + "loss": 2.7255, + "step": 13015 + }, + { + "epoch": 0.38596803368620825, + "grad_norm": 0.11683648079633713, + "learning_rate": 0.000684469051761389, + "loss": 2.7233, + "step": 13016 + }, + { + "epoch": 0.3859976870385197, + "grad_norm": 0.13890184462070465, + "learning_rate": 0.0006844253192425463, + "loss": 2.6966, + "step": 13017 + }, + { + "epoch": 0.3860273403908312, + "grad_norm": 0.13424107432365417, + "learning_rate": 0.0006843815850905854, + "loss": 2.701, + "step": 13018 + }, + { + "epoch": 0.3860569937431427, + "grad_norm": 0.13644753396511078, + "learning_rate": 0.0006843378493058941, + "loss": 2.6891, + "step": 13019 + }, + { + "epoch": 0.38608664709545415, + "grad_norm": 0.11968366801738739, + "learning_rate": 0.0006842941118888593, + "loss": 2.7502, + "step": 13020 + }, + { + "epoch": 0.3861163004477656, + "grad_norm": 0.10617060214281082, + "learning_rate": 0.0006842503728398686, + "loss": 2.7389, + "step": 13021 + }, + { + "epoch": 0.3861459538000771, + "grad_norm": 0.12712335586547852, + "learning_rate": 0.0006842066321593094, + "loss": 2.741, + "step": 13022 + }, + { + "epoch": 0.3861756071523886, + "grad_norm": 0.1318669617176056, + "learning_rate": 0.0006841628898475688, + "loss": 2.7303, + "step": 13023 + }, + { + "epoch": 0.38620526050470005, + "grad_norm": 0.13696661591529846, + "learning_rate": 0.0006841191459050341, + "loss": 2.7136, + "step": 13024 + }, + { + "epoch": 0.38623491385701153, + "grad_norm": 0.13831433653831482, + "learning_rate": 0.0006840754003320928, + "loss": 2.7168, + "step": 13025 + }, + { + "epoch": 0.386264567209323, + "grad_norm": 0.13354158401489258, + "learning_rate": 0.0006840316531291321, + "loss": 2.7115, + "step": 13026 + }, + { + "epoch": 0.3862942205616345, + "grad_norm": 0.13634078204631805, + "learning_rate": 0.0006839879042965398, + "loss": 2.7039, + "step": 13027 + }, + { + "epoch": 0.38632387391394596, + "grad_norm": 0.13619129359722137, + "learning_rate": 0.0006839441538347029, + "loss": 2.7525, + "step": 13028 + }, + { + "epoch": 0.38635352726625743, + "grad_norm": 0.12954768538475037, + "learning_rate": 0.0006839004017440089, + "loss": 2.7515, + "step": 13029 + }, + { + "epoch": 0.3863831806185689, + "grad_norm": 0.11561917513608932, + "learning_rate": 0.0006838566480248453, + "loss": 2.7039, + "step": 13030 + }, + { + "epoch": 0.3864128339708804, + "grad_norm": 0.12091466784477234, + "learning_rate": 0.0006838128926775995, + "loss": 2.7129, + "step": 13031 + }, + { + "epoch": 0.38644248732319186, + "grad_norm": 0.12350019067525864, + "learning_rate": 0.0006837691357026589, + "loss": 2.7156, + "step": 13032 + }, + { + "epoch": 0.38647214067550334, + "grad_norm": 0.12483509629964828, + "learning_rate": 0.0006837253771004113, + "loss": 2.7334, + "step": 13033 + }, + { + "epoch": 0.38650179402781487, + "grad_norm": 0.13316166400909424, + "learning_rate": 0.0006836816168712438, + "loss": 2.7521, + "step": 13034 + }, + { + "epoch": 0.38653144738012635, + "grad_norm": 0.1409156769514084, + "learning_rate": 0.000683637855015544, + "loss": 2.7268, + "step": 13035 + }, + { + "epoch": 0.3865611007324378, + "grad_norm": 0.1391027420759201, + "learning_rate": 0.0006835940915336996, + "loss": 2.7257, + "step": 13036 + }, + { + "epoch": 0.3865907540847493, + "grad_norm": 0.13776810467243195, + "learning_rate": 0.000683550326426098, + "loss": 2.7242, + "step": 13037 + }, + { + "epoch": 0.3866204074370608, + "grad_norm": 0.13392773270606995, + "learning_rate": 0.0006835065596931265, + "loss": 2.6987, + "step": 13038 + }, + { + "epoch": 0.38665006078937225, + "grad_norm": 0.12503597140312195, + "learning_rate": 0.0006834627913351733, + "loss": 2.7179, + "step": 13039 + }, + { + "epoch": 0.38667971414168373, + "grad_norm": 0.11961612850427628, + "learning_rate": 0.0006834190213526254, + "loss": 2.7295, + "step": 13040 + }, + { + "epoch": 0.3867093674939952, + "grad_norm": 0.11139488220214844, + "learning_rate": 0.0006833752497458705, + "loss": 2.6745, + "step": 13041 + }, + { + "epoch": 0.3867390208463067, + "grad_norm": 0.1148519217967987, + "learning_rate": 0.0006833314765152963, + "loss": 2.7254, + "step": 13042 + }, + { + "epoch": 0.38676867419861816, + "grad_norm": 0.12059742212295532, + "learning_rate": 0.0006832877016612906, + "loss": 2.7206, + "step": 13043 + }, + { + "epoch": 0.38679832755092963, + "grad_norm": 0.12542380392551422, + "learning_rate": 0.0006832439251842408, + "loss": 2.7041, + "step": 13044 + }, + { + "epoch": 0.3868279809032411, + "grad_norm": 0.13712556660175323, + "learning_rate": 0.0006832001470845346, + "loss": 2.7417, + "step": 13045 + }, + { + "epoch": 0.3868576342555526, + "grad_norm": 0.13103002309799194, + "learning_rate": 0.0006831563673625594, + "loss": 2.7233, + "step": 13046 + }, + { + "epoch": 0.38688728760786406, + "grad_norm": 0.15134039521217346, + "learning_rate": 0.0006831125860187033, + "loss": 2.7184, + "step": 13047 + }, + { + "epoch": 0.38691694096017554, + "grad_norm": 0.16813232004642487, + "learning_rate": 0.0006830688030533538, + "loss": 2.7067, + "step": 13048 + }, + { + "epoch": 0.386946594312487, + "grad_norm": 0.15163956582546234, + "learning_rate": 0.0006830250184668987, + "loss": 2.695, + "step": 13049 + }, + { + "epoch": 0.3869762476647985, + "grad_norm": 0.13577815890312195, + "learning_rate": 0.0006829812322597256, + "loss": 2.6973, + "step": 13050 + }, + { + "epoch": 0.38700590101710997, + "grad_norm": 0.13284888863563538, + "learning_rate": 0.0006829374444322222, + "loss": 2.7465, + "step": 13051 + }, + { + "epoch": 0.38703555436942144, + "grad_norm": 0.15022335946559906, + "learning_rate": 0.0006828936549847765, + "loss": 2.726, + "step": 13052 + }, + { + "epoch": 0.3870652077217329, + "grad_norm": 0.14905427396297455, + "learning_rate": 0.0006828498639177758, + "loss": 2.7234, + "step": 13053 + }, + { + "epoch": 0.3870948610740444, + "grad_norm": 0.12093857675790787, + "learning_rate": 0.0006828060712316084, + "loss": 2.7168, + "step": 13054 + }, + { + "epoch": 0.3871245144263559, + "grad_norm": 0.12764708697795868, + "learning_rate": 0.0006827622769266619, + "loss": 2.7221, + "step": 13055 + }, + { + "epoch": 0.3871541677786674, + "grad_norm": 0.11826811730861664, + "learning_rate": 0.000682718481003324, + "loss": 2.7222, + "step": 13056 + }, + { + "epoch": 0.3871838211309789, + "grad_norm": 0.12553952634334564, + "learning_rate": 0.0006826746834619826, + "loss": 2.6889, + "step": 13057 + }, + { + "epoch": 0.38721347448329035, + "grad_norm": 0.13007137179374695, + "learning_rate": 0.0006826308843030255, + "loss": 2.7151, + "step": 13058 + }, + { + "epoch": 0.38724312783560183, + "grad_norm": 0.11448562890291214, + "learning_rate": 0.0006825870835268404, + "loss": 2.7301, + "step": 13059 + }, + { + "epoch": 0.3872727811879133, + "grad_norm": 0.10427949577569962, + "learning_rate": 0.0006825432811338157, + "loss": 2.7225, + "step": 13060 + }, + { + "epoch": 0.3873024345402248, + "grad_norm": 0.12013264745473862, + "learning_rate": 0.0006824994771243387, + "loss": 2.6992, + "step": 13061 + }, + { + "epoch": 0.38733208789253626, + "grad_norm": 0.11301819235086441, + "learning_rate": 0.0006824556714987975, + "loss": 2.734, + "step": 13062 + }, + { + "epoch": 0.38736174124484773, + "grad_norm": 0.1275477409362793, + "learning_rate": 0.00068241186425758, + "loss": 2.6928, + "step": 13063 + }, + { + "epoch": 0.3873913945971592, + "grad_norm": 0.13560807704925537, + "learning_rate": 0.0006823680554010742, + "loss": 2.6836, + "step": 13064 + }, + { + "epoch": 0.3874210479494707, + "grad_norm": 0.1439528614282608, + "learning_rate": 0.0006823242449296678, + "loss": 2.7269, + "step": 13065 + }, + { + "epoch": 0.38745070130178216, + "grad_norm": 0.14245297014713287, + "learning_rate": 0.0006822804328437491, + "loss": 2.7321, + "step": 13066 + }, + { + "epoch": 0.38748035465409364, + "grad_norm": 0.13256779313087463, + "learning_rate": 0.0006822366191437058, + "loss": 2.732, + "step": 13067 + }, + { + "epoch": 0.3875100080064051, + "grad_norm": 0.14619702100753784, + "learning_rate": 0.000682192803829926, + "loss": 2.7259, + "step": 13068 + }, + { + "epoch": 0.3875396613587166, + "grad_norm": 0.17168155312538147, + "learning_rate": 0.0006821489869027976, + "loss": 2.7161, + "step": 13069 + }, + { + "epoch": 0.38756931471102807, + "grad_norm": 0.15659043192863464, + "learning_rate": 0.0006821051683627087, + "loss": 2.6871, + "step": 13070 + }, + { + "epoch": 0.38759896806333954, + "grad_norm": 0.11609113216400146, + "learning_rate": 0.0006820613482100473, + "loss": 2.7303, + "step": 13071 + }, + { + "epoch": 0.387628621415651, + "grad_norm": 0.12600655853748322, + "learning_rate": 0.0006820175264452013, + "loss": 2.7359, + "step": 13072 + }, + { + "epoch": 0.3876582747679625, + "grad_norm": 0.14270345866680145, + "learning_rate": 0.000681973703068559, + "loss": 2.7275, + "step": 13073 + }, + { + "epoch": 0.387687928120274, + "grad_norm": 0.11081366240978241, + "learning_rate": 0.0006819298780805082, + "loss": 2.7188, + "step": 13074 + }, + { + "epoch": 0.38771758147258545, + "grad_norm": 0.12260586023330688, + "learning_rate": 0.0006818860514814371, + "loss": 2.7072, + "step": 13075 + }, + { + "epoch": 0.387747234824897, + "grad_norm": 0.13048714399337769, + "learning_rate": 0.0006818422232717339, + "loss": 2.6773, + "step": 13076 + }, + { + "epoch": 0.38777688817720846, + "grad_norm": 0.12128271907567978, + "learning_rate": 0.0006817983934517866, + "loss": 2.7274, + "step": 13077 + }, + { + "epoch": 0.38780654152951993, + "grad_norm": 0.13280680775642395, + "learning_rate": 0.0006817545620219833, + "loss": 2.7164, + "step": 13078 + }, + { + "epoch": 0.3878361948818314, + "grad_norm": 0.14843136072158813, + "learning_rate": 0.0006817107289827121, + "loss": 2.7171, + "step": 13079 + }, + { + "epoch": 0.3878658482341429, + "grad_norm": 0.15429770946502686, + "learning_rate": 0.0006816668943343612, + "loss": 2.7391, + "step": 13080 + }, + { + "epoch": 0.38789550158645436, + "grad_norm": 0.14266334474086761, + "learning_rate": 0.0006816230580773188, + "loss": 2.7704, + "step": 13081 + }, + { + "epoch": 0.38792515493876584, + "grad_norm": 0.11532007902860641, + "learning_rate": 0.0006815792202119731, + "loss": 2.6875, + "step": 13082 + }, + { + "epoch": 0.3879548082910773, + "grad_norm": 0.1195472851395607, + "learning_rate": 0.0006815353807387121, + "loss": 2.7382, + "step": 13083 + }, + { + "epoch": 0.3879844616433888, + "grad_norm": 0.12443836778402328, + "learning_rate": 0.0006814915396579244, + "loss": 2.702, + "step": 13084 + }, + { + "epoch": 0.38801411499570027, + "grad_norm": 0.12525169551372528, + "learning_rate": 0.0006814476969699976, + "loss": 2.7426, + "step": 13085 + }, + { + "epoch": 0.38804376834801174, + "grad_norm": 0.10645999014377594, + "learning_rate": 0.0006814038526753205, + "loss": 2.6624, + "step": 13086 + }, + { + "epoch": 0.3880734217003232, + "grad_norm": 0.12678374350070953, + "learning_rate": 0.0006813600067742811, + "loss": 2.7238, + "step": 13087 + }, + { + "epoch": 0.3881030750526347, + "grad_norm": 0.1348036229610443, + "learning_rate": 0.0006813161592672678, + "loss": 2.7109, + "step": 13088 + }, + { + "epoch": 0.38813272840494617, + "grad_norm": 0.12653961777687073, + "learning_rate": 0.0006812723101546687, + "loss": 2.7122, + "step": 13089 + }, + { + "epoch": 0.38816238175725765, + "grad_norm": 0.12373470515012741, + "learning_rate": 0.0006812284594368723, + "loss": 2.7276, + "step": 13090 + }, + { + "epoch": 0.3881920351095691, + "grad_norm": 0.12193761020898819, + "learning_rate": 0.0006811846071142667, + "loss": 2.7288, + "step": 13091 + }, + { + "epoch": 0.3882216884618806, + "grad_norm": 0.12665078043937683, + "learning_rate": 0.0006811407531872402, + "loss": 2.7343, + "step": 13092 + }, + { + "epoch": 0.3882513418141921, + "grad_norm": 0.13959072530269623, + "learning_rate": 0.0006810968976561814, + "loss": 2.7392, + "step": 13093 + }, + { + "epoch": 0.38828099516650355, + "grad_norm": 0.14047516882419586, + "learning_rate": 0.0006810530405214785, + "loss": 2.7303, + "step": 13094 + }, + { + "epoch": 0.388310648518815, + "grad_norm": 0.12444615364074707, + "learning_rate": 0.0006810091817835197, + "loss": 2.7119, + "step": 13095 + }, + { + "epoch": 0.3883403018711265, + "grad_norm": 0.11827793717384338, + "learning_rate": 0.0006809653214426936, + "loss": 2.7109, + "step": 13096 + }, + { + "epoch": 0.38836995522343803, + "grad_norm": 0.12610064446926117, + "learning_rate": 0.0006809214594993884, + "loss": 2.7053, + "step": 13097 + }, + { + "epoch": 0.3883996085757495, + "grad_norm": 0.11103426665067673, + "learning_rate": 0.0006808775959539928, + "loss": 2.7005, + "step": 13098 + }, + { + "epoch": 0.388429261928061, + "grad_norm": 0.1302085518836975, + "learning_rate": 0.0006808337308068951, + "loss": 2.7268, + "step": 13099 + }, + { + "epoch": 0.38845891528037246, + "grad_norm": 0.13980309665203094, + "learning_rate": 0.0006807898640584834, + "loss": 2.7157, + "step": 13100 + }, + { + "epoch": 0.38848856863268394, + "grad_norm": 0.15380506217479706, + "learning_rate": 0.0006807459957091466, + "loss": 2.7159, + "step": 13101 + }, + { + "epoch": 0.3885182219849954, + "grad_norm": 0.14019443094730377, + "learning_rate": 0.0006807021257592729, + "loss": 2.7319, + "step": 13102 + }, + { + "epoch": 0.3885478753373069, + "grad_norm": 0.12048006802797318, + "learning_rate": 0.000680658254209251, + "loss": 2.7274, + "step": 13103 + }, + { + "epoch": 0.38857752868961837, + "grad_norm": 0.12776042520999908, + "learning_rate": 0.0006806143810594692, + "loss": 2.7752, + "step": 13104 + }, + { + "epoch": 0.38860718204192984, + "grad_norm": 0.1277250200510025, + "learning_rate": 0.0006805705063103161, + "loss": 2.7007, + "step": 13105 + }, + { + "epoch": 0.3886368353942413, + "grad_norm": 0.11492712795734406, + "learning_rate": 0.00068052662996218, + "loss": 2.7483, + "step": 13106 + }, + { + "epoch": 0.3886664887465528, + "grad_norm": 0.11874565482139587, + "learning_rate": 0.0006804827520154496, + "loss": 2.7255, + "step": 13107 + }, + { + "epoch": 0.3886961420988643, + "grad_norm": 0.11215430498123169, + "learning_rate": 0.0006804388724705136, + "loss": 2.7518, + "step": 13108 + }, + { + "epoch": 0.38872579545117575, + "grad_norm": 0.11950819939374924, + "learning_rate": 0.0006803949913277603, + "loss": 2.6806, + "step": 13109 + }, + { + "epoch": 0.3887554488034872, + "grad_norm": 0.1297244429588318, + "learning_rate": 0.0006803511085875785, + "loss": 2.7103, + "step": 13110 + }, + { + "epoch": 0.3887851021557987, + "grad_norm": 0.1462193876504898, + "learning_rate": 0.0006803072242503567, + "loss": 2.7171, + "step": 13111 + }, + { + "epoch": 0.3888147555081102, + "grad_norm": 0.1446661651134491, + "learning_rate": 0.0006802633383164833, + "loss": 2.7152, + "step": 13112 + }, + { + "epoch": 0.38884440886042165, + "grad_norm": 0.12815245985984802, + "learning_rate": 0.0006802194507863472, + "loss": 2.6974, + "step": 13113 + }, + { + "epoch": 0.38887406221273313, + "grad_norm": 0.15607792139053345, + "learning_rate": 0.0006801755616603369, + "loss": 2.7192, + "step": 13114 + }, + { + "epoch": 0.3889037155650446, + "grad_norm": 0.17392219603061676, + "learning_rate": 0.0006801316709388412, + "loss": 2.7474, + "step": 13115 + }, + { + "epoch": 0.3889333689173561, + "grad_norm": 0.17436955869197845, + "learning_rate": 0.0006800877786222486, + "loss": 2.7257, + "step": 13116 + }, + { + "epoch": 0.3889630222696676, + "grad_norm": 0.16330966353416443, + "learning_rate": 0.0006800438847109476, + "loss": 2.7515, + "step": 13117 + }, + { + "epoch": 0.3889926756219791, + "grad_norm": 0.15369081497192383, + "learning_rate": 0.0006799999892053273, + "loss": 2.7103, + "step": 13118 + }, + { + "epoch": 0.38902232897429057, + "grad_norm": 0.13018642365932465, + "learning_rate": 0.000679956092105776, + "loss": 2.7198, + "step": 13119 + }, + { + "epoch": 0.38905198232660204, + "grad_norm": 0.11883573979139328, + "learning_rate": 0.0006799121934126829, + "loss": 2.6923, + "step": 13120 + }, + { + "epoch": 0.3890816356789135, + "grad_norm": 0.142393097281456, + "learning_rate": 0.0006798682931264363, + "loss": 2.7092, + "step": 13121 + }, + { + "epoch": 0.389111289031225, + "grad_norm": 0.11732804775238037, + "learning_rate": 0.0006798243912474251, + "loss": 2.7165, + "step": 13122 + }, + { + "epoch": 0.38914094238353647, + "grad_norm": 0.14384303987026215, + "learning_rate": 0.000679780487776038, + "loss": 2.7082, + "step": 13123 + }, + { + "epoch": 0.38917059573584795, + "grad_norm": 0.12810039520263672, + "learning_rate": 0.0006797365827126638, + "loss": 2.7254, + "step": 13124 + }, + { + "epoch": 0.3892002490881594, + "grad_norm": 0.12710407376289368, + "learning_rate": 0.0006796926760576914, + "loss": 2.7579, + "step": 13125 + }, + { + "epoch": 0.3892299024404709, + "grad_norm": 0.14025631546974182, + "learning_rate": 0.0006796487678115095, + "loss": 2.709, + "step": 13126 + }, + { + "epoch": 0.3892595557927824, + "grad_norm": 0.13319487869739532, + "learning_rate": 0.000679604857974507, + "loss": 2.6944, + "step": 13127 + }, + { + "epoch": 0.38928920914509385, + "grad_norm": 0.14062823355197906, + "learning_rate": 0.0006795609465470724, + "loss": 2.7367, + "step": 13128 + }, + { + "epoch": 0.3893188624974053, + "grad_norm": 0.13010287284851074, + "learning_rate": 0.000679517033529595, + "loss": 2.7243, + "step": 13129 + }, + { + "epoch": 0.3893485158497168, + "grad_norm": 0.12981560826301575, + "learning_rate": 0.0006794731189224634, + "loss": 2.6892, + "step": 13130 + }, + { + "epoch": 0.3893781692020283, + "grad_norm": 0.10378110408782959, + "learning_rate": 0.0006794292027260667, + "loss": 2.7312, + "step": 13131 + }, + { + "epoch": 0.38940782255433976, + "grad_norm": 0.13005562126636505, + "learning_rate": 0.0006793852849407933, + "loss": 2.7504, + "step": 13132 + }, + { + "epoch": 0.38943747590665123, + "grad_norm": 0.13019965589046478, + "learning_rate": 0.0006793413655670327, + "loss": 2.7274, + "step": 13133 + }, + { + "epoch": 0.3894671292589627, + "grad_norm": 0.1134149357676506, + "learning_rate": 0.0006792974446051732, + "loss": 2.7011, + "step": 13134 + }, + { + "epoch": 0.3894967826112742, + "grad_norm": 0.11457659304141998, + "learning_rate": 0.0006792535220556044, + "loss": 2.6919, + "step": 13135 + }, + { + "epoch": 0.38952643596358566, + "grad_norm": 0.12048882991075516, + "learning_rate": 0.0006792095979187147, + "loss": 2.7195, + "step": 13136 + }, + { + "epoch": 0.38955608931589714, + "grad_norm": 0.1476888507604599, + "learning_rate": 0.0006791656721948932, + "loss": 2.6984, + "step": 13137 + }, + { + "epoch": 0.38958574266820867, + "grad_norm": 0.13247014582157135, + "learning_rate": 0.000679121744884529, + "loss": 2.7332, + "step": 13138 + }, + { + "epoch": 0.38961539602052014, + "grad_norm": 0.11625489592552185, + "learning_rate": 0.000679077815988011, + "loss": 2.7301, + "step": 13139 + }, + { + "epoch": 0.3896450493728316, + "grad_norm": 0.13588346540927887, + "learning_rate": 0.0006790338855057282, + "loss": 2.7094, + "step": 13140 + }, + { + "epoch": 0.3896747027251431, + "grad_norm": 0.14701703190803528, + "learning_rate": 0.0006789899534380697, + "loss": 2.7199, + "step": 13141 + }, + { + "epoch": 0.3897043560774546, + "grad_norm": 0.14220179617404938, + "learning_rate": 0.0006789460197854242, + "loss": 2.7453, + "step": 13142 + }, + { + "epoch": 0.38973400942976605, + "grad_norm": 0.1336800903081894, + "learning_rate": 0.0006789020845481813, + "loss": 2.709, + "step": 13143 + }, + { + "epoch": 0.3897636627820775, + "grad_norm": 0.11271325498819351, + "learning_rate": 0.0006788581477267295, + "loss": 2.7247, + "step": 13144 + }, + { + "epoch": 0.389793316134389, + "grad_norm": 0.11312626302242279, + "learning_rate": 0.0006788142093214582, + "loss": 2.721, + "step": 13145 + }, + { + "epoch": 0.3898229694867005, + "grad_norm": 0.12976695597171783, + "learning_rate": 0.0006787702693327563, + "loss": 2.7113, + "step": 13146 + }, + { + "epoch": 0.38985262283901195, + "grad_norm": 0.11155091226100922, + "learning_rate": 0.000678726327761013, + "loss": 2.7335, + "step": 13147 + }, + { + "epoch": 0.38988227619132343, + "grad_norm": 0.11545047163963318, + "learning_rate": 0.0006786823846066176, + "loss": 2.729, + "step": 13148 + }, + { + "epoch": 0.3899119295436349, + "grad_norm": 0.11060495674610138, + "learning_rate": 0.0006786384398699588, + "loss": 2.7061, + "step": 13149 + }, + { + "epoch": 0.3899415828959464, + "grad_norm": 0.11683516204357147, + "learning_rate": 0.0006785944935514259, + "loss": 2.7527, + "step": 13150 + }, + { + "epoch": 0.38997123624825786, + "grad_norm": 0.11997362971305847, + "learning_rate": 0.0006785505456514082, + "loss": 2.724, + "step": 13151 + }, + { + "epoch": 0.39000088960056933, + "grad_norm": 0.10928940027952194, + "learning_rate": 0.0006785065961702947, + "loss": 2.7097, + "step": 13152 + }, + { + "epoch": 0.3900305429528808, + "grad_norm": 0.11573309451341629, + "learning_rate": 0.0006784626451084748, + "loss": 2.689, + "step": 13153 + }, + { + "epoch": 0.3900601963051923, + "grad_norm": 0.12181650102138519, + "learning_rate": 0.0006784186924663375, + "loss": 2.7527, + "step": 13154 + }, + { + "epoch": 0.39008984965750376, + "grad_norm": 0.15613600611686707, + "learning_rate": 0.000678374738244272, + "loss": 2.7061, + "step": 13155 + }, + { + "epoch": 0.39011950300981524, + "grad_norm": 0.15755392611026764, + "learning_rate": 0.0006783307824426674, + "loss": 2.7056, + "step": 13156 + }, + { + "epoch": 0.3901491563621267, + "grad_norm": 0.17124693095684052, + "learning_rate": 0.0006782868250619134, + "loss": 2.674, + "step": 13157 + }, + { + "epoch": 0.3901788097144382, + "grad_norm": 0.18374398350715637, + "learning_rate": 0.0006782428661023988, + "loss": 2.7318, + "step": 13158 + }, + { + "epoch": 0.3902084630667497, + "grad_norm": 0.1820138543844223, + "learning_rate": 0.0006781989055645132, + "loss": 2.749, + "step": 13159 + }, + { + "epoch": 0.3902381164190612, + "grad_norm": 0.16267354786396027, + "learning_rate": 0.0006781549434486456, + "loss": 2.7747, + "step": 13160 + }, + { + "epoch": 0.3902677697713727, + "grad_norm": 0.1317051351070404, + "learning_rate": 0.0006781109797551854, + "loss": 2.7116, + "step": 13161 + }, + { + "epoch": 0.39029742312368415, + "grad_norm": 0.13816949725151062, + "learning_rate": 0.0006780670144845218, + "loss": 2.7152, + "step": 13162 + }, + { + "epoch": 0.3903270764759956, + "grad_norm": 0.17749908566474915, + "learning_rate": 0.0006780230476370443, + "loss": 2.7567, + "step": 13163 + }, + { + "epoch": 0.3903567298283071, + "grad_norm": 0.13814310729503632, + "learning_rate": 0.0006779790792131421, + "loss": 2.7118, + "step": 13164 + }, + { + "epoch": 0.3903863831806186, + "grad_norm": 0.12886790931224823, + "learning_rate": 0.0006779351092132047, + "loss": 2.6973, + "step": 13165 + }, + { + "epoch": 0.39041603653293006, + "grad_norm": 0.13337092101573944, + "learning_rate": 0.0006778911376376215, + "loss": 2.7182, + "step": 13166 + }, + { + "epoch": 0.39044568988524153, + "grad_norm": 0.1187199279665947, + "learning_rate": 0.0006778471644867815, + "loss": 2.6838, + "step": 13167 + }, + { + "epoch": 0.390475343237553, + "grad_norm": 0.12083045393228531, + "learning_rate": 0.0006778031897610744, + "loss": 2.7339, + "step": 13168 + }, + { + "epoch": 0.3905049965898645, + "grad_norm": 0.1268676072359085, + "learning_rate": 0.0006777592134608895, + "loss": 2.7191, + "step": 13169 + }, + { + "epoch": 0.39053464994217596, + "grad_norm": 0.11180594563484192, + "learning_rate": 0.0006777152355866163, + "loss": 2.7233, + "step": 13170 + }, + { + "epoch": 0.39056430329448744, + "grad_norm": 0.11587706953287125, + "learning_rate": 0.0006776712561386442, + "loss": 2.727, + "step": 13171 + }, + { + "epoch": 0.3905939566467989, + "grad_norm": 0.11014246195554733, + "learning_rate": 0.0006776272751173627, + "loss": 2.7319, + "step": 13172 + }, + { + "epoch": 0.3906236099991104, + "grad_norm": 0.12177325040102005, + "learning_rate": 0.000677583292523161, + "loss": 2.7183, + "step": 13173 + }, + { + "epoch": 0.39065326335142186, + "grad_norm": 0.09312756359577179, + "learning_rate": 0.0006775393083564288, + "loss": 2.7146, + "step": 13174 + }, + { + "epoch": 0.39068291670373334, + "grad_norm": 0.11786720156669617, + "learning_rate": 0.0006774953226175557, + "loss": 2.7193, + "step": 13175 + }, + { + "epoch": 0.3907125700560448, + "grad_norm": 0.113099105656147, + "learning_rate": 0.0006774513353069309, + "loss": 2.7342, + "step": 13176 + }, + { + "epoch": 0.3907422234083563, + "grad_norm": 0.10776398330926895, + "learning_rate": 0.0006774073464249442, + "loss": 2.729, + "step": 13177 + }, + { + "epoch": 0.39077187676066777, + "grad_norm": 0.11830548942089081, + "learning_rate": 0.0006773633559719849, + "loss": 2.7157, + "step": 13178 + }, + { + "epoch": 0.39080153011297925, + "grad_norm": 0.11537499725818634, + "learning_rate": 0.0006773193639484426, + "loss": 2.7334, + "step": 13179 + }, + { + "epoch": 0.3908311834652908, + "grad_norm": 0.1153663769364357, + "learning_rate": 0.0006772753703547069, + "loss": 2.7388, + "step": 13180 + }, + { + "epoch": 0.39086083681760225, + "grad_norm": 0.11308275163173676, + "learning_rate": 0.0006772313751911677, + "loss": 2.7278, + "step": 13181 + }, + { + "epoch": 0.39089049016991373, + "grad_norm": 0.12670885026454926, + "learning_rate": 0.0006771873784582138, + "loss": 2.6675, + "step": 13182 + }, + { + "epoch": 0.3909201435222252, + "grad_norm": 0.1415470987558365, + "learning_rate": 0.0006771433801562354, + "loss": 2.7153, + "step": 13183 + }, + { + "epoch": 0.3909497968745367, + "grad_norm": 0.13633716106414795, + "learning_rate": 0.000677099380285622, + "loss": 2.7304, + "step": 13184 + }, + { + "epoch": 0.39097945022684816, + "grad_norm": 0.11705965548753738, + "learning_rate": 0.0006770553788467632, + "loss": 2.7541, + "step": 13185 + }, + { + "epoch": 0.39100910357915963, + "grad_norm": 0.11173124611377716, + "learning_rate": 0.0006770113758400487, + "loss": 2.6944, + "step": 13186 + }, + { + "epoch": 0.3910387569314711, + "grad_norm": 0.12315211445093155, + "learning_rate": 0.000676967371265868, + "loss": 2.6872, + "step": 13187 + }, + { + "epoch": 0.3910684102837826, + "grad_norm": 0.12506233155727386, + "learning_rate": 0.0006769233651246108, + "loss": 2.72, + "step": 13188 + }, + { + "epoch": 0.39109806363609406, + "grad_norm": 0.13694781064987183, + "learning_rate": 0.0006768793574166668, + "loss": 2.7013, + "step": 13189 + }, + { + "epoch": 0.39112771698840554, + "grad_norm": 0.176840677857399, + "learning_rate": 0.0006768353481424259, + "loss": 2.6891, + "step": 13190 + }, + { + "epoch": 0.391157370340717, + "grad_norm": 0.1597885638475418, + "learning_rate": 0.0006767913373022776, + "loss": 2.7006, + "step": 13191 + }, + { + "epoch": 0.3911870236930285, + "grad_norm": 0.12314088642597198, + "learning_rate": 0.0006767473248966116, + "loss": 2.7171, + "step": 13192 + }, + { + "epoch": 0.39121667704533997, + "grad_norm": 0.12869374454021454, + "learning_rate": 0.0006767033109258176, + "loss": 2.7611, + "step": 13193 + }, + { + "epoch": 0.39124633039765144, + "grad_norm": 0.1373235285282135, + "learning_rate": 0.0006766592953902856, + "loss": 2.6841, + "step": 13194 + }, + { + "epoch": 0.3912759837499629, + "grad_norm": 0.1338232159614563, + "learning_rate": 0.0006766152782904051, + "loss": 2.7144, + "step": 13195 + }, + { + "epoch": 0.3913056371022744, + "grad_norm": 0.12647467851638794, + "learning_rate": 0.0006765712596265661, + "loss": 2.7137, + "step": 13196 + }, + { + "epoch": 0.39133529045458587, + "grad_norm": 0.13819557428359985, + "learning_rate": 0.0006765272393991583, + "loss": 2.7203, + "step": 13197 + }, + { + "epoch": 0.39136494380689735, + "grad_norm": 0.12910126149654388, + "learning_rate": 0.0006764832176085714, + "loss": 2.6873, + "step": 13198 + }, + { + "epoch": 0.3913945971592088, + "grad_norm": 0.11500456929206848, + "learning_rate": 0.0006764391942551954, + "loss": 2.732, + "step": 13199 + }, + { + "epoch": 0.3914242505115203, + "grad_norm": 0.1378960758447647, + "learning_rate": 0.00067639516933942, + "loss": 2.6861, + "step": 13200 + }, + { + "epoch": 0.39145390386383183, + "grad_norm": 0.13664200901985168, + "learning_rate": 0.0006763511428616351, + "loss": 2.7475, + "step": 13201 + }, + { + "epoch": 0.3914835572161433, + "grad_norm": 0.12391982972621918, + "learning_rate": 0.0006763071148222306, + "loss": 2.7241, + "step": 13202 + }, + { + "epoch": 0.3915132105684548, + "grad_norm": 0.11935833096504211, + "learning_rate": 0.0006762630852215962, + "loss": 2.708, + "step": 13203 + }, + { + "epoch": 0.39154286392076626, + "grad_norm": 0.12191960215568542, + "learning_rate": 0.0006762190540601222, + "loss": 2.7322, + "step": 13204 + }, + { + "epoch": 0.39157251727307774, + "grad_norm": 0.12996551394462585, + "learning_rate": 0.000676175021338198, + "loss": 2.7245, + "step": 13205 + }, + { + "epoch": 0.3916021706253892, + "grad_norm": 0.13114018738269806, + "learning_rate": 0.0006761309870562138, + "loss": 2.7363, + "step": 13206 + }, + { + "epoch": 0.3916318239777007, + "grad_norm": 0.1287459433078766, + "learning_rate": 0.0006760869512145595, + "loss": 2.7288, + "step": 13207 + }, + { + "epoch": 0.39166147733001216, + "grad_norm": 0.15733391046524048, + "learning_rate": 0.000676042913813625, + "loss": 2.7201, + "step": 13208 + }, + { + "epoch": 0.39169113068232364, + "grad_norm": 0.16995440423488617, + "learning_rate": 0.0006759988748538003, + "loss": 2.7045, + "step": 13209 + }, + { + "epoch": 0.3917207840346351, + "grad_norm": 0.14612257480621338, + "learning_rate": 0.0006759548343354754, + "loss": 2.7316, + "step": 13210 + }, + { + "epoch": 0.3917504373869466, + "grad_norm": 0.16452640295028687, + "learning_rate": 0.0006759107922590402, + "loss": 2.7523, + "step": 13211 + }, + { + "epoch": 0.39178009073925807, + "grad_norm": 0.18512235581874847, + "learning_rate": 0.0006758667486248846, + "loss": 2.7192, + "step": 13212 + }, + { + "epoch": 0.39180974409156955, + "grad_norm": 0.1481136977672577, + "learning_rate": 0.000675822703433399, + "loss": 2.6951, + "step": 13213 + }, + { + "epoch": 0.391839397443881, + "grad_norm": 0.15924306213855743, + "learning_rate": 0.0006757786566849729, + "loss": 2.7544, + "step": 13214 + }, + { + "epoch": 0.3918690507961925, + "grad_norm": 0.14238283038139343, + "learning_rate": 0.0006757346083799969, + "loss": 2.7231, + "step": 13215 + }, + { + "epoch": 0.391898704148504, + "grad_norm": 0.14259593188762665, + "learning_rate": 0.0006756905585188607, + "loss": 2.7184, + "step": 13216 + }, + { + "epoch": 0.39192835750081545, + "grad_norm": 0.14837276935577393, + "learning_rate": 0.0006756465071019543, + "loss": 2.7183, + "step": 13217 + }, + { + "epoch": 0.3919580108531269, + "grad_norm": 0.11732189357280731, + "learning_rate": 0.000675602454129668, + "loss": 2.6871, + "step": 13218 + }, + { + "epoch": 0.3919876642054384, + "grad_norm": 0.11060524731874466, + "learning_rate": 0.0006755583996023919, + "loss": 2.7354, + "step": 13219 + }, + { + "epoch": 0.3920173175577499, + "grad_norm": 0.13453340530395508, + "learning_rate": 0.0006755143435205161, + "loss": 2.7434, + "step": 13220 + }, + { + "epoch": 0.3920469709100614, + "grad_norm": 0.1281934380531311, + "learning_rate": 0.0006754702858844303, + "loss": 2.704, + "step": 13221 + }, + { + "epoch": 0.3920766242623729, + "grad_norm": 0.13422423601150513, + "learning_rate": 0.0006754262266945254, + "loss": 2.7064, + "step": 13222 + }, + { + "epoch": 0.39210627761468436, + "grad_norm": 0.12887129187583923, + "learning_rate": 0.0006753821659511909, + "loss": 2.7449, + "step": 13223 + }, + { + "epoch": 0.39213593096699584, + "grad_norm": 0.13997292518615723, + "learning_rate": 0.0006753381036548175, + "loss": 2.7119, + "step": 13224 + }, + { + "epoch": 0.3921655843193073, + "grad_norm": 0.14171136915683746, + "learning_rate": 0.000675294039805795, + "loss": 2.7055, + "step": 13225 + }, + { + "epoch": 0.3921952376716188, + "grad_norm": 0.13520970940589905, + "learning_rate": 0.0006752499744045135, + "loss": 2.7298, + "step": 13226 + }, + { + "epoch": 0.39222489102393027, + "grad_norm": 0.14097611606121063, + "learning_rate": 0.0006752059074513634, + "loss": 2.7166, + "step": 13227 + }, + { + "epoch": 0.39225454437624174, + "grad_norm": 0.1116812601685524, + "learning_rate": 0.0006751618389467351, + "loss": 2.7306, + "step": 13228 + }, + { + "epoch": 0.3922841977285532, + "grad_norm": 0.12927955389022827, + "learning_rate": 0.0006751177688910186, + "loss": 2.7055, + "step": 13229 + }, + { + "epoch": 0.3923138510808647, + "grad_norm": 0.12437283992767334, + "learning_rate": 0.0006750736972846042, + "loss": 2.737, + "step": 13230 + }, + { + "epoch": 0.39234350443317617, + "grad_norm": 0.11874113231897354, + "learning_rate": 0.0006750296241278821, + "loss": 2.7211, + "step": 13231 + }, + { + "epoch": 0.39237315778548765, + "grad_norm": 0.10503698140382767, + "learning_rate": 0.0006749855494212427, + "loss": 2.7142, + "step": 13232 + }, + { + "epoch": 0.3924028111377991, + "grad_norm": 0.11546024680137634, + "learning_rate": 0.0006749414731650762, + "loss": 2.7499, + "step": 13233 + }, + { + "epoch": 0.3924324644901106, + "grad_norm": 0.1130623146891594, + "learning_rate": 0.0006748973953597727, + "loss": 2.7208, + "step": 13234 + }, + { + "epoch": 0.3924621178424221, + "grad_norm": 0.10306139290332794, + "learning_rate": 0.0006748533160057232, + "loss": 2.7136, + "step": 13235 + }, + { + "epoch": 0.39249177119473355, + "grad_norm": 0.12906448543071747, + "learning_rate": 0.0006748092351033173, + "loss": 2.7425, + "step": 13236 + }, + { + "epoch": 0.39252142454704503, + "grad_norm": 0.1266695111989975, + "learning_rate": 0.0006747651526529456, + "loss": 2.7011, + "step": 13237 + }, + { + "epoch": 0.3925510778993565, + "grad_norm": 0.13861548900604248, + "learning_rate": 0.0006747210686549987, + "loss": 2.6872, + "step": 13238 + }, + { + "epoch": 0.392580731251668, + "grad_norm": 0.15977708995342255, + "learning_rate": 0.0006746769831098664, + "loss": 2.7241, + "step": 13239 + }, + { + "epoch": 0.39261038460397946, + "grad_norm": 0.16976720094680786, + "learning_rate": 0.0006746328960179396, + "loss": 2.7115, + "step": 13240 + }, + { + "epoch": 0.39264003795629093, + "grad_norm": 0.136577308177948, + "learning_rate": 0.0006745888073796086, + "loss": 2.7112, + "step": 13241 + }, + { + "epoch": 0.39266969130860246, + "grad_norm": 0.12391053885221481, + "learning_rate": 0.0006745447171952637, + "loss": 2.7519, + "step": 13242 + }, + { + "epoch": 0.39269934466091394, + "grad_norm": 0.12269576638936996, + "learning_rate": 0.0006745006254652953, + "loss": 2.6914, + "step": 13243 + }, + { + "epoch": 0.3927289980132254, + "grad_norm": 0.13872788846492767, + "learning_rate": 0.000674456532190094, + "loss": 2.7169, + "step": 13244 + }, + { + "epoch": 0.3927586513655369, + "grad_norm": 0.14278414845466614, + "learning_rate": 0.0006744124373700501, + "loss": 2.7178, + "step": 13245 + }, + { + "epoch": 0.39278830471784837, + "grad_norm": 0.12341979146003723, + "learning_rate": 0.0006743683410055543, + "loss": 2.7303, + "step": 13246 + }, + { + "epoch": 0.39281795807015985, + "grad_norm": 0.1249215230345726, + "learning_rate": 0.0006743242430969965, + "loss": 2.7412, + "step": 13247 + }, + { + "epoch": 0.3928476114224713, + "grad_norm": 0.13384462893009186, + "learning_rate": 0.0006742801436447679, + "loss": 2.7539, + "step": 13248 + }, + { + "epoch": 0.3928772647747828, + "grad_norm": 0.12907111644744873, + "learning_rate": 0.0006742360426492587, + "loss": 2.7409, + "step": 13249 + }, + { + "epoch": 0.3929069181270943, + "grad_norm": 0.11589471250772476, + "learning_rate": 0.0006741919401108594, + "loss": 2.703, + "step": 13250 + }, + { + "epoch": 0.39293657147940575, + "grad_norm": 0.10300952196121216, + "learning_rate": 0.0006741478360299607, + "loss": 2.6972, + "step": 13251 + }, + { + "epoch": 0.3929662248317172, + "grad_norm": 0.0960015207529068, + "learning_rate": 0.0006741037304069529, + "loss": 2.7413, + "step": 13252 + }, + { + "epoch": 0.3929958781840287, + "grad_norm": 0.11381829530000687, + "learning_rate": 0.0006740596232422266, + "loss": 2.6867, + "step": 13253 + }, + { + "epoch": 0.3930255315363402, + "grad_norm": 0.12543849647045135, + "learning_rate": 0.0006740155145361726, + "loss": 2.7418, + "step": 13254 + }, + { + "epoch": 0.39305518488865165, + "grad_norm": 0.11715443432331085, + "learning_rate": 0.0006739714042891812, + "loss": 2.6983, + "step": 13255 + }, + { + "epoch": 0.39308483824096313, + "grad_norm": 0.14110951125621796, + "learning_rate": 0.0006739272925016433, + "loss": 2.6996, + "step": 13256 + }, + { + "epoch": 0.3931144915932746, + "grad_norm": 0.15286390483379364, + "learning_rate": 0.0006738831791739493, + "loss": 2.7294, + "step": 13257 + }, + { + "epoch": 0.3931441449455861, + "grad_norm": 0.15834380686283112, + "learning_rate": 0.00067383906430649, + "loss": 2.7268, + "step": 13258 + }, + { + "epoch": 0.39317379829789756, + "grad_norm": 0.13687987625598907, + "learning_rate": 0.0006737949478996559, + "loss": 2.7142, + "step": 13259 + }, + { + "epoch": 0.39320345165020903, + "grad_norm": 0.12489239871501923, + "learning_rate": 0.0006737508299538375, + "loss": 2.7127, + "step": 13260 + }, + { + "epoch": 0.3932331050025205, + "grad_norm": 0.1191849410533905, + "learning_rate": 0.0006737067104694258, + "loss": 2.7197, + "step": 13261 + }, + { + "epoch": 0.393262758354832, + "grad_norm": 0.12328537553548813, + "learning_rate": 0.0006736625894468116, + "loss": 2.7081, + "step": 13262 + }, + { + "epoch": 0.3932924117071435, + "grad_norm": 0.13732290267944336, + "learning_rate": 0.0006736184668863852, + "loss": 2.7136, + "step": 13263 + }, + { + "epoch": 0.393322065059455, + "grad_norm": 0.14783266186714172, + "learning_rate": 0.0006735743427885375, + "loss": 2.7256, + "step": 13264 + }, + { + "epoch": 0.39335171841176647, + "grad_norm": 0.13659749925136566, + "learning_rate": 0.0006735302171536591, + "loss": 2.6938, + "step": 13265 + }, + { + "epoch": 0.39338137176407795, + "grad_norm": 0.13428142666816711, + "learning_rate": 0.0006734860899821408, + "loss": 2.7043, + "step": 13266 + }, + { + "epoch": 0.3934110251163894, + "grad_norm": 0.12441059947013855, + "learning_rate": 0.0006734419612743736, + "loss": 2.7257, + "step": 13267 + }, + { + "epoch": 0.3934406784687009, + "grad_norm": 0.10546961426734924, + "learning_rate": 0.0006733978310307479, + "loss": 2.7147, + "step": 13268 + }, + { + "epoch": 0.3934703318210124, + "grad_norm": 0.10956922918558121, + "learning_rate": 0.0006733536992516546, + "loss": 2.7263, + "step": 13269 + }, + { + "epoch": 0.39349998517332385, + "grad_norm": 0.12741664052009583, + "learning_rate": 0.0006733095659374847, + "loss": 2.6982, + "step": 13270 + }, + { + "epoch": 0.39352963852563533, + "grad_norm": 0.12992435693740845, + "learning_rate": 0.0006732654310886288, + "loss": 2.7361, + "step": 13271 + }, + { + "epoch": 0.3935592918779468, + "grad_norm": 0.10562101751565933, + "learning_rate": 0.0006732212947054777, + "loss": 2.7324, + "step": 13272 + }, + { + "epoch": 0.3935889452302583, + "grad_norm": 0.11606238782405853, + "learning_rate": 0.0006731771567884223, + "loss": 2.715, + "step": 13273 + }, + { + "epoch": 0.39361859858256976, + "grad_norm": 0.1325705498456955, + "learning_rate": 0.0006731330173378535, + "loss": 2.7415, + "step": 13274 + }, + { + "epoch": 0.39364825193488123, + "grad_norm": 0.12023548781871796, + "learning_rate": 0.000673088876354162, + "loss": 2.6909, + "step": 13275 + }, + { + "epoch": 0.3936779052871927, + "grad_norm": 0.12868456542491913, + "learning_rate": 0.000673044733837739, + "loss": 2.7221, + "step": 13276 + }, + { + "epoch": 0.3937075586395042, + "grad_norm": 0.1254430115222931, + "learning_rate": 0.000673000589788975, + "loss": 2.7224, + "step": 13277 + }, + { + "epoch": 0.39373721199181566, + "grad_norm": 0.11979466676712036, + "learning_rate": 0.0006729564442082612, + "loss": 2.7192, + "step": 13278 + }, + { + "epoch": 0.39376686534412714, + "grad_norm": 0.10587485879659653, + "learning_rate": 0.0006729122970959884, + "loss": 2.7003, + "step": 13279 + }, + { + "epoch": 0.3937965186964386, + "grad_norm": 0.12314972281455994, + "learning_rate": 0.0006728681484525474, + "loss": 2.6925, + "step": 13280 + }, + { + "epoch": 0.3938261720487501, + "grad_norm": 0.14610253274440765, + "learning_rate": 0.0006728239982783294, + "loss": 2.7165, + "step": 13281 + }, + { + "epoch": 0.39385582540106157, + "grad_norm": 0.16502119600772858, + "learning_rate": 0.0006727798465737252, + "loss": 2.7598, + "step": 13282 + }, + { + "epoch": 0.39388547875337304, + "grad_norm": 0.2030264437198639, + "learning_rate": 0.0006727356933391257, + "loss": 2.7392, + "step": 13283 + }, + { + "epoch": 0.3939151321056846, + "grad_norm": 0.22513100504875183, + "learning_rate": 0.0006726915385749223, + "loss": 2.7604, + "step": 13284 + }, + { + "epoch": 0.39394478545799605, + "grad_norm": 0.19264043867588043, + "learning_rate": 0.0006726473822815055, + "loss": 2.7067, + "step": 13285 + }, + { + "epoch": 0.3939744388103075, + "grad_norm": 0.13925358653068542, + "learning_rate": 0.0006726032244592663, + "loss": 2.7334, + "step": 13286 + }, + { + "epoch": 0.394004092162619, + "grad_norm": 0.15697944164276123, + "learning_rate": 0.000672559065108596, + "loss": 2.7143, + "step": 13287 + }, + { + "epoch": 0.3940337455149305, + "grad_norm": 0.14062775671482086, + "learning_rate": 0.0006725149042298857, + "loss": 2.7046, + "step": 13288 + }, + { + "epoch": 0.39406339886724195, + "grad_norm": 0.1331038624048233, + "learning_rate": 0.0006724707418235262, + "loss": 2.7054, + "step": 13289 + }, + { + "epoch": 0.39409305221955343, + "grad_norm": 0.12826955318450928, + "learning_rate": 0.0006724265778899088, + "loss": 2.6817, + "step": 13290 + }, + { + "epoch": 0.3941227055718649, + "grad_norm": 0.11881602555513382, + "learning_rate": 0.0006723824124294244, + "loss": 2.7226, + "step": 13291 + }, + { + "epoch": 0.3941523589241764, + "grad_norm": 0.1248338595032692, + "learning_rate": 0.0006723382454424641, + "loss": 2.6987, + "step": 13292 + }, + { + "epoch": 0.39418201227648786, + "grad_norm": 0.11479002237319946, + "learning_rate": 0.000672294076929419, + "loss": 2.729, + "step": 13293 + }, + { + "epoch": 0.39421166562879933, + "grad_norm": 0.11273787915706635, + "learning_rate": 0.0006722499068906804, + "loss": 2.703, + "step": 13294 + }, + { + "epoch": 0.3942413189811108, + "grad_norm": 0.10982906818389893, + "learning_rate": 0.0006722057353266394, + "loss": 2.7374, + "step": 13295 + }, + { + "epoch": 0.3942709723334223, + "grad_norm": 0.11504428088665009, + "learning_rate": 0.0006721615622376869, + "loss": 2.7141, + "step": 13296 + }, + { + "epoch": 0.39430062568573376, + "grad_norm": 0.10683801025152206, + "learning_rate": 0.0006721173876242142, + "loss": 2.6934, + "step": 13297 + }, + { + "epoch": 0.39433027903804524, + "grad_norm": 0.10738595575094223, + "learning_rate": 0.0006720732114866124, + "loss": 2.6966, + "step": 13298 + }, + { + "epoch": 0.3943599323903567, + "grad_norm": 0.113582544028759, + "learning_rate": 0.0006720290338252729, + "loss": 2.6924, + "step": 13299 + }, + { + "epoch": 0.3943895857426682, + "grad_norm": 0.11029709875583649, + "learning_rate": 0.0006719848546405869, + "loss": 2.7583, + "step": 13300 + }, + { + "epoch": 0.39441923909497967, + "grad_norm": 0.1230650544166565, + "learning_rate": 0.0006719406739329454, + "loss": 2.7221, + "step": 13301 + }, + { + "epoch": 0.39444889244729114, + "grad_norm": 0.14165376126766205, + "learning_rate": 0.0006718964917027396, + "loss": 2.7195, + "step": 13302 + }, + { + "epoch": 0.3944785457996026, + "grad_norm": 0.14707602560520172, + "learning_rate": 0.000671852307950361, + "loss": 2.7094, + "step": 13303 + }, + { + "epoch": 0.3945081991519141, + "grad_norm": 0.12201125919818878, + "learning_rate": 0.0006718081226762007, + "loss": 2.7496, + "step": 13304 + }, + { + "epoch": 0.39453785250422563, + "grad_norm": 0.12192800641059875, + "learning_rate": 0.0006717639358806499, + "loss": 2.6986, + "step": 13305 + }, + { + "epoch": 0.3945675058565371, + "grad_norm": 0.11879398673772812, + "learning_rate": 0.0006717197475640999, + "loss": 2.7192, + "step": 13306 + }, + { + "epoch": 0.3945971592088486, + "grad_norm": 0.11857479065656662, + "learning_rate": 0.0006716755577269423, + "loss": 2.7376, + "step": 13307 + }, + { + "epoch": 0.39462681256116006, + "grad_norm": 0.12401127815246582, + "learning_rate": 0.000671631366369568, + "loss": 2.7117, + "step": 13308 + }, + { + "epoch": 0.39465646591347153, + "grad_norm": 0.12416911125183105, + "learning_rate": 0.0006715871734923685, + "loss": 2.7208, + "step": 13309 + }, + { + "epoch": 0.394686119265783, + "grad_norm": 0.13903668522834778, + "learning_rate": 0.0006715429790957352, + "loss": 2.7385, + "step": 13310 + }, + { + "epoch": 0.3947157726180945, + "grad_norm": 0.14979985356330872, + "learning_rate": 0.0006714987831800593, + "loss": 2.7234, + "step": 13311 + }, + { + "epoch": 0.39474542597040596, + "grad_norm": 0.12501558661460876, + "learning_rate": 0.0006714545857457322, + "loss": 2.683, + "step": 13312 + }, + { + "epoch": 0.39477507932271744, + "grad_norm": 0.1196155920624733, + "learning_rate": 0.0006714103867931455, + "loss": 2.7009, + "step": 13313 + }, + { + "epoch": 0.3948047326750289, + "grad_norm": 0.14337949454784393, + "learning_rate": 0.0006713661863226902, + "loss": 2.7273, + "step": 13314 + }, + { + "epoch": 0.3948343860273404, + "grad_norm": 0.11582396179437637, + "learning_rate": 0.000671321984334758, + "loss": 2.7065, + "step": 13315 + }, + { + "epoch": 0.39486403937965187, + "grad_norm": 0.12888982892036438, + "learning_rate": 0.0006712777808297402, + "loss": 2.719, + "step": 13316 + }, + { + "epoch": 0.39489369273196334, + "grad_norm": 0.11338363587856293, + "learning_rate": 0.0006712335758080283, + "loss": 2.7235, + "step": 13317 + }, + { + "epoch": 0.3949233460842748, + "grad_norm": 0.11275387555360794, + "learning_rate": 0.0006711893692700136, + "loss": 2.7334, + "step": 13318 + }, + { + "epoch": 0.3949529994365863, + "grad_norm": 0.10973986238241196, + "learning_rate": 0.0006711451612160877, + "loss": 2.6746, + "step": 13319 + }, + { + "epoch": 0.39498265278889777, + "grad_norm": 0.1205897107720375, + "learning_rate": 0.0006711009516466421, + "loss": 2.6891, + "step": 13320 + }, + { + "epoch": 0.39501230614120925, + "grad_norm": 0.12891700863838196, + "learning_rate": 0.0006710567405620681, + "loss": 2.7011, + "step": 13321 + }, + { + "epoch": 0.3950419594935207, + "grad_norm": 0.12938226759433746, + "learning_rate": 0.0006710125279627574, + "loss": 2.7151, + "step": 13322 + }, + { + "epoch": 0.3950716128458322, + "grad_norm": 0.13755148649215698, + "learning_rate": 0.0006709683138491014, + "loss": 2.7092, + "step": 13323 + }, + { + "epoch": 0.3951012661981437, + "grad_norm": 0.1213621273636818, + "learning_rate": 0.0006709240982214914, + "loss": 2.7193, + "step": 13324 + }, + { + "epoch": 0.3951309195504552, + "grad_norm": 0.12108168005943298, + "learning_rate": 0.0006708798810803194, + "loss": 2.6609, + "step": 13325 + }, + { + "epoch": 0.3951605729027667, + "grad_norm": 0.13162483274936676, + "learning_rate": 0.0006708356624259768, + "loss": 2.6928, + "step": 13326 + }, + { + "epoch": 0.39519022625507816, + "grad_norm": 0.15566149353981018, + "learning_rate": 0.0006707914422588548, + "loss": 2.6822, + "step": 13327 + }, + { + "epoch": 0.39521987960738963, + "grad_norm": 0.1694745272397995, + "learning_rate": 0.0006707472205793456, + "loss": 2.73, + "step": 13328 + }, + { + "epoch": 0.3952495329597011, + "grad_norm": 0.14418058097362518, + "learning_rate": 0.0006707029973878402, + "loss": 2.6939, + "step": 13329 + }, + { + "epoch": 0.3952791863120126, + "grad_norm": 0.1427016705274582, + "learning_rate": 0.0006706587726847306, + "loss": 2.7068, + "step": 13330 + }, + { + "epoch": 0.39530883966432406, + "grad_norm": 0.15212783217430115, + "learning_rate": 0.0006706145464704081, + "loss": 2.7132, + "step": 13331 + }, + { + "epoch": 0.39533849301663554, + "grad_norm": 0.1336003541946411, + "learning_rate": 0.0006705703187452646, + "loss": 2.7498, + "step": 13332 + }, + { + "epoch": 0.395368146368947, + "grad_norm": 0.12747822701931, + "learning_rate": 0.0006705260895096917, + "loss": 2.6971, + "step": 13333 + }, + { + "epoch": 0.3953977997212585, + "grad_norm": 0.13136810064315796, + "learning_rate": 0.0006704818587640811, + "loss": 2.7391, + "step": 13334 + }, + { + "epoch": 0.39542745307356997, + "grad_norm": 0.13491252064704895, + "learning_rate": 0.0006704376265088242, + "loss": 2.7095, + "step": 13335 + }, + { + "epoch": 0.39545710642588144, + "grad_norm": 0.14459475874900818, + "learning_rate": 0.0006703933927443129, + "loss": 2.7197, + "step": 13336 + }, + { + "epoch": 0.3954867597781929, + "grad_norm": 0.12404006719589233, + "learning_rate": 0.0006703491574709387, + "loss": 2.7109, + "step": 13337 + }, + { + "epoch": 0.3955164131305044, + "grad_norm": 0.12172258645296097, + "learning_rate": 0.0006703049206890938, + "loss": 2.6814, + "step": 13338 + }, + { + "epoch": 0.3955460664828159, + "grad_norm": 0.12886179983615875, + "learning_rate": 0.0006702606823991694, + "loss": 2.6901, + "step": 13339 + }, + { + "epoch": 0.39557571983512735, + "grad_norm": 0.13564351201057434, + "learning_rate": 0.0006702164426015575, + "loss": 2.7484, + "step": 13340 + }, + { + "epoch": 0.3956053731874388, + "grad_norm": 0.13662825524806976, + "learning_rate": 0.0006701722012966497, + "loss": 2.6863, + "step": 13341 + }, + { + "epoch": 0.3956350265397503, + "grad_norm": 0.1271437555551529, + "learning_rate": 0.0006701279584848379, + "loss": 2.7131, + "step": 13342 + }, + { + "epoch": 0.3956646798920618, + "grad_norm": 0.1232016533613205, + "learning_rate": 0.0006700837141665138, + "loss": 2.7216, + "step": 13343 + }, + { + "epoch": 0.39569433324437325, + "grad_norm": 0.1401103287935257, + "learning_rate": 0.0006700394683420693, + "loss": 2.7378, + "step": 13344 + }, + { + "epoch": 0.39572398659668473, + "grad_norm": 0.1253846138715744, + "learning_rate": 0.000669995221011896, + "loss": 2.6873, + "step": 13345 + }, + { + "epoch": 0.39575363994899626, + "grad_norm": 0.12827958166599274, + "learning_rate": 0.0006699509721763859, + "loss": 2.7219, + "step": 13346 + }, + { + "epoch": 0.39578329330130774, + "grad_norm": 0.11680588126182556, + "learning_rate": 0.0006699067218359308, + "loss": 2.6911, + "step": 13347 + }, + { + "epoch": 0.3958129466536192, + "grad_norm": 0.11988761276006699, + "learning_rate": 0.0006698624699909225, + "loss": 2.7041, + "step": 13348 + }, + { + "epoch": 0.3958426000059307, + "grad_norm": 0.13059666752815247, + "learning_rate": 0.0006698182166417528, + "loss": 2.7177, + "step": 13349 + }, + { + "epoch": 0.39587225335824217, + "grad_norm": 0.1308988332748413, + "learning_rate": 0.0006697739617888137, + "loss": 2.6851, + "step": 13350 + }, + { + "epoch": 0.39590190671055364, + "grad_norm": 0.1371147334575653, + "learning_rate": 0.000669729705432497, + "loss": 2.7658, + "step": 13351 + }, + { + "epoch": 0.3959315600628651, + "grad_norm": 0.1286085993051529, + "learning_rate": 0.0006696854475731947, + "loss": 2.7598, + "step": 13352 + }, + { + "epoch": 0.3959612134151766, + "grad_norm": 0.1175149604678154, + "learning_rate": 0.0006696411882112986, + "loss": 2.7298, + "step": 13353 + }, + { + "epoch": 0.39599086676748807, + "grad_norm": 0.12223319709300995, + "learning_rate": 0.0006695969273472007, + "loss": 2.6767, + "step": 13354 + }, + { + "epoch": 0.39602052011979955, + "grad_norm": 0.11028748005628586, + "learning_rate": 0.0006695526649812928, + "loss": 2.7077, + "step": 13355 + }, + { + "epoch": 0.396050173472111, + "grad_norm": 0.1023700162768364, + "learning_rate": 0.000669508401113967, + "loss": 2.7274, + "step": 13356 + }, + { + "epoch": 0.3960798268244225, + "grad_norm": 0.12194490432739258, + "learning_rate": 0.0006694641357456152, + "loss": 2.7059, + "step": 13357 + }, + { + "epoch": 0.396109480176734, + "grad_norm": 0.12876860797405243, + "learning_rate": 0.0006694198688766293, + "loss": 2.6948, + "step": 13358 + }, + { + "epoch": 0.39613913352904545, + "grad_norm": 0.12869787216186523, + "learning_rate": 0.0006693756005074016, + "loss": 2.7137, + "step": 13359 + }, + { + "epoch": 0.3961687868813569, + "grad_norm": 0.1455918401479721, + "learning_rate": 0.0006693313306383236, + "loss": 2.7447, + "step": 13360 + }, + { + "epoch": 0.3961984402336684, + "grad_norm": 0.15867061913013458, + "learning_rate": 0.0006692870592697879, + "loss": 2.7551, + "step": 13361 + }, + { + "epoch": 0.3962280935859799, + "grad_norm": 0.1686873733997345, + "learning_rate": 0.0006692427864021861, + "loss": 2.685, + "step": 13362 + }, + { + "epoch": 0.39625774693829136, + "grad_norm": 0.13955800235271454, + "learning_rate": 0.0006691985120359103, + "loss": 2.7088, + "step": 13363 + }, + { + "epoch": 0.39628740029060283, + "grad_norm": 0.14335642755031586, + "learning_rate": 0.0006691542361713527, + "loss": 2.7458, + "step": 13364 + }, + { + "epoch": 0.3963170536429143, + "grad_norm": 0.13435602188110352, + "learning_rate": 0.0006691099588089052, + "loss": 2.7261, + "step": 13365 + }, + { + "epoch": 0.3963467069952258, + "grad_norm": 0.13813607394695282, + "learning_rate": 0.0006690656799489602, + "loss": 2.7086, + "step": 13366 + }, + { + "epoch": 0.3963763603475373, + "grad_norm": 0.13930287957191467, + "learning_rate": 0.0006690213995919096, + "loss": 2.7077, + "step": 13367 + }, + { + "epoch": 0.3964060136998488, + "grad_norm": 0.1261899620294571, + "learning_rate": 0.0006689771177381453, + "loss": 2.7031, + "step": 13368 + }, + { + "epoch": 0.39643566705216027, + "grad_norm": 0.11628038436174393, + "learning_rate": 0.0006689328343880597, + "loss": 2.7016, + "step": 13369 + }, + { + "epoch": 0.39646532040447174, + "grad_norm": 0.12821201980113983, + "learning_rate": 0.0006688885495420447, + "loss": 2.7307, + "step": 13370 + }, + { + "epoch": 0.3964949737567832, + "grad_norm": 0.14660786092281342, + "learning_rate": 0.0006688442632004929, + "loss": 2.6993, + "step": 13371 + }, + { + "epoch": 0.3965246271090947, + "grad_norm": 0.12371601164340973, + "learning_rate": 0.000668799975363796, + "loss": 2.6893, + "step": 13372 + }, + { + "epoch": 0.3965542804614062, + "grad_norm": 0.11860447376966476, + "learning_rate": 0.0006687556860323464, + "loss": 2.7113, + "step": 13373 + }, + { + "epoch": 0.39658393381371765, + "grad_norm": 0.10884881019592285, + "learning_rate": 0.0006687113952065361, + "loss": 2.718, + "step": 13374 + }, + { + "epoch": 0.3966135871660291, + "grad_norm": 0.11247967183589935, + "learning_rate": 0.0006686671028867576, + "loss": 2.718, + "step": 13375 + }, + { + "epoch": 0.3966432405183406, + "grad_norm": 0.1649255007505417, + "learning_rate": 0.0006686228090734029, + "loss": 2.7146, + "step": 13376 + }, + { + "epoch": 0.3966728938706521, + "grad_norm": 0.12899671494960785, + "learning_rate": 0.0006685785137668642, + "loss": 2.7096, + "step": 13377 + }, + { + "epoch": 0.39670254722296355, + "grad_norm": 0.12268143892288208, + "learning_rate": 0.0006685342169675339, + "loss": 2.7186, + "step": 13378 + }, + { + "epoch": 0.39673220057527503, + "grad_norm": 0.11779843270778656, + "learning_rate": 0.0006684899186758042, + "loss": 2.7217, + "step": 13379 + }, + { + "epoch": 0.3967618539275865, + "grad_norm": 0.12198928743600845, + "learning_rate": 0.0006684456188920673, + "loss": 2.723, + "step": 13380 + }, + { + "epoch": 0.396791507279898, + "grad_norm": 0.1204998791217804, + "learning_rate": 0.0006684013176167155, + "loss": 2.7122, + "step": 13381 + }, + { + "epoch": 0.39682116063220946, + "grad_norm": 0.13025601208209991, + "learning_rate": 0.0006683570148501413, + "loss": 2.6903, + "step": 13382 + }, + { + "epoch": 0.39685081398452093, + "grad_norm": 0.12361446022987366, + "learning_rate": 0.0006683127105927367, + "loss": 2.7337, + "step": 13383 + }, + { + "epoch": 0.3968804673368324, + "grad_norm": 0.11377514153718948, + "learning_rate": 0.0006682684048448941, + "loss": 2.7089, + "step": 13384 + }, + { + "epoch": 0.3969101206891439, + "grad_norm": 0.12407315522432327, + "learning_rate": 0.000668224097607006, + "loss": 2.7428, + "step": 13385 + }, + { + "epoch": 0.39693977404145536, + "grad_norm": 0.12867027521133423, + "learning_rate": 0.0006681797888794645, + "loss": 2.6648, + "step": 13386 + }, + { + "epoch": 0.39696942739376684, + "grad_norm": 0.11015982925891876, + "learning_rate": 0.0006681354786626622, + "loss": 2.7271, + "step": 13387 + }, + { + "epoch": 0.39699908074607837, + "grad_norm": 0.1125105619430542, + "learning_rate": 0.0006680911669569915, + "loss": 2.7171, + "step": 13388 + }, + { + "epoch": 0.39702873409838985, + "grad_norm": 0.1389186531305313, + "learning_rate": 0.0006680468537628444, + "loss": 2.7494, + "step": 13389 + }, + { + "epoch": 0.3970583874507013, + "grad_norm": 0.14420394599437714, + "learning_rate": 0.0006680025390806138, + "loss": 2.7158, + "step": 13390 + }, + { + "epoch": 0.3970880408030128, + "grad_norm": 0.14052307605743408, + "learning_rate": 0.0006679582229106917, + "loss": 2.7284, + "step": 13391 + }, + { + "epoch": 0.3971176941553243, + "grad_norm": 0.11681898683309555, + "learning_rate": 0.0006679139052534708, + "loss": 2.7144, + "step": 13392 + }, + { + "epoch": 0.39714734750763575, + "grad_norm": 0.12850849330425262, + "learning_rate": 0.0006678695861093435, + "loss": 2.7148, + "step": 13393 + }, + { + "epoch": 0.3971770008599472, + "grad_norm": 0.15411609411239624, + "learning_rate": 0.0006678252654787022, + "loss": 2.7469, + "step": 13394 + }, + { + "epoch": 0.3972066542122587, + "grad_norm": 0.1737695336341858, + "learning_rate": 0.0006677809433619393, + "loss": 2.7149, + "step": 13395 + }, + { + "epoch": 0.3972363075645702, + "grad_norm": 0.15505976974964142, + "learning_rate": 0.0006677366197594474, + "loss": 2.6973, + "step": 13396 + }, + { + "epoch": 0.39726596091688166, + "grad_norm": 0.15540513396263123, + "learning_rate": 0.0006676922946716188, + "loss": 2.719, + "step": 13397 + }, + { + "epoch": 0.39729561426919313, + "grad_norm": 0.16101957857608795, + "learning_rate": 0.0006676479680988462, + "loss": 2.7136, + "step": 13398 + }, + { + "epoch": 0.3973252676215046, + "grad_norm": 0.13041740655899048, + "learning_rate": 0.0006676036400415222, + "loss": 2.7228, + "step": 13399 + }, + { + "epoch": 0.3973549209738161, + "grad_norm": 0.14134366810321808, + "learning_rate": 0.0006675593105000392, + "loss": 2.7243, + "step": 13400 + }, + { + "epoch": 0.39738457432612756, + "grad_norm": 0.12692421674728394, + "learning_rate": 0.0006675149794747897, + "loss": 2.7004, + "step": 13401 + }, + { + "epoch": 0.39741422767843904, + "grad_norm": 0.14183735847473145, + "learning_rate": 0.000667470646966166, + "loss": 2.7191, + "step": 13402 + }, + { + "epoch": 0.3974438810307505, + "grad_norm": 0.1432085931301117, + "learning_rate": 0.0006674263129745612, + "loss": 2.7227, + "step": 13403 + }, + { + "epoch": 0.397473534383062, + "grad_norm": 0.13613402843475342, + "learning_rate": 0.0006673819775003679, + "loss": 2.7166, + "step": 13404 + }, + { + "epoch": 0.39750318773537346, + "grad_norm": 0.1386568695306778, + "learning_rate": 0.0006673376405439783, + "loss": 2.6991, + "step": 13405 + }, + { + "epoch": 0.39753284108768494, + "grad_norm": 0.14206819236278534, + "learning_rate": 0.0006672933021057851, + "loss": 2.7075, + "step": 13406 + }, + { + "epoch": 0.3975624944399964, + "grad_norm": 0.14347165822982788, + "learning_rate": 0.000667248962186181, + "loss": 2.7066, + "step": 13407 + }, + { + "epoch": 0.3975921477923079, + "grad_norm": 0.12033072113990784, + "learning_rate": 0.0006672046207855585, + "loss": 2.7147, + "step": 13408 + }, + { + "epoch": 0.3976218011446194, + "grad_norm": 0.12147937715053558, + "learning_rate": 0.0006671602779043107, + "loss": 2.7418, + "step": 13409 + }, + { + "epoch": 0.3976514544969309, + "grad_norm": 0.12861880660057068, + "learning_rate": 0.0006671159335428298, + "loss": 2.7444, + "step": 13410 + }, + { + "epoch": 0.3976811078492424, + "grad_norm": 0.1238756775856018, + "learning_rate": 0.0006670715877015085, + "loss": 2.7478, + "step": 13411 + }, + { + "epoch": 0.39771076120155385, + "grad_norm": 0.11535187065601349, + "learning_rate": 0.0006670272403807397, + "loss": 2.7226, + "step": 13412 + }, + { + "epoch": 0.39774041455386533, + "grad_norm": 0.13435794413089752, + "learning_rate": 0.0006669828915809161, + "loss": 2.7365, + "step": 13413 + }, + { + "epoch": 0.3977700679061768, + "grad_norm": 0.11720798909664154, + "learning_rate": 0.0006669385413024302, + "loss": 2.6974, + "step": 13414 + }, + { + "epoch": 0.3977997212584883, + "grad_norm": 0.13052202761173248, + "learning_rate": 0.000666894189545675, + "loss": 2.7225, + "step": 13415 + }, + { + "epoch": 0.39782937461079976, + "grad_norm": 0.132454052567482, + "learning_rate": 0.0006668498363110429, + "loss": 2.6849, + "step": 13416 + }, + { + "epoch": 0.39785902796311123, + "grad_norm": 0.10710817575454712, + "learning_rate": 0.0006668054815989271, + "loss": 2.7055, + "step": 13417 + }, + { + "epoch": 0.3978886813154227, + "grad_norm": 0.1257435381412506, + "learning_rate": 0.0006667611254097199, + "loss": 2.6998, + "step": 13418 + }, + { + "epoch": 0.3979183346677342, + "grad_norm": 0.13779616355895996, + "learning_rate": 0.0006667167677438145, + "loss": 2.7075, + "step": 13419 + }, + { + "epoch": 0.39794798802004566, + "grad_norm": 0.14471150934696198, + "learning_rate": 0.0006666724086016034, + "loss": 2.6992, + "step": 13420 + }, + { + "epoch": 0.39797764137235714, + "grad_norm": 0.1319294422864914, + "learning_rate": 0.0006666280479834796, + "loss": 2.6994, + "step": 13421 + }, + { + "epoch": 0.3980072947246686, + "grad_norm": 0.12522341310977936, + "learning_rate": 0.0006665836858898357, + "loss": 2.7174, + "step": 13422 + }, + { + "epoch": 0.3980369480769801, + "grad_norm": 0.1384546011686325, + "learning_rate": 0.0006665393223210648, + "loss": 2.7097, + "step": 13423 + }, + { + "epoch": 0.39806660142929157, + "grad_norm": 0.15862023830413818, + "learning_rate": 0.0006664949572775596, + "loss": 2.7297, + "step": 13424 + }, + { + "epoch": 0.39809625478160304, + "grad_norm": 0.1559128314256668, + "learning_rate": 0.0006664505907597129, + "loss": 2.751, + "step": 13425 + }, + { + "epoch": 0.3981259081339145, + "grad_norm": 0.1587693840265274, + "learning_rate": 0.0006664062227679177, + "loss": 2.7598, + "step": 13426 + }, + { + "epoch": 0.398155561486226, + "grad_norm": 0.15071645379066467, + "learning_rate": 0.0006663618533025668, + "loss": 2.7069, + "step": 13427 + }, + { + "epoch": 0.39818521483853747, + "grad_norm": 0.1324128359556198, + "learning_rate": 0.0006663174823640531, + "loss": 2.7275, + "step": 13428 + }, + { + "epoch": 0.398214868190849, + "grad_norm": 0.13240282237529755, + "learning_rate": 0.0006662731099527695, + "loss": 2.6942, + "step": 13429 + }, + { + "epoch": 0.3982445215431605, + "grad_norm": 0.13818518817424774, + "learning_rate": 0.0006662287360691091, + "loss": 2.7479, + "step": 13430 + }, + { + "epoch": 0.39827417489547196, + "grad_norm": 0.14272764325141907, + "learning_rate": 0.0006661843607134649, + "loss": 2.7229, + "step": 13431 + }, + { + "epoch": 0.39830382824778343, + "grad_norm": 0.12292703241109848, + "learning_rate": 0.0006661399838862294, + "loss": 2.6989, + "step": 13432 + }, + { + "epoch": 0.3983334816000949, + "grad_norm": 0.1642650067806244, + "learning_rate": 0.0006660956055877959, + "loss": 2.6999, + "step": 13433 + }, + { + "epoch": 0.3983631349524064, + "grad_norm": 0.19155128300189972, + "learning_rate": 0.0006660512258185572, + "loss": 2.7152, + "step": 13434 + }, + { + "epoch": 0.39839278830471786, + "grad_norm": 0.15336407721042633, + "learning_rate": 0.0006660068445789064, + "loss": 2.7336, + "step": 13435 + }, + { + "epoch": 0.39842244165702934, + "grad_norm": 0.13058613240718842, + "learning_rate": 0.0006659624618692366, + "loss": 2.7614, + "step": 13436 + }, + { + "epoch": 0.3984520950093408, + "grad_norm": 0.13428013026714325, + "learning_rate": 0.0006659180776899407, + "loss": 2.7091, + "step": 13437 + }, + { + "epoch": 0.3984817483616523, + "grad_norm": 0.14686459302902222, + "learning_rate": 0.0006658736920414117, + "loss": 2.7215, + "step": 13438 + }, + { + "epoch": 0.39851140171396376, + "grad_norm": 0.13449501991271973, + "learning_rate": 0.0006658293049240427, + "loss": 2.6814, + "step": 13439 + }, + { + "epoch": 0.39854105506627524, + "grad_norm": 0.14188143610954285, + "learning_rate": 0.0006657849163382268, + "loss": 2.7352, + "step": 13440 + }, + { + "epoch": 0.3985707084185867, + "grad_norm": 0.1313934624195099, + "learning_rate": 0.0006657405262843568, + "loss": 2.7257, + "step": 13441 + }, + { + "epoch": 0.3986003617708982, + "grad_norm": 0.11614453047513962, + "learning_rate": 0.0006656961347628262, + "loss": 2.7138, + "step": 13442 + }, + { + "epoch": 0.39863001512320967, + "grad_norm": 0.12157204747200012, + "learning_rate": 0.0006656517417740279, + "loss": 2.7054, + "step": 13443 + }, + { + "epoch": 0.39865966847552115, + "grad_norm": 0.1308707594871521, + "learning_rate": 0.0006656073473183548, + "loss": 2.7056, + "step": 13444 + }, + { + "epoch": 0.3986893218278326, + "grad_norm": 0.11584772169589996, + "learning_rate": 0.0006655629513962004, + "loss": 2.7031, + "step": 13445 + }, + { + "epoch": 0.3987189751801441, + "grad_norm": 0.1311565488576889, + "learning_rate": 0.0006655185540079576, + "loss": 2.7342, + "step": 13446 + }, + { + "epoch": 0.3987486285324556, + "grad_norm": 0.1153138130903244, + "learning_rate": 0.0006654741551540195, + "loss": 2.709, + "step": 13447 + }, + { + "epoch": 0.39877828188476705, + "grad_norm": 0.13034336268901825, + "learning_rate": 0.0006654297548347794, + "loss": 2.7427, + "step": 13448 + }, + { + "epoch": 0.3988079352370785, + "grad_norm": 0.1269950121641159, + "learning_rate": 0.0006653853530506305, + "loss": 2.6843, + "step": 13449 + }, + { + "epoch": 0.39883758858939006, + "grad_norm": 0.1077704206109047, + "learning_rate": 0.0006653409498019658, + "loss": 2.7014, + "step": 13450 + }, + { + "epoch": 0.39886724194170153, + "grad_norm": 0.12173814326524734, + "learning_rate": 0.0006652965450891787, + "loss": 2.7014, + "step": 13451 + }, + { + "epoch": 0.398896895294013, + "grad_norm": 0.13865093886852264, + "learning_rate": 0.0006652521389126623, + "loss": 2.7342, + "step": 13452 + }, + { + "epoch": 0.3989265486463245, + "grad_norm": 0.14699146151542664, + "learning_rate": 0.0006652077312728098, + "loss": 2.738, + "step": 13453 + }, + { + "epoch": 0.39895620199863596, + "grad_norm": 0.1586427241563797, + "learning_rate": 0.0006651633221700145, + "loss": 2.6953, + "step": 13454 + }, + { + "epoch": 0.39898585535094744, + "grad_norm": 0.1294407695531845, + "learning_rate": 0.0006651189116046696, + "loss": 2.7243, + "step": 13455 + }, + { + "epoch": 0.3990155087032589, + "grad_norm": 0.11837604641914368, + "learning_rate": 0.0006650744995771685, + "loss": 2.7242, + "step": 13456 + }, + { + "epoch": 0.3990451620555704, + "grad_norm": 0.12123634666204453, + "learning_rate": 0.0006650300860879044, + "loss": 2.7252, + "step": 13457 + }, + { + "epoch": 0.39907481540788187, + "grad_norm": 0.13160663843154907, + "learning_rate": 0.0006649856711372704, + "loss": 2.6913, + "step": 13458 + }, + { + "epoch": 0.39910446876019334, + "grad_norm": 0.12365083396434784, + "learning_rate": 0.0006649412547256601, + "loss": 2.7556, + "step": 13459 + }, + { + "epoch": 0.3991341221125048, + "grad_norm": 0.11604820191860199, + "learning_rate": 0.0006648968368534666, + "loss": 2.7469, + "step": 13460 + }, + { + "epoch": 0.3991637754648163, + "grad_norm": 0.1279502809047699, + "learning_rate": 0.0006648524175210833, + "loss": 2.7326, + "step": 13461 + }, + { + "epoch": 0.39919342881712777, + "grad_norm": 0.12451600283384323, + "learning_rate": 0.0006648079967289035, + "loss": 2.7018, + "step": 13462 + }, + { + "epoch": 0.39922308216943925, + "grad_norm": 0.12518490850925446, + "learning_rate": 0.0006647635744773207, + "loss": 2.7419, + "step": 13463 + }, + { + "epoch": 0.3992527355217507, + "grad_norm": 0.1236630454659462, + "learning_rate": 0.0006647191507667282, + "loss": 2.753, + "step": 13464 + }, + { + "epoch": 0.3992823888740622, + "grad_norm": 0.135487899184227, + "learning_rate": 0.0006646747255975193, + "loss": 2.7251, + "step": 13465 + }, + { + "epoch": 0.3993120422263737, + "grad_norm": 0.13989832997322083, + "learning_rate": 0.0006646302989700874, + "loss": 2.7054, + "step": 13466 + }, + { + "epoch": 0.39934169557868515, + "grad_norm": 0.13692227005958557, + "learning_rate": 0.0006645858708848259, + "loss": 2.7273, + "step": 13467 + }, + { + "epoch": 0.39937134893099663, + "grad_norm": 0.13051737844944, + "learning_rate": 0.0006645414413421283, + "loss": 2.7334, + "step": 13468 + }, + { + "epoch": 0.3994010022833081, + "grad_norm": 0.10570613294839859, + "learning_rate": 0.0006644970103423882, + "loss": 2.7518, + "step": 13469 + }, + { + "epoch": 0.3994306556356196, + "grad_norm": 0.11009372770786285, + "learning_rate": 0.0006644525778859985, + "loss": 2.7527, + "step": 13470 + }, + { + "epoch": 0.3994603089879311, + "grad_norm": 0.10512693226337433, + "learning_rate": 0.0006644081439733532, + "loss": 2.7574, + "step": 13471 + }, + { + "epoch": 0.3994899623402426, + "grad_norm": 0.11589303612709045, + "learning_rate": 0.0006643637086048455, + "loss": 2.7342, + "step": 13472 + }, + { + "epoch": 0.39951961569255406, + "grad_norm": 0.10282228142023087, + "learning_rate": 0.0006643192717808689, + "loss": 2.6942, + "step": 13473 + }, + { + "epoch": 0.39954926904486554, + "grad_norm": 0.11028271913528442, + "learning_rate": 0.0006642748335018169, + "loss": 2.7102, + "step": 13474 + }, + { + "epoch": 0.399578922397177, + "grad_norm": 0.11473023146390915, + "learning_rate": 0.0006642303937680834, + "loss": 2.7332, + "step": 13475 + }, + { + "epoch": 0.3996085757494885, + "grad_norm": 0.1167851909995079, + "learning_rate": 0.0006641859525800614, + "loss": 2.6731, + "step": 13476 + }, + { + "epoch": 0.39963822910179997, + "grad_norm": 0.10210272669792175, + "learning_rate": 0.0006641415099381445, + "loss": 2.7085, + "step": 13477 + }, + { + "epoch": 0.39966788245411144, + "grad_norm": 0.11743980646133423, + "learning_rate": 0.0006640970658427263, + "loss": 2.7309, + "step": 13478 + }, + { + "epoch": 0.3996975358064229, + "grad_norm": 0.10222635418176651, + "learning_rate": 0.0006640526202942006, + "loss": 2.7347, + "step": 13479 + }, + { + "epoch": 0.3997271891587344, + "grad_norm": 0.1100638285279274, + "learning_rate": 0.0006640081732929606, + "loss": 2.7354, + "step": 13480 + }, + { + "epoch": 0.3997568425110459, + "grad_norm": 0.13041985034942627, + "learning_rate": 0.0006639637248394001, + "loss": 2.7492, + "step": 13481 + }, + { + "epoch": 0.39978649586335735, + "grad_norm": 0.15506389737129211, + "learning_rate": 0.0006639192749339129, + "loss": 2.7449, + "step": 13482 + }, + { + "epoch": 0.3998161492156688, + "grad_norm": 0.17546780407428741, + "learning_rate": 0.0006638748235768921, + "loss": 2.7226, + "step": 13483 + }, + { + "epoch": 0.3998458025679803, + "grad_norm": 0.181208997964859, + "learning_rate": 0.0006638303707687319, + "loss": 2.7084, + "step": 13484 + }, + { + "epoch": 0.3998754559202918, + "grad_norm": 0.14337576925754547, + "learning_rate": 0.0006637859165098255, + "loss": 2.7479, + "step": 13485 + }, + { + "epoch": 0.39990510927260325, + "grad_norm": 0.11605336517095566, + "learning_rate": 0.0006637414608005666, + "loss": 2.6821, + "step": 13486 + }, + { + "epoch": 0.39993476262491473, + "grad_norm": 0.14078648388385773, + "learning_rate": 0.000663697003641349, + "loss": 2.7612, + "step": 13487 + }, + { + "epoch": 0.3999644159772262, + "grad_norm": 0.1577371507883072, + "learning_rate": 0.0006636525450325663, + "loss": 2.7237, + "step": 13488 + }, + { + "epoch": 0.3999940693295377, + "grad_norm": 0.12335001677274704, + "learning_rate": 0.0006636080849746123, + "loss": 2.6678, + "step": 13489 + }, + { + "epoch": 0.40002372268184916, + "grad_norm": 0.12260187417268753, + "learning_rate": 0.0006635636234678807, + "loss": 2.7209, + "step": 13490 + }, + { + "epoch": 0.40005337603416063, + "grad_norm": 0.13043922185897827, + "learning_rate": 0.0006635191605127651, + "loss": 2.745, + "step": 13491 + }, + { + "epoch": 0.40008302938647217, + "grad_norm": 0.10267575085163116, + "learning_rate": 0.0006634746961096591, + "loss": 2.7889, + "step": 13492 + }, + { + "epoch": 0.40011268273878364, + "grad_norm": 0.12021942436695099, + "learning_rate": 0.0006634302302589568, + "loss": 2.7209, + "step": 13493 + }, + { + "epoch": 0.4001423360910951, + "grad_norm": 0.11305045336484909, + "learning_rate": 0.0006633857629610517, + "loss": 2.7154, + "step": 13494 + }, + { + "epoch": 0.4001719894434066, + "grad_norm": 0.12140203267335892, + "learning_rate": 0.0006633412942163376, + "loss": 2.7078, + "step": 13495 + }, + { + "epoch": 0.40020164279571807, + "grad_norm": 0.1110558807849884, + "learning_rate": 0.0006632968240252083, + "loss": 2.7015, + "step": 13496 + }, + { + "epoch": 0.40023129614802955, + "grad_norm": 0.12378890067338943, + "learning_rate": 0.0006632523523880577, + "loss": 2.6751, + "step": 13497 + }, + { + "epoch": 0.400260949500341, + "grad_norm": 0.11851757019758224, + "learning_rate": 0.0006632078793052794, + "loss": 2.741, + "step": 13498 + }, + { + "epoch": 0.4002906028526525, + "grad_norm": 0.128986656665802, + "learning_rate": 0.0006631634047772672, + "loss": 2.7089, + "step": 13499 + }, + { + "epoch": 0.400320256204964, + "grad_norm": 0.13628895580768585, + "learning_rate": 0.0006631189288044153, + "loss": 2.7193, + "step": 13500 + }, + { + "epoch": 0.40034990955727545, + "grad_norm": 0.16212423145771027, + "learning_rate": 0.0006630744513871171, + "loss": 2.7347, + "step": 13501 + }, + { + "epoch": 0.40037956290958693, + "grad_norm": 0.18184636533260345, + "learning_rate": 0.0006630299725257667, + "loss": 2.7508, + "step": 13502 + }, + { + "epoch": 0.4004092162618984, + "grad_norm": 0.1496196687221527, + "learning_rate": 0.0006629854922207579, + "loss": 2.7192, + "step": 13503 + }, + { + "epoch": 0.4004388696142099, + "grad_norm": 0.12079722434282303, + "learning_rate": 0.0006629410104724846, + "loss": 2.7109, + "step": 13504 + }, + { + "epoch": 0.40046852296652136, + "grad_norm": 0.15414360165596008, + "learning_rate": 0.0006628965272813406, + "loss": 2.7531, + "step": 13505 + }, + { + "epoch": 0.40049817631883283, + "grad_norm": 0.15021394193172455, + "learning_rate": 0.00066285204264772, + "loss": 2.7099, + "step": 13506 + }, + { + "epoch": 0.4005278296711443, + "grad_norm": 0.14101742208003998, + "learning_rate": 0.0006628075565720166, + "loss": 2.6997, + "step": 13507 + }, + { + "epoch": 0.4005574830234558, + "grad_norm": 0.13421250879764557, + "learning_rate": 0.0006627630690546243, + "loss": 2.7009, + "step": 13508 + }, + { + "epoch": 0.40058713637576726, + "grad_norm": 0.12198968976736069, + "learning_rate": 0.0006627185800959372, + "loss": 2.7119, + "step": 13509 + }, + { + "epoch": 0.40061678972807874, + "grad_norm": 0.13174928724765778, + "learning_rate": 0.000662674089696349, + "loss": 2.7267, + "step": 13510 + }, + { + "epoch": 0.4006464430803902, + "grad_norm": 0.12913690507411957, + "learning_rate": 0.0006626295978562538, + "loss": 2.7149, + "step": 13511 + }, + { + "epoch": 0.4006760964327017, + "grad_norm": 0.1167575940489769, + "learning_rate": 0.0006625851045760456, + "loss": 2.6938, + "step": 13512 + }, + { + "epoch": 0.4007057497850132, + "grad_norm": 0.1260625571012497, + "learning_rate": 0.0006625406098561186, + "loss": 2.7511, + "step": 13513 + }, + { + "epoch": 0.4007354031373247, + "grad_norm": 0.11794206500053406, + "learning_rate": 0.0006624961136968663, + "loss": 2.6913, + "step": 13514 + }, + { + "epoch": 0.4007650564896362, + "grad_norm": 0.12812688946723938, + "learning_rate": 0.0006624516160986833, + "loss": 2.7204, + "step": 13515 + }, + { + "epoch": 0.40079470984194765, + "grad_norm": 0.1260167360305786, + "learning_rate": 0.0006624071170619633, + "loss": 2.7255, + "step": 13516 + }, + { + "epoch": 0.4008243631942591, + "grad_norm": 0.1197354644536972, + "learning_rate": 0.0006623626165871002, + "loss": 2.6869, + "step": 13517 + }, + { + "epoch": 0.4008540165465706, + "grad_norm": 0.10890165716409683, + "learning_rate": 0.0006623181146744884, + "loss": 2.7512, + "step": 13518 + }, + { + "epoch": 0.4008836698988821, + "grad_norm": 0.10852250456809998, + "learning_rate": 0.0006622736113245218, + "loss": 2.7412, + "step": 13519 + }, + { + "epoch": 0.40091332325119355, + "grad_norm": 0.12160313874483109, + "learning_rate": 0.0006622291065375945, + "loss": 2.7237, + "step": 13520 + }, + { + "epoch": 0.40094297660350503, + "grad_norm": 0.13963326811790466, + "learning_rate": 0.0006621846003141007, + "loss": 2.7397, + "step": 13521 + }, + { + "epoch": 0.4009726299558165, + "grad_norm": 0.13272230327129364, + "learning_rate": 0.0006621400926544344, + "loss": 2.714, + "step": 13522 + }, + { + "epoch": 0.401002283308128, + "grad_norm": 0.13892138004302979, + "learning_rate": 0.0006620955835589897, + "loss": 2.7401, + "step": 13523 + }, + { + "epoch": 0.40103193666043946, + "grad_norm": 0.15476678311824799, + "learning_rate": 0.000662051073028161, + "loss": 2.7164, + "step": 13524 + }, + { + "epoch": 0.40106159001275093, + "grad_norm": 0.14075016975402832, + "learning_rate": 0.0006620065610623418, + "loss": 2.7244, + "step": 13525 + }, + { + "epoch": 0.4010912433650624, + "grad_norm": 0.12194120138883591, + "learning_rate": 0.000661962047661927, + "loss": 2.7209, + "step": 13526 + }, + { + "epoch": 0.4011208967173739, + "grad_norm": 0.13336461782455444, + "learning_rate": 0.0006619175328273104, + "loss": 2.7376, + "step": 13527 + }, + { + "epoch": 0.40115055006968536, + "grad_norm": 0.15640749037265778, + "learning_rate": 0.0006618730165588862, + "loss": 2.704, + "step": 13528 + }, + { + "epoch": 0.40118020342199684, + "grad_norm": 0.17317894101142883, + "learning_rate": 0.0006618284988570488, + "loss": 2.6803, + "step": 13529 + }, + { + "epoch": 0.4012098567743083, + "grad_norm": 0.13372108340263367, + "learning_rate": 0.0006617839797221923, + "loss": 2.7417, + "step": 13530 + }, + { + "epoch": 0.4012395101266198, + "grad_norm": 0.12773361802101135, + "learning_rate": 0.0006617394591547106, + "loss": 2.7241, + "step": 13531 + }, + { + "epoch": 0.40126916347893127, + "grad_norm": 0.13327647745609283, + "learning_rate": 0.0006616949371549983, + "loss": 2.7181, + "step": 13532 + }, + { + "epoch": 0.4012988168312428, + "grad_norm": 0.12166565656661987, + "learning_rate": 0.0006616504137234498, + "loss": 2.7319, + "step": 13533 + }, + { + "epoch": 0.4013284701835543, + "grad_norm": 0.12884052097797394, + "learning_rate": 0.000661605888860459, + "loss": 2.6927, + "step": 13534 + }, + { + "epoch": 0.40135812353586575, + "grad_norm": 0.1421699970960617, + "learning_rate": 0.0006615613625664204, + "loss": 2.7008, + "step": 13535 + }, + { + "epoch": 0.40138777688817723, + "grad_norm": 0.14199136197566986, + "learning_rate": 0.0006615168348417281, + "loss": 2.7512, + "step": 13536 + }, + { + "epoch": 0.4014174302404887, + "grad_norm": 0.1369410753250122, + "learning_rate": 0.0006614723056867765, + "loss": 2.7455, + "step": 13537 + }, + { + "epoch": 0.4014470835928002, + "grad_norm": 0.12195336073637009, + "learning_rate": 0.00066142777510196, + "loss": 2.7109, + "step": 13538 + }, + { + "epoch": 0.40147673694511166, + "grad_norm": 0.11246935278177261, + "learning_rate": 0.0006613832430876727, + "loss": 2.7125, + "step": 13539 + }, + { + "epoch": 0.40150639029742313, + "grad_norm": 0.11585773527622223, + "learning_rate": 0.0006613387096443093, + "loss": 2.7245, + "step": 13540 + }, + { + "epoch": 0.4015360436497346, + "grad_norm": 0.12952126562595367, + "learning_rate": 0.0006612941747722637, + "loss": 2.7237, + "step": 13541 + }, + { + "epoch": 0.4015656970020461, + "grad_norm": 0.1195094883441925, + "learning_rate": 0.0006612496384719306, + "loss": 2.7094, + "step": 13542 + }, + { + "epoch": 0.40159535035435756, + "grad_norm": 0.11557067185640335, + "learning_rate": 0.0006612051007437043, + "loss": 2.6766, + "step": 13543 + }, + { + "epoch": 0.40162500370666904, + "grad_norm": 0.10759817063808441, + "learning_rate": 0.000661160561587979, + "loss": 2.7233, + "step": 13544 + }, + { + "epoch": 0.4016546570589805, + "grad_norm": 0.11919151246547699, + "learning_rate": 0.0006611160210051496, + "loss": 2.7398, + "step": 13545 + }, + { + "epoch": 0.401684310411292, + "grad_norm": 0.13669784367084503, + "learning_rate": 0.0006610714789956099, + "loss": 2.7214, + "step": 13546 + }, + { + "epoch": 0.40171396376360347, + "grad_norm": 0.13941629230976105, + "learning_rate": 0.0006610269355597547, + "loss": 2.6889, + "step": 13547 + }, + { + "epoch": 0.40174361711591494, + "grad_norm": 0.13798651099205017, + "learning_rate": 0.0006609823906979784, + "loss": 2.7457, + "step": 13548 + }, + { + "epoch": 0.4017732704682264, + "grad_norm": 0.12714150547981262, + "learning_rate": 0.0006609378444106753, + "loss": 2.7425, + "step": 13549 + }, + { + "epoch": 0.4018029238205379, + "grad_norm": 0.11471926420927048, + "learning_rate": 0.0006608932966982399, + "loss": 2.7412, + "step": 13550 + }, + { + "epoch": 0.40183257717284937, + "grad_norm": 0.11121666431427002, + "learning_rate": 0.000660848747561067, + "loss": 2.7235, + "step": 13551 + }, + { + "epoch": 0.40186223052516085, + "grad_norm": 0.12136682122945786, + "learning_rate": 0.0006608041969995505, + "loss": 2.6833, + "step": 13552 + }, + { + "epoch": 0.4018918838774723, + "grad_norm": 0.11478979140520096, + "learning_rate": 0.0006607596450140855, + "loss": 2.7251, + "step": 13553 + }, + { + "epoch": 0.40192153722978385, + "grad_norm": 0.1254452019929886, + "learning_rate": 0.0006607150916050662, + "loss": 2.6917, + "step": 13554 + }, + { + "epoch": 0.40195119058209533, + "grad_norm": 0.1302916258573532, + "learning_rate": 0.000660670536772887, + "loss": 2.7113, + "step": 13555 + }, + { + "epoch": 0.4019808439344068, + "grad_norm": 0.1288219839334488, + "learning_rate": 0.0006606259805179427, + "loss": 2.7193, + "step": 13556 + }, + { + "epoch": 0.4020104972867183, + "grad_norm": 0.1334378868341446, + "learning_rate": 0.0006605814228406279, + "loss": 2.6889, + "step": 13557 + }, + { + "epoch": 0.40204015063902976, + "grad_norm": 0.15498808026313782, + "learning_rate": 0.0006605368637413369, + "loss": 2.7583, + "step": 13558 + }, + { + "epoch": 0.40206980399134123, + "grad_norm": 0.15005160868167877, + "learning_rate": 0.0006604923032204645, + "loss": 2.7168, + "step": 13559 + }, + { + "epoch": 0.4020994573436527, + "grad_norm": 0.1360413134098053, + "learning_rate": 0.0006604477412784051, + "loss": 2.739, + "step": 13560 + }, + { + "epoch": 0.4021291106959642, + "grad_norm": 0.1342017948627472, + "learning_rate": 0.0006604031779155534, + "loss": 2.7394, + "step": 13561 + }, + { + "epoch": 0.40215876404827566, + "grad_norm": 0.12271221727132797, + "learning_rate": 0.0006603586131323043, + "loss": 2.7064, + "step": 13562 + }, + { + "epoch": 0.40218841740058714, + "grad_norm": 0.13412655889987946, + "learning_rate": 0.0006603140469290521, + "loss": 2.6974, + "step": 13563 + }, + { + "epoch": 0.4022180707528986, + "grad_norm": 0.1500137597322464, + "learning_rate": 0.0006602694793061912, + "loss": 2.7309, + "step": 13564 + }, + { + "epoch": 0.4022477241052101, + "grad_norm": 0.1477990597486496, + "learning_rate": 0.0006602249102641166, + "loss": 2.7252, + "step": 13565 + }, + { + "epoch": 0.40227737745752157, + "grad_norm": 0.14067018032073975, + "learning_rate": 0.0006601803398032231, + "loss": 2.7204, + "step": 13566 + }, + { + "epoch": 0.40230703080983304, + "grad_norm": 0.1235160082578659, + "learning_rate": 0.0006601357679239052, + "loss": 2.7262, + "step": 13567 + }, + { + "epoch": 0.4023366841621445, + "grad_norm": 0.12318447977304459, + "learning_rate": 0.0006600911946265575, + "loss": 2.6797, + "step": 13568 + }, + { + "epoch": 0.402366337514456, + "grad_norm": 0.1379816234111786, + "learning_rate": 0.0006600466199115748, + "loss": 2.7094, + "step": 13569 + }, + { + "epoch": 0.4023959908667675, + "grad_norm": 0.14235012233257294, + "learning_rate": 0.0006600020437793518, + "loss": 2.7412, + "step": 13570 + }, + { + "epoch": 0.40242564421907895, + "grad_norm": 0.1404981166124344, + "learning_rate": 0.0006599574662302832, + "loss": 2.7015, + "step": 13571 + }, + { + "epoch": 0.4024552975713904, + "grad_norm": 0.1560685783624649, + "learning_rate": 0.000659912887264764, + "loss": 2.7485, + "step": 13572 + }, + { + "epoch": 0.4024849509237019, + "grad_norm": 0.1247619166970253, + "learning_rate": 0.0006598683068831885, + "loss": 2.7023, + "step": 13573 + }, + { + "epoch": 0.4025146042760134, + "grad_norm": 0.11641740053892136, + "learning_rate": 0.0006598237250859518, + "loss": 2.7232, + "step": 13574 + }, + { + "epoch": 0.4025442576283249, + "grad_norm": 0.14642290771007538, + "learning_rate": 0.0006597791418734485, + "loss": 2.6952, + "step": 13575 + }, + { + "epoch": 0.4025739109806364, + "grad_norm": 0.14577540755271912, + "learning_rate": 0.0006597345572460735, + "loss": 2.7033, + "step": 13576 + }, + { + "epoch": 0.40260356433294786, + "grad_norm": 0.1390317976474762, + "learning_rate": 0.0006596899712042216, + "loss": 2.7086, + "step": 13577 + }, + { + "epoch": 0.40263321768525934, + "grad_norm": 0.13008666038513184, + "learning_rate": 0.0006596453837482876, + "loss": 2.7341, + "step": 13578 + }, + { + "epoch": 0.4026628710375708, + "grad_norm": 0.13390040397644043, + "learning_rate": 0.0006596007948786665, + "loss": 2.7426, + "step": 13579 + }, + { + "epoch": 0.4026925243898823, + "grad_norm": 0.15328359603881836, + "learning_rate": 0.0006595562045957527, + "loss": 2.7103, + "step": 13580 + }, + { + "epoch": 0.40272217774219377, + "grad_norm": 0.14147412776947021, + "learning_rate": 0.0006595116128999414, + "loss": 2.6989, + "step": 13581 + }, + { + "epoch": 0.40275183109450524, + "grad_norm": 0.1332659274339676, + "learning_rate": 0.0006594670197916274, + "loss": 2.7185, + "step": 13582 + }, + { + "epoch": 0.4027814844468167, + "grad_norm": 0.13046114146709442, + "learning_rate": 0.0006594224252712055, + "loss": 2.7056, + "step": 13583 + }, + { + "epoch": 0.4028111377991282, + "grad_norm": 0.13354456424713135, + "learning_rate": 0.0006593778293390709, + "loss": 2.72, + "step": 13584 + }, + { + "epoch": 0.40284079115143967, + "grad_norm": 0.12807904183864594, + "learning_rate": 0.000659333231995618, + "loss": 2.7592, + "step": 13585 + }, + { + "epoch": 0.40287044450375115, + "grad_norm": 0.1313742697238922, + "learning_rate": 0.000659288633241242, + "loss": 2.6907, + "step": 13586 + }, + { + "epoch": 0.4029000978560626, + "grad_norm": 0.13824620842933655, + "learning_rate": 0.0006592440330763379, + "loss": 2.7126, + "step": 13587 + }, + { + "epoch": 0.4029297512083741, + "grad_norm": 0.13944880664348602, + "learning_rate": 0.0006591994315013006, + "loss": 2.7465, + "step": 13588 + }, + { + "epoch": 0.4029594045606856, + "grad_norm": 0.11637608706951141, + "learning_rate": 0.0006591548285165249, + "loss": 2.7271, + "step": 13589 + }, + { + "epoch": 0.40298905791299705, + "grad_norm": 0.12830540537834167, + "learning_rate": 0.0006591102241224059, + "loss": 2.7341, + "step": 13590 + }, + { + "epoch": 0.4030187112653085, + "grad_norm": 0.10615577548742294, + "learning_rate": 0.0006590656183193387, + "loss": 2.7159, + "step": 13591 + }, + { + "epoch": 0.40304836461762, + "grad_norm": 0.10855165868997574, + "learning_rate": 0.0006590210111077179, + "loss": 2.6937, + "step": 13592 + }, + { + "epoch": 0.4030780179699315, + "grad_norm": 0.10414310544729233, + "learning_rate": 0.0006589764024879388, + "loss": 2.7227, + "step": 13593 + }, + { + "epoch": 0.40310767132224296, + "grad_norm": 0.1003870815038681, + "learning_rate": 0.0006589317924603965, + "loss": 2.7104, + "step": 13594 + }, + { + "epoch": 0.40313732467455443, + "grad_norm": 0.111099012196064, + "learning_rate": 0.000658887181025486, + "loss": 2.6876, + "step": 13595 + }, + { + "epoch": 0.40316697802686596, + "grad_norm": 0.11772266775369644, + "learning_rate": 0.0006588425681836019, + "loss": 2.6975, + "step": 13596 + }, + { + "epoch": 0.40319663137917744, + "grad_norm": 0.10830113291740417, + "learning_rate": 0.0006587979539351399, + "loss": 2.7341, + "step": 13597 + }, + { + "epoch": 0.4032262847314889, + "grad_norm": 0.1151263564825058, + "learning_rate": 0.0006587533382804945, + "loss": 2.7075, + "step": 13598 + }, + { + "epoch": 0.4032559380838004, + "grad_norm": 0.12067190557718277, + "learning_rate": 0.0006587087212200612, + "loss": 2.6874, + "step": 13599 + }, + { + "epoch": 0.40328559143611187, + "grad_norm": 0.12599067389965057, + "learning_rate": 0.0006586641027542348, + "loss": 2.6655, + "step": 13600 + }, + { + "epoch": 0.40331524478842334, + "grad_norm": 0.12815943360328674, + "learning_rate": 0.0006586194828834109, + "loss": 2.7258, + "step": 13601 + }, + { + "epoch": 0.4033448981407348, + "grad_norm": 0.13722440600395203, + "learning_rate": 0.0006585748616079838, + "loss": 2.7262, + "step": 13602 + }, + { + "epoch": 0.4033745514930463, + "grad_norm": 0.12846051156520844, + "learning_rate": 0.0006585302389283493, + "loss": 2.7275, + "step": 13603 + }, + { + "epoch": 0.40340420484535777, + "grad_norm": 0.11897911876440048, + "learning_rate": 0.0006584856148449023, + "loss": 2.7123, + "step": 13604 + }, + { + "epoch": 0.40343385819766925, + "grad_norm": 0.12234877794981003, + "learning_rate": 0.000658440989358038, + "loss": 2.7183, + "step": 13605 + }, + { + "epoch": 0.4034635115499807, + "grad_norm": 0.12895776331424713, + "learning_rate": 0.0006583963624681515, + "loss": 2.7048, + "step": 13606 + }, + { + "epoch": 0.4034931649022922, + "grad_norm": 0.13535378873348236, + "learning_rate": 0.0006583517341756381, + "loss": 2.7065, + "step": 13607 + }, + { + "epoch": 0.4035228182546037, + "grad_norm": 0.14230169355869293, + "learning_rate": 0.0006583071044808928, + "loss": 2.6896, + "step": 13608 + }, + { + "epoch": 0.40355247160691515, + "grad_norm": 0.14727048575878143, + "learning_rate": 0.0006582624733843109, + "loss": 2.6944, + "step": 13609 + }, + { + "epoch": 0.40358212495922663, + "grad_norm": 0.1268029361963272, + "learning_rate": 0.0006582178408862877, + "loss": 2.7288, + "step": 13610 + }, + { + "epoch": 0.4036117783115381, + "grad_norm": 0.13151408731937408, + "learning_rate": 0.0006581732069872183, + "loss": 2.7244, + "step": 13611 + }, + { + "epoch": 0.4036414316638496, + "grad_norm": 0.1442350447177887, + "learning_rate": 0.0006581285716874981, + "loss": 2.7177, + "step": 13612 + }, + { + "epoch": 0.40367108501616106, + "grad_norm": 0.15432427823543549, + "learning_rate": 0.0006580839349875223, + "loss": 2.7436, + "step": 13613 + }, + { + "epoch": 0.40370073836847253, + "grad_norm": 0.13400889933109283, + "learning_rate": 0.000658039296887686, + "loss": 2.7417, + "step": 13614 + }, + { + "epoch": 0.403730391720784, + "grad_norm": 0.1283266544342041, + "learning_rate": 0.0006579946573883846, + "loss": 2.6953, + "step": 13615 + }, + { + "epoch": 0.4037600450730955, + "grad_norm": 0.12816870212554932, + "learning_rate": 0.0006579500164900135, + "loss": 2.7097, + "step": 13616 + }, + { + "epoch": 0.403789698425407, + "grad_norm": 0.13997521996498108, + "learning_rate": 0.0006579053741929678, + "loss": 2.7305, + "step": 13617 + }, + { + "epoch": 0.4038193517777185, + "grad_norm": 0.15768185257911682, + "learning_rate": 0.000657860730497643, + "loss": 2.7072, + "step": 13618 + }, + { + "epoch": 0.40384900513002997, + "grad_norm": 0.18615025281906128, + "learning_rate": 0.0006578160854044342, + "loss": 2.7309, + "step": 13619 + }, + { + "epoch": 0.40387865848234145, + "grad_norm": 0.18695394694805145, + "learning_rate": 0.0006577714389137369, + "loss": 2.6992, + "step": 13620 + }, + { + "epoch": 0.4039083118346529, + "grad_norm": 0.14238998293876648, + "learning_rate": 0.0006577267910259465, + "loss": 2.6733, + "step": 13621 + }, + { + "epoch": 0.4039379651869644, + "grad_norm": 0.15015879273414612, + "learning_rate": 0.0006576821417414582, + "loss": 2.7131, + "step": 13622 + }, + { + "epoch": 0.4039676185392759, + "grad_norm": 0.1531861573457718, + "learning_rate": 0.0006576374910606676, + "loss": 2.7073, + "step": 13623 + }, + { + "epoch": 0.40399727189158735, + "grad_norm": 0.13680784404277802, + "learning_rate": 0.0006575928389839698, + "loss": 2.7217, + "step": 13624 + }, + { + "epoch": 0.4040269252438988, + "grad_norm": 0.1308804154396057, + "learning_rate": 0.0006575481855117606, + "loss": 2.7381, + "step": 13625 + }, + { + "epoch": 0.4040565785962103, + "grad_norm": 0.12885452806949615, + "learning_rate": 0.0006575035306444349, + "loss": 2.7081, + "step": 13626 + }, + { + "epoch": 0.4040862319485218, + "grad_norm": 0.11231286823749542, + "learning_rate": 0.0006574588743823886, + "loss": 2.7203, + "step": 13627 + }, + { + "epoch": 0.40411588530083326, + "grad_norm": 0.12384894490242004, + "learning_rate": 0.0006574142167260168, + "loss": 2.7179, + "step": 13628 + }, + { + "epoch": 0.40414553865314473, + "grad_norm": 0.12495596706867218, + "learning_rate": 0.0006573695576757152, + "loss": 2.7253, + "step": 13629 + }, + { + "epoch": 0.4041751920054562, + "grad_norm": 0.13132570683956146, + "learning_rate": 0.000657324897231879, + "loss": 2.7182, + "step": 13630 + }, + { + "epoch": 0.4042048453577677, + "grad_norm": 0.10699288547039032, + "learning_rate": 0.000657280235394904, + "loss": 2.7028, + "step": 13631 + }, + { + "epoch": 0.40423449871007916, + "grad_norm": 0.12177635729312897, + "learning_rate": 0.0006572355721651855, + "loss": 2.7191, + "step": 13632 + }, + { + "epoch": 0.40426415206239064, + "grad_norm": 0.13225887715816498, + "learning_rate": 0.0006571909075431191, + "loss": 2.6666, + "step": 13633 + }, + { + "epoch": 0.4042938054147021, + "grad_norm": 0.1375460922718048, + "learning_rate": 0.0006571462415291, + "loss": 2.6855, + "step": 13634 + }, + { + "epoch": 0.4043234587670136, + "grad_norm": 0.12617185711860657, + "learning_rate": 0.000657101574123524, + "loss": 2.7707, + "step": 13635 + }, + { + "epoch": 0.40435311211932506, + "grad_norm": 0.13397091627120972, + "learning_rate": 0.0006570569053267867, + "loss": 2.7162, + "step": 13636 + }, + { + "epoch": 0.4043827654716366, + "grad_norm": 0.14222262799739838, + "learning_rate": 0.0006570122351392835, + "loss": 2.7325, + "step": 13637 + }, + { + "epoch": 0.40441241882394807, + "grad_norm": 0.14654849469661713, + "learning_rate": 0.0006569675635614099, + "loss": 2.7126, + "step": 13638 + }, + { + "epoch": 0.40444207217625955, + "grad_norm": 0.1258467584848404, + "learning_rate": 0.0006569228905935618, + "loss": 2.6883, + "step": 13639 + }, + { + "epoch": 0.404471725528571, + "grad_norm": 0.11026905477046967, + "learning_rate": 0.0006568782162361344, + "loss": 2.6888, + "step": 13640 + }, + { + "epoch": 0.4045013788808825, + "grad_norm": 0.12272977083921432, + "learning_rate": 0.0006568335404895235, + "loss": 2.6877, + "step": 13641 + }, + { + "epoch": 0.404531032233194, + "grad_norm": 0.12126146256923676, + "learning_rate": 0.0006567888633541247, + "loss": 2.7026, + "step": 13642 + }, + { + "epoch": 0.40456068558550545, + "grad_norm": 0.11472786962985992, + "learning_rate": 0.0006567441848303336, + "loss": 2.697, + "step": 13643 + }, + { + "epoch": 0.40459033893781693, + "grad_norm": 0.11611055582761765, + "learning_rate": 0.0006566995049185461, + "loss": 2.6871, + "step": 13644 + }, + { + "epoch": 0.4046199922901284, + "grad_norm": 0.11043563485145569, + "learning_rate": 0.0006566548236191571, + "loss": 2.6833, + "step": 13645 + }, + { + "epoch": 0.4046496456424399, + "grad_norm": 0.10913605988025665, + "learning_rate": 0.0006566101409325631, + "loss": 2.7234, + "step": 13646 + }, + { + "epoch": 0.40467929899475136, + "grad_norm": 0.10516674071550369, + "learning_rate": 0.0006565654568591592, + "loss": 2.6911, + "step": 13647 + }, + { + "epoch": 0.40470895234706283, + "grad_norm": 0.11412319540977478, + "learning_rate": 0.0006565207713993413, + "loss": 2.7427, + "step": 13648 + }, + { + "epoch": 0.4047386056993743, + "grad_norm": 0.12154685705900192, + "learning_rate": 0.0006564760845535054, + "loss": 2.6924, + "step": 13649 + }, + { + "epoch": 0.4047682590516858, + "grad_norm": 0.14176423847675323, + "learning_rate": 0.0006564313963220468, + "loss": 2.7277, + "step": 13650 + }, + { + "epoch": 0.40479791240399726, + "grad_norm": 0.1539294272661209, + "learning_rate": 0.0006563867067053611, + "loss": 2.6812, + "step": 13651 + }, + { + "epoch": 0.40482756575630874, + "grad_norm": 0.1622598022222519, + "learning_rate": 0.0006563420157038444, + "loss": 2.7158, + "step": 13652 + }, + { + "epoch": 0.4048572191086202, + "grad_norm": 0.13684320449829102, + "learning_rate": 0.0006562973233178923, + "loss": 2.7096, + "step": 13653 + }, + { + "epoch": 0.4048868724609317, + "grad_norm": 0.12580962479114532, + "learning_rate": 0.0006562526295479008, + "loss": 2.6943, + "step": 13654 + }, + { + "epoch": 0.40491652581324317, + "grad_norm": 0.12914003431797028, + "learning_rate": 0.0006562079343942652, + "loss": 2.714, + "step": 13655 + }, + { + "epoch": 0.40494617916555464, + "grad_norm": 0.11562826484441757, + "learning_rate": 0.0006561632378573817, + "loss": 2.6703, + "step": 13656 + }, + { + "epoch": 0.4049758325178661, + "grad_norm": 0.13056999444961548, + "learning_rate": 0.0006561185399376457, + "loss": 2.7103, + "step": 13657 + }, + { + "epoch": 0.40500548587017765, + "grad_norm": 0.12660427391529083, + "learning_rate": 0.0006560738406354532, + "loss": 2.7309, + "step": 13658 + }, + { + "epoch": 0.4050351392224891, + "grad_norm": 0.11419374495744705, + "learning_rate": 0.0006560291399512003, + "loss": 2.7315, + "step": 13659 + }, + { + "epoch": 0.4050647925748006, + "grad_norm": 0.1199161633849144, + "learning_rate": 0.0006559844378852825, + "loss": 2.6891, + "step": 13660 + }, + { + "epoch": 0.4050944459271121, + "grad_norm": 0.10884834825992584, + "learning_rate": 0.0006559397344380958, + "loss": 2.7278, + "step": 13661 + }, + { + "epoch": 0.40512409927942356, + "grad_norm": 0.1376819759607315, + "learning_rate": 0.0006558950296100358, + "loss": 2.719, + "step": 13662 + }, + { + "epoch": 0.40515375263173503, + "grad_norm": 0.134218230843544, + "learning_rate": 0.0006558503234014986, + "loss": 2.7386, + "step": 13663 + }, + { + "epoch": 0.4051834059840465, + "grad_norm": 0.13229796290397644, + "learning_rate": 0.0006558056158128802, + "loss": 2.7332, + "step": 13664 + }, + { + "epoch": 0.405213059336358, + "grad_norm": 0.1332937330007553, + "learning_rate": 0.0006557609068445761, + "loss": 2.7214, + "step": 13665 + }, + { + "epoch": 0.40524271268866946, + "grad_norm": 0.14965155720710754, + "learning_rate": 0.0006557161964969826, + "loss": 2.7149, + "step": 13666 + }, + { + "epoch": 0.40527236604098094, + "grad_norm": 0.15477563440799713, + "learning_rate": 0.0006556714847704954, + "loss": 2.7215, + "step": 13667 + }, + { + "epoch": 0.4053020193932924, + "grad_norm": 0.1379527449607849, + "learning_rate": 0.0006556267716655104, + "loss": 2.6776, + "step": 13668 + }, + { + "epoch": 0.4053316727456039, + "grad_norm": 0.13259094953536987, + "learning_rate": 0.0006555820571824237, + "loss": 2.7273, + "step": 13669 + }, + { + "epoch": 0.40536132609791536, + "grad_norm": 0.12587931752204895, + "learning_rate": 0.0006555373413216312, + "loss": 2.7476, + "step": 13670 + }, + { + "epoch": 0.40539097945022684, + "grad_norm": 0.11192183196544647, + "learning_rate": 0.0006554926240835288, + "loss": 2.7301, + "step": 13671 + }, + { + "epoch": 0.4054206328025383, + "grad_norm": 0.13325631618499756, + "learning_rate": 0.0006554479054685126, + "loss": 2.7243, + "step": 13672 + }, + { + "epoch": 0.4054502861548498, + "grad_norm": 0.13521011173725128, + "learning_rate": 0.0006554031854769784, + "loss": 2.7621, + "step": 13673 + }, + { + "epoch": 0.40547993950716127, + "grad_norm": 0.11816277354955673, + "learning_rate": 0.0006553584641093225, + "loss": 2.6801, + "step": 13674 + }, + { + "epoch": 0.40550959285947275, + "grad_norm": 0.12192777544260025, + "learning_rate": 0.0006553137413659405, + "loss": 2.7168, + "step": 13675 + }, + { + "epoch": 0.4055392462117842, + "grad_norm": 0.1261204183101654, + "learning_rate": 0.0006552690172472288, + "loss": 2.7151, + "step": 13676 + }, + { + "epoch": 0.4055688995640957, + "grad_norm": 0.11484947800636292, + "learning_rate": 0.0006552242917535834, + "loss": 2.6898, + "step": 13677 + }, + { + "epoch": 0.4055985529164072, + "grad_norm": 0.13679181039333344, + "learning_rate": 0.0006551795648854, + "loss": 2.7147, + "step": 13678 + }, + { + "epoch": 0.4056282062687187, + "grad_norm": 0.15412341058254242, + "learning_rate": 0.0006551348366430752, + "loss": 2.7579, + "step": 13679 + }, + { + "epoch": 0.4056578596210302, + "grad_norm": 0.12051670998334885, + "learning_rate": 0.0006550901070270044, + "loss": 2.6896, + "step": 13680 + }, + { + "epoch": 0.40568751297334166, + "grad_norm": 0.14041446149349213, + "learning_rate": 0.0006550453760375843, + "loss": 2.719, + "step": 13681 + }, + { + "epoch": 0.40571716632565313, + "grad_norm": 0.11468902975320816, + "learning_rate": 0.000655000643675211, + "loss": 2.7161, + "step": 13682 + }, + { + "epoch": 0.4057468196779646, + "grad_norm": 0.12384206056594849, + "learning_rate": 0.0006549559099402801, + "loss": 2.6965, + "step": 13683 + }, + { + "epoch": 0.4057764730302761, + "grad_norm": 0.1106400191783905, + "learning_rate": 0.0006549111748331882, + "loss": 2.7106, + "step": 13684 + }, + { + "epoch": 0.40580612638258756, + "grad_norm": 0.10926418006420135, + "learning_rate": 0.0006548664383543312, + "loss": 2.6654, + "step": 13685 + }, + { + "epoch": 0.40583577973489904, + "grad_norm": 0.11798100918531418, + "learning_rate": 0.000654821700504105, + "loss": 2.704, + "step": 13686 + }, + { + "epoch": 0.4058654330872105, + "grad_norm": 0.11872699111700058, + "learning_rate": 0.0006547769612829065, + "loss": 2.6854, + "step": 13687 + }, + { + "epoch": 0.405895086439522, + "grad_norm": 0.14053282141685486, + "learning_rate": 0.0006547322206911313, + "loss": 2.7344, + "step": 13688 + }, + { + "epoch": 0.40592473979183347, + "grad_norm": 0.15761089324951172, + "learning_rate": 0.0006546874787291757, + "loss": 2.7303, + "step": 13689 + }, + { + "epoch": 0.40595439314414494, + "grad_norm": 0.14911293983459473, + "learning_rate": 0.0006546427353974359, + "loss": 2.7329, + "step": 13690 + }, + { + "epoch": 0.4059840464964564, + "grad_norm": 0.13389672338962555, + "learning_rate": 0.0006545979906963082, + "loss": 2.7184, + "step": 13691 + }, + { + "epoch": 0.4060136998487679, + "grad_norm": 0.1376112848520279, + "learning_rate": 0.0006545532446261887, + "loss": 2.7242, + "step": 13692 + }, + { + "epoch": 0.40604335320107937, + "grad_norm": 0.14998039603233337, + "learning_rate": 0.0006545084971874737, + "loss": 2.7276, + "step": 13693 + }, + { + "epoch": 0.40607300655339085, + "grad_norm": 0.1452750265598297, + "learning_rate": 0.0006544637483805595, + "loss": 2.7353, + "step": 13694 + }, + { + "epoch": 0.4061026599057023, + "grad_norm": 0.16722068190574646, + "learning_rate": 0.0006544189982058422, + "loss": 2.7352, + "step": 13695 + }, + { + "epoch": 0.4061323132580138, + "grad_norm": 0.16730299592018127, + "learning_rate": 0.0006543742466637183, + "loss": 2.7075, + "step": 13696 + }, + { + "epoch": 0.4061619666103253, + "grad_norm": 0.1583929806947708, + "learning_rate": 0.0006543294937545838, + "loss": 2.7282, + "step": 13697 + }, + { + "epoch": 0.40619161996263675, + "grad_norm": 0.14310131967067719, + "learning_rate": 0.0006542847394788351, + "loss": 2.7298, + "step": 13698 + }, + { + "epoch": 0.40622127331494823, + "grad_norm": 0.13887721300125122, + "learning_rate": 0.0006542399838368688, + "loss": 2.7216, + "step": 13699 + }, + { + "epoch": 0.40625092666725976, + "grad_norm": 0.15364743769168854, + "learning_rate": 0.0006541952268290807, + "loss": 2.721, + "step": 13700 + }, + { + "epoch": 0.40628058001957124, + "grad_norm": 0.1424485445022583, + "learning_rate": 0.0006541504684558676, + "loss": 2.7354, + "step": 13701 + }, + { + "epoch": 0.4063102333718827, + "grad_norm": 0.1451488584280014, + "learning_rate": 0.0006541057087176256, + "loss": 2.6788, + "step": 13702 + }, + { + "epoch": 0.4063398867241942, + "grad_norm": 0.11965189129114151, + "learning_rate": 0.000654060947614751, + "loss": 2.7271, + "step": 13703 + }, + { + "epoch": 0.40636954007650566, + "grad_norm": 0.13592995703220367, + "learning_rate": 0.0006540161851476404, + "loss": 2.7044, + "step": 13704 + }, + { + "epoch": 0.40639919342881714, + "grad_norm": 0.13698700070381165, + "learning_rate": 0.0006539714213166899, + "loss": 2.6934, + "step": 13705 + }, + { + "epoch": 0.4064288467811286, + "grad_norm": 0.14990076422691345, + "learning_rate": 0.000653926656122296, + "loss": 2.7348, + "step": 13706 + }, + { + "epoch": 0.4064585001334401, + "grad_norm": 0.14037199318408966, + "learning_rate": 0.0006538818895648553, + "loss": 2.7281, + "step": 13707 + }, + { + "epoch": 0.40648815348575157, + "grad_norm": 0.14271216094493866, + "learning_rate": 0.000653837121644764, + "loss": 2.7245, + "step": 13708 + }, + { + "epoch": 0.40651780683806304, + "grad_norm": 0.13583239912986755, + "learning_rate": 0.0006537923523624187, + "loss": 2.7356, + "step": 13709 + }, + { + "epoch": 0.4065474601903745, + "grad_norm": 0.12106715142726898, + "learning_rate": 0.0006537475817182156, + "loss": 2.7147, + "step": 13710 + }, + { + "epoch": 0.406577113542686, + "grad_norm": 0.13828963041305542, + "learning_rate": 0.0006537028097125513, + "loss": 2.7276, + "step": 13711 + }, + { + "epoch": 0.4066067668949975, + "grad_norm": 0.144025981426239, + "learning_rate": 0.000653658036345822, + "loss": 2.676, + "step": 13712 + }, + { + "epoch": 0.40663642024730895, + "grad_norm": 0.13687191903591156, + "learning_rate": 0.0006536132616184247, + "loss": 2.7073, + "step": 13713 + }, + { + "epoch": 0.4066660735996204, + "grad_norm": 0.1446695327758789, + "learning_rate": 0.0006535684855307556, + "loss": 2.7281, + "step": 13714 + }, + { + "epoch": 0.4066957269519319, + "grad_norm": 0.1345459371805191, + "learning_rate": 0.0006535237080832111, + "loss": 2.7494, + "step": 13715 + }, + { + "epoch": 0.4067253803042434, + "grad_norm": 0.12187764793634415, + "learning_rate": 0.0006534789292761879, + "loss": 2.7303, + "step": 13716 + }, + { + "epoch": 0.40675503365655485, + "grad_norm": 0.12393273413181305, + "learning_rate": 0.0006534341491100824, + "loss": 2.7015, + "step": 13717 + }, + { + "epoch": 0.40678468700886633, + "grad_norm": 0.12429635226726532, + "learning_rate": 0.0006533893675852911, + "loss": 2.7137, + "step": 13718 + }, + { + "epoch": 0.4068143403611778, + "grad_norm": 0.12688961625099182, + "learning_rate": 0.0006533445847022106, + "loss": 2.679, + "step": 13719 + }, + { + "epoch": 0.4068439937134893, + "grad_norm": 0.10414112359285355, + "learning_rate": 0.0006532998004612376, + "loss": 2.7041, + "step": 13720 + }, + { + "epoch": 0.4068736470658008, + "grad_norm": 0.1143430769443512, + "learning_rate": 0.0006532550148627685, + "loss": 2.6892, + "step": 13721 + }, + { + "epoch": 0.4069033004181123, + "grad_norm": 0.1251591593027115, + "learning_rate": 0.0006532102279071999, + "loss": 2.744, + "step": 13722 + }, + { + "epoch": 0.40693295377042377, + "grad_norm": 0.11322032660245895, + "learning_rate": 0.0006531654395949284, + "loss": 2.7408, + "step": 13723 + }, + { + "epoch": 0.40696260712273524, + "grad_norm": 0.12131539732217789, + "learning_rate": 0.0006531206499263508, + "loss": 2.7208, + "step": 13724 + }, + { + "epoch": 0.4069922604750467, + "grad_norm": 0.1069265678524971, + "learning_rate": 0.0006530758589018635, + "loss": 2.7096, + "step": 13725 + }, + { + "epoch": 0.4070219138273582, + "grad_norm": 0.11602215468883514, + "learning_rate": 0.0006530310665218632, + "loss": 2.7091, + "step": 13726 + }, + { + "epoch": 0.40705156717966967, + "grad_norm": 0.11698157340288162, + "learning_rate": 0.0006529862727867465, + "loss": 2.6629, + "step": 13727 + }, + { + "epoch": 0.40708122053198115, + "grad_norm": 0.11606529355049133, + "learning_rate": 0.00065294147769691, + "loss": 2.6714, + "step": 13728 + }, + { + "epoch": 0.4071108738842926, + "grad_norm": 0.1400260180234909, + "learning_rate": 0.0006528966812527506, + "loss": 2.7042, + "step": 13729 + }, + { + "epoch": 0.4071405272366041, + "grad_norm": 0.1531425565481186, + "learning_rate": 0.0006528518834546649, + "loss": 2.6821, + "step": 13730 + }, + { + "epoch": 0.4071701805889156, + "grad_norm": 0.19020293653011322, + "learning_rate": 0.0006528070843030494, + "loss": 2.7032, + "step": 13731 + }, + { + "epoch": 0.40719983394122705, + "grad_norm": 0.15112975239753723, + "learning_rate": 0.0006527622837983009, + "loss": 2.7107, + "step": 13732 + }, + { + "epoch": 0.40722948729353853, + "grad_norm": 0.12299463152885437, + "learning_rate": 0.0006527174819408164, + "loss": 2.6746, + "step": 13733 + }, + { + "epoch": 0.40725914064585, + "grad_norm": 0.14632169902324677, + "learning_rate": 0.0006526726787309922, + "loss": 2.6983, + "step": 13734 + }, + { + "epoch": 0.4072887939981615, + "grad_norm": 0.143844336271286, + "learning_rate": 0.0006526278741692252, + "loss": 2.6958, + "step": 13735 + }, + { + "epoch": 0.40731844735047296, + "grad_norm": 0.12517152726650238, + "learning_rate": 0.0006525830682559122, + "loss": 2.6797, + "step": 13736 + }, + { + "epoch": 0.40734810070278443, + "grad_norm": 0.13989298045635223, + "learning_rate": 0.0006525382609914501, + "loss": 2.7171, + "step": 13737 + }, + { + "epoch": 0.4073777540550959, + "grad_norm": 0.14299501478672028, + "learning_rate": 0.0006524934523762353, + "loss": 2.7145, + "step": 13738 + }, + { + "epoch": 0.4074074074074074, + "grad_norm": 0.12785115838050842, + "learning_rate": 0.0006524486424106648, + "loss": 2.6641, + "step": 13739 + }, + { + "epoch": 0.40743706075971886, + "grad_norm": 0.15154235064983368, + "learning_rate": 0.0006524038310951356, + "loss": 2.7009, + "step": 13740 + }, + { + "epoch": 0.4074667141120304, + "grad_norm": 0.10743311047554016, + "learning_rate": 0.0006523590184300441, + "loss": 2.7259, + "step": 13741 + }, + { + "epoch": 0.40749636746434187, + "grad_norm": 0.12545543909072876, + "learning_rate": 0.0006523142044157875, + "loss": 2.7272, + "step": 13742 + }, + { + "epoch": 0.40752602081665334, + "grad_norm": 0.11756786704063416, + "learning_rate": 0.0006522693890527625, + "loss": 2.7079, + "step": 13743 + }, + { + "epoch": 0.4075556741689648, + "grad_norm": 0.13261841237545013, + "learning_rate": 0.0006522245723413658, + "loss": 2.7371, + "step": 13744 + }, + { + "epoch": 0.4075853275212763, + "grad_norm": 0.12724648416042328, + "learning_rate": 0.0006521797542819944, + "loss": 2.6852, + "step": 13745 + }, + { + "epoch": 0.4076149808735878, + "grad_norm": 0.1294175684452057, + "learning_rate": 0.0006521349348750452, + "loss": 2.7453, + "step": 13746 + }, + { + "epoch": 0.40764463422589925, + "grad_norm": 0.10670628398656845, + "learning_rate": 0.000652090114120915, + "loss": 2.6992, + "step": 13747 + }, + { + "epoch": 0.4076742875782107, + "grad_norm": 0.12135085463523865, + "learning_rate": 0.0006520452920200008, + "loss": 2.7014, + "step": 13748 + }, + { + "epoch": 0.4077039409305222, + "grad_norm": 0.11557207256555557, + "learning_rate": 0.0006520004685726994, + "loss": 2.6975, + "step": 13749 + }, + { + "epoch": 0.4077335942828337, + "grad_norm": 0.12355345487594604, + "learning_rate": 0.0006519556437794078, + "loss": 2.7678, + "step": 13750 + }, + { + "epoch": 0.40776324763514515, + "grad_norm": 0.14321279525756836, + "learning_rate": 0.0006519108176405227, + "loss": 2.699, + "step": 13751 + }, + { + "epoch": 0.40779290098745663, + "grad_norm": 0.1402176022529602, + "learning_rate": 0.0006518659901564414, + "loss": 2.7188, + "step": 13752 + }, + { + "epoch": 0.4078225543397681, + "grad_norm": 0.14917276799678802, + "learning_rate": 0.0006518211613275607, + "loss": 2.7211, + "step": 13753 + }, + { + "epoch": 0.4078522076920796, + "grad_norm": 0.14624641835689545, + "learning_rate": 0.0006517763311542776, + "loss": 2.7298, + "step": 13754 + }, + { + "epoch": 0.40788186104439106, + "grad_norm": 0.13610856235027313, + "learning_rate": 0.000651731499636989, + "loss": 2.7187, + "step": 13755 + }, + { + "epoch": 0.40791151439670253, + "grad_norm": 0.14166252315044403, + "learning_rate": 0.0006516866667760919, + "loss": 2.684, + "step": 13756 + }, + { + "epoch": 0.407941167749014, + "grad_norm": 0.1434766948223114, + "learning_rate": 0.0006516418325719833, + "loss": 2.7042, + "step": 13757 + }, + { + "epoch": 0.4079708211013255, + "grad_norm": 0.1306876242160797, + "learning_rate": 0.0006515969970250601, + "loss": 2.7038, + "step": 13758 + }, + { + "epoch": 0.40800047445363696, + "grad_norm": 0.1394549310207367, + "learning_rate": 0.0006515521601357197, + "loss": 2.702, + "step": 13759 + }, + { + "epoch": 0.40803012780594844, + "grad_norm": 0.10700274258852005, + "learning_rate": 0.0006515073219043589, + "loss": 2.7035, + "step": 13760 + }, + { + "epoch": 0.4080597811582599, + "grad_norm": 0.1224813237786293, + "learning_rate": 0.0006514624823313746, + "loss": 2.7159, + "step": 13761 + }, + { + "epoch": 0.40808943451057145, + "grad_norm": 0.12482808530330658, + "learning_rate": 0.0006514176414171642, + "loss": 2.7206, + "step": 13762 + }, + { + "epoch": 0.4081190878628829, + "grad_norm": 0.11169838160276413, + "learning_rate": 0.0006513727991621246, + "loss": 2.7022, + "step": 13763 + }, + { + "epoch": 0.4081487412151944, + "grad_norm": 0.11381155252456665, + "learning_rate": 0.0006513279555666527, + "loss": 2.7228, + "step": 13764 + }, + { + "epoch": 0.4081783945675059, + "grad_norm": 0.12421989440917969, + "learning_rate": 0.0006512831106311459, + "loss": 2.7275, + "step": 13765 + }, + { + "epoch": 0.40820804791981735, + "grad_norm": 0.14045986533164978, + "learning_rate": 0.0006512382643560011, + "loss": 2.7072, + "step": 13766 + }, + { + "epoch": 0.40823770127212883, + "grad_norm": 0.13170866668224335, + "learning_rate": 0.0006511934167416156, + "loss": 2.6982, + "step": 13767 + }, + { + "epoch": 0.4082673546244403, + "grad_norm": 0.1108061820268631, + "learning_rate": 0.0006511485677883863, + "loss": 2.7121, + "step": 13768 + }, + { + "epoch": 0.4082970079767518, + "grad_norm": 0.14666198194026947, + "learning_rate": 0.0006511037174967107, + "loss": 2.7441, + "step": 13769 + }, + { + "epoch": 0.40832666132906326, + "grad_norm": 0.17229387164115906, + "learning_rate": 0.0006510588658669856, + "loss": 2.7317, + "step": 13770 + }, + { + "epoch": 0.40835631468137473, + "grad_norm": 0.15193893015384674, + "learning_rate": 0.0006510140128996084, + "loss": 2.7303, + "step": 13771 + }, + { + "epoch": 0.4083859680336862, + "grad_norm": 0.13687121868133545, + "learning_rate": 0.0006509691585949762, + "loss": 2.6863, + "step": 13772 + }, + { + "epoch": 0.4084156213859977, + "grad_norm": 0.13131791353225708, + "learning_rate": 0.0006509243029534862, + "loss": 2.6789, + "step": 13773 + }, + { + "epoch": 0.40844527473830916, + "grad_norm": 0.15214592218399048, + "learning_rate": 0.0006508794459755354, + "loss": 2.7028, + "step": 13774 + }, + { + "epoch": 0.40847492809062064, + "grad_norm": 0.14393587410449982, + "learning_rate": 0.0006508345876615215, + "loss": 2.7146, + "step": 13775 + }, + { + "epoch": 0.4085045814429321, + "grad_norm": 0.11562883108854294, + "learning_rate": 0.0006507897280118413, + "loss": 2.678, + "step": 13776 + }, + { + "epoch": 0.4085342347952436, + "grad_norm": 0.1408456712961197, + "learning_rate": 0.000650744867026892, + "loss": 2.7135, + "step": 13777 + }, + { + "epoch": 0.40856388814755507, + "grad_norm": 0.1410718858242035, + "learning_rate": 0.0006507000047070711, + "loss": 2.74, + "step": 13778 + }, + { + "epoch": 0.40859354149986654, + "grad_norm": 0.12138534337282181, + "learning_rate": 0.0006506551410527759, + "loss": 2.6812, + "step": 13779 + }, + { + "epoch": 0.408623194852178, + "grad_norm": 0.1345670521259308, + "learning_rate": 0.0006506102760644037, + "loss": 2.7219, + "step": 13780 + }, + { + "epoch": 0.4086528482044895, + "grad_norm": 0.12603972852230072, + "learning_rate": 0.0006505654097423515, + "loss": 2.7442, + "step": 13781 + }, + { + "epoch": 0.40868250155680097, + "grad_norm": 0.12067136913537979, + "learning_rate": 0.0006505205420870167, + "loss": 2.7732, + "step": 13782 + }, + { + "epoch": 0.4087121549091125, + "grad_norm": 0.1302514225244522, + "learning_rate": 0.0006504756730987966, + "loss": 2.7174, + "step": 13783 + }, + { + "epoch": 0.408741808261424, + "grad_norm": 0.11782794445753098, + "learning_rate": 0.0006504308027780887, + "loss": 2.6862, + "step": 13784 + }, + { + "epoch": 0.40877146161373545, + "grad_norm": 0.12362536042928696, + "learning_rate": 0.0006503859311252903, + "loss": 2.7001, + "step": 13785 + }, + { + "epoch": 0.40880111496604693, + "grad_norm": 0.1210065633058548, + "learning_rate": 0.0006503410581407986, + "loss": 2.6946, + "step": 13786 + }, + { + "epoch": 0.4088307683183584, + "grad_norm": 0.1192399337887764, + "learning_rate": 0.000650296183825011, + "loss": 2.7208, + "step": 13787 + }, + { + "epoch": 0.4088604216706699, + "grad_norm": 0.10881827026605606, + "learning_rate": 0.0006502513081783249, + "loss": 2.7015, + "step": 13788 + }, + { + "epoch": 0.40889007502298136, + "grad_norm": 0.11750317364931107, + "learning_rate": 0.0006502064312011377, + "loss": 2.7142, + "step": 13789 + }, + { + "epoch": 0.40891972837529283, + "grad_norm": 0.1139058768749237, + "learning_rate": 0.0006501615528938466, + "loss": 2.7171, + "step": 13790 + }, + { + "epoch": 0.4089493817276043, + "grad_norm": 0.11021127551794052, + "learning_rate": 0.0006501166732568494, + "loss": 2.7299, + "step": 13791 + }, + { + "epoch": 0.4089790350799158, + "grad_norm": 0.11704361438751221, + "learning_rate": 0.0006500717922905433, + "loss": 2.7154, + "step": 13792 + }, + { + "epoch": 0.40900868843222726, + "grad_norm": 0.10814732313156128, + "learning_rate": 0.0006500269099953256, + "loss": 2.6899, + "step": 13793 + }, + { + "epoch": 0.40903834178453874, + "grad_norm": 0.10400835424661636, + "learning_rate": 0.0006499820263715938, + "loss": 2.7131, + "step": 13794 + }, + { + "epoch": 0.4090679951368502, + "grad_norm": 0.11294947564601898, + "learning_rate": 0.0006499371414197454, + "loss": 2.6933, + "step": 13795 + }, + { + "epoch": 0.4090976484891617, + "grad_norm": 0.11579704284667969, + "learning_rate": 0.0006498922551401781, + "loss": 2.6796, + "step": 13796 + }, + { + "epoch": 0.40912730184147317, + "grad_norm": 0.11032024025917053, + "learning_rate": 0.000649847367533289, + "loss": 2.7277, + "step": 13797 + }, + { + "epoch": 0.40915695519378464, + "grad_norm": 0.12270217388868332, + "learning_rate": 0.0006498024785994758, + "loss": 2.6775, + "step": 13798 + }, + { + "epoch": 0.4091866085460961, + "grad_norm": 0.13084882497787476, + "learning_rate": 0.0006497575883391359, + "loss": 2.7176, + "step": 13799 + }, + { + "epoch": 0.4092162618984076, + "grad_norm": 0.1384873241186142, + "learning_rate": 0.0006497126967526668, + "loss": 2.7244, + "step": 13800 + }, + { + "epoch": 0.4092459152507191, + "grad_norm": 0.12979023158550262, + "learning_rate": 0.0006496678038404662, + "loss": 2.7378, + "step": 13801 + }, + { + "epoch": 0.40927556860303055, + "grad_norm": 0.12012363970279694, + "learning_rate": 0.0006496229096029314, + "loss": 2.7208, + "step": 13802 + }, + { + "epoch": 0.409305221955342, + "grad_norm": 0.12169776111841202, + "learning_rate": 0.0006495780140404601, + "loss": 2.6976, + "step": 13803 + }, + { + "epoch": 0.40933487530765356, + "grad_norm": 0.12622103095054626, + "learning_rate": 0.0006495331171534498, + "loss": 2.7058, + "step": 13804 + }, + { + "epoch": 0.40936452865996503, + "grad_norm": 0.12460830062627792, + "learning_rate": 0.0006494882189422981, + "loss": 2.7213, + "step": 13805 + }, + { + "epoch": 0.4093941820122765, + "grad_norm": 0.12833160161972046, + "learning_rate": 0.0006494433194074025, + "loss": 2.7211, + "step": 13806 + }, + { + "epoch": 0.409423835364588, + "grad_norm": 0.12892429530620575, + "learning_rate": 0.0006493984185491607, + "loss": 2.7042, + "step": 13807 + }, + { + "epoch": 0.40945348871689946, + "grad_norm": 0.14139921963214874, + "learning_rate": 0.0006493535163679704, + "loss": 2.6887, + "step": 13808 + }, + { + "epoch": 0.40948314206921094, + "grad_norm": 0.14368698000907898, + "learning_rate": 0.0006493086128642288, + "loss": 2.7253, + "step": 13809 + }, + { + "epoch": 0.4095127954215224, + "grad_norm": 0.13170041143894196, + "learning_rate": 0.0006492637080383339, + "loss": 2.7078, + "step": 13810 + }, + { + "epoch": 0.4095424487738339, + "grad_norm": 0.1275295466184616, + "learning_rate": 0.0006492188018906833, + "loss": 2.7158, + "step": 13811 + }, + { + "epoch": 0.40957210212614537, + "grad_norm": 0.12733343243598938, + "learning_rate": 0.0006491738944216746, + "loss": 2.7053, + "step": 13812 + }, + { + "epoch": 0.40960175547845684, + "grad_norm": 0.12507210671901703, + "learning_rate": 0.0006491289856317055, + "loss": 2.693, + "step": 13813 + }, + { + "epoch": 0.4096314088307683, + "grad_norm": 0.13082163035869598, + "learning_rate": 0.0006490840755211736, + "loss": 2.7392, + "step": 13814 + }, + { + "epoch": 0.4096610621830798, + "grad_norm": 0.11484599858522415, + "learning_rate": 0.0006490391640904766, + "loss": 2.7093, + "step": 13815 + }, + { + "epoch": 0.40969071553539127, + "grad_norm": 0.11428795009851456, + "learning_rate": 0.0006489942513400121, + "loss": 2.7118, + "step": 13816 + }, + { + "epoch": 0.40972036888770275, + "grad_norm": 0.1273728758096695, + "learning_rate": 0.000648949337270178, + "loss": 2.7141, + "step": 13817 + }, + { + "epoch": 0.4097500222400142, + "grad_norm": 0.13325795531272888, + "learning_rate": 0.0006489044218813722, + "loss": 2.7237, + "step": 13818 + }, + { + "epoch": 0.4097796755923257, + "grad_norm": 0.13242536783218384, + "learning_rate": 0.0006488595051739919, + "loss": 2.7211, + "step": 13819 + }, + { + "epoch": 0.4098093289446372, + "grad_norm": 0.12846143543720245, + "learning_rate": 0.0006488145871484352, + "loss": 2.6953, + "step": 13820 + }, + { + "epoch": 0.40983898229694865, + "grad_norm": 0.1278098076581955, + "learning_rate": 0.0006487696678050998, + "loss": 2.7149, + "step": 13821 + }, + { + "epoch": 0.4098686356492601, + "grad_norm": 0.1294444352388382, + "learning_rate": 0.0006487247471443833, + "loss": 2.702, + "step": 13822 + }, + { + "epoch": 0.4098982890015716, + "grad_norm": 0.11249087750911713, + "learning_rate": 0.000648679825166684, + "loss": 2.6969, + "step": 13823 + }, + { + "epoch": 0.4099279423538831, + "grad_norm": 0.11754810065031052, + "learning_rate": 0.0006486349018723992, + "loss": 2.7047, + "step": 13824 + }, + { + "epoch": 0.4099575957061946, + "grad_norm": 0.12093421816825867, + "learning_rate": 0.0006485899772619266, + "loss": 2.6995, + "step": 13825 + }, + { + "epoch": 0.4099872490585061, + "grad_norm": 0.1205667033791542, + "learning_rate": 0.0006485450513356644, + "loss": 2.6863, + "step": 13826 + }, + { + "epoch": 0.41001690241081756, + "grad_norm": 0.14640319347381592, + "learning_rate": 0.0006485001240940102, + "loss": 2.7087, + "step": 13827 + }, + { + "epoch": 0.41004655576312904, + "grad_norm": 0.1481156349182129, + "learning_rate": 0.0006484551955373621, + "loss": 2.6846, + "step": 13828 + }, + { + "epoch": 0.4100762091154405, + "grad_norm": 0.14352326095104218, + "learning_rate": 0.0006484102656661176, + "loss": 2.7548, + "step": 13829 + }, + { + "epoch": 0.410105862467752, + "grad_norm": 0.12002433836460114, + "learning_rate": 0.0006483653344806749, + "loss": 2.7666, + "step": 13830 + }, + { + "epoch": 0.41013551582006347, + "grad_norm": 0.11717656254768372, + "learning_rate": 0.0006483204019814315, + "loss": 2.7092, + "step": 13831 + }, + { + "epoch": 0.41016516917237494, + "grad_norm": 0.13731224834918976, + "learning_rate": 0.0006482754681687854, + "loss": 2.7202, + "step": 13832 + }, + { + "epoch": 0.4101948225246864, + "grad_norm": 0.13670186698436737, + "learning_rate": 0.0006482305330431349, + "loss": 2.6941, + "step": 13833 + }, + { + "epoch": 0.4102244758769979, + "grad_norm": 0.1405555009841919, + "learning_rate": 0.0006481855966048773, + "loss": 2.6979, + "step": 13834 + }, + { + "epoch": 0.41025412922930937, + "grad_norm": 0.14359140396118164, + "learning_rate": 0.0006481406588544109, + "loss": 2.7197, + "step": 13835 + }, + { + "epoch": 0.41028378258162085, + "grad_norm": 0.14839640259742737, + "learning_rate": 0.0006480957197921336, + "loss": 2.7124, + "step": 13836 + }, + { + "epoch": 0.4103134359339323, + "grad_norm": 0.15189634263515472, + "learning_rate": 0.0006480507794184431, + "loss": 2.6947, + "step": 13837 + }, + { + "epoch": 0.4103430892862438, + "grad_norm": 0.1377696543931961, + "learning_rate": 0.0006480058377337377, + "loss": 2.6817, + "step": 13838 + }, + { + "epoch": 0.4103727426385553, + "grad_norm": 0.14086827635765076, + "learning_rate": 0.0006479608947384152, + "loss": 2.6681, + "step": 13839 + }, + { + "epoch": 0.41040239599086675, + "grad_norm": 0.13533824682235718, + "learning_rate": 0.0006479159504328736, + "loss": 2.6769, + "step": 13840 + }, + { + "epoch": 0.41043204934317823, + "grad_norm": 0.1483597308397293, + "learning_rate": 0.0006478710048175109, + "loss": 2.6662, + "step": 13841 + }, + { + "epoch": 0.4104617026954897, + "grad_norm": 0.16542541980743408, + "learning_rate": 0.0006478260578927249, + "loss": 2.6944, + "step": 13842 + }, + { + "epoch": 0.4104913560478012, + "grad_norm": 0.13752414286136627, + "learning_rate": 0.0006477811096589139, + "loss": 2.6804, + "step": 13843 + }, + { + "epoch": 0.41052100940011266, + "grad_norm": 0.1272381842136383, + "learning_rate": 0.0006477361601164757, + "loss": 2.7132, + "step": 13844 + }, + { + "epoch": 0.4105506627524242, + "grad_norm": 0.13731026649475098, + "learning_rate": 0.0006476912092658085, + "loss": 2.7253, + "step": 13845 + }, + { + "epoch": 0.41058031610473567, + "grad_norm": 0.122622549533844, + "learning_rate": 0.0006476462571073105, + "loss": 2.693, + "step": 13846 + }, + { + "epoch": 0.41060996945704714, + "grad_norm": 0.11285386234521866, + "learning_rate": 0.0006476013036413792, + "loss": 2.7222, + "step": 13847 + }, + { + "epoch": 0.4106396228093586, + "grad_norm": 0.11098190397024155, + "learning_rate": 0.0006475563488684132, + "loss": 2.6617, + "step": 13848 + }, + { + "epoch": 0.4106692761616701, + "grad_norm": 0.11131612211465836, + "learning_rate": 0.0006475113927888103, + "loss": 2.7252, + "step": 13849 + }, + { + "epoch": 0.41069892951398157, + "grad_norm": 0.12127232551574707, + "learning_rate": 0.0006474664354029689, + "loss": 2.6789, + "step": 13850 + }, + { + "epoch": 0.41072858286629305, + "grad_norm": 0.1055464893579483, + "learning_rate": 0.0006474214767112869, + "loss": 2.6649, + "step": 13851 + }, + { + "epoch": 0.4107582362186045, + "grad_norm": 0.1155494824051857, + "learning_rate": 0.0006473765167141623, + "loss": 2.6864, + "step": 13852 + }, + { + "epoch": 0.410787889570916, + "grad_norm": 0.11857437342405319, + "learning_rate": 0.0006473315554119933, + "loss": 2.6802, + "step": 13853 + }, + { + "epoch": 0.4108175429232275, + "grad_norm": 0.11387506872415543, + "learning_rate": 0.0006472865928051781, + "loss": 2.6945, + "step": 13854 + }, + { + "epoch": 0.41084719627553895, + "grad_norm": 0.10910197347402573, + "learning_rate": 0.0006472416288941149, + "loss": 2.7094, + "step": 13855 + }, + { + "epoch": 0.4108768496278504, + "grad_norm": 0.10079821199178696, + "learning_rate": 0.0006471966636792018, + "loss": 2.7006, + "step": 13856 + }, + { + "epoch": 0.4109065029801619, + "grad_norm": 0.11303587257862091, + "learning_rate": 0.000647151697160837, + "loss": 2.7103, + "step": 13857 + }, + { + "epoch": 0.4109361563324734, + "grad_norm": 0.11888686567544937, + "learning_rate": 0.0006471067293394187, + "loss": 2.6843, + "step": 13858 + }, + { + "epoch": 0.41096580968478486, + "grad_norm": 0.13332366943359375, + "learning_rate": 0.0006470617602153449, + "loss": 2.7027, + "step": 13859 + }, + { + "epoch": 0.41099546303709633, + "grad_norm": 0.16803471744060516, + "learning_rate": 0.0006470167897890141, + "loss": 2.7257, + "step": 13860 + }, + { + "epoch": 0.4110251163894078, + "grad_norm": 0.16514520347118378, + "learning_rate": 0.0006469718180608243, + "loss": 2.6801, + "step": 13861 + }, + { + "epoch": 0.4110547697417193, + "grad_norm": 0.13844461739063263, + "learning_rate": 0.0006469268450311739, + "loss": 2.7597, + "step": 13862 + }, + { + "epoch": 0.41108442309403076, + "grad_norm": 0.1384754478931427, + "learning_rate": 0.000646881870700461, + "loss": 2.7348, + "step": 13863 + }, + { + "epoch": 0.41111407644634224, + "grad_norm": 0.13691948354244232, + "learning_rate": 0.000646836895069084, + "loss": 2.7387, + "step": 13864 + }, + { + "epoch": 0.4111437297986537, + "grad_norm": 0.1536283791065216, + "learning_rate": 0.0006467919181374409, + "loss": 2.7133, + "step": 13865 + }, + { + "epoch": 0.41117338315096524, + "grad_norm": 0.12797830998897552, + "learning_rate": 0.0006467469399059304, + "loss": 2.7197, + "step": 13866 + }, + { + "epoch": 0.4112030365032767, + "grad_norm": 0.12929901480674744, + "learning_rate": 0.0006467019603749504, + "loss": 2.69, + "step": 13867 + }, + { + "epoch": 0.4112326898555882, + "grad_norm": 0.14513647556304932, + "learning_rate": 0.0006466569795448995, + "loss": 2.7691, + "step": 13868 + }, + { + "epoch": 0.41126234320789967, + "grad_norm": 0.14568324387073517, + "learning_rate": 0.0006466119974161759, + "loss": 2.7002, + "step": 13869 + }, + { + "epoch": 0.41129199656021115, + "grad_norm": 0.1324825882911682, + "learning_rate": 0.0006465670139891777, + "loss": 2.7041, + "step": 13870 + }, + { + "epoch": 0.4113216499125226, + "grad_norm": 0.13082024455070496, + "learning_rate": 0.0006465220292643036, + "loss": 2.6872, + "step": 13871 + }, + { + "epoch": 0.4113513032648341, + "grad_norm": 0.11180850863456726, + "learning_rate": 0.0006464770432419518, + "loss": 2.7081, + "step": 13872 + }, + { + "epoch": 0.4113809566171456, + "grad_norm": 0.12972594797611237, + "learning_rate": 0.0006464320559225205, + "loss": 2.7472, + "step": 13873 + }, + { + "epoch": 0.41141060996945705, + "grad_norm": 0.12854509055614471, + "learning_rate": 0.0006463870673064083, + "loss": 2.7007, + "step": 13874 + }, + { + "epoch": 0.41144026332176853, + "grad_norm": 0.13215667009353638, + "learning_rate": 0.0006463420773940135, + "loss": 2.7113, + "step": 13875 + }, + { + "epoch": 0.41146991667408, + "grad_norm": 0.12770622968673706, + "learning_rate": 0.0006462970861857343, + "loss": 2.7187, + "step": 13876 + }, + { + "epoch": 0.4114995700263915, + "grad_norm": 0.14002712070941925, + "learning_rate": 0.0006462520936819695, + "loss": 2.7345, + "step": 13877 + }, + { + "epoch": 0.41152922337870296, + "grad_norm": 0.15825381875038147, + "learning_rate": 0.0006462070998831172, + "loss": 2.7251, + "step": 13878 + }, + { + "epoch": 0.41155887673101443, + "grad_norm": 0.14920957386493683, + "learning_rate": 0.0006461621047895761, + "loss": 2.7368, + "step": 13879 + }, + { + "epoch": 0.4115885300833259, + "grad_norm": 0.14580245316028595, + "learning_rate": 0.0006461171084017443, + "loss": 2.7107, + "step": 13880 + }, + { + "epoch": 0.4116181834356374, + "grad_norm": 0.13313856720924377, + "learning_rate": 0.0006460721107200205, + "loss": 2.7167, + "step": 13881 + }, + { + "epoch": 0.41164783678794886, + "grad_norm": 0.12888920307159424, + "learning_rate": 0.0006460271117448029, + "loss": 2.6904, + "step": 13882 + }, + { + "epoch": 0.41167749014026034, + "grad_norm": 0.12473361194133759, + "learning_rate": 0.0006459821114764904, + "loss": 2.7055, + "step": 13883 + }, + { + "epoch": 0.4117071434925718, + "grad_norm": 0.12229020893573761, + "learning_rate": 0.0006459371099154813, + "loss": 2.6787, + "step": 13884 + }, + { + "epoch": 0.4117367968448833, + "grad_norm": 0.14443708956241608, + "learning_rate": 0.0006458921070621739, + "loss": 2.7453, + "step": 13885 + }, + { + "epoch": 0.41176645019719477, + "grad_norm": 0.14749875664710999, + "learning_rate": 0.0006458471029169669, + "loss": 2.7036, + "step": 13886 + }, + { + "epoch": 0.4117961035495063, + "grad_norm": 0.1385919749736786, + "learning_rate": 0.0006458020974802587, + "loss": 2.6856, + "step": 13887 + }, + { + "epoch": 0.4118257569018178, + "grad_norm": 0.14191439747810364, + "learning_rate": 0.0006457570907524478, + "loss": 2.7294, + "step": 13888 + }, + { + "epoch": 0.41185541025412925, + "grad_norm": 0.13369230926036835, + "learning_rate": 0.0006457120827339331, + "loss": 2.7212, + "step": 13889 + }, + { + "epoch": 0.4118850636064407, + "grad_norm": 0.12874260544776917, + "learning_rate": 0.0006456670734251127, + "loss": 2.6898, + "step": 13890 + }, + { + "epoch": 0.4119147169587522, + "grad_norm": 0.11398496478796005, + "learning_rate": 0.0006456220628263856, + "loss": 2.6711, + "step": 13891 + }, + { + "epoch": 0.4119443703110637, + "grad_norm": 0.12540623545646667, + "learning_rate": 0.0006455770509381499, + "loss": 2.7128, + "step": 13892 + }, + { + "epoch": 0.41197402366337516, + "grad_norm": 0.12380698323249817, + "learning_rate": 0.0006455320377608043, + "loss": 2.6851, + "step": 13893 + }, + { + "epoch": 0.41200367701568663, + "grad_norm": 0.12329841405153275, + "learning_rate": 0.0006454870232947479, + "loss": 2.7117, + "step": 13894 + }, + { + "epoch": 0.4120333303679981, + "grad_norm": 0.11885910481214523, + "learning_rate": 0.0006454420075403788, + "loss": 2.6748, + "step": 13895 + }, + { + "epoch": 0.4120629837203096, + "grad_norm": 0.11519446223974228, + "learning_rate": 0.0006453969904980957, + "loss": 2.7458, + "step": 13896 + }, + { + "epoch": 0.41209263707262106, + "grad_norm": 0.11683551967144012, + "learning_rate": 0.0006453519721682972, + "loss": 2.7048, + "step": 13897 + }, + { + "epoch": 0.41212229042493254, + "grad_norm": 0.13542115688323975, + "learning_rate": 0.0006453069525513822, + "loss": 2.6927, + "step": 13898 + }, + { + "epoch": 0.412151943777244, + "grad_norm": 0.11676700413227081, + "learning_rate": 0.0006452619316477491, + "loss": 2.6954, + "step": 13899 + }, + { + "epoch": 0.4121815971295555, + "grad_norm": 0.11205760389566422, + "learning_rate": 0.0006452169094577967, + "loss": 2.6859, + "step": 13900 + }, + { + "epoch": 0.41221125048186696, + "grad_norm": 0.12859880924224854, + "learning_rate": 0.0006451718859819236, + "loss": 2.7162, + "step": 13901 + }, + { + "epoch": 0.41224090383417844, + "grad_norm": 0.1306154578924179, + "learning_rate": 0.0006451268612205287, + "loss": 2.7248, + "step": 13902 + }, + { + "epoch": 0.4122705571864899, + "grad_norm": 0.1115477979183197, + "learning_rate": 0.0006450818351740104, + "loss": 2.6617, + "step": 13903 + }, + { + "epoch": 0.4123002105388014, + "grad_norm": 0.17068231105804443, + "learning_rate": 0.0006450368078427675, + "loss": 2.6713, + "step": 13904 + }, + { + "epoch": 0.41232986389111287, + "grad_norm": 0.12632568180561066, + "learning_rate": 0.0006449917792271989, + "loss": 2.7312, + "step": 13905 + }, + { + "epoch": 0.41235951724342434, + "grad_norm": 0.13114599883556366, + "learning_rate": 0.000644946749327703, + "loss": 2.7406, + "step": 13906 + }, + { + "epoch": 0.4123891705957358, + "grad_norm": 0.11945394426584244, + "learning_rate": 0.0006449017181446791, + "loss": 2.7189, + "step": 13907 + }, + { + "epoch": 0.41241882394804735, + "grad_norm": 0.11505025625228882, + "learning_rate": 0.0006448566856785253, + "loss": 2.7267, + "step": 13908 + }, + { + "epoch": 0.41244847730035883, + "grad_norm": 0.13998548686504364, + "learning_rate": 0.0006448116519296407, + "loss": 2.7054, + "step": 13909 + }, + { + "epoch": 0.4124781306526703, + "grad_norm": 0.13819481432437897, + "learning_rate": 0.0006447666168984242, + "loss": 2.7036, + "step": 13910 + }, + { + "epoch": 0.4125077840049818, + "grad_norm": 0.1310182809829712, + "learning_rate": 0.0006447215805852745, + "loss": 2.7057, + "step": 13911 + }, + { + "epoch": 0.41253743735729326, + "grad_norm": 0.12758012115955353, + "learning_rate": 0.0006446765429905903, + "loss": 2.7216, + "step": 13912 + }, + { + "epoch": 0.41256709070960473, + "grad_norm": 0.12596546113491058, + "learning_rate": 0.0006446315041147703, + "loss": 2.7171, + "step": 13913 + }, + { + "epoch": 0.4125967440619162, + "grad_norm": 0.11480674147605896, + "learning_rate": 0.0006445864639582136, + "loss": 2.7335, + "step": 13914 + }, + { + "epoch": 0.4126263974142277, + "grad_norm": 0.1343289017677307, + "learning_rate": 0.000644541422521319, + "loss": 2.7175, + "step": 13915 + }, + { + "epoch": 0.41265605076653916, + "grad_norm": 0.14920054376125336, + "learning_rate": 0.0006444963798044854, + "loss": 2.6898, + "step": 13916 + }, + { + "epoch": 0.41268570411885064, + "grad_norm": 0.15434254705905914, + "learning_rate": 0.0006444513358081114, + "loss": 2.7301, + "step": 13917 + }, + { + "epoch": 0.4127153574711621, + "grad_norm": 0.13441717624664307, + "learning_rate": 0.0006444062905325962, + "loss": 2.7029, + "step": 13918 + }, + { + "epoch": 0.4127450108234736, + "grad_norm": 0.12531225383281708, + "learning_rate": 0.0006443612439783382, + "loss": 2.6988, + "step": 13919 + }, + { + "epoch": 0.41277466417578507, + "grad_norm": 0.1195775493979454, + "learning_rate": 0.0006443161961457368, + "loss": 2.7191, + "step": 13920 + }, + { + "epoch": 0.41280431752809654, + "grad_norm": 0.12887294590473175, + "learning_rate": 0.0006442711470351907, + "loss": 2.6675, + "step": 13921 + }, + { + "epoch": 0.412833970880408, + "grad_norm": 0.14423678815364838, + "learning_rate": 0.000644226096647099, + "loss": 2.6848, + "step": 13922 + }, + { + "epoch": 0.4128636242327195, + "grad_norm": 0.15675242245197296, + "learning_rate": 0.0006441810449818602, + "loss": 2.7401, + "step": 13923 + }, + { + "epoch": 0.41289327758503097, + "grad_norm": 0.12954184412956238, + "learning_rate": 0.0006441359920398736, + "loss": 2.6915, + "step": 13924 + }, + { + "epoch": 0.41292293093734245, + "grad_norm": 0.12587952613830566, + "learning_rate": 0.000644090937821538, + "loss": 2.7326, + "step": 13925 + }, + { + "epoch": 0.4129525842896539, + "grad_norm": 0.15397275984287262, + "learning_rate": 0.0006440458823272524, + "loss": 2.6808, + "step": 13926 + }, + { + "epoch": 0.4129822376419654, + "grad_norm": 0.14438991248607635, + "learning_rate": 0.0006440008255574159, + "loss": 2.7072, + "step": 13927 + }, + { + "epoch": 0.4130118909942769, + "grad_norm": 0.13488204777240753, + "learning_rate": 0.0006439557675124273, + "loss": 2.7127, + "step": 13928 + }, + { + "epoch": 0.4130415443465884, + "grad_norm": 0.1264200359582901, + "learning_rate": 0.0006439107081926857, + "loss": 2.6871, + "step": 13929 + }, + { + "epoch": 0.4130711976988999, + "grad_norm": 0.12700608372688293, + "learning_rate": 0.00064386564759859, + "loss": 2.6994, + "step": 13930 + }, + { + "epoch": 0.41310085105121136, + "grad_norm": 0.13281388580799103, + "learning_rate": 0.0006438205857305395, + "loss": 2.6845, + "step": 13931 + }, + { + "epoch": 0.41313050440352284, + "grad_norm": 0.13810734450817108, + "learning_rate": 0.0006437755225889328, + "loss": 2.7371, + "step": 13932 + }, + { + "epoch": 0.4131601577558343, + "grad_norm": 0.13466039299964905, + "learning_rate": 0.0006437304581741692, + "loss": 2.7574, + "step": 13933 + }, + { + "epoch": 0.4131898111081458, + "grad_norm": 0.14252892136573792, + "learning_rate": 0.0006436853924866479, + "loss": 2.696, + "step": 13934 + }, + { + "epoch": 0.41321946446045726, + "grad_norm": 0.13574504852294922, + "learning_rate": 0.0006436403255267676, + "loss": 2.7186, + "step": 13935 + }, + { + "epoch": 0.41324911781276874, + "grad_norm": 0.11890700459480286, + "learning_rate": 0.0006435952572949275, + "loss": 2.6988, + "step": 13936 + }, + { + "epoch": 0.4132787711650802, + "grad_norm": 0.11796953529119492, + "learning_rate": 0.0006435501877915269, + "loss": 2.7081, + "step": 13937 + }, + { + "epoch": 0.4133084245173917, + "grad_norm": 0.12111178785562515, + "learning_rate": 0.0006435051170169647, + "loss": 2.7313, + "step": 13938 + }, + { + "epoch": 0.41333807786970317, + "grad_norm": 0.12461800128221512, + "learning_rate": 0.0006434600449716401, + "loss": 2.7043, + "step": 13939 + }, + { + "epoch": 0.41336773122201464, + "grad_norm": 0.1071576476097107, + "learning_rate": 0.0006434149716559521, + "loss": 2.7306, + "step": 13940 + }, + { + "epoch": 0.4133973845743261, + "grad_norm": 0.125370055437088, + "learning_rate": 0.0006433698970703001, + "loss": 2.6841, + "step": 13941 + }, + { + "epoch": 0.4134270379266376, + "grad_norm": 0.15001387894153595, + "learning_rate": 0.0006433248212150828, + "loss": 2.6889, + "step": 13942 + }, + { + "epoch": 0.4134566912789491, + "grad_norm": 0.16966919600963593, + "learning_rate": 0.0006432797440906997, + "loss": 2.7214, + "step": 13943 + }, + { + "epoch": 0.41348634463126055, + "grad_norm": 0.173681378364563, + "learning_rate": 0.0006432346656975499, + "loss": 2.7525, + "step": 13944 + }, + { + "epoch": 0.413515997983572, + "grad_norm": 0.16399531066417694, + "learning_rate": 0.0006431895860360325, + "loss": 2.7192, + "step": 13945 + }, + { + "epoch": 0.4135456513358835, + "grad_norm": 0.13394789397716522, + "learning_rate": 0.0006431445051065468, + "loss": 2.7121, + "step": 13946 + }, + { + "epoch": 0.413575304688195, + "grad_norm": 0.1279182881116867, + "learning_rate": 0.0006430994229094919, + "loss": 2.7087, + "step": 13947 + }, + { + "epoch": 0.41360495804050645, + "grad_norm": 0.1285274475812912, + "learning_rate": 0.000643054339445267, + "loss": 2.7193, + "step": 13948 + }, + { + "epoch": 0.413634611392818, + "grad_norm": 0.11375090479850769, + "learning_rate": 0.0006430092547142716, + "loss": 2.711, + "step": 13949 + }, + { + "epoch": 0.41366426474512946, + "grad_norm": 0.11624019593000412, + "learning_rate": 0.0006429641687169046, + "loss": 2.6847, + "step": 13950 + }, + { + "epoch": 0.41369391809744094, + "grad_norm": 0.11448962986469269, + "learning_rate": 0.0006429190814535651, + "loss": 2.6658, + "step": 13951 + }, + { + "epoch": 0.4137235714497524, + "grad_norm": 0.12214548885822296, + "learning_rate": 0.0006428739929246527, + "loss": 2.7143, + "step": 13952 + }, + { + "epoch": 0.4137532248020639, + "grad_norm": 0.11621768772602081, + "learning_rate": 0.0006428289031305668, + "loss": 2.715, + "step": 13953 + }, + { + "epoch": 0.41378287815437537, + "grad_norm": 0.10449449717998505, + "learning_rate": 0.0006427838120717062, + "loss": 2.7263, + "step": 13954 + }, + { + "epoch": 0.41381253150668684, + "grad_norm": 0.1186084970831871, + "learning_rate": 0.0006427387197484707, + "loss": 2.7149, + "step": 13955 + }, + { + "epoch": 0.4138421848589983, + "grad_norm": 0.12504833936691284, + "learning_rate": 0.0006426936261612591, + "loss": 2.7265, + "step": 13956 + }, + { + "epoch": 0.4138718382113098, + "grad_norm": 0.1290210485458374, + "learning_rate": 0.000642648531310471, + "loss": 2.6933, + "step": 13957 + }, + { + "epoch": 0.41390149156362127, + "grad_norm": 0.13189053535461426, + "learning_rate": 0.0006426034351965055, + "loss": 2.7193, + "step": 13958 + }, + { + "epoch": 0.41393114491593275, + "grad_norm": 0.13295510411262512, + "learning_rate": 0.0006425583378197624, + "loss": 2.7141, + "step": 13959 + }, + { + "epoch": 0.4139607982682442, + "grad_norm": 0.14218467473983765, + "learning_rate": 0.0006425132391806406, + "loss": 2.731, + "step": 13960 + }, + { + "epoch": 0.4139904516205557, + "grad_norm": 0.13754640519618988, + "learning_rate": 0.0006424681392795397, + "loss": 2.7148, + "step": 13961 + }, + { + "epoch": 0.4140201049728672, + "grad_norm": 0.14306481182575226, + "learning_rate": 0.000642423038116859, + "loss": 2.7329, + "step": 13962 + }, + { + "epoch": 0.41404975832517865, + "grad_norm": 0.1422378122806549, + "learning_rate": 0.0006423779356929978, + "loss": 2.7358, + "step": 13963 + }, + { + "epoch": 0.41407941167749013, + "grad_norm": 0.1243026852607727, + "learning_rate": 0.0006423328320083552, + "loss": 2.7236, + "step": 13964 + }, + { + "epoch": 0.4141090650298016, + "grad_norm": 0.14040926098823547, + "learning_rate": 0.0006422877270633314, + "loss": 2.7628, + "step": 13965 + }, + { + "epoch": 0.4141387183821131, + "grad_norm": 0.16441886126995087, + "learning_rate": 0.0006422426208583252, + "loss": 2.6899, + "step": 13966 + }, + { + "epoch": 0.41416837173442456, + "grad_norm": 0.1483585685491562, + "learning_rate": 0.0006421975133937361, + "loss": 2.7063, + "step": 13967 + }, + { + "epoch": 0.41419802508673603, + "grad_norm": 0.142951101064682, + "learning_rate": 0.0006421524046699639, + "loss": 2.7236, + "step": 13968 + }, + { + "epoch": 0.4142276784390475, + "grad_norm": 0.1435350626707077, + "learning_rate": 0.0006421072946874073, + "loss": 2.7293, + "step": 13969 + }, + { + "epoch": 0.41425733179135904, + "grad_norm": 0.12144951522350311, + "learning_rate": 0.0006420621834464666, + "loss": 2.7213, + "step": 13970 + }, + { + "epoch": 0.4142869851436705, + "grad_norm": 0.11842405796051025, + "learning_rate": 0.0006420170709475407, + "loss": 2.7417, + "step": 13971 + }, + { + "epoch": 0.414316638495982, + "grad_norm": 0.13778096437454224, + "learning_rate": 0.0006419719571910293, + "loss": 2.7073, + "step": 13972 + }, + { + "epoch": 0.41434629184829347, + "grad_norm": 0.11172939091920853, + "learning_rate": 0.0006419268421773319, + "loss": 2.6895, + "step": 13973 + }, + { + "epoch": 0.41437594520060494, + "grad_norm": 0.10688258707523346, + "learning_rate": 0.0006418817259068478, + "loss": 2.6806, + "step": 13974 + }, + { + "epoch": 0.4144055985529164, + "grad_norm": 0.10578501224517822, + "learning_rate": 0.0006418366083799767, + "loss": 2.7474, + "step": 13975 + }, + { + "epoch": 0.4144352519052279, + "grad_norm": 0.1101115494966507, + "learning_rate": 0.0006417914895971182, + "loss": 2.7172, + "step": 13976 + }, + { + "epoch": 0.4144649052575394, + "grad_norm": 0.11227412521839142, + "learning_rate": 0.0006417463695586718, + "loss": 2.7263, + "step": 13977 + }, + { + "epoch": 0.41449455860985085, + "grad_norm": 0.10304741561412811, + "learning_rate": 0.0006417012482650367, + "loss": 2.6721, + "step": 13978 + }, + { + "epoch": 0.4145242119621623, + "grad_norm": 0.12800179421901703, + "learning_rate": 0.0006416561257166129, + "loss": 2.7323, + "step": 13979 + }, + { + "epoch": 0.4145538653144738, + "grad_norm": 0.12423171103000641, + "learning_rate": 0.0006416110019137997, + "loss": 2.6906, + "step": 13980 + }, + { + "epoch": 0.4145835186667853, + "grad_norm": 0.12328983098268509, + "learning_rate": 0.0006415658768569968, + "loss": 2.7287, + "step": 13981 + }, + { + "epoch": 0.41461317201909675, + "grad_norm": 0.1280042976140976, + "learning_rate": 0.0006415207505466038, + "loss": 2.7103, + "step": 13982 + }, + { + "epoch": 0.41464282537140823, + "grad_norm": 0.11388939619064331, + "learning_rate": 0.0006414756229830203, + "loss": 2.6881, + "step": 13983 + }, + { + "epoch": 0.4146724787237197, + "grad_norm": 0.12828610837459564, + "learning_rate": 0.0006414304941666458, + "loss": 2.7098, + "step": 13984 + }, + { + "epoch": 0.4147021320760312, + "grad_norm": 0.12660641968250275, + "learning_rate": 0.00064138536409788, + "loss": 2.7198, + "step": 13985 + }, + { + "epoch": 0.41473178542834266, + "grad_norm": 0.13454778492450714, + "learning_rate": 0.0006413402327771225, + "loss": 2.7111, + "step": 13986 + }, + { + "epoch": 0.41476143878065413, + "grad_norm": 0.1529783308506012, + "learning_rate": 0.0006412951002047731, + "loss": 2.7714, + "step": 13987 + }, + { + "epoch": 0.4147910921329656, + "grad_norm": 0.1387983113527298, + "learning_rate": 0.0006412499663812313, + "loss": 2.6653, + "step": 13988 + }, + { + "epoch": 0.4148207454852771, + "grad_norm": 0.1579095721244812, + "learning_rate": 0.0006412048313068967, + "loss": 2.708, + "step": 13989 + }, + { + "epoch": 0.41485039883758856, + "grad_norm": 0.17544381320476532, + "learning_rate": 0.0006411596949821691, + "loss": 2.7027, + "step": 13990 + }, + { + "epoch": 0.4148800521899001, + "grad_norm": 0.16371312737464905, + "learning_rate": 0.0006411145574074481, + "loss": 2.7303, + "step": 13991 + }, + { + "epoch": 0.41490970554221157, + "grad_norm": 0.15489330887794495, + "learning_rate": 0.0006410694185831337, + "loss": 2.7091, + "step": 13992 + }, + { + "epoch": 0.41493935889452305, + "grad_norm": 0.1352759152650833, + "learning_rate": 0.0006410242785096254, + "loss": 2.7231, + "step": 13993 + }, + { + "epoch": 0.4149690122468345, + "grad_norm": 0.13428035378456116, + "learning_rate": 0.0006409791371873228, + "loss": 2.6966, + "step": 13994 + }, + { + "epoch": 0.414998665599146, + "grad_norm": 0.13854172825813293, + "learning_rate": 0.0006409339946166257, + "loss": 2.7014, + "step": 13995 + }, + { + "epoch": 0.4150283189514575, + "grad_norm": 0.13316354155540466, + "learning_rate": 0.0006408888507979339, + "loss": 2.7259, + "step": 13996 + }, + { + "epoch": 0.41505797230376895, + "grad_norm": 0.1216440498828888, + "learning_rate": 0.000640843705731647, + "loss": 2.7338, + "step": 13997 + }, + { + "epoch": 0.41508762565608043, + "grad_norm": 0.13093872368335724, + "learning_rate": 0.0006407985594181653, + "loss": 2.7465, + "step": 13998 + }, + { + "epoch": 0.4151172790083919, + "grad_norm": 0.11669307947158813, + "learning_rate": 0.0006407534118578878, + "loss": 2.68, + "step": 13999 + }, + { + "epoch": 0.4151469323607034, + "grad_norm": 0.12865565717220306, + "learning_rate": 0.0006407082630512148, + "loss": 2.7484, + "step": 14000 + }, + { + "epoch": 0.41517658571301486, + "grad_norm": 0.14064620435237885, + "learning_rate": 0.000640663112998546, + "loss": 2.7215, + "step": 14001 + }, + { + "epoch": 0.41520623906532633, + "grad_norm": 0.13142681121826172, + "learning_rate": 0.0006406179617002813, + "loss": 2.6846, + "step": 14002 + }, + { + "epoch": 0.4152358924176378, + "grad_norm": 0.1309579461812973, + "learning_rate": 0.0006405728091568203, + "loss": 2.7342, + "step": 14003 + }, + { + "epoch": 0.4152655457699493, + "grad_norm": 0.11710792034864426, + "learning_rate": 0.0006405276553685629, + "loss": 2.706, + "step": 14004 + }, + { + "epoch": 0.41529519912226076, + "grad_norm": 0.1088993027806282, + "learning_rate": 0.0006404825003359091, + "loss": 2.6896, + "step": 14005 + }, + { + "epoch": 0.41532485247457224, + "grad_norm": 0.11945335566997528, + "learning_rate": 0.0006404373440592586, + "loss": 2.6945, + "step": 14006 + }, + { + "epoch": 0.4153545058268837, + "grad_norm": 0.12850601971149445, + "learning_rate": 0.0006403921865390112, + "loss": 2.6906, + "step": 14007 + }, + { + "epoch": 0.4153841591791952, + "grad_norm": 0.10856343805789948, + "learning_rate": 0.0006403470277755671, + "loss": 2.7016, + "step": 14008 + }, + { + "epoch": 0.41541381253150667, + "grad_norm": 0.10117480158805847, + "learning_rate": 0.0006403018677693258, + "loss": 2.6949, + "step": 14009 + }, + { + "epoch": 0.41544346588381814, + "grad_norm": 0.11950403451919556, + "learning_rate": 0.0006402567065206875, + "loss": 2.7611, + "step": 14010 + }, + { + "epoch": 0.4154731192361296, + "grad_norm": 0.12096261233091354, + "learning_rate": 0.000640211544030052, + "loss": 2.6694, + "step": 14011 + }, + { + "epoch": 0.41550277258844115, + "grad_norm": 0.12113796919584274, + "learning_rate": 0.000640166380297819, + "loss": 2.6941, + "step": 14012 + }, + { + "epoch": 0.4155324259407526, + "grad_norm": 0.1269807368516922, + "learning_rate": 0.000640121215324389, + "loss": 2.7192, + "step": 14013 + }, + { + "epoch": 0.4155620792930641, + "grad_norm": 0.1224256381392479, + "learning_rate": 0.0006400760491101613, + "loss": 2.7195, + "step": 14014 + }, + { + "epoch": 0.4155917326453756, + "grad_norm": 0.12844409048557281, + "learning_rate": 0.0006400308816555362, + "loss": 2.7044, + "step": 14015 + }, + { + "epoch": 0.41562138599768705, + "grad_norm": 0.1411695033311844, + "learning_rate": 0.0006399857129609135, + "loss": 2.6569, + "step": 14016 + }, + { + "epoch": 0.41565103934999853, + "grad_norm": 0.13654138147830963, + "learning_rate": 0.0006399405430266935, + "loss": 2.7211, + "step": 14017 + }, + { + "epoch": 0.41568069270231, + "grad_norm": 0.13599033653736115, + "learning_rate": 0.0006398953718532758, + "loss": 2.6905, + "step": 14018 + }, + { + "epoch": 0.4157103460546215, + "grad_norm": 0.13027909398078918, + "learning_rate": 0.0006398501994410607, + "loss": 2.7162, + "step": 14019 + }, + { + "epoch": 0.41573999940693296, + "grad_norm": 0.16275429725646973, + "learning_rate": 0.0006398050257904482, + "loss": 2.7095, + "step": 14020 + }, + { + "epoch": 0.41576965275924443, + "grad_norm": 0.13743259012699127, + "learning_rate": 0.000639759850901838, + "loss": 2.6974, + "step": 14021 + }, + { + "epoch": 0.4157993061115559, + "grad_norm": 0.12490201741456985, + "learning_rate": 0.0006397146747756304, + "loss": 2.7218, + "step": 14022 + }, + { + "epoch": 0.4158289594638674, + "grad_norm": 0.11780110746622086, + "learning_rate": 0.0006396694974122253, + "loss": 2.6981, + "step": 14023 + }, + { + "epoch": 0.41585861281617886, + "grad_norm": 0.1247681975364685, + "learning_rate": 0.0006396243188120228, + "loss": 2.7116, + "step": 14024 + }, + { + "epoch": 0.41588826616849034, + "grad_norm": 0.13218282163143158, + "learning_rate": 0.0006395791389754231, + "loss": 2.6949, + "step": 14025 + }, + { + "epoch": 0.4159179195208018, + "grad_norm": 0.11983265727758408, + "learning_rate": 0.0006395339579028261, + "loss": 2.7078, + "step": 14026 + }, + { + "epoch": 0.4159475728731133, + "grad_norm": 0.0987960547208786, + "learning_rate": 0.000639488775594632, + "loss": 2.697, + "step": 14027 + }, + { + "epoch": 0.41597722622542477, + "grad_norm": 0.13674576580524445, + "learning_rate": 0.0006394435920512408, + "loss": 2.7176, + "step": 14028 + }, + { + "epoch": 0.41600687957773624, + "grad_norm": 0.1385212242603302, + "learning_rate": 0.0006393984072730525, + "loss": 2.7147, + "step": 14029 + }, + { + "epoch": 0.4160365329300477, + "grad_norm": 0.14011061191558838, + "learning_rate": 0.0006393532212604676, + "loss": 2.7367, + "step": 14030 + }, + { + "epoch": 0.4160661862823592, + "grad_norm": 0.13800795376300812, + "learning_rate": 0.0006393080340138861, + "loss": 2.7306, + "step": 14031 + }, + { + "epoch": 0.4160958396346707, + "grad_norm": 0.1349092423915863, + "learning_rate": 0.000639262845533708, + "loss": 2.7192, + "step": 14032 + }, + { + "epoch": 0.4161254929869822, + "grad_norm": 0.13860727846622467, + "learning_rate": 0.0006392176558203333, + "loss": 2.7196, + "step": 14033 + }, + { + "epoch": 0.4161551463392937, + "grad_norm": 0.13216307759284973, + "learning_rate": 0.0006391724648741625, + "loss": 2.6798, + "step": 14034 + }, + { + "epoch": 0.41618479969160516, + "grad_norm": 0.13056831061840057, + "learning_rate": 0.0006391272726955955, + "loss": 2.7278, + "step": 14035 + }, + { + "epoch": 0.41621445304391663, + "grad_norm": 0.15430913865566254, + "learning_rate": 0.0006390820792850328, + "loss": 2.7041, + "step": 14036 + }, + { + "epoch": 0.4162441063962281, + "grad_norm": 0.13153153657913208, + "learning_rate": 0.0006390368846428743, + "loss": 2.6897, + "step": 14037 + }, + { + "epoch": 0.4162737597485396, + "grad_norm": 0.13354885578155518, + "learning_rate": 0.0006389916887695204, + "loss": 2.7265, + "step": 14038 + }, + { + "epoch": 0.41630341310085106, + "grad_norm": 0.1327657252550125, + "learning_rate": 0.0006389464916653711, + "loss": 2.7268, + "step": 14039 + }, + { + "epoch": 0.41633306645316254, + "grad_norm": 0.15602698922157288, + "learning_rate": 0.000638901293330827, + "loss": 2.708, + "step": 14040 + }, + { + "epoch": 0.416362719805474, + "grad_norm": 0.1517293006181717, + "learning_rate": 0.000638856093766288, + "loss": 2.6967, + "step": 14041 + }, + { + "epoch": 0.4163923731577855, + "grad_norm": 0.13268499076366425, + "learning_rate": 0.0006388108929721543, + "loss": 2.7067, + "step": 14042 + }, + { + "epoch": 0.41642202651009697, + "grad_norm": 0.14912846684455872, + "learning_rate": 0.0006387656909488264, + "loss": 2.6783, + "step": 14043 + }, + { + "epoch": 0.41645167986240844, + "grad_norm": 0.13472259044647217, + "learning_rate": 0.0006387204876967046, + "loss": 2.7128, + "step": 14044 + }, + { + "epoch": 0.4164813332147199, + "grad_norm": 0.1351279765367508, + "learning_rate": 0.0006386752832161889, + "loss": 2.7196, + "step": 14045 + }, + { + "epoch": 0.4165109865670314, + "grad_norm": 0.14277616143226624, + "learning_rate": 0.0006386300775076799, + "loss": 2.6677, + "step": 14046 + }, + { + "epoch": 0.41654063991934287, + "grad_norm": 0.11934056133031845, + "learning_rate": 0.0006385848705715778, + "loss": 2.6655, + "step": 14047 + }, + { + "epoch": 0.41657029327165435, + "grad_norm": 0.14430777728557587, + "learning_rate": 0.0006385396624082828, + "loss": 2.7128, + "step": 14048 + }, + { + "epoch": 0.4165999466239658, + "grad_norm": 0.10047568380832672, + "learning_rate": 0.0006384944530181953, + "loss": 2.6851, + "step": 14049 + }, + { + "epoch": 0.4166295999762773, + "grad_norm": 0.12323278933763504, + "learning_rate": 0.0006384492424017157, + "loss": 2.7183, + "step": 14050 + }, + { + "epoch": 0.4166592533285888, + "grad_norm": 0.11534887552261353, + "learning_rate": 0.0006384040305592442, + "loss": 2.6998, + "step": 14051 + }, + { + "epoch": 0.41668890668090025, + "grad_norm": 0.119362972676754, + "learning_rate": 0.0006383588174911813, + "loss": 2.7318, + "step": 14052 + }, + { + "epoch": 0.4167185600332118, + "grad_norm": 0.13013173639774323, + "learning_rate": 0.0006383136031979274, + "loss": 2.7126, + "step": 14053 + }, + { + "epoch": 0.41674821338552326, + "grad_norm": 0.12695887684822083, + "learning_rate": 0.0006382683876798829, + "loss": 2.6994, + "step": 14054 + }, + { + "epoch": 0.41677786673783473, + "grad_norm": 0.12399139255285263, + "learning_rate": 0.0006382231709374477, + "loss": 2.7042, + "step": 14055 + }, + { + "epoch": 0.4168075200901462, + "grad_norm": 0.130340576171875, + "learning_rate": 0.0006381779529710229, + "loss": 2.7166, + "step": 14056 + }, + { + "epoch": 0.4168371734424577, + "grad_norm": 0.12655647099018097, + "learning_rate": 0.0006381327337810084, + "loss": 2.7099, + "step": 14057 + }, + { + "epoch": 0.41686682679476916, + "grad_norm": 0.12685123085975647, + "learning_rate": 0.0006380875133678052, + "loss": 2.7025, + "step": 14058 + }, + { + "epoch": 0.41689648014708064, + "grad_norm": 0.13463833928108215, + "learning_rate": 0.0006380422917318131, + "loss": 2.7173, + "step": 14059 + }, + { + "epoch": 0.4169261334993921, + "grad_norm": 0.13633184134960175, + "learning_rate": 0.0006379970688734327, + "loss": 2.6981, + "step": 14060 + }, + { + "epoch": 0.4169557868517036, + "grad_norm": 0.12522676587104797, + "learning_rate": 0.0006379518447930648, + "loss": 2.726, + "step": 14061 + }, + { + "epoch": 0.41698544020401507, + "grad_norm": 0.1470107138156891, + "learning_rate": 0.0006379066194911095, + "loss": 2.6868, + "step": 14062 + }, + { + "epoch": 0.41701509355632654, + "grad_norm": 0.15502247214317322, + "learning_rate": 0.0006378613929679675, + "loss": 2.7252, + "step": 14063 + }, + { + "epoch": 0.417044746908638, + "grad_norm": 0.16319392621517181, + "learning_rate": 0.0006378161652240391, + "loss": 2.716, + "step": 14064 + }, + { + "epoch": 0.4170744002609495, + "grad_norm": 0.18404322862625122, + "learning_rate": 0.0006377709362597251, + "loss": 2.7101, + "step": 14065 + }, + { + "epoch": 0.41710405361326097, + "grad_norm": 0.16336777806282043, + "learning_rate": 0.0006377257060754257, + "loss": 2.6996, + "step": 14066 + }, + { + "epoch": 0.41713370696557245, + "grad_norm": 0.14057278633117676, + "learning_rate": 0.0006376804746715414, + "loss": 2.6828, + "step": 14067 + }, + { + "epoch": 0.4171633603178839, + "grad_norm": 0.1373700499534607, + "learning_rate": 0.0006376352420484728, + "loss": 2.734, + "step": 14068 + }, + { + "epoch": 0.4171930136701954, + "grad_norm": 0.1476706862449646, + "learning_rate": 0.000637590008206621, + "loss": 2.7273, + "step": 14069 + }, + { + "epoch": 0.4172226670225069, + "grad_norm": 0.15358814597129822, + "learning_rate": 0.0006375447731463857, + "loss": 2.7242, + "step": 14070 + }, + { + "epoch": 0.41725232037481835, + "grad_norm": 0.13347706198692322, + "learning_rate": 0.0006374995368681678, + "loss": 2.677, + "step": 14071 + }, + { + "epoch": 0.41728197372712983, + "grad_norm": 0.13075922429561615, + "learning_rate": 0.000637454299372368, + "loss": 2.7273, + "step": 14072 + }, + { + "epoch": 0.4173116270794413, + "grad_norm": 0.14870932698249817, + "learning_rate": 0.0006374090606593867, + "loss": 2.6954, + "step": 14073 + }, + { + "epoch": 0.41734128043175284, + "grad_norm": 0.12020551413297653, + "learning_rate": 0.0006373638207296246, + "loss": 2.6728, + "step": 14074 + }, + { + "epoch": 0.4173709337840643, + "grad_norm": 0.12301424890756607, + "learning_rate": 0.0006373185795834823, + "loss": 2.7103, + "step": 14075 + }, + { + "epoch": 0.4174005871363758, + "grad_norm": 0.1300835907459259, + "learning_rate": 0.0006372733372213605, + "loss": 2.7427, + "step": 14076 + }, + { + "epoch": 0.41743024048868727, + "grad_norm": 0.11475612223148346, + "learning_rate": 0.0006372280936436597, + "loss": 2.6778, + "step": 14077 + }, + { + "epoch": 0.41745989384099874, + "grad_norm": 0.11856023967266083, + "learning_rate": 0.0006371828488507805, + "loss": 2.7447, + "step": 14078 + }, + { + "epoch": 0.4174895471933102, + "grad_norm": 0.13023829460144043, + "learning_rate": 0.0006371376028431237, + "loss": 2.7267, + "step": 14079 + }, + { + "epoch": 0.4175192005456217, + "grad_norm": 0.12715259194374084, + "learning_rate": 0.0006370923556210898, + "loss": 2.7043, + "step": 14080 + }, + { + "epoch": 0.41754885389793317, + "grad_norm": 0.12154920399188995, + "learning_rate": 0.0006370471071850797, + "loss": 2.7035, + "step": 14081 + }, + { + "epoch": 0.41757850725024465, + "grad_norm": 0.11523303389549255, + "learning_rate": 0.0006370018575354938, + "loss": 2.7131, + "step": 14082 + }, + { + "epoch": 0.4176081606025561, + "grad_norm": 0.11843223124742508, + "learning_rate": 0.000636956606672733, + "loss": 2.7004, + "step": 14083 + }, + { + "epoch": 0.4176378139548676, + "grad_norm": 0.10740230232477188, + "learning_rate": 0.000636911354597198, + "loss": 2.6863, + "step": 14084 + }, + { + "epoch": 0.4176674673071791, + "grad_norm": 0.10980705171823502, + "learning_rate": 0.0006368661013092893, + "loss": 2.7073, + "step": 14085 + }, + { + "epoch": 0.41769712065949055, + "grad_norm": 0.1341886669397354, + "learning_rate": 0.000636820846809408, + "loss": 2.684, + "step": 14086 + }, + { + "epoch": 0.417726774011802, + "grad_norm": 0.12133851647377014, + "learning_rate": 0.0006367755910979543, + "loss": 2.6982, + "step": 14087 + }, + { + "epoch": 0.4177564273641135, + "grad_norm": 0.12393985688686371, + "learning_rate": 0.0006367303341753294, + "loss": 2.7119, + "step": 14088 + }, + { + "epoch": 0.417786080716425, + "grad_norm": 0.11525299400091171, + "learning_rate": 0.000636685076041934, + "loss": 2.7056, + "step": 14089 + }, + { + "epoch": 0.41781573406873646, + "grad_norm": 0.10957042872905731, + "learning_rate": 0.0006366398166981689, + "loss": 2.7281, + "step": 14090 + }, + { + "epoch": 0.41784538742104793, + "grad_norm": 0.12030340731143951, + "learning_rate": 0.0006365945561444346, + "loss": 2.7327, + "step": 14091 + }, + { + "epoch": 0.4178750407733594, + "grad_norm": 0.1293923258781433, + "learning_rate": 0.0006365492943811321, + "loss": 2.723, + "step": 14092 + }, + { + "epoch": 0.4179046941256709, + "grad_norm": 0.11581818014383316, + "learning_rate": 0.0006365040314086622, + "loss": 2.7133, + "step": 14093 + }, + { + "epoch": 0.41793434747798236, + "grad_norm": 0.11498330533504486, + "learning_rate": 0.0006364587672274255, + "loss": 2.7311, + "step": 14094 + }, + { + "epoch": 0.4179640008302939, + "grad_norm": 0.11860304325819016, + "learning_rate": 0.0006364135018378231, + "loss": 2.7122, + "step": 14095 + }, + { + "epoch": 0.41799365418260537, + "grad_norm": 0.13519439101219177, + "learning_rate": 0.0006363682352402558, + "loss": 2.7391, + "step": 14096 + }, + { + "epoch": 0.41802330753491684, + "grad_norm": 0.12911701202392578, + "learning_rate": 0.0006363229674351243, + "loss": 2.7104, + "step": 14097 + }, + { + "epoch": 0.4180529608872283, + "grad_norm": 0.13625949621200562, + "learning_rate": 0.0006362776984228295, + "loss": 2.651, + "step": 14098 + }, + { + "epoch": 0.4180826142395398, + "grad_norm": 0.14056028425693512, + "learning_rate": 0.0006362324282037724, + "loss": 2.6953, + "step": 14099 + }, + { + "epoch": 0.41811226759185127, + "grad_norm": 0.16268832981586456, + "learning_rate": 0.0006361871567783536, + "loss": 2.6691, + "step": 14100 + }, + { + "epoch": 0.41814192094416275, + "grad_norm": 0.15191759169101715, + "learning_rate": 0.0006361418841469743, + "loss": 2.693, + "step": 14101 + }, + { + "epoch": 0.4181715742964742, + "grad_norm": 0.15064121782779694, + "learning_rate": 0.0006360966103100352, + "loss": 2.7295, + "step": 14102 + }, + { + "epoch": 0.4182012276487857, + "grad_norm": 0.1752772182226181, + "learning_rate": 0.0006360513352679372, + "loss": 2.6874, + "step": 14103 + }, + { + "epoch": 0.4182308810010972, + "grad_norm": 0.16416415572166443, + "learning_rate": 0.0006360060590210814, + "loss": 2.6929, + "step": 14104 + }, + { + "epoch": 0.41826053435340865, + "grad_norm": 0.14390172064304352, + "learning_rate": 0.0006359607815698685, + "loss": 2.6892, + "step": 14105 + }, + { + "epoch": 0.41829018770572013, + "grad_norm": 0.1495966762304306, + "learning_rate": 0.0006359155029146995, + "loss": 2.7092, + "step": 14106 + }, + { + "epoch": 0.4183198410580316, + "grad_norm": 0.13963906466960907, + "learning_rate": 0.0006358702230559755, + "loss": 2.7448, + "step": 14107 + }, + { + "epoch": 0.4183494944103431, + "grad_norm": 0.13638420403003693, + "learning_rate": 0.0006358249419940972, + "loss": 2.6971, + "step": 14108 + }, + { + "epoch": 0.41837914776265456, + "grad_norm": 0.12903639674186707, + "learning_rate": 0.0006357796597294659, + "loss": 2.6717, + "step": 14109 + }, + { + "epoch": 0.41840880111496603, + "grad_norm": 0.1376136839389801, + "learning_rate": 0.0006357343762624823, + "loss": 2.6753, + "step": 14110 + }, + { + "epoch": 0.4184384544672775, + "grad_norm": 0.1211959719657898, + "learning_rate": 0.0006356890915935475, + "loss": 2.6727, + "step": 14111 + }, + { + "epoch": 0.418468107819589, + "grad_norm": 0.123679518699646, + "learning_rate": 0.0006356438057230626, + "loss": 2.6369, + "step": 14112 + }, + { + "epoch": 0.41849776117190046, + "grad_norm": 0.11959624290466309, + "learning_rate": 0.0006355985186514284, + "loss": 2.6924, + "step": 14113 + }, + { + "epoch": 0.41852741452421194, + "grad_norm": 0.10079233348369598, + "learning_rate": 0.0006355532303790461, + "loss": 2.6874, + "step": 14114 + }, + { + "epoch": 0.4185570678765234, + "grad_norm": 0.11478500813245773, + "learning_rate": 0.0006355079409063167, + "loss": 2.7134, + "step": 14115 + }, + { + "epoch": 0.41858672122883495, + "grad_norm": 0.13683336973190308, + "learning_rate": 0.0006354626502336412, + "loss": 2.6918, + "step": 14116 + }, + { + "epoch": 0.4186163745811464, + "grad_norm": 0.15179339051246643, + "learning_rate": 0.0006354173583614207, + "loss": 2.71, + "step": 14117 + }, + { + "epoch": 0.4186460279334579, + "grad_norm": 0.1196797639131546, + "learning_rate": 0.0006353720652900561, + "loss": 2.6824, + "step": 14118 + }, + { + "epoch": 0.4186756812857694, + "grad_norm": 0.11438514292240143, + "learning_rate": 0.0006353267710199488, + "loss": 2.6948, + "step": 14119 + }, + { + "epoch": 0.41870533463808085, + "grad_norm": 0.1260775923728943, + "learning_rate": 0.0006352814755514997, + "loss": 2.6966, + "step": 14120 + }, + { + "epoch": 0.4187349879903923, + "grad_norm": 0.12131358683109283, + "learning_rate": 0.0006352361788851098, + "loss": 2.7126, + "step": 14121 + }, + { + "epoch": 0.4187646413427038, + "grad_norm": 0.1218286007642746, + "learning_rate": 0.0006351908810211804, + "loss": 2.6976, + "step": 14122 + }, + { + "epoch": 0.4187942946950153, + "grad_norm": 0.14312033355236053, + "learning_rate": 0.0006351455819601125, + "loss": 2.7253, + "step": 14123 + }, + { + "epoch": 0.41882394804732676, + "grad_norm": 0.12785851955413818, + "learning_rate": 0.0006351002817023075, + "loss": 2.7239, + "step": 14124 + }, + { + "epoch": 0.41885360139963823, + "grad_norm": 0.12504112720489502, + "learning_rate": 0.0006350549802481661, + "loss": 2.7115, + "step": 14125 + }, + { + "epoch": 0.4188832547519497, + "grad_norm": 0.1279270499944687, + "learning_rate": 0.0006350096775980896, + "loss": 2.6977, + "step": 14126 + }, + { + "epoch": 0.4189129081042612, + "grad_norm": 0.13138580322265625, + "learning_rate": 0.0006349643737524793, + "loss": 2.6836, + "step": 14127 + }, + { + "epoch": 0.41894256145657266, + "grad_norm": 0.10810942947864532, + "learning_rate": 0.0006349190687117363, + "loss": 2.7103, + "step": 14128 + }, + { + "epoch": 0.41897221480888414, + "grad_norm": 0.11757685989141464, + "learning_rate": 0.0006348737624762619, + "loss": 2.7159, + "step": 14129 + }, + { + "epoch": 0.4190018681611956, + "grad_norm": 0.11935412883758545, + "learning_rate": 0.0006348284550464572, + "loss": 2.7144, + "step": 14130 + }, + { + "epoch": 0.4190315215135071, + "grad_norm": 0.10876473784446716, + "learning_rate": 0.0006347831464227233, + "loss": 2.7467, + "step": 14131 + }, + { + "epoch": 0.41906117486581856, + "grad_norm": 0.1301940530538559, + "learning_rate": 0.0006347378366054614, + "loss": 2.7101, + "step": 14132 + }, + { + "epoch": 0.41909082821813004, + "grad_norm": 0.14544124901294708, + "learning_rate": 0.0006346925255950728, + "loss": 2.7046, + "step": 14133 + }, + { + "epoch": 0.4191204815704415, + "grad_norm": 0.11887682229280472, + "learning_rate": 0.0006346472133919591, + "loss": 2.7001, + "step": 14134 + }, + { + "epoch": 0.419150134922753, + "grad_norm": 0.12433471530675888, + "learning_rate": 0.0006346018999965209, + "loss": 2.7073, + "step": 14135 + }, + { + "epoch": 0.41917978827506447, + "grad_norm": 0.13579095900058746, + "learning_rate": 0.0006345565854091599, + "loss": 2.7104, + "step": 14136 + }, + { + "epoch": 0.419209441627376, + "grad_norm": 0.121931292116642, + "learning_rate": 0.0006345112696302772, + "loss": 2.696, + "step": 14137 + }, + { + "epoch": 0.4192390949796875, + "grad_norm": 0.1253630667924881, + "learning_rate": 0.0006344659526602742, + "loss": 2.672, + "step": 14138 + }, + { + "epoch": 0.41926874833199895, + "grad_norm": 0.126763254404068, + "learning_rate": 0.000634420634499552, + "loss": 2.6594, + "step": 14139 + }, + { + "epoch": 0.41929840168431043, + "grad_norm": 0.12772828340530396, + "learning_rate": 0.0006343753151485121, + "loss": 2.7154, + "step": 14140 + }, + { + "epoch": 0.4193280550366219, + "grad_norm": 0.14328360557556152, + "learning_rate": 0.0006343299946075556, + "loss": 2.7032, + "step": 14141 + }, + { + "epoch": 0.4193577083889334, + "grad_norm": 0.15370343625545502, + "learning_rate": 0.0006342846728770841, + "loss": 2.6918, + "step": 14142 + }, + { + "epoch": 0.41938736174124486, + "grad_norm": 0.13372653722763062, + "learning_rate": 0.0006342393499574986, + "loss": 2.721, + "step": 14143 + }, + { + "epoch": 0.41941701509355633, + "grad_norm": 0.1294231414794922, + "learning_rate": 0.0006341940258492007, + "loss": 2.6753, + "step": 14144 + }, + { + "epoch": 0.4194466684458678, + "grad_norm": 0.1197662428021431, + "learning_rate": 0.0006341487005525917, + "loss": 2.7111, + "step": 14145 + }, + { + "epoch": 0.4194763217981793, + "grad_norm": 0.14069189131259918, + "learning_rate": 0.0006341033740680729, + "loss": 2.7034, + "step": 14146 + }, + { + "epoch": 0.41950597515049076, + "grad_norm": 0.13027063012123108, + "learning_rate": 0.0006340580463960457, + "loss": 2.6786, + "step": 14147 + }, + { + "epoch": 0.41953562850280224, + "grad_norm": 0.13370567560195923, + "learning_rate": 0.0006340127175369115, + "loss": 2.7485, + "step": 14148 + }, + { + "epoch": 0.4195652818551137, + "grad_norm": 0.12837933003902435, + "learning_rate": 0.0006339673874910716, + "loss": 2.6936, + "step": 14149 + }, + { + "epoch": 0.4195949352074252, + "grad_norm": 0.12257184833288193, + "learning_rate": 0.0006339220562589276, + "loss": 2.7079, + "step": 14150 + }, + { + "epoch": 0.41962458855973667, + "grad_norm": 0.12758374214172363, + "learning_rate": 0.0006338767238408809, + "loss": 2.6906, + "step": 14151 + }, + { + "epoch": 0.41965424191204814, + "grad_norm": 0.11626388877630234, + "learning_rate": 0.0006338313902373325, + "loss": 2.6985, + "step": 14152 + }, + { + "epoch": 0.4196838952643596, + "grad_norm": 0.11580109596252441, + "learning_rate": 0.0006337860554486844, + "loss": 2.7133, + "step": 14153 + }, + { + "epoch": 0.4197135486166711, + "grad_norm": 0.12661516666412354, + "learning_rate": 0.0006337407194753377, + "loss": 2.6994, + "step": 14154 + }, + { + "epoch": 0.41974320196898257, + "grad_norm": 0.10668443143367767, + "learning_rate": 0.0006336953823176941, + "loss": 2.6755, + "step": 14155 + }, + { + "epoch": 0.41977285532129405, + "grad_norm": 0.12831784784793854, + "learning_rate": 0.0006336500439761549, + "loss": 2.6945, + "step": 14156 + }, + { + "epoch": 0.4198025086736055, + "grad_norm": 0.11414190381765366, + "learning_rate": 0.0006336047044511217, + "loss": 2.6686, + "step": 14157 + }, + { + "epoch": 0.41983216202591706, + "grad_norm": 0.13428910076618195, + "learning_rate": 0.0006335593637429957, + "loss": 2.7343, + "step": 14158 + }, + { + "epoch": 0.41986181537822853, + "grad_norm": 0.12865214049816132, + "learning_rate": 0.0006335140218521788, + "loss": 2.7298, + "step": 14159 + }, + { + "epoch": 0.41989146873054, + "grad_norm": 0.12212948501110077, + "learning_rate": 0.0006334686787790722, + "loss": 2.7309, + "step": 14160 + }, + { + "epoch": 0.4199211220828515, + "grad_norm": 0.12084874510765076, + "learning_rate": 0.0006334233345240776, + "loss": 2.6885, + "step": 14161 + }, + { + "epoch": 0.41995077543516296, + "grad_norm": 0.14107325673103333, + "learning_rate": 0.0006333779890875966, + "loss": 2.6655, + "step": 14162 + }, + { + "epoch": 0.41998042878747444, + "grad_norm": 0.14109072089195251, + "learning_rate": 0.0006333326424700304, + "loss": 2.7216, + "step": 14163 + }, + { + "epoch": 0.4200100821397859, + "grad_norm": 0.12958185374736786, + "learning_rate": 0.000633287294671781, + "loss": 2.7004, + "step": 14164 + }, + { + "epoch": 0.4200397354920974, + "grad_norm": 0.12093593180179596, + "learning_rate": 0.0006332419456932493, + "loss": 2.7328, + "step": 14165 + }, + { + "epoch": 0.42006938884440886, + "grad_norm": 0.13104408979415894, + "learning_rate": 0.0006331965955348375, + "loss": 2.7116, + "step": 14166 + }, + { + "epoch": 0.42009904219672034, + "grad_norm": 0.12289398908615112, + "learning_rate": 0.0006331512441969473, + "loss": 2.7086, + "step": 14167 + }, + { + "epoch": 0.4201286955490318, + "grad_norm": 0.14148937165737152, + "learning_rate": 0.0006331058916799797, + "loss": 2.7246, + "step": 14168 + }, + { + "epoch": 0.4201583489013433, + "grad_norm": 0.10586618632078171, + "learning_rate": 0.0006330605379843366, + "loss": 2.7281, + "step": 14169 + }, + { + "epoch": 0.42018800225365477, + "grad_norm": 0.12312048673629761, + "learning_rate": 0.0006330151831104196, + "loss": 2.6761, + "step": 14170 + }, + { + "epoch": 0.42021765560596624, + "grad_norm": 0.1387835294008255, + "learning_rate": 0.0006329698270586302, + "loss": 2.7049, + "step": 14171 + }, + { + "epoch": 0.4202473089582777, + "grad_norm": 0.1364779770374298, + "learning_rate": 0.0006329244698293704, + "loss": 2.7083, + "step": 14172 + }, + { + "epoch": 0.4202769623105892, + "grad_norm": 0.1327020674943924, + "learning_rate": 0.0006328791114230414, + "loss": 2.7011, + "step": 14173 + }, + { + "epoch": 0.4203066156629007, + "grad_norm": 0.13738670945167542, + "learning_rate": 0.0006328337518400453, + "loss": 2.6987, + "step": 14174 + }, + { + "epoch": 0.42033626901521215, + "grad_norm": 0.13141147792339325, + "learning_rate": 0.0006327883910807832, + "loss": 2.7157, + "step": 14175 + }, + { + "epoch": 0.4203659223675236, + "grad_norm": 0.131684347987175, + "learning_rate": 0.0006327430291456573, + "loss": 2.6886, + "step": 14176 + }, + { + "epoch": 0.4203955757198351, + "grad_norm": 0.15796402096748352, + "learning_rate": 0.0006326976660350691, + "loss": 2.7384, + "step": 14177 + }, + { + "epoch": 0.42042522907214663, + "grad_norm": 0.19461975991725922, + "learning_rate": 0.0006326523017494202, + "loss": 2.7288, + "step": 14178 + }, + { + "epoch": 0.4204548824244581, + "grad_norm": 0.17790383100509644, + "learning_rate": 0.0006326069362891125, + "loss": 2.7196, + "step": 14179 + }, + { + "epoch": 0.4204845357767696, + "grad_norm": 0.14039570093154907, + "learning_rate": 0.0006325615696545476, + "loss": 2.7011, + "step": 14180 + }, + { + "epoch": 0.42051418912908106, + "grad_norm": 0.13119317591190338, + "learning_rate": 0.0006325162018461272, + "loss": 2.7106, + "step": 14181 + }, + { + "epoch": 0.42054384248139254, + "grad_norm": 0.14574937522411346, + "learning_rate": 0.0006324708328642531, + "loss": 2.6902, + "step": 14182 + }, + { + "epoch": 0.420573495833704, + "grad_norm": 0.13969534635543823, + "learning_rate": 0.0006324254627093271, + "loss": 2.7104, + "step": 14183 + }, + { + "epoch": 0.4206031491860155, + "grad_norm": 0.1221127137541771, + "learning_rate": 0.0006323800913817508, + "loss": 2.7025, + "step": 14184 + }, + { + "epoch": 0.42063280253832697, + "grad_norm": 0.1313803344964981, + "learning_rate": 0.000632334718881926, + "loss": 2.7208, + "step": 14185 + }, + { + "epoch": 0.42066245589063844, + "grad_norm": 0.14706535637378693, + "learning_rate": 0.0006322893452102548, + "loss": 2.7144, + "step": 14186 + }, + { + "epoch": 0.4206921092429499, + "grad_norm": 0.12707597017288208, + "learning_rate": 0.0006322439703671385, + "loss": 2.7006, + "step": 14187 + }, + { + "epoch": 0.4207217625952614, + "grad_norm": 0.1328766644001007, + "learning_rate": 0.0006321985943529793, + "loss": 2.7066, + "step": 14188 + }, + { + "epoch": 0.42075141594757287, + "grad_norm": 0.13558553159236908, + "learning_rate": 0.0006321532171681788, + "loss": 2.7121, + "step": 14189 + }, + { + "epoch": 0.42078106929988435, + "grad_norm": 0.12131116539239883, + "learning_rate": 0.000632107838813139, + "loss": 2.7267, + "step": 14190 + }, + { + "epoch": 0.4208107226521958, + "grad_norm": 0.1296631544828415, + "learning_rate": 0.0006320624592882614, + "loss": 2.6952, + "step": 14191 + }, + { + "epoch": 0.4208403760045073, + "grad_norm": 0.1277940422296524, + "learning_rate": 0.0006320170785939481, + "loss": 2.7176, + "step": 14192 + }, + { + "epoch": 0.4208700293568188, + "grad_norm": 0.1220906674861908, + "learning_rate": 0.000631971696730601, + "loss": 2.7216, + "step": 14193 + }, + { + "epoch": 0.42089968270913025, + "grad_norm": 0.13049843907356262, + "learning_rate": 0.0006319263136986218, + "loss": 2.7111, + "step": 14194 + }, + { + "epoch": 0.42092933606144173, + "grad_norm": 0.14218370616436005, + "learning_rate": 0.0006318809294984125, + "loss": 2.7448, + "step": 14195 + }, + { + "epoch": 0.4209589894137532, + "grad_norm": 0.14930284023284912, + "learning_rate": 0.000631835544130375, + "loss": 2.689, + "step": 14196 + }, + { + "epoch": 0.4209886427660647, + "grad_norm": 0.11559825390577316, + "learning_rate": 0.0006317901575949109, + "loss": 2.6931, + "step": 14197 + }, + { + "epoch": 0.42101829611837616, + "grad_norm": 0.12661738693714142, + "learning_rate": 0.0006317447698924223, + "loss": 2.7373, + "step": 14198 + }, + { + "epoch": 0.4210479494706877, + "grad_norm": 0.13706810772418976, + "learning_rate": 0.0006316993810233114, + "loss": 2.7109, + "step": 14199 + }, + { + "epoch": 0.42107760282299916, + "grad_norm": 0.12267930060625076, + "learning_rate": 0.00063165399098798, + "loss": 2.7276, + "step": 14200 + }, + { + "epoch": 0.42110725617531064, + "grad_norm": 0.11649032682180405, + "learning_rate": 0.0006316085997868297, + "loss": 2.7328, + "step": 14201 + }, + { + "epoch": 0.4211369095276221, + "grad_norm": 0.11640917509794235, + "learning_rate": 0.0006315632074202626, + "loss": 2.7125, + "step": 14202 + }, + { + "epoch": 0.4211665628799336, + "grad_norm": 0.10317764431238174, + "learning_rate": 0.0006315178138886808, + "loss": 2.7034, + "step": 14203 + }, + { + "epoch": 0.42119621623224507, + "grad_norm": 0.10293765366077423, + "learning_rate": 0.000631472419192486, + "loss": 2.7185, + "step": 14204 + }, + { + "epoch": 0.42122586958455654, + "grad_norm": 0.09766088426113129, + "learning_rate": 0.0006314270233320806, + "loss": 2.7055, + "step": 14205 + }, + { + "epoch": 0.421255522936868, + "grad_norm": 0.10206327587366104, + "learning_rate": 0.0006313816263078662, + "loss": 2.6899, + "step": 14206 + }, + { + "epoch": 0.4212851762891795, + "grad_norm": 0.11197154968976974, + "learning_rate": 0.000631336228120245, + "loss": 2.7043, + "step": 14207 + }, + { + "epoch": 0.421314829641491, + "grad_norm": 0.12111078947782516, + "learning_rate": 0.0006312908287696191, + "loss": 2.6939, + "step": 14208 + }, + { + "epoch": 0.42134448299380245, + "grad_norm": 0.11229080706834793, + "learning_rate": 0.0006312454282563902, + "loss": 2.7384, + "step": 14209 + }, + { + "epoch": 0.4213741363461139, + "grad_norm": 0.1098097413778305, + "learning_rate": 0.0006312000265809606, + "loss": 2.705, + "step": 14210 + }, + { + "epoch": 0.4214037896984254, + "grad_norm": 0.11237044632434845, + "learning_rate": 0.0006311546237437321, + "loss": 2.6638, + "step": 14211 + }, + { + "epoch": 0.4214334430507369, + "grad_norm": 0.1212485060095787, + "learning_rate": 0.000631109219745107, + "loss": 2.7284, + "step": 14212 + }, + { + "epoch": 0.42146309640304835, + "grad_norm": 0.12921464443206787, + "learning_rate": 0.0006310638145854872, + "loss": 2.7262, + "step": 14213 + }, + { + "epoch": 0.42149274975535983, + "grad_norm": 0.14054147899150848, + "learning_rate": 0.000631018408265275, + "loss": 2.7099, + "step": 14214 + }, + { + "epoch": 0.4215224031076713, + "grad_norm": 0.13054321706295013, + "learning_rate": 0.0006309730007848722, + "loss": 2.7122, + "step": 14215 + }, + { + "epoch": 0.4215520564599828, + "grad_norm": 0.12784023582935333, + "learning_rate": 0.0006309275921446808, + "loss": 2.733, + "step": 14216 + }, + { + "epoch": 0.42158170981229426, + "grad_norm": 0.12518952786922455, + "learning_rate": 0.0006308821823451035, + "loss": 2.7045, + "step": 14217 + }, + { + "epoch": 0.42161136316460573, + "grad_norm": 0.1318475902080536, + "learning_rate": 0.0006308367713865416, + "loss": 2.6951, + "step": 14218 + }, + { + "epoch": 0.4216410165169172, + "grad_norm": 0.1402355134487152, + "learning_rate": 0.0006307913592693979, + "loss": 2.7052, + "step": 14219 + }, + { + "epoch": 0.42167066986922874, + "grad_norm": 0.11899781972169876, + "learning_rate": 0.0006307459459940741, + "loss": 2.7089, + "step": 14220 + }, + { + "epoch": 0.4217003232215402, + "grad_norm": 0.1148705929517746, + "learning_rate": 0.0006307005315609726, + "loss": 2.7344, + "step": 14221 + }, + { + "epoch": 0.4217299765738517, + "grad_norm": 0.13638506829738617, + "learning_rate": 0.0006306551159704955, + "loss": 2.7134, + "step": 14222 + }, + { + "epoch": 0.42175962992616317, + "grad_norm": 0.14210674166679382, + "learning_rate": 0.0006306096992230448, + "loss": 2.6861, + "step": 14223 + }, + { + "epoch": 0.42178928327847465, + "grad_norm": 0.15284359455108643, + "learning_rate": 0.0006305642813190229, + "loss": 2.7153, + "step": 14224 + }, + { + "epoch": 0.4218189366307861, + "grad_norm": 0.1349484771490097, + "learning_rate": 0.0006305188622588318, + "loss": 2.6839, + "step": 14225 + }, + { + "epoch": 0.4218485899830976, + "grad_norm": 0.1504364162683487, + "learning_rate": 0.0006304734420428739, + "loss": 2.6901, + "step": 14226 + }, + { + "epoch": 0.4218782433354091, + "grad_norm": 0.14760683476924896, + "learning_rate": 0.0006304280206715511, + "loss": 2.7068, + "step": 14227 + }, + { + "epoch": 0.42190789668772055, + "grad_norm": 0.12149283289909363, + "learning_rate": 0.000630382598145266, + "loss": 2.7163, + "step": 14228 + }, + { + "epoch": 0.42193755004003203, + "grad_norm": 0.12991267442703247, + "learning_rate": 0.0006303371744644203, + "loss": 2.6943, + "step": 14229 + }, + { + "epoch": 0.4219672033923435, + "grad_norm": 0.12571269273757935, + "learning_rate": 0.0006302917496294168, + "loss": 2.7255, + "step": 14230 + }, + { + "epoch": 0.421996856744655, + "grad_norm": 0.14639106392860413, + "learning_rate": 0.0006302463236406573, + "loss": 2.7192, + "step": 14231 + }, + { + "epoch": 0.42202651009696646, + "grad_norm": 0.12879326939582825, + "learning_rate": 0.0006302008964985444, + "loss": 2.7088, + "step": 14232 + }, + { + "epoch": 0.42205616344927793, + "grad_norm": 0.13045558333396912, + "learning_rate": 0.0006301554682034803, + "loss": 2.7345, + "step": 14233 + }, + { + "epoch": 0.4220858168015894, + "grad_norm": 0.1326521784067154, + "learning_rate": 0.0006301100387558671, + "loss": 2.7294, + "step": 14234 + }, + { + "epoch": 0.4221154701539009, + "grad_norm": 0.15426357090473175, + "learning_rate": 0.0006300646081561071, + "loss": 2.6913, + "step": 14235 + }, + { + "epoch": 0.42214512350621236, + "grad_norm": 0.1624263972043991, + "learning_rate": 0.0006300191764046026, + "loss": 2.7348, + "step": 14236 + }, + { + "epoch": 0.42217477685852384, + "grad_norm": 0.13743294775485992, + "learning_rate": 0.0006299737435017562, + "loss": 2.6776, + "step": 14237 + }, + { + "epoch": 0.4222044302108353, + "grad_norm": 0.12412616610527039, + "learning_rate": 0.0006299283094479699, + "loss": 2.7249, + "step": 14238 + }, + { + "epoch": 0.4222340835631468, + "grad_norm": 0.14540915191173553, + "learning_rate": 0.0006298828742436461, + "loss": 2.7023, + "step": 14239 + }, + { + "epoch": 0.42226373691545827, + "grad_norm": 0.1301984190940857, + "learning_rate": 0.0006298374378891871, + "loss": 2.6931, + "step": 14240 + }, + { + "epoch": 0.4222933902677698, + "grad_norm": 0.13232284784317017, + "learning_rate": 0.0006297920003849954, + "loss": 2.6863, + "step": 14241 + }, + { + "epoch": 0.4223230436200813, + "grad_norm": 0.14824053645133972, + "learning_rate": 0.0006297465617314731, + "loss": 2.708, + "step": 14242 + }, + { + "epoch": 0.42235269697239275, + "grad_norm": 0.14043444395065308, + "learning_rate": 0.000629701121929023, + "loss": 2.6984, + "step": 14243 + }, + { + "epoch": 0.4223823503247042, + "grad_norm": 0.12678717076778412, + "learning_rate": 0.0006296556809780471, + "loss": 2.7127, + "step": 14244 + }, + { + "epoch": 0.4224120036770157, + "grad_norm": 0.13477428257465363, + "learning_rate": 0.0006296102388789477, + "loss": 2.7068, + "step": 14245 + }, + { + "epoch": 0.4224416570293272, + "grad_norm": 0.11516331136226654, + "learning_rate": 0.0006295647956321276, + "loss": 2.7281, + "step": 14246 + }, + { + "epoch": 0.42247131038163865, + "grad_norm": 0.12857210636138916, + "learning_rate": 0.0006295193512379888, + "loss": 2.7167, + "step": 14247 + }, + { + "epoch": 0.42250096373395013, + "grad_norm": 0.1258857697248459, + "learning_rate": 0.0006294739056969341, + "loss": 2.7217, + "step": 14248 + }, + { + "epoch": 0.4225306170862616, + "grad_norm": 0.1169322282075882, + "learning_rate": 0.0006294284590093657, + "loss": 2.7158, + "step": 14249 + }, + { + "epoch": 0.4225602704385731, + "grad_norm": 0.12297577410936356, + "learning_rate": 0.000629383011175686, + "loss": 2.7099, + "step": 14250 + }, + { + "epoch": 0.42258992379088456, + "grad_norm": 0.11983393132686615, + "learning_rate": 0.0006293375621962975, + "loss": 2.7117, + "step": 14251 + }, + { + "epoch": 0.42261957714319603, + "grad_norm": 0.13239075243473053, + "learning_rate": 0.0006292921120716029, + "loss": 2.7155, + "step": 14252 + }, + { + "epoch": 0.4226492304955075, + "grad_norm": 0.11879435926675797, + "learning_rate": 0.0006292466608020043, + "loss": 2.7117, + "step": 14253 + }, + { + "epoch": 0.422678883847819, + "grad_norm": 0.1167101263999939, + "learning_rate": 0.0006292012083879044, + "loss": 2.7028, + "step": 14254 + }, + { + "epoch": 0.42270853720013046, + "grad_norm": 0.12112045288085938, + "learning_rate": 0.0006291557548297055, + "loss": 2.6581, + "step": 14255 + }, + { + "epoch": 0.42273819055244194, + "grad_norm": 0.11753402650356293, + "learning_rate": 0.0006291103001278102, + "loss": 2.6849, + "step": 14256 + }, + { + "epoch": 0.4227678439047534, + "grad_norm": 0.11275482922792435, + "learning_rate": 0.0006290648442826213, + "loss": 2.7493, + "step": 14257 + }, + { + "epoch": 0.4227974972570649, + "grad_norm": 0.14211666584014893, + "learning_rate": 0.0006290193872945408, + "loss": 2.7224, + "step": 14258 + }, + { + "epoch": 0.42282715060937637, + "grad_norm": 0.14187940955162048, + "learning_rate": 0.0006289739291639716, + "loss": 2.7129, + "step": 14259 + }, + { + "epoch": 0.42285680396168784, + "grad_norm": 0.12367641180753708, + "learning_rate": 0.0006289284698913161, + "loss": 2.6898, + "step": 14260 + }, + { + "epoch": 0.4228864573139993, + "grad_norm": 0.1303272247314453, + "learning_rate": 0.0006288830094769768, + "loss": 2.6735, + "step": 14261 + }, + { + "epoch": 0.42291611066631085, + "grad_norm": 0.11662798374891281, + "learning_rate": 0.0006288375479213564, + "loss": 2.7253, + "step": 14262 + }, + { + "epoch": 0.4229457640186223, + "grad_norm": 0.14145740866661072, + "learning_rate": 0.0006287920852248573, + "loss": 2.7127, + "step": 14263 + }, + { + "epoch": 0.4229754173709338, + "grad_norm": 0.16856177151203156, + "learning_rate": 0.0006287466213878824, + "loss": 2.7296, + "step": 14264 + }, + { + "epoch": 0.4230050707232453, + "grad_norm": 0.19904714822769165, + "learning_rate": 0.0006287011564108338, + "loss": 2.6759, + "step": 14265 + }, + { + "epoch": 0.42303472407555676, + "grad_norm": 0.19421671330928802, + "learning_rate": 0.0006286556902941145, + "loss": 2.7114, + "step": 14266 + }, + { + "epoch": 0.42306437742786823, + "grad_norm": 0.14075979590415955, + "learning_rate": 0.000628610223038127, + "loss": 2.6662, + "step": 14267 + }, + { + "epoch": 0.4230940307801797, + "grad_norm": 0.12483430653810501, + "learning_rate": 0.0006285647546432738, + "loss": 2.7117, + "step": 14268 + }, + { + "epoch": 0.4231236841324912, + "grad_norm": 0.16673463582992554, + "learning_rate": 0.0006285192851099577, + "loss": 2.7019, + "step": 14269 + }, + { + "epoch": 0.42315333748480266, + "grad_norm": 0.13616083562374115, + "learning_rate": 0.0006284738144385812, + "loss": 2.6846, + "step": 14270 + }, + { + "epoch": 0.42318299083711414, + "grad_norm": 0.12474112957715988, + "learning_rate": 0.0006284283426295471, + "loss": 2.7046, + "step": 14271 + }, + { + "epoch": 0.4232126441894256, + "grad_norm": 0.15456409752368927, + "learning_rate": 0.0006283828696832581, + "loss": 2.6757, + "step": 14272 + }, + { + "epoch": 0.4232422975417371, + "grad_norm": 0.1314333975315094, + "learning_rate": 0.0006283373956001167, + "loss": 2.7195, + "step": 14273 + }, + { + "epoch": 0.42327195089404857, + "grad_norm": 0.12007545679807663, + "learning_rate": 0.0006282919203805255, + "loss": 2.7154, + "step": 14274 + }, + { + "epoch": 0.42330160424636004, + "grad_norm": 0.13577322661876678, + "learning_rate": 0.0006282464440248872, + "loss": 2.6938, + "step": 14275 + }, + { + "epoch": 0.4233312575986715, + "grad_norm": 0.13347361981868744, + "learning_rate": 0.0006282009665336049, + "loss": 2.6903, + "step": 14276 + }, + { + "epoch": 0.423360910950983, + "grad_norm": 0.12815743684768677, + "learning_rate": 0.000628155487907081, + "loss": 2.7204, + "step": 14277 + }, + { + "epoch": 0.42339056430329447, + "grad_norm": 0.14568805694580078, + "learning_rate": 0.0006281100081457181, + "loss": 2.7188, + "step": 14278 + }, + { + "epoch": 0.42342021765560595, + "grad_norm": 0.14639219641685486, + "learning_rate": 0.0006280645272499193, + "loss": 2.7247, + "step": 14279 + }, + { + "epoch": 0.4234498710079174, + "grad_norm": 0.12817247211933136, + "learning_rate": 0.000628019045220087, + "loss": 2.7032, + "step": 14280 + }, + { + "epoch": 0.4234795243602289, + "grad_norm": 0.12768720090389252, + "learning_rate": 0.000627973562056624, + "loss": 2.6764, + "step": 14281 + }, + { + "epoch": 0.42350917771254043, + "grad_norm": 0.11764764785766602, + "learning_rate": 0.0006279280777599332, + "loss": 2.73, + "step": 14282 + }, + { + "epoch": 0.4235388310648519, + "grad_norm": 0.11776132881641388, + "learning_rate": 0.0006278825923304174, + "loss": 2.6987, + "step": 14283 + }, + { + "epoch": 0.4235684844171634, + "grad_norm": 0.1225048154592514, + "learning_rate": 0.0006278371057684793, + "loss": 2.676, + "step": 14284 + }, + { + "epoch": 0.42359813776947486, + "grad_norm": 0.12498664110898972, + "learning_rate": 0.0006277916180745215, + "loss": 2.7108, + "step": 14285 + }, + { + "epoch": 0.42362779112178633, + "grad_norm": 0.11868403106927872, + "learning_rate": 0.0006277461292489473, + "loss": 2.6556, + "step": 14286 + }, + { + "epoch": 0.4236574444740978, + "grad_norm": 0.11941874772310257, + "learning_rate": 0.000627700639292159, + "loss": 2.6996, + "step": 14287 + }, + { + "epoch": 0.4236870978264093, + "grad_norm": 0.12739351391792297, + "learning_rate": 0.0006276551482045596, + "loss": 2.7238, + "step": 14288 + }, + { + "epoch": 0.42371675117872076, + "grad_norm": 0.1225125789642334, + "learning_rate": 0.000627609655986552, + "loss": 2.6969, + "step": 14289 + }, + { + "epoch": 0.42374640453103224, + "grad_norm": 0.13355110585689545, + "learning_rate": 0.0006275641626385389, + "loss": 2.6993, + "step": 14290 + }, + { + "epoch": 0.4237760578833437, + "grad_norm": 0.1454695612192154, + "learning_rate": 0.0006275186681609233, + "loss": 2.6955, + "step": 14291 + }, + { + "epoch": 0.4238057112356552, + "grad_norm": 0.11141476780176163, + "learning_rate": 0.0006274731725541081, + "loss": 2.6999, + "step": 14292 + }, + { + "epoch": 0.42383536458796667, + "grad_norm": 0.10540972650051117, + "learning_rate": 0.0006274276758184961, + "loss": 2.7189, + "step": 14293 + }, + { + "epoch": 0.42386501794027814, + "grad_norm": 0.12391334772109985, + "learning_rate": 0.0006273821779544899, + "loss": 2.6785, + "step": 14294 + }, + { + "epoch": 0.4238946712925896, + "grad_norm": 0.1334204524755478, + "learning_rate": 0.0006273366789624928, + "loss": 2.7317, + "step": 14295 + }, + { + "epoch": 0.4239243246449011, + "grad_norm": 0.12455202639102936, + "learning_rate": 0.0006272911788429076, + "loss": 2.6947, + "step": 14296 + }, + { + "epoch": 0.42395397799721257, + "grad_norm": 0.11187351495027542, + "learning_rate": 0.0006272456775961371, + "loss": 2.7067, + "step": 14297 + }, + { + "epoch": 0.42398363134952405, + "grad_norm": 0.12681619822978973, + "learning_rate": 0.0006272001752225844, + "loss": 2.7037, + "step": 14298 + }, + { + "epoch": 0.4240132847018355, + "grad_norm": 0.12405822426080704, + "learning_rate": 0.0006271546717226522, + "loss": 2.6968, + "step": 14299 + }, + { + "epoch": 0.424042938054147, + "grad_norm": 0.11762741953134537, + "learning_rate": 0.0006271091670967436, + "loss": 2.7583, + "step": 14300 + }, + { + "epoch": 0.4240725914064585, + "grad_norm": 0.1196674257516861, + "learning_rate": 0.0006270636613452614, + "loss": 2.7141, + "step": 14301 + }, + { + "epoch": 0.42410224475876995, + "grad_norm": 0.13088086247444153, + "learning_rate": 0.0006270181544686086, + "loss": 2.6882, + "step": 14302 + }, + { + "epoch": 0.4241318981110815, + "grad_norm": 0.13134604692459106, + "learning_rate": 0.0006269726464671885, + "loss": 2.6798, + "step": 14303 + }, + { + "epoch": 0.42416155146339296, + "grad_norm": 0.13791239261627197, + "learning_rate": 0.0006269271373414039, + "loss": 2.6821, + "step": 14304 + }, + { + "epoch": 0.42419120481570444, + "grad_norm": 0.12634895741939545, + "learning_rate": 0.0006268816270916574, + "loss": 2.6995, + "step": 14305 + }, + { + "epoch": 0.4242208581680159, + "grad_norm": 0.1515227109193802, + "learning_rate": 0.0006268361157183524, + "loss": 2.6952, + "step": 14306 + }, + { + "epoch": 0.4242505115203274, + "grad_norm": 0.13402746617794037, + "learning_rate": 0.0006267906032218917, + "loss": 2.7229, + "step": 14307 + }, + { + "epoch": 0.42428016487263887, + "grad_norm": 0.15845714509487152, + "learning_rate": 0.0006267450896026787, + "loss": 2.7237, + "step": 14308 + }, + { + "epoch": 0.42430981822495034, + "grad_norm": 0.12275777012109756, + "learning_rate": 0.0006266995748611162, + "loss": 2.6894, + "step": 14309 + }, + { + "epoch": 0.4243394715772618, + "grad_norm": 0.1284051388502121, + "learning_rate": 0.0006266540589976071, + "loss": 2.7321, + "step": 14310 + }, + { + "epoch": 0.4243691249295733, + "grad_norm": 0.12431475520133972, + "learning_rate": 0.0006266085420125546, + "loss": 2.702, + "step": 14311 + }, + { + "epoch": 0.42439877828188477, + "grad_norm": 0.13100750744342804, + "learning_rate": 0.0006265630239063617, + "loss": 2.7301, + "step": 14312 + }, + { + "epoch": 0.42442843163419625, + "grad_norm": 0.1422487199306488, + "learning_rate": 0.0006265175046794313, + "loss": 2.6879, + "step": 14313 + }, + { + "epoch": 0.4244580849865077, + "grad_norm": 0.13805583119392395, + "learning_rate": 0.000626471984332167, + "loss": 2.6892, + "step": 14314 + }, + { + "epoch": 0.4244877383388192, + "grad_norm": 0.12625455856323242, + "learning_rate": 0.0006264264628649714, + "loss": 2.6664, + "step": 14315 + }, + { + "epoch": 0.4245173916911307, + "grad_norm": 0.1094849482178688, + "learning_rate": 0.0006263809402782479, + "loss": 2.6478, + "step": 14316 + }, + { + "epoch": 0.42454704504344215, + "grad_norm": 0.11951036006212234, + "learning_rate": 0.0006263354165723993, + "loss": 2.7147, + "step": 14317 + }, + { + "epoch": 0.4245766983957536, + "grad_norm": 0.13501176238059998, + "learning_rate": 0.000626289891747829, + "loss": 2.6939, + "step": 14318 + }, + { + "epoch": 0.4246063517480651, + "grad_norm": 0.13966456055641174, + "learning_rate": 0.00062624436580494, + "loss": 2.7124, + "step": 14319 + }, + { + "epoch": 0.4246360051003766, + "grad_norm": 0.1283186972141266, + "learning_rate": 0.0006261988387441356, + "loss": 2.7299, + "step": 14320 + }, + { + "epoch": 0.42466565845268806, + "grad_norm": 0.1174016147851944, + "learning_rate": 0.0006261533105658187, + "loss": 2.7237, + "step": 14321 + }, + { + "epoch": 0.42469531180499953, + "grad_norm": 0.10631980746984482, + "learning_rate": 0.0006261077812703926, + "loss": 2.7003, + "step": 14322 + }, + { + "epoch": 0.424724965157311, + "grad_norm": 0.12195051461458206, + "learning_rate": 0.0006260622508582604, + "loss": 2.7063, + "step": 14323 + }, + { + "epoch": 0.42475461850962254, + "grad_norm": 0.13657155632972717, + "learning_rate": 0.0006260167193298254, + "loss": 2.716, + "step": 14324 + }, + { + "epoch": 0.424784271861934, + "grad_norm": 0.13227620720863342, + "learning_rate": 0.0006259711866854906, + "loss": 2.7357, + "step": 14325 + }, + { + "epoch": 0.4248139252142455, + "grad_norm": 0.12436505407094955, + "learning_rate": 0.0006259256529256596, + "loss": 2.7001, + "step": 14326 + }, + { + "epoch": 0.42484357856655697, + "grad_norm": 0.12600170075893402, + "learning_rate": 0.0006258801180507351, + "loss": 2.6928, + "step": 14327 + }, + { + "epoch": 0.42487323191886844, + "grad_norm": 0.12494690716266632, + "learning_rate": 0.0006258345820611206, + "loss": 2.6954, + "step": 14328 + }, + { + "epoch": 0.4249028852711799, + "grad_norm": 0.14723056554794312, + "learning_rate": 0.0006257890449572192, + "loss": 2.7266, + "step": 14329 + }, + { + "epoch": 0.4249325386234914, + "grad_norm": 0.13822731375694275, + "learning_rate": 0.0006257435067394344, + "loss": 2.6857, + "step": 14330 + }, + { + "epoch": 0.42496219197580287, + "grad_norm": 0.12678954005241394, + "learning_rate": 0.0006256979674081692, + "loss": 2.6567, + "step": 14331 + }, + { + "epoch": 0.42499184532811435, + "grad_norm": 0.13649411499500275, + "learning_rate": 0.0006256524269638268, + "loss": 2.7061, + "step": 14332 + }, + { + "epoch": 0.4250214986804258, + "grad_norm": 0.13110868632793427, + "learning_rate": 0.0006256068854068107, + "loss": 2.7129, + "step": 14333 + }, + { + "epoch": 0.4250511520327373, + "grad_norm": 0.11198518425226212, + "learning_rate": 0.000625561342737524, + "loss": 2.7195, + "step": 14334 + }, + { + "epoch": 0.4250808053850488, + "grad_norm": 0.1251976191997528, + "learning_rate": 0.00062551579895637, + "loss": 2.6913, + "step": 14335 + }, + { + "epoch": 0.42511045873736025, + "grad_norm": 0.12951581180095673, + "learning_rate": 0.0006254702540637523, + "loss": 2.7357, + "step": 14336 + }, + { + "epoch": 0.42514011208967173, + "grad_norm": 0.1388462334871292, + "learning_rate": 0.0006254247080600738, + "loss": 2.6992, + "step": 14337 + }, + { + "epoch": 0.4251697654419832, + "grad_norm": 0.13578955829143524, + "learning_rate": 0.000625379160945738, + "loss": 2.7079, + "step": 14338 + }, + { + "epoch": 0.4251994187942947, + "grad_norm": 0.12780079245567322, + "learning_rate": 0.0006253336127211481, + "loss": 2.7048, + "step": 14339 + }, + { + "epoch": 0.42522907214660616, + "grad_norm": 0.11648762226104736, + "learning_rate": 0.0006252880633867077, + "loss": 2.6934, + "step": 14340 + }, + { + "epoch": 0.42525872549891763, + "grad_norm": 0.1287994086742401, + "learning_rate": 0.00062524251294282, + "loss": 2.6755, + "step": 14341 + }, + { + "epoch": 0.4252883788512291, + "grad_norm": 0.12740032374858856, + "learning_rate": 0.0006251969613898882, + "loss": 2.7181, + "step": 14342 + }, + { + "epoch": 0.4253180322035406, + "grad_norm": 0.12787704169750214, + "learning_rate": 0.000625151408728316, + "loss": 2.7026, + "step": 14343 + }, + { + "epoch": 0.42534768555585206, + "grad_norm": 0.13308852910995483, + "learning_rate": 0.0006251058549585065, + "loss": 2.6831, + "step": 14344 + }, + { + "epoch": 0.4253773389081636, + "grad_norm": 0.1195242628455162, + "learning_rate": 0.0006250603000808632, + "loss": 2.6889, + "step": 14345 + }, + { + "epoch": 0.42540699226047507, + "grad_norm": 0.12580537796020508, + "learning_rate": 0.0006250147440957894, + "loss": 2.7277, + "step": 14346 + }, + { + "epoch": 0.42543664561278655, + "grad_norm": 0.13393716514110565, + "learning_rate": 0.0006249691870036886, + "loss": 2.6975, + "step": 14347 + }, + { + "epoch": 0.425466298965098, + "grad_norm": 0.12131760269403458, + "learning_rate": 0.0006249236288049644, + "loss": 2.6988, + "step": 14348 + }, + { + "epoch": 0.4254959523174095, + "grad_norm": 0.1270434409379959, + "learning_rate": 0.0006248780695000198, + "loss": 2.7097, + "step": 14349 + }, + { + "epoch": 0.425525605669721, + "grad_norm": 0.12042088806629181, + "learning_rate": 0.0006248325090892585, + "loss": 2.7198, + "step": 14350 + }, + { + "epoch": 0.42555525902203245, + "grad_norm": 0.12819993495941162, + "learning_rate": 0.0006247869475730839, + "loss": 2.7122, + "step": 14351 + }, + { + "epoch": 0.4255849123743439, + "grad_norm": 0.14821073412895203, + "learning_rate": 0.0006247413849518995, + "loss": 2.6887, + "step": 14352 + }, + { + "epoch": 0.4256145657266554, + "grad_norm": 0.138186514377594, + "learning_rate": 0.0006246958212261087, + "loss": 2.7081, + "step": 14353 + }, + { + "epoch": 0.4256442190789669, + "grad_norm": 0.12562529742717743, + "learning_rate": 0.0006246502563961151, + "loss": 2.7126, + "step": 14354 + }, + { + "epoch": 0.42567387243127836, + "grad_norm": 0.09870579093694687, + "learning_rate": 0.0006246046904623219, + "loss": 2.6899, + "step": 14355 + }, + { + "epoch": 0.42570352578358983, + "grad_norm": 0.11529935896396637, + "learning_rate": 0.0006245591234251329, + "loss": 2.7467, + "step": 14356 + }, + { + "epoch": 0.4257331791359013, + "grad_norm": 0.12729676067829132, + "learning_rate": 0.0006245135552849514, + "loss": 2.7201, + "step": 14357 + }, + { + "epoch": 0.4257628324882128, + "grad_norm": 0.12153246998786926, + "learning_rate": 0.0006244679860421811, + "loss": 2.6721, + "step": 14358 + }, + { + "epoch": 0.42579248584052426, + "grad_norm": 0.13835664093494415, + "learning_rate": 0.0006244224156972254, + "loss": 2.6908, + "step": 14359 + }, + { + "epoch": 0.42582213919283574, + "grad_norm": 0.12755438685417175, + "learning_rate": 0.0006243768442504878, + "loss": 2.7239, + "step": 14360 + }, + { + "epoch": 0.4258517925451472, + "grad_norm": 0.12395195662975311, + "learning_rate": 0.000624331271702372, + "loss": 2.6944, + "step": 14361 + }, + { + "epoch": 0.4258814458974587, + "grad_norm": 0.13768906891345978, + "learning_rate": 0.0006242856980532813, + "loss": 2.71, + "step": 14362 + }, + { + "epoch": 0.42591109924977016, + "grad_norm": 0.14569957554340363, + "learning_rate": 0.0006242401233036195, + "loss": 2.7065, + "step": 14363 + }, + { + "epoch": 0.42594075260208164, + "grad_norm": 0.1369589865207672, + "learning_rate": 0.0006241945474537901, + "loss": 2.7001, + "step": 14364 + }, + { + "epoch": 0.4259704059543931, + "grad_norm": 0.11077485233545303, + "learning_rate": 0.0006241489705041965, + "loss": 2.6977, + "step": 14365 + }, + { + "epoch": 0.42600005930670465, + "grad_norm": 0.12869605422019958, + "learning_rate": 0.0006241033924552427, + "loss": 2.7105, + "step": 14366 + }, + { + "epoch": 0.4260297126590161, + "grad_norm": 0.1380397230386734, + "learning_rate": 0.0006240578133073319, + "loss": 2.68, + "step": 14367 + }, + { + "epoch": 0.4260593660113276, + "grad_norm": 0.10904652625322342, + "learning_rate": 0.0006240122330608679, + "loss": 2.7067, + "step": 14368 + }, + { + "epoch": 0.4260890193636391, + "grad_norm": 0.12261782586574554, + "learning_rate": 0.0006239666517162543, + "loss": 2.6852, + "step": 14369 + }, + { + "epoch": 0.42611867271595055, + "grad_norm": 0.12556874752044678, + "learning_rate": 0.0006239210692738948, + "loss": 2.6856, + "step": 14370 + }, + { + "epoch": 0.42614832606826203, + "grad_norm": 0.12101628631353378, + "learning_rate": 0.0006238754857341929, + "loss": 2.6864, + "step": 14371 + }, + { + "epoch": 0.4261779794205735, + "grad_norm": 0.12800276279449463, + "learning_rate": 0.0006238299010975522, + "loss": 2.6958, + "step": 14372 + }, + { + "epoch": 0.426207632772885, + "grad_norm": 0.12815189361572266, + "learning_rate": 0.0006237843153643765, + "loss": 2.7112, + "step": 14373 + }, + { + "epoch": 0.42623728612519646, + "grad_norm": 0.1197986900806427, + "learning_rate": 0.0006237387285350696, + "loss": 2.6957, + "step": 14374 + }, + { + "epoch": 0.42626693947750793, + "grad_norm": 0.1232394278049469, + "learning_rate": 0.0006236931406100349, + "loss": 2.6883, + "step": 14375 + }, + { + "epoch": 0.4262965928298194, + "grad_norm": 0.12011054158210754, + "learning_rate": 0.0006236475515896762, + "loss": 2.7099, + "step": 14376 + }, + { + "epoch": 0.4263262461821309, + "grad_norm": 0.13296034932136536, + "learning_rate": 0.0006236019614743973, + "loss": 2.6799, + "step": 14377 + }, + { + "epoch": 0.42635589953444236, + "grad_norm": 0.1345575898885727, + "learning_rate": 0.0006235563702646017, + "loss": 2.7168, + "step": 14378 + }, + { + "epoch": 0.42638555288675384, + "grad_norm": 0.13462664186954498, + "learning_rate": 0.0006235107779606932, + "loss": 2.7219, + "step": 14379 + }, + { + "epoch": 0.4264152062390653, + "grad_norm": 0.13583314418792725, + "learning_rate": 0.0006234651845630758, + "loss": 2.6927, + "step": 14380 + }, + { + "epoch": 0.4264448595913768, + "grad_norm": 0.14899462461471558, + "learning_rate": 0.0006234195900721528, + "loss": 2.732, + "step": 14381 + }, + { + "epoch": 0.42647451294368827, + "grad_norm": 0.12384682893753052, + "learning_rate": 0.0006233739944883283, + "loss": 2.6927, + "step": 14382 + }, + { + "epoch": 0.42650416629599974, + "grad_norm": 0.1095922663807869, + "learning_rate": 0.0006233283978120057, + "loss": 2.7025, + "step": 14383 + }, + { + "epoch": 0.4265338196483112, + "grad_norm": 0.12620313465595245, + "learning_rate": 0.0006232828000435891, + "loss": 2.7391, + "step": 14384 + }, + { + "epoch": 0.4265634730006227, + "grad_norm": 0.12549246847629547, + "learning_rate": 0.000623237201183482, + "loss": 2.6819, + "step": 14385 + }, + { + "epoch": 0.4265931263529342, + "grad_norm": 0.11505098640918732, + "learning_rate": 0.0006231916012320884, + "loss": 2.6836, + "step": 14386 + }, + { + "epoch": 0.4266227797052457, + "grad_norm": 0.12647588551044464, + "learning_rate": 0.0006231460001898121, + "loss": 2.7067, + "step": 14387 + }, + { + "epoch": 0.4266524330575572, + "grad_norm": 0.1515708863735199, + "learning_rate": 0.0006231003980570567, + "loss": 2.7064, + "step": 14388 + }, + { + "epoch": 0.42668208640986865, + "grad_norm": 0.1494462490081787, + "learning_rate": 0.0006230547948342264, + "loss": 2.7163, + "step": 14389 + }, + { + "epoch": 0.42671173976218013, + "grad_norm": 0.15283793210983276, + "learning_rate": 0.0006230091905217246, + "loss": 2.7372, + "step": 14390 + }, + { + "epoch": 0.4267413931144916, + "grad_norm": 0.13913068175315857, + "learning_rate": 0.0006229635851199552, + "loss": 2.7081, + "step": 14391 + }, + { + "epoch": 0.4267710464668031, + "grad_norm": 0.13832825422286987, + "learning_rate": 0.0006229179786293223, + "loss": 2.7071, + "step": 14392 + }, + { + "epoch": 0.42680069981911456, + "grad_norm": 0.17444875836372375, + "learning_rate": 0.0006228723710502295, + "loss": 2.7091, + "step": 14393 + }, + { + "epoch": 0.42683035317142604, + "grad_norm": 0.15674825012683868, + "learning_rate": 0.0006228267623830809, + "loss": 2.6969, + "step": 14394 + }, + { + "epoch": 0.4268600065237375, + "grad_norm": 0.15179429948329926, + "learning_rate": 0.00062278115262828, + "loss": 2.7003, + "step": 14395 + }, + { + "epoch": 0.426889659876049, + "grad_norm": 0.14357955753803253, + "learning_rate": 0.0006227355417862311, + "loss": 2.6889, + "step": 14396 + }, + { + "epoch": 0.42691931322836046, + "grad_norm": 0.11556092649698257, + "learning_rate": 0.0006226899298573381, + "loss": 2.6875, + "step": 14397 + }, + { + "epoch": 0.42694896658067194, + "grad_norm": 0.11222751438617706, + "learning_rate": 0.0006226443168420045, + "loss": 2.7093, + "step": 14398 + }, + { + "epoch": 0.4269786199329834, + "grad_norm": 0.11466723680496216, + "learning_rate": 0.0006225987027406343, + "loss": 2.7022, + "step": 14399 + }, + { + "epoch": 0.4270082732852949, + "grad_norm": 0.11936601996421814, + "learning_rate": 0.0006225530875536316, + "loss": 2.7473, + "step": 14400 + }, + { + "epoch": 0.42703792663760637, + "grad_norm": 0.11472412943840027, + "learning_rate": 0.0006225074712814004, + "loss": 2.7207, + "step": 14401 + }, + { + "epoch": 0.42706757998991784, + "grad_norm": 0.11429967731237411, + "learning_rate": 0.0006224618539243445, + "loss": 2.6816, + "step": 14402 + }, + { + "epoch": 0.4270972333422293, + "grad_norm": 0.11939691007137299, + "learning_rate": 0.0006224162354828679, + "loss": 2.6755, + "step": 14403 + }, + { + "epoch": 0.4271268866945408, + "grad_norm": 0.1187867596745491, + "learning_rate": 0.0006223706159573742, + "loss": 2.6732, + "step": 14404 + }, + { + "epoch": 0.4271565400468523, + "grad_norm": 0.12846754491329193, + "learning_rate": 0.0006223249953482679, + "loss": 2.7163, + "step": 14405 + }, + { + "epoch": 0.42718619339916375, + "grad_norm": 0.14588315784931183, + "learning_rate": 0.0006222793736559529, + "loss": 2.717, + "step": 14406 + }, + { + "epoch": 0.4272158467514753, + "grad_norm": 0.15426994860172272, + "learning_rate": 0.000622233750880833, + "loss": 2.6624, + "step": 14407 + }, + { + "epoch": 0.42724550010378676, + "grad_norm": 0.12870875000953674, + "learning_rate": 0.0006221881270233123, + "loss": 2.7058, + "step": 14408 + }, + { + "epoch": 0.42727515345609823, + "grad_norm": 0.13509786128997803, + "learning_rate": 0.0006221425020837947, + "loss": 2.6859, + "step": 14409 + }, + { + "epoch": 0.4273048068084097, + "grad_norm": 0.14064472913742065, + "learning_rate": 0.000622096876062684, + "loss": 2.7419, + "step": 14410 + }, + { + "epoch": 0.4273344601607212, + "grad_norm": 0.13626497983932495, + "learning_rate": 0.0006220512489603847, + "loss": 2.664, + "step": 14411 + }, + { + "epoch": 0.42736411351303266, + "grad_norm": 0.1434209793806076, + "learning_rate": 0.0006220056207773008, + "loss": 2.7006, + "step": 14412 + }, + { + "epoch": 0.42739376686534414, + "grad_norm": 0.14580866694450378, + "learning_rate": 0.0006219599915138361, + "loss": 2.6773, + "step": 14413 + }, + { + "epoch": 0.4274234202176556, + "grad_norm": 0.14438197016716003, + "learning_rate": 0.0006219143611703948, + "loss": 2.6903, + "step": 14414 + }, + { + "epoch": 0.4274530735699671, + "grad_norm": 0.13516734540462494, + "learning_rate": 0.0006218687297473808, + "loss": 2.6782, + "step": 14415 + }, + { + "epoch": 0.42748272692227857, + "grad_norm": 0.12090648710727692, + "learning_rate": 0.0006218230972451983, + "loss": 2.7526, + "step": 14416 + }, + { + "epoch": 0.42751238027459004, + "grad_norm": 0.1335197240114212, + "learning_rate": 0.0006217774636642512, + "loss": 2.7145, + "step": 14417 + }, + { + "epoch": 0.4275420336269015, + "grad_norm": 0.11032723635435104, + "learning_rate": 0.000621731829004944, + "loss": 2.711, + "step": 14418 + }, + { + "epoch": 0.427571686979213, + "grad_norm": 0.11238705366849899, + "learning_rate": 0.0006216861932676806, + "loss": 2.6768, + "step": 14419 + }, + { + "epoch": 0.42760134033152447, + "grad_norm": 0.11130715161561966, + "learning_rate": 0.0006216405564528649, + "loss": 2.7055, + "step": 14420 + }, + { + "epoch": 0.42763099368383595, + "grad_norm": 0.12067550420761108, + "learning_rate": 0.0006215949185609012, + "loss": 2.711, + "step": 14421 + }, + { + "epoch": 0.4276606470361474, + "grad_norm": 0.11663460731506348, + "learning_rate": 0.0006215492795921938, + "loss": 2.7326, + "step": 14422 + }, + { + "epoch": 0.4276903003884589, + "grad_norm": 0.1086127907037735, + "learning_rate": 0.0006215036395471465, + "loss": 2.6796, + "step": 14423 + }, + { + "epoch": 0.4277199537407704, + "grad_norm": 0.11433950066566467, + "learning_rate": 0.0006214579984261636, + "loss": 2.6951, + "step": 14424 + }, + { + "epoch": 0.42774960709308185, + "grad_norm": 0.10750305652618408, + "learning_rate": 0.0006214123562296493, + "loss": 2.6857, + "step": 14425 + }, + { + "epoch": 0.42777926044539333, + "grad_norm": 0.14018093049526215, + "learning_rate": 0.0006213667129580079, + "loss": 2.6768, + "step": 14426 + }, + { + "epoch": 0.4278089137977048, + "grad_norm": 0.1524360626935959, + "learning_rate": 0.0006213210686116433, + "loss": 2.7163, + "step": 14427 + }, + { + "epoch": 0.42783856715001634, + "grad_norm": 0.11501690745353699, + "learning_rate": 0.0006212754231909597, + "loss": 2.7203, + "step": 14428 + }, + { + "epoch": 0.4278682205023278, + "grad_norm": 0.13309359550476074, + "learning_rate": 0.0006212297766963617, + "loss": 2.6644, + "step": 14429 + }, + { + "epoch": 0.4278978738546393, + "grad_norm": 0.15276023745536804, + "learning_rate": 0.0006211841291282529, + "loss": 2.7109, + "step": 14430 + }, + { + "epoch": 0.42792752720695076, + "grad_norm": 0.15169872343540192, + "learning_rate": 0.000621138480487038, + "loss": 2.7268, + "step": 14431 + }, + { + "epoch": 0.42795718055926224, + "grad_norm": 0.15649160742759705, + "learning_rate": 0.000621092830773121, + "loss": 2.6836, + "step": 14432 + }, + { + "epoch": 0.4279868339115737, + "grad_norm": 0.14923445880413055, + "learning_rate": 0.0006210471799869062, + "loss": 2.6835, + "step": 14433 + }, + { + "epoch": 0.4280164872638852, + "grad_norm": 0.13741376996040344, + "learning_rate": 0.000621001528128798, + "loss": 2.7119, + "step": 14434 + }, + { + "epoch": 0.42804614061619667, + "grad_norm": 0.16126923263072968, + "learning_rate": 0.0006209558751992004, + "loss": 2.7158, + "step": 14435 + }, + { + "epoch": 0.42807579396850814, + "grad_norm": 0.12422297149896622, + "learning_rate": 0.0006209102211985177, + "loss": 2.6971, + "step": 14436 + }, + { + "epoch": 0.4281054473208196, + "grad_norm": 0.12696945667266846, + "learning_rate": 0.0006208645661271542, + "loss": 2.7062, + "step": 14437 + }, + { + "epoch": 0.4281351006731311, + "grad_norm": 0.1527632474899292, + "learning_rate": 0.0006208189099855143, + "loss": 2.7152, + "step": 14438 + }, + { + "epoch": 0.4281647540254426, + "grad_norm": 0.1320784091949463, + "learning_rate": 0.0006207732527740022, + "loss": 2.7143, + "step": 14439 + }, + { + "epoch": 0.42819440737775405, + "grad_norm": 0.15036942064762115, + "learning_rate": 0.0006207275944930224, + "loss": 2.7103, + "step": 14440 + }, + { + "epoch": 0.4282240607300655, + "grad_norm": 0.13572995364665985, + "learning_rate": 0.0006206819351429789, + "loss": 2.7069, + "step": 14441 + }, + { + "epoch": 0.428253714082377, + "grad_norm": 0.13431040942668915, + "learning_rate": 0.0006206362747242761, + "loss": 2.6974, + "step": 14442 + }, + { + "epoch": 0.4282833674346885, + "grad_norm": 0.14081451296806335, + "learning_rate": 0.0006205906132373182, + "loss": 2.7179, + "step": 14443 + }, + { + "epoch": 0.42831302078699995, + "grad_norm": 0.14218130707740784, + "learning_rate": 0.0006205449506825099, + "loss": 2.7319, + "step": 14444 + }, + { + "epoch": 0.42834267413931143, + "grad_norm": 0.132759690284729, + "learning_rate": 0.0006204992870602555, + "loss": 2.7287, + "step": 14445 + }, + { + "epoch": 0.4283723274916229, + "grad_norm": 0.11764516681432724, + "learning_rate": 0.0006204536223709591, + "loss": 2.709, + "step": 14446 + }, + { + "epoch": 0.4284019808439344, + "grad_norm": 0.12997660040855408, + "learning_rate": 0.0006204079566150253, + "loss": 2.7044, + "step": 14447 + }, + { + "epoch": 0.42843163419624586, + "grad_norm": 0.12247703224420547, + "learning_rate": 0.0006203622897928583, + "loss": 2.68, + "step": 14448 + }, + { + "epoch": 0.4284612875485574, + "grad_norm": 0.11259481310844421, + "learning_rate": 0.0006203166219048623, + "loss": 2.715, + "step": 14449 + }, + { + "epoch": 0.42849094090086887, + "grad_norm": 0.11388444900512695, + "learning_rate": 0.0006202709529514424, + "loss": 2.6797, + "step": 14450 + }, + { + "epoch": 0.42852059425318034, + "grad_norm": 0.12262433767318726, + "learning_rate": 0.0006202252829330024, + "loss": 2.7615, + "step": 14451 + }, + { + "epoch": 0.4285502476054918, + "grad_norm": 0.12223541736602783, + "learning_rate": 0.0006201796118499469, + "loss": 2.6724, + "step": 14452 + }, + { + "epoch": 0.4285799009578033, + "grad_norm": 0.11629438400268555, + "learning_rate": 0.0006201339397026802, + "loss": 2.6834, + "step": 14453 + }, + { + "epoch": 0.42860955431011477, + "grad_norm": 0.11811760067939758, + "learning_rate": 0.0006200882664916069, + "loss": 2.7331, + "step": 14454 + }, + { + "epoch": 0.42863920766242625, + "grad_norm": 0.12535610795021057, + "learning_rate": 0.0006200425922171315, + "loss": 2.6833, + "step": 14455 + }, + { + "epoch": 0.4286688610147377, + "grad_norm": 0.12953031063079834, + "learning_rate": 0.0006199969168796581, + "loss": 2.6711, + "step": 14456 + }, + { + "epoch": 0.4286985143670492, + "grad_norm": 0.1184961348772049, + "learning_rate": 0.0006199512404795916, + "loss": 2.7198, + "step": 14457 + }, + { + "epoch": 0.4287281677193607, + "grad_norm": 0.1070168986916542, + "learning_rate": 0.0006199055630173362, + "loss": 2.6907, + "step": 14458 + }, + { + "epoch": 0.42875782107167215, + "grad_norm": 0.12163619697093964, + "learning_rate": 0.0006198598844932965, + "loss": 2.7118, + "step": 14459 + }, + { + "epoch": 0.42878747442398363, + "grad_norm": 0.1122695580124855, + "learning_rate": 0.0006198142049078769, + "loss": 2.7331, + "step": 14460 + }, + { + "epoch": 0.4288171277762951, + "grad_norm": 0.126474067568779, + "learning_rate": 0.000619768524261482, + "loss": 2.6846, + "step": 14461 + }, + { + "epoch": 0.4288467811286066, + "grad_norm": 0.11737210303544998, + "learning_rate": 0.0006197228425545162, + "loss": 2.7054, + "step": 14462 + }, + { + "epoch": 0.42887643448091806, + "grad_norm": 0.10997407883405685, + "learning_rate": 0.0006196771597873842, + "loss": 2.712, + "step": 14463 + }, + { + "epoch": 0.42890608783322953, + "grad_norm": 0.12354523688554764, + "learning_rate": 0.0006196314759604902, + "loss": 2.6766, + "step": 14464 + }, + { + "epoch": 0.428935741185541, + "grad_norm": 0.14388982951641083, + "learning_rate": 0.0006195857910742391, + "loss": 2.7174, + "step": 14465 + }, + { + "epoch": 0.4289653945378525, + "grad_norm": 0.14354988932609558, + "learning_rate": 0.0006195401051290353, + "loss": 2.6953, + "step": 14466 + }, + { + "epoch": 0.42899504789016396, + "grad_norm": 0.12120278924703598, + "learning_rate": 0.0006194944181252834, + "loss": 2.7144, + "step": 14467 + }, + { + "epoch": 0.42902470124247544, + "grad_norm": 0.11515779048204422, + "learning_rate": 0.0006194487300633879, + "loss": 2.7296, + "step": 14468 + }, + { + "epoch": 0.4290543545947869, + "grad_norm": 0.11825732886791229, + "learning_rate": 0.0006194030409437531, + "loss": 2.7099, + "step": 14469 + }, + { + "epoch": 0.42908400794709844, + "grad_norm": 0.13729731738567352, + "learning_rate": 0.0006193573507667842, + "loss": 2.7059, + "step": 14470 + }, + { + "epoch": 0.4291136612994099, + "grad_norm": 0.1430366188287735, + "learning_rate": 0.0006193116595328853, + "loss": 2.6665, + "step": 14471 + }, + { + "epoch": 0.4291433146517214, + "grad_norm": 0.1277540773153305, + "learning_rate": 0.0006192659672424612, + "loss": 2.6961, + "step": 14472 + }, + { + "epoch": 0.4291729680040329, + "grad_norm": 0.12741915881633759, + "learning_rate": 0.0006192202738959168, + "loss": 2.7192, + "step": 14473 + }, + { + "epoch": 0.42920262135634435, + "grad_norm": 0.14352230727672577, + "learning_rate": 0.0006191745794936561, + "loss": 2.6755, + "step": 14474 + }, + { + "epoch": 0.4292322747086558, + "grad_norm": 0.14761440455913544, + "learning_rate": 0.000619128884036084, + "loss": 2.7128, + "step": 14475 + }, + { + "epoch": 0.4292619280609673, + "grad_norm": 0.1612474024295807, + "learning_rate": 0.0006190831875236051, + "loss": 2.7065, + "step": 14476 + }, + { + "epoch": 0.4292915814132788, + "grad_norm": 0.1790187507867813, + "learning_rate": 0.0006190374899566244, + "loss": 2.6896, + "step": 14477 + }, + { + "epoch": 0.42932123476559025, + "grad_norm": 0.17391860485076904, + "learning_rate": 0.0006189917913355463, + "loss": 2.6818, + "step": 14478 + }, + { + "epoch": 0.42935088811790173, + "grad_norm": 0.1338108628988266, + "learning_rate": 0.0006189460916607754, + "loss": 2.6979, + "step": 14479 + }, + { + "epoch": 0.4293805414702132, + "grad_norm": 0.13116668164730072, + "learning_rate": 0.0006189003909327163, + "loss": 2.7186, + "step": 14480 + }, + { + "epoch": 0.4294101948225247, + "grad_norm": 0.1296216994524002, + "learning_rate": 0.000618854689151774, + "loss": 2.7216, + "step": 14481 + }, + { + "epoch": 0.42943984817483616, + "grad_norm": 0.1357191950082779, + "learning_rate": 0.0006188089863183528, + "loss": 2.7169, + "step": 14482 + }, + { + "epoch": 0.42946950152714763, + "grad_norm": 0.1297803521156311, + "learning_rate": 0.000618763282432858, + "loss": 2.706, + "step": 14483 + }, + { + "epoch": 0.4294991548794591, + "grad_norm": 0.12289859354496002, + "learning_rate": 0.0006187175774956937, + "loss": 2.7175, + "step": 14484 + }, + { + "epoch": 0.4295288082317706, + "grad_norm": 0.12606556713581085, + "learning_rate": 0.0006186718715072649, + "loss": 2.7119, + "step": 14485 + }, + { + "epoch": 0.42955846158408206, + "grad_norm": 0.11974157392978668, + "learning_rate": 0.0006186261644679763, + "loss": 2.6764, + "step": 14486 + }, + { + "epoch": 0.42958811493639354, + "grad_norm": 0.12009833753108978, + "learning_rate": 0.0006185804563782327, + "loss": 2.7072, + "step": 14487 + }, + { + "epoch": 0.429617768288705, + "grad_norm": 0.11934585869312286, + "learning_rate": 0.0006185347472384388, + "loss": 2.716, + "step": 14488 + }, + { + "epoch": 0.4296474216410165, + "grad_norm": 0.1271020919084549, + "learning_rate": 0.0006184890370489992, + "loss": 2.7117, + "step": 14489 + }, + { + "epoch": 0.429677074993328, + "grad_norm": 0.12025513499975204, + "learning_rate": 0.0006184433258103191, + "loss": 2.6923, + "step": 14490 + }, + { + "epoch": 0.4297067283456395, + "grad_norm": 0.13189387321472168, + "learning_rate": 0.0006183976135228029, + "loss": 2.6782, + "step": 14491 + }, + { + "epoch": 0.429736381697951, + "grad_norm": 0.11059210449457169, + "learning_rate": 0.0006183519001868555, + "loss": 2.7017, + "step": 14492 + }, + { + "epoch": 0.42976603505026245, + "grad_norm": 0.12241204082965851, + "learning_rate": 0.0006183061858028818, + "loss": 2.7142, + "step": 14493 + }, + { + "epoch": 0.4297956884025739, + "grad_norm": 0.11497676372528076, + "learning_rate": 0.0006182604703712864, + "loss": 2.6972, + "step": 14494 + }, + { + "epoch": 0.4298253417548854, + "grad_norm": 0.11324291676282883, + "learning_rate": 0.0006182147538924742, + "loss": 2.6718, + "step": 14495 + }, + { + "epoch": 0.4298549951071969, + "grad_norm": 0.12227078527212143, + "learning_rate": 0.0006181690363668502, + "loss": 2.6719, + "step": 14496 + }, + { + "epoch": 0.42988464845950836, + "grad_norm": 0.1088162362575531, + "learning_rate": 0.000618123317794819, + "loss": 2.6823, + "step": 14497 + }, + { + "epoch": 0.42991430181181983, + "grad_norm": 0.10549761354923248, + "learning_rate": 0.0006180775981767856, + "loss": 2.698, + "step": 14498 + }, + { + "epoch": 0.4299439551641313, + "grad_norm": 0.12517239153385162, + "learning_rate": 0.0006180318775131548, + "loss": 2.6986, + "step": 14499 + }, + { + "epoch": 0.4299736085164428, + "grad_norm": 0.1329399198293686, + "learning_rate": 0.0006179861558043316, + "loss": 2.7063, + "step": 14500 + }, + { + "epoch": 0.43000326186875426, + "grad_norm": 0.125367671251297, + "learning_rate": 0.0006179404330507205, + "loss": 2.6992, + "step": 14501 + }, + { + "epoch": 0.43003291522106574, + "grad_norm": 0.12241244316101074, + "learning_rate": 0.0006178947092527267, + "loss": 2.7154, + "step": 14502 + }, + { + "epoch": 0.4300625685733772, + "grad_norm": 0.1164378821849823, + "learning_rate": 0.000617848984410755, + "loss": 2.6724, + "step": 14503 + }, + { + "epoch": 0.4300922219256887, + "grad_norm": 0.13218726217746735, + "learning_rate": 0.0006178032585252102, + "loss": 2.7342, + "step": 14504 + }, + { + "epoch": 0.43012187527800017, + "grad_norm": 0.1326121985912323, + "learning_rate": 0.0006177575315964976, + "loss": 2.7075, + "step": 14505 + }, + { + "epoch": 0.43015152863031164, + "grad_norm": 0.10426585376262665, + "learning_rate": 0.0006177118036250217, + "loss": 2.7072, + "step": 14506 + }, + { + "epoch": 0.4301811819826231, + "grad_norm": 0.12010699510574341, + "learning_rate": 0.0006176660746111875, + "loss": 2.7126, + "step": 14507 + }, + { + "epoch": 0.4302108353349346, + "grad_norm": 0.132675901055336, + "learning_rate": 0.0006176203445554002, + "loss": 2.6742, + "step": 14508 + }, + { + "epoch": 0.43024048868724607, + "grad_norm": 0.13523723185062408, + "learning_rate": 0.0006175746134580645, + "loss": 2.6752, + "step": 14509 + }, + { + "epoch": 0.43027014203955755, + "grad_norm": 0.15069831907749176, + "learning_rate": 0.0006175288813195852, + "loss": 2.6832, + "step": 14510 + }, + { + "epoch": 0.4302997953918691, + "grad_norm": 0.14798595011234283, + "learning_rate": 0.0006174831481403678, + "loss": 2.7349, + "step": 14511 + }, + { + "epoch": 0.43032944874418055, + "grad_norm": 0.12227892130613327, + "learning_rate": 0.0006174374139208168, + "loss": 2.7278, + "step": 14512 + }, + { + "epoch": 0.43035910209649203, + "grad_norm": 0.12582072615623474, + "learning_rate": 0.0006173916786613374, + "loss": 2.7018, + "step": 14513 + }, + { + "epoch": 0.4303887554488035, + "grad_norm": 0.11643778532743454, + "learning_rate": 0.0006173459423623344, + "loss": 2.6674, + "step": 14514 + }, + { + "epoch": 0.430418408801115, + "grad_norm": 0.12592674791812897, + "learning_rate": 0.0006173002050242129, + "loss": 2.7116, + "step": 14515 + }, + { + "epoch": 0.43044806215342646, + "grad_norm": 0.13660897314548492, + "learning_rate": 0.0006172544666473783, + "loss": 2.6824, + "step": 14516 + }, + { + "epoch": 0.43047771550573793, + "grad_norm": 0.14913135766983032, + "learning_rate": 0.000617208727232235, + "loss": 2.6886, + "step": 14517 + }, + { + "epoch": 0.4305073688580494, + "grad_norm": 0.14028194546699524, + "learning_rate": 0.0006171629867791884, + "loss": 2.7246, + "step": 14518 + }, + { + "epoch": 0.4305370222103609, + "grad_norm": 0.11701681464910507, + "learning_rate": 0.0006171172452886433, + "loss": 2.7074, + "step": 14519 + }, + { + "epoch": 0.43056667556267236, + "grad_norm": 0.11256961524486542, + "learning_rate": 0.0006170715027610049, + "loss": 2.6966, + "step": 14520 + }, + { + "epoch": 0.43059632891498384, + "grad_norm": 0.12729893624782562, + "learning_rate": 0.0006170257591966784, + "loss": 2.7202, + "step": 14521 + }, + { + "epoch": 0.4306259822672953, + "grad_norm": 0.12042776495218277, + "learning_rate": 0.0006169800145960686, + "loss": 2.72, + "step": 14522 + }, + { + "epoch": 0.4306556356196068, + "grad_norm": 0.129987433552742, + "learning_rate": 0.0006169342689595808, + "loss": 2.765, + "step": 14523 + }, + { + "epoch": 0.43068528897191827, + "grad_norm": 0.1273619532585144, + "learning_rate": 0.00061688852228762, + "loss": 2.6869, + "step": 14524 + }, + { + "epoch": 0.43071494232422974, + "grad_norm": 0.12138063460588455, + "learning_rate": 0.0006168427745805911, + "loss": 2.6635, + "step": 14525 + }, + { + "epoch": 0.4307445956765412, + "grad_norm": 0.12291053682565689, + "learning_rate": 0.0006167970258388994, + "loss": 2.7187, + "step": 14526 + }, + { + "epoch": 0.4307742490288527, + "grad_norm": 0.12072394788265228, + "learning_rate": 0.0006167512760629501, + "loss": 2.6737, + "step": 14527 + }, + { + "epoch": 0.43080390238116417, + "grad_norm": 0.1138559952378273, + "learning_rate": 0.0006167055252531482, + "loss": 2.7214, + "step": 14528 + }, + { + "epoch": 0.43083355573347565, + "grad_norm": 0.11255647987127304, + "learning_rate": 0.0006166597734098987, + "loss": 2.7011, + "step": 14529 + }, + { + "epoch": 0.4308632090857871, + "grad_norm": 0.13139881193637848, + "learning_rate": 0.0006166140205336071, + "loss": 2.6944, + "step": 14530 + }, + { + "epoch": 0.4308928624380986, + "grad_norm": 0.13312020897865295, + "learning_rate": 0.0006165682666246781, + "loss": 2.7137, + "step": 14531 + }, + { + "epoch": 0.43092251579041013, + "grad_norm": 0.1374308317899704, + "learning_rate": 0.0006165225116835173, + "loss": 2.6872, + "step": 14532 + }, + { + "epoch": 0.4309521691427216, + "grad_norm": 0.13553526997566223, + "learning_rate": 0.0006164767557105296, + "loss": 2.6411, + "step": 14533 + }, + { + "epoch": 0.4309818224950331, + "grad_norm": 0.13622435927391052, + "learning_rate": 0.00061643099870612, + "loss": 2.7365, + "step": 14534 + }, + { + "epoch": 0.43101147584734456, + "grad_norm": 0.13530156016349792, + "learning_rate": 0.0006163852406706942, + "loss": 2.6618, + "step": 14535 + }, + { + "epoch": 0.43104112919965604, + "grad_norm": 0.11556734889745712, + "learning_rate": 0.000616339481604657, + "loss": 2.6961, + "step": 14536 + }, + { + "epoch": 0.4310707825519675, + "grad_norm": 0.12909142673015594, + "learning_rate": 0.0006162937215084137, + "loss": 2.7112, + "step": 14537 + }, + { + "epoch": 0.431100435904279, + "grad_norm": 0.14497654139995575, + "learning_rate": 0.0006162479603823698, + "loss": 2.6995, + "step": 14538 + }, + { + "epoch": 0.43113008925659047, + "grad_norm": 0.12680616974830627, + "learning_rate": 0.00061620219822693, + "loss": 2.6622, + "step": 14539 + }, + { + "epoch": 0.43115974260890194, + "grad_norm": 0.124099962413311, + "learning_rate": 0.0006161564350424997, + "loss": 2.7327, + "step": 14540 + }, + { + "epoch": 0.4311893959612134, + "grad_norm": 0.11813393980264664, + "learning_rate": 0.0006161106708294843, + "loss": 2.6663, + "step": 14541 + }, + { + "epoch": 0.4312190493135249, + "grad_norm": 0.12318800389766693, + "learning_rate": 0.0006160649055882891, + "loss": 2.7082, + "step": 14542 + }, + { + "epoch": 0.43124870266583637, + "grad_norm": 0.12430139631032944, + "learning_rate": 0.0006160191393193193, + "loss": 2.6998, + "step": 14543 + }, + { + "epoch": 0.43127835601814785, + "grad_norm": 0.14137807488441467, + "learning_rate": 0.0006159733720229799, + "loss": 2.6978, + "step": 14544 + }, + { + "epoch": 0.4313080093704593, + "grad_norm": 0.15192288160324097, + "learning_rate": 0.0006159276036996766, + "loss": 2.69, + "step": 14545 + }, + { + "epoch": 0.4313376627227708, + "grad_norm": 0.14380311965942383, + "learning_rate": 0.0006158818343498143, + "loss": 2.6998, + "step": 14546 + }, + { + "epoch": 0.4313673160750823, + "grad_norm": 0.12579257786273956, + "learning_rate": 0.0006158360639737984, + "loss": 2.6861, + "step": 14547 + }, + { + "epoch": 0.43139696942739375, + "grad_norm": 0.13157819211483002, + "learning_rate": 0.0006157902925720345, + "loss": 2.7248, + "step": 14548 + }, + { + "epoch": 0.4314266227797052, + "grad_norm": 0.12216828763484955, + "learning_rate": 0.0006157445201449276, + "loss": 2.6895, + "step": 14549 + }, + { + "epoch": 0.4314562761320167, + "grad_norm": 0.14244621992111206, + "learning_rate": 0.000615698746692883, + "loss": 2.6673, + "step": 14550 + }, + { + "epoch": 0.4314859294843282, + "grad_norm": 0.15250778198242188, + "learning_rate": 0.0006156529722163062, + "loss": 2.7101, + "step": 14551 + }, + { + "epoch": 0.43151558283663966, + "grad_norm": 0.1409878432750702, + "learning_rate": 0.0006156071967156025, + "loss": 2.6676, + "step": 14552 + }, + { + "epoch": 0.4315452361889512, + "grad_norm": 0.12596222758293152, + "learning_rate": 0.0006155614201911771, + "loss": 2.7202, + "step": 14553 + }, + { + "epoch": 0.43157488954126266, + "grad_norm": 0.11616567522287369, + "learning_rate": 0.0006155156426434357, + "loss": 2.7177, + "step": 14554 + }, + { + "epoch": 0.43160454289357414, + "grad_norm": 0.11104466021060944, + "learning_rate": 0.0006154698640727834, + "loss": 2.7213, + "step": 14555 + }, + { + "epoch": 0.4316341962458856, + "grad_norm": 0.1181381344795227, + "learning_rate": 0.0006154240844796256, + "loss": 2.7076, + "step": 14556 + }, + { + "epoch": 0.4316638495981971, + "grad_norm": 0.13220570981502533, + "learning_rate": 0.0006153783038643678, + "loss": 2.7342, + "step": 14557 + }, + { + "epoch": 0.43169350295050857, + "grad_norm": 0.13959136605262756, + "learning_rate": 0.0006153325222274152, + "loss": 2.7088, + "step": 14558 + }, + { + "epoch": 0.43172315630282004, + "grad_norm": 0.13652561604976654, + "learning_rate": 0.0006152867395691732, + "loss": 2.7112, + "step": 14559 + }, + { + "epoch": 0.4317528096551315, + "grad_norm": 0.11619631201028824, + "learning_rate": 0.0006152409558900475, + "loss": 2.6782, + "step": 14560 + }, + { + "epoch": 0.431782463007443, + "grad_norm": 0.1296364665031433, + "learning_rate": 0.0006151951711904435, + "loss": 2.686, + "step": 14561 + }, + { + "epoch": 0.43181211635975447, + "grad_norm": 0.13551582396030426, + "learning_rate": 0.0006151493854707663, + "loss": 2.7336, + "step": 14562 + }, + { + "epoch": 0.43184176971206595, + "grad_norm": 0.14543133974075317, + "learning_rate": 0.0006151035987314215, + "loss": 2.7493, + "step": 14563 + }, + { + "epoch": 0.4318714230643774, + "grad_norm": 0.12856324017047882, + "learning_rate": 0.0006150578109728146, + "loss": 2.7311, + "step": 14564 + }, + { + "epoch": 0.4319010764166889, + "grad_norm": 0.13214106857776642, + "learning_rate": 0.000615012022195351, + "loss": 2.6882, + "step": 14565 + }, + { + "epoch": 0.4319307297690004, + "grad_norm": 0.12919804453849792, + "learning_rate": 0.0006149662323994363, + "loss": 2.6905, + "step": 14566 + }, + { + "epoch": 0.43196038312131185, + "grad_norm": 0.13985320925712585, + "learning_rate": 0.0006149204415854759, + "loss": 2.6997, + "step": 14567 + }, + { + "epoch": 0.43199003647362333, + "grad_norm": 0.12983262538909912, + "learning_rate": 0.0006148746497538752, + "loss": 2.7258, + "step": 14568 + }, + { + "epoch": 0.4320196898259348, + "grad_norm": 0.1290683001279831, + "learning_rate": 0.0006148288569050398, + "loss": 2.722, + "step": 14569 + }, + { + "epoch": 0.4320493431782463, + "grad_norm": 0.13777470588684082, + "learning_rate": 0.0006147830630393751, + "loss": 2.7035, + "step": 14570 + }, + { + "epoch": 0.43207899653055776, + "grad_norm": 0.13709719479084015, + "learning_rate": 0.0006147372681572868, + "loss": 2.7062, + "step": 14571 + }, + { + "epoch": 0.43210864988286923, + "grad_norm": 0.15043677389621735, + "learning_rate": 0.0006146914722591801, + "loss": 2.717, + "step": 14572 + }, + { + "epoch": 0.4321383032351807, + "grad_norm": 0.13653278350830078, + "learning_rate": 0.0006146456753454608, + "loss": 2.6976, + "step": 14573 + }, + { + "epoch": 0.43216795658749224, + "grad_norm": 0.1229679211974144, + "learning_rate": 0.0006145998774165344, + "loss": 2.7112, + "step": 14574 + }, + { + "epoch": 0.4321976099398037, + "grad_norm": 0.13341782987117767, + "learning_rate": 0.0006145540784728063, + "loss": 2.7413, + "step": 14575 + }, + { + "epoch": 0.4322272632921152, + "grad_norm": 0.15457093715667725, + "learning_rate": 0.0006145082785146825, + "loss": 2.7077, + "step": 14576 + }, + { + "epoch": 0.43225691664442667, + "grad_norm": 0.16158218681812286, + "learning_rate": 0.000614462477542568, + "loss": 2.7026, + "step": 14577 + }, + { + "epoch": 0.43228656999673815, + "grad_norm": 0.15777355432510376, + "learning_rate": 0.0006144166755568685, + "loss": 2.719, + "step": 14578 + }, + { + "epoch": 0.4323162233490496, + "grad_norm": 0.13458365201950073, + "learning_rate": 0.0006143708725579899, + "loss": 2.7269, + "step": 14579 + }, + { + "epoch": 0.4323458767013611, + "grad_norm": 0.14152802526950836, + "learning_rate": 0.0006143250685463374, + "loss": 2.7142, + "step": 14580 + }, + { + "epoch": 0.4323755300536726, + "grad_norm": 0.14439703524112701, + "learning_rate": 0.000614279263522317, + "loss": 2.6963, + "step": 14581 + }, + { + "epoch": 0.43240518340598405, + "grad_norm": 0.13808394968509674, + "learning_rate": 0.0006142334574863341, + "loss": 2.6968, + "step": 14582 + }, + { + "epoch": 0.4324348367582955, + "grad_norm": 0.1316208839416504, + "learning_rate": 0.0006141876504387942, + "loss": 2.6718, + "step": 14583 + }, + { + "epoch": 0.432464490110607, + "grad_norm": 0.14096274971961975, + "learning_rate": 0.0006141418423801031, + "loss": 2.7036, + "step": 14584 + }, + { + "epoch": 0.4324941434629185, + "grad_norm": 0.1306753158569336, + "learning_rate": 0.0006140960333106664, + "loss": 2.6989, + "step": 14585 + }, + { + "epoch": 0.43252379681522996, + "grad_norm": 0.12565314769744873, + "learning_rate": 0.0006140502232308897, + "loss": 2.6761, + "step": 14586 + }, + { + "epoch": 0.43255345016754143, + "grad_norm": 0.14377614855766296, + "learning_rate": 0.0006140044121411787, + "loss": 2.6938, + "step": 14587 + }, + { + "epoch": 0.4325831035198529, + "grad_norm": 0.1308189481496811, + "learning_rate": 0.0006139586000419392, + "loss": 2.7031, + "step": 14588 + }, + { + "epoch": 0.4326127568721644, + "grad_norm": 0.12539781630039215, + "learning_rate": 0.0006139127869335766, + "loss": 2.6682, + "step": 14589 + }, + { + "epoch": 0.43264241022447586, + "grad_norm": 0.1239590272307396, + "learning_rate": 0.0006138669728164968, + "loss": 2.7223, + "step": 14590 + }, + { + "epoch": 0.43267206357678734, + "grad_norm": 0.1112506166100502, + "learning_rate": 0.0006138211576911051, + "loss": 2.7188, + "step": 14591 + }, + { + "epoch": 0.4327017169290988, + "grad_norm": 0.12894092500209808, + "learning_rate": 0.000613775341557808, + "loss": 2.7093, + "step": 14592 + }, + { + "epoch": 0.4327313702814103, + "grad_norm": 0.10462051630020142, + "learning_rate": 0.0006137295244170105, + "loss": 2.7353, + "step": 14593 + }, + { + "epoch": 0.4327610236337218, + "grad_norm": 0.1126476302742958, + "learning_rate": 0.0006136837062691186, + "loss": 2.6908, + "step": 14594 + }, + { + "epoch": 0.4327906769860333, + "grad_norm": 0.11039893329143524, + "learning_rate": 0.0006136378871145377, + "loss": 2.686, + "step": 14595 + }, + { + "epoch": 0.43282033033834477, + "grad_norm": 0.126542329788208, + "learning_rate": 0.0006135920669536741, + "loss": 2.7194, + "step": 14596 + }, + { + "epoch": 0.43284998369065625, + "grad_norm": 0.12187135964632034, + "learning_rate": 0.0006135462457869331, + "loss": 2.7069, + "step": 14597 + }, + { + "epoch": 0.4328796370429677, + "grad_norm": 0.13010430335998535, + "learning_rate": 0.0006135004236147207, + "loss": 2.7154, + "step": 14598 + }, + { + "epoch": 0.4329092903952792, + "grad_norm": 0.1262630671262741, + "learning_rate": 0.0006134546004374425, + "loss": 2.6947, + "step": 14599 + }, + { + "epoch": 0.4329389437475907, + "grad_norm": 0.117245614528656, + "learning_rate": 0.0006134087762555044, + "loss": 2.6869, + "step": 14600 + }, + { + "epoch": 0.43296859709990215, + "grad_norm": 0.10506922751665115, + "learning_rate": 0.0006133629510693121, + "loss": 2.6992, + "step": 14601 + }, + { + "epoch": 0.43299825045221363, + "grad_norm": 0.11934744566679001, + "learning_rate": 0.0006133171248792713, + "loss": 2.707, + "step": 14602 + }, + { + "epoch": 0.4330279038045251, + "grad_norm": 0.10355493426322937, + "learning_rate": 0.000613271297685788, + "loss": 2.6884, + "step": 14603 + }, + { + "epoch": 0.4330575571568366, + "grad_norm": 0.13017727434635162, + "learning_rate": 0.0006132254694892679, + "loss": 2.7199, + "step": 14604 + }, + { + "epoch": 0.43308721050914806, + "grad_norm": 0.14193324744701385, + "learning_rate": 0.0006131796402901169, + "loss": 2.6931, + "step": 14605 + }, + { + "epoch": 0.43311686386145953, + "grad_norm": 0.14089976251125336, + "learning_rate": 0.0006131338100887407, + "loss": 2.6906, + "step": 14606 + }, + { + "epoch": 0.433146517213771, + "grad_norm": 0.16813179850578308, + "learning_rate": 0.0006130879788855452, + "loss": 2.7273, + "step": 14607 + }, + { + "epoch": 0.4331761705660825, + "grad_norm": 0.16188666224479675, + "learning_rate": 0.0006130421466809361, + "loss": 2.7141, + "step": 14608 + }, + { + "epoch": 0.43320582391839396, + "grad_norm": 0.14918018877506256, + "learning_rate": 0.0006129963134753197, + "loss": 2.6951, + "step": 14609 + }, + { + "epoch": 0.43323547727070544, + "grad_norm": 0.12776292860507965, + "learning_rate": 0.0006129504792691014, + "loss": 2.691, + "step": 14610 + }, + { + "epoch": 0.4332651306230169, + "grad_norm": 0.14320221543312073, + "learning_rate": 0.0006129046440626871, + "loss": 2.6779, + "step": 14611 + }, + { + "epoch": 0.4332947839753284, + "grad_norm": 0.12974965572357178, + "learning_rate": 0.0006128588078564829, + "loss": 2.7123, + "step": 14612 + }, + { + "epoch": 0.43332443732763987, + "grad_norm": 0.1251734495162964, + "learning_rate": 0.0006128129706508946, + "loss": 2.6947, + "step": 14613 + }, + { + "epoch": 0.43335409067995134, + "grad_norm": 0.1318468302488327, + "learning_rate": 0.0006127671324463281, + "loss": 2.6793, + "step": 14614 + }, + { + "epoch": 0.4333837440322629, + "grad_norm": 0.1405114084482193, + "learning_rate": 0.0006127212932431893, + "loss": 2.6882, + "step": 14615 + }, + { + "epoch": 0.43341339738457435, + "grad_norm": 0.1417596936225891, + "learning_rate": 0.000612675453041884, + "loss": 2.696, + "step": 14616 + }, + { + "epoch": 0.4334430507368858, + "grad_norm": 0.1395249366760254, + "learning_rate": 0.0006126296118428181, + "loss": 2.6714, + "step": 14617 + }, + { + "epoch": 0.4334727040891973, + "grad_norm": 0.13261792063713074, + "learning_rate": 0.0006125837696463978, + "loss": 2.6582, + "step": 14618 + }, + { + "epoch": 0.4335023574415088, + "grad_norm": 0.1215883269906044, + "learning_rate": 0.000612537926453029, + "loss": 2.7262, + "step": 14619 + }, + { + "epoch": 0.43353201079382025, + "grad_norm": 0.13421161472797394, + "learning_rate": 0.0006124920822631175, + "loss": 2.717, + "step": 14620 + }, + { + "epoch": 0.43356166414613173, + "grad_norm": 0.11116252094507217, + "learning_rate": 0.0006124462370770692, + "loss": 2.7315, + "step": 14621 + }, + { + "epoch": 0.4335913174984432, + "grad_norm": 0.11745918542146683, + "learning_rate": 0.0006124003908952903, + "loss": 2.7717, + "step": 14622 + }, + { + "epoch": 0.4336209708507547, + "grad_norm": 0.11501044780015945, + "learning_rate": 0.0006123545437181865, + "loss": 2.6713, + "step": 14623 + }, + { + "epoch": 0.43365062420306616, + "grad_norm": 0.12801994383335114, + "learning_rate": 0.000612308695546164, + "loss": 2.7054, + "step": 14624 + }, + { + "epoch": 0.43368027755537764, + "grad_norm": 0.12750177085399628, + "learning_rate": 0.0006122628463796288, + "loss": 2.704, + "step": 14625 + }, + { + "epoch": 0.4337099309076891, + "grad_norm": 0.1304907202720642, + "learning_rate": 0.0006122169962189867, + "loss": 2.7012, + "step": 14626 + }, + { + "epoch": 0.4337395842600006, + "grad_norm": 0.12193053215742111, + "learning_rate": 0.0006121711450646439, + "loss": 2.7047, + "step": 14627 + }, + { + "epoch": 0.43376923761231206, + "grad_norm": 0.13799872994422913, + "learning_rate": 0.0006121252929170063, + "loss": 2.7064, + "step": 14628 + }, + { + "epoch": 0.43379889096462354, + "grad_norm": 0.1361333429813385, + "learning_rate": 0.0006120794397764801, + "loss": 2.7312, + "step": 14629 + }, + { + "epoch": 0.433828544316935, + "grad_norm": 0.14032778143882751, + "learning_rate": 0.0006120335856434711, + "loss": 2.7168, + "step": 14630 + }, + { + "epoch": 0.4338581976692465, + "grad_norm": 0.1250600665807724, + "learning_rate": 0.0006119877305183855, + "loss": 2.7182, + "step": 14631 + }, + { + "epoch": 0.43388785102155797, + "grad_norm": 0.12280841916799545, + "learning_rate": 0.0006119418744016294, + "loss": 2.7467, + "step": 14632 + }, + { + "epoch": 0.43391750437386944, + "grad_norm": 0.11631885170936584, + "learning_rate": 0.0006118960172936087, + "loss": 2.7125, + "step": 14633 + }, + { + "epoch": 0.4339471577261809, + "grad_norm": 0.1355302929878235, + "learning_rate": 0.0006118501591947296, + "loss": 2.663, + "step": 14634 + }, + { + "epoch": 0.4339768110784924, + "grad_norm": 0.1339566856622696, + "learning_rate": 0.0006118043001053981, + "loss": 2.6891, + "step": 14635 + }, + { + "epoch": 0.43400646443080393, + "grad_norm": 0.12709473073482513, + "learning_rate": 0.0006117584400260204, + "loss": 2.6762, + "step": 14636 + }, + { + "epoch": 0.4340361177831154, + "grad_norm": 0.12580451369285583, + "learning_rate": 0.0006117125789570025, + "loss": 2.7185, + "step": 14637 + }, + { + "epoch": 0.4340657711354269, + "grad_norm": 0.11651459336280823, + "learning_rate": 0.0006116667168987505, + "loss": 2.7261, + "step": 14638 + }, + { + "epoch": 0.43409542448773836, + "grad_norm": 0.1179472878575325, + "learning_rate": 0.0006116208538516707, + "loss": 2.6782, + "step": 14639 + }, + { + "epoch": 0.43412507784004983, + "grad_norm": 0.11639350652694702, + "learning_rate": 0.0006115749898161688, + "loss": 2.698, + "step": 14640 + }, + { + "epoch": 0.4341547311923613, + "grad_norm": 0.11255259811878204, + "learning_rate": 0.0006115291247926515, + "loss": 2.7087, + "step": 14641 + }, + { + "epoch": 0.4341843845446728, + "grad_norm": 0.1183556541800499, + "learning_rate": 0.0006114832587815247, + "loss": 2.6749, + "step": 14642 + }, + { + "epoch": 0.43421403789698426, + "grad_norm": 0.11337882280349731, + "learning_rate": 0.0006114373917831942, + "loss": 2.7205, + "step": 14643 + }, + { + "epoch": 0.43424369124929574, + "grad_norm": 0.11438482254743576, + "learning_rate": 0.0006113915237980666, + "loss": 2.6862, + "step": 14644 + }, + { + "epoch": 0.4342733446016072, + "grad_norm": 0.12033949047327042, + "learning_rate": 0.0006113456548265479, + "loss": 2.7089, + "step": 14645 + }, + { + "epoch": 0.4343029979539187, + "grad_norm": 0.11695753782987595, + "learning_rate": 0.0006112997848690444, + "loss": 2.6954, + "step": 14646 + }, + { + "epoch": 0.43433265130623017, + "grad_norm": 0.10522003471851349, + "learning_rate": 0.0006112539139259623, + "loss": 2.7089, + "step": 14647 + }, + { + "epoch": 0.43436230465854164, + "grad_norm": 0.12586729228496552, + "learning_rate": 0.0006112080419977075, + "loss": 2.7223, + "step": 14648 + }, + { + "epoch": 0.4343919580108531, + "grad_norm": 0.15392005443572998, + "learning_rate": 0.0006111621690846865, + "loss": 2.6984, + "step": 14649 + }, + { + "epoch": 0.4344216113631646, + "grad_norm": 0.1532248854637146, + "learning_rate": 0.0006111162951873052, + "loss": 2.7188, + "step": 14650 + }, + { + "epoch": 0.43445126471547607, + "grad_norm": 0.16300596296787262, + "learning_rate": 0.0006110704203059703, + "loss": 2.7097, + "step": 14651 + }, + { + "epoch": 0.43448091806778755, + "grad_norm": 0.16896364092826843, + "learning_rate": 0.0006110245444410876, + "loss": 2.6867, + "step": 14652 + }, + { + "epoch": 0.434510571420099, + "grad_norm": 0.19068093597888947, + "learning_rate": 0.0006109786675930636, + "loss": 2.7136, + "step": 14653 + }, + { + "epoch": 0.4345402247724105, + "grad_norm": 0.16670167446136475, + "learning_rate": 0.0006109327897623045, + "loss": 2.7183, + "step": 14654 + }, + { + "epoch": 0.434569878124722, + "grad_norm": 0.13144735991954803, + "learning_rate": 0.0006108869109492165, + "loss": 2.7061, + "step": 14655 + }, + { + "epoch": 0.43459953147703345, + "grad_norm": 0.13128694891929626, + "learning_rate": 0.0006108410311542056, + "loss": 2.6733, + "step": 14656 + }, + { + "epoch": 0.434629184829345, + "grad_norm": 0.12806250154972076, + "learning_rate": 0.0006107951503776785, + "loss": 2.732, + "step": 14657 + }, + { + "epoch": 0.43465883818165646, + "grad_norm": 0.14291943609714508, + "learning_rate": 0.0006107492686200415, + "loss": 2.7246, + "step": 14658 + }, + { + "epoch": 0.43468849153396794, + "grad_norm": 0.11640368402004242, + "learning_rate": 0.0006107033858817006, + "loss": 2.718, + "step": 14659 + }, + { + "epoch": 0.4347181448862794, + "grad_norm": 0.12047921121120453, + "learning_rate": 0.0006106575021630621, + "loss": 2.7009, + "step": 14660 + }, + { + "epoch": 0.4347477982385909, + "grad_norm": 0.10361148416996002, + "learning_rate": 0.0006106116174645327, + "loss": 2.6887, + "step": 14661 + }, + { + "epoch": 0.43477745159090236, + "grad_norm": 0.12063819169998169, + "learning_rate": 0.0006105657317865182, + "loss": 2.7001, + "step": 14662 + }, + { + "epoch": 0.43480710494321384, + "grad_norm": 0.1248234361410141, + "learning_rate": 0.0006105198451294251, + "loss": 2.7257, + "step": 14663 + }, + { + "epoch": 0.4348367582955253, + "grad_norm": 0.11586140096187592, + "learning_rate": 0.0006104739574936599, + "loss": 2.6983, + "step": 14664 + }, + { + "epoch": 0.4348664116478368, + "grad_norm": 0.11831521987915039, + "learning_rate": 0.000610428068879629, + "loss": 2.6953, + "step": 14665 + }, + { + "epoch": 0.43489606500014827, + "grad_norm": 0.13557952642440796, + "learning_rate": 0.0006103821792877384, + "loss": 2.6935, + "step": 14666 + }, + { + "epoch": 0.43492571835245974, + "grad_norm": 0.14873556792736053, + "learning_rate": 0.0006103362887183947, + "loss": 2.6815, + "step": 14667 + }, + { + "epoch": 0.4349553717047712, + "grad_norm": 0.145670086145401, + "learning_rate": 0.0006102903971720043, + "loss": 2.6897, + "step": 14668 + }, + { + "epoch": 0.4349850250570827, + "grad_norm": 0.14444299042224884, + "learning_rate": 0.0006102445046489736, + "loss": 2.6964, + "step": 14669 + }, + { + "epoch": 0.4350146784093942, + "grad_norm": 0.13762934505939484, + "learning_rate": 0.0006101986111497087, + "loss": 2.6737, + "step": 14670 + }, + { + "epoch": 0.43504433176170565, + "grad_norm": 0.12694098055362701, + "learning_rate": 0.0006101527166746161, + "loss": 2.6762, + "step": 14671 + }, + { + "epoch": 0.4350739851140171, + "grad_norm": 0.13180577754974365, + "learning_rate": 0.0006101068212241024, + "loss": 2.6742, + "step": 14672 + }, + { + "epoch": 0.4351036384663286, + "grad_norm": 0.12059806287288666, + "learning_rate": 0.000610060924798574, + "loss": 2.7439, + "step": 14673 + }, + { + "epoch": 0.4351332918186401, + "grad_norm": 0.1189170554280281, + "learning_rate": 0.000610015027398437, + "loss": 2.711, + "step": 14674 + }, + { + "epoch": 0.43516294517095155, + "grad_norm": 0.10620106011629105, + "learning_rate": 0.0006099691290240984, + "loss": 2.7048, + "step": 14675 + }, + { + "epoch": 0.43519259852326303, + "grad_norm": 0.10704117268323898, + "learning_rate": 0.000609923229675964, + "loss": 2.7156, + "step": 14676 + }, + { + "epoch": 0.4352222518755745, + "grad_norm": 0.12166791409254074, + "learning_rate": 0.0006098773293544405, + "loss": 2.7229, + "step": 14677 + }, + { + "epoch": 0.43525190522788604, + "grad_norm": 0.12409872561693192, + "learning_rate": 0.0006098314280599345, + "loss": 2.7209, + "step": 14678 + }, + { + "epoch": 0.4352815585801975, + "grad_norm": 0.10646498203277588, + "learning_rate": 0.0006097855257928522, + "loss": 2.6635, + "step": 14679 + }, + { + "epoch": 0.435311211932509, + "grad_norm": 0.10661117732524872, + "learning_rate": 0.0006097396225536006, + "loss": 2.7285, + "step": 14680 + }, + { + "epoch": 0.43534086528482047, + "grad_norm": 0.12183023244142532, + "learning_rate": 0.0006096937183425856, + "loss": 2.683, + "step": 14681 + }, + { + "epoch": 0.43537051863713194, + "grad_norm": 0.12583614885807037, + "learning_rate": 0.0006096478131602137, + "loss": 2.6814, + "step": 14682 + }, + { + "epoch": 0.4354001719894434, + "grad_norm": 0.12441454082727432, + "learning_rate": 0.0006096019070068918, + "loss": 2.6983, + "step": 14683 + }, + { + "epoch": 0.4354298253417549, + "grad_norm": 0.1287475824356079, + "learning_rate": 0.0006095559998830261, + "loss": 2.7242, + "step": 14684 + }, + { + "epoch": 0.43545947869406637, + "grad_norm": 0.15624503791332245, + "learning_rate": 0.0006095100917890234, + "loss": 2.7108, + "step": 14685 + }, + { + "epoch": 0.43548913204637785, + "grad_norm": 0.1945105493068695, + "learning_rate": 0.00060946418272529, + "loss": 2.7256, + "step": 14686 + }, + { + "epoch": 0.4355187853986893, + "grad_norm": 0.172898530960083, + "learning_rate": 0.0006094182726922323, + "loss": 2.6655, + "step": 14687 + }, + { + "epoch": 0.4355484387510008, + "grad_norm": 0.12745949625968933, + "learning_rate": 0.0006093723616902569, + "loss": 2.6751, + "step": 14688 + }, + { + "epoch": 0.4355780921033123, + "grad_norm": 0.12153028696775436, + "learning_rate": 0.0006093264497197707, + "loss": 2.7031, + "step": 14689 + }, + { + "epoch": 0.43560774545562375, + "grad_norm": 0.11522944271564484, + "learning_rate": 0.0006092805367811801, + "loss": 2.6709, + "step": 14690 + }, + { + "epoch": 0.4356373988079352, + "grad_norm": 0.13470886647701263, + "learning_rate": 0.0006092346228748915, + "loss": 2.6741, + "step": 14691 + }, + { + "epoch": 0.4356670521602467, + "grad_norm": 0.12737751007080078, + "learning_rate": 0.0006091887080013115, + "loss": 2.7104, + "step": 14692 + }, + { + "epoch": 0.4356967055125582, + "grad_norm": 0.13587570190429688, + "learning_rate": 0.0006091427921608468, + "loss": 2.7107, + "step": 14693 + }, + { + "epoch": 0.43572635886486966, + "grad_norm": 0.1181800588965416, + "learning_rate": 0.0006090968753539039, + "loss": 2.7052, + "step": 14694 + }, + { + "epoch": 0.43575601221718113, + "grad_norm": 0.14853444695472717, + "learning_rate": 0.0006090509575808893, + "loss": 2.6974, + "step": 14695 + }, + { + "epoch": 0.4357856655694926, + "grad_norm": 0.14629991352558136, + "learning_rate": 0.0006090050388422102, + "loss": 2.711, + "step": 14696 + }, + { + "epoch": 0.4358153189218041, + "grad_norm": 0.12422433495521545, + "learning_rate": 0.0006089591191382724, + "loss": 2.7228, + "step": 14697 + }, + { + "epoch": 0.4358449722741156, + "grad_norm": 0.11939224600791931, + "learning_rate": 0.000608913198469483, + "loss": 2.6798, + "step": 14698 + }, + { + "epoch": 0.4358746256264271, + "grad_norm": 0.11650332063436508, + "learning_rate": 0.0006088672768362485, + "loss": 2.731, + "step": 14699 + }, + { + "epoch": 0.43590427897873857, + "grad_norm": 0.1306498795747757, + "learning_rate": 0.0006088213542389756, + "loss": 2.6731, + "step": 14700 + }, + { + "epoch": 0.43593393233105004, + "grad_norm": 0.14888519048690796, + "learning_rate": 0.000608775430678071, + "loss": 2.7179, + "step": 14701 + }, + { + "epoch": 0.4359635856833615, + "grad_norm": 0.14835235476493835, + "learning_rate": 0.0006087295061539412, + "loss": 2.6629, + "step": 14702 + }, + { + "epoch": 0.435993239035673, + "grad_norm": 0.13367429375648499, + "learning_rate": 0.000608683580666993, + "loss": 2.6767, + "step": 14703 + }, + { + "epoch": 0.4360228923879845, + "grad_norm": 0.12456692010164261, + "learning_rate": 0.000608637654217633, + "loss": 2.714, + "step": 14704 + }, + { + "epoch": 0.43605254574029595, + "grad_norm": 0.11803243309259415, + "learning_rate": 0.0006085917268062679, + "loss": 2.6407, + "step": 14705 + }, + { + "epoch": 0.4360821990926074, + "grad_norm": 0.14175647497177124, + "learning_rate": 0.0006085457984333044, + "loss": 2.666, + "step": 14706 + }, + { + "epoch": 0.4361118524449189, + "grad_norm": 0.1633017659187317, + "learning_rate": 0.0006084998690991495, + "loss": 2.6866, + "step": 14707 + }, + { + "epoch": 0.4361415057972304, + "grad_norm": 0.1264990121126175, + "learning_rate": 0.0006084539388042092, + "loss": 2.6548, + "step": 14708 + }, + { + "epoch": 0.43617115914954185, + "grad_norm": 0.11397767066955566, + "learning_rate": 0.0006084080075488909, + "loss": 2.7449, + "step": 14709 + }, + { + "epoch": 0.43620081250185333, + "grad_norm": 0.1182592585682869, + "learning_rate": 0.0006083620753336011, + "loss": 2.7241, + "step": 14710 + }, + { + "epoch": 0.4362304658541648, + "grad_norm": 0.108907051384449, + "learning_rate": 0.0006083161421587464, + "loss": 2.7084, + "step": 14711 + }, + { + "epoch": 0.4362601192064763, + "grad_norm": 0.11670037358999252, + "learning_rate": 0.0006082702080247338, + "loss": 2.7043, + "step": 14712 + }, + { + "epoch": 0.43628977255878776, + "grad_norm": 0.11315235495567322, + "learning_rate": 0.00060822427293197, + "loss": 2.7422, + "step": 14713 + }, + { + "epoch": 0.43631942591109923, + "grad_norm": 0.12011086195707321, + "learning_rate": 0.0006081783368808614, + "loss": 2.726, + "step": 14714 + }, + { + "epoch": 0.4363490792634107, + "grad_norm": 0.11676505208015442, + "learning_rate": 0.0006081323998718152, + "loss": 2.6695, + "step": 14715 + }, + { + "epoch": 0.4363787326157222, + "grad_norm": 0.10700253397226334, + "learning_rate": 0.0006080864619052381, + "loss": 2.7403, + "step": 14716 + }, + { + "epoch": 0.43640838596803366, + "grad_norm": 0.10841428488492966, + "learning_rate": 0.0006080405229815368, + "loss": 2.7177, + "step": 14717 + }, + { + "epoch": 0.43643803932034514, + "grad_norm": 0.1124424934387207, + "learning_rate": 0.0006079945831011182, + "loss": 2.7577, + "step": 14718 + }, + { + "epoch": 0.43646769267265667, + "grad_norm": 0.1245645061135292, + "learning_rate": 0.000607948642264389, + "loss": 2.6919, + "step": 14719 + }, + { + "epoch": 0.43649734602496815, + "grad_norm": 0.11910327523946762, + "learning_rate": 0.0006079027004717559, + "loss": 2.7084, + "step": 14720 + }, + { + "epoch": 0.4365269993772796, + "grad_norm": 0.11467362940311432, + "learning_rate": 0.0006078567577236259, + "loss": 2.7278, + "step": 14721 + }, + { + "epoch": 0.4365566527295911, + "grad_norm": 0.12630222737789154, + "learning_rate": 0.0006078108140204058, + "loss": 2.7176, + "step": 14722 + }, + { + "epoch": 0.4365863060819026, + "grad_norm": 0.15570281445980072, + "learning_rate": 0.0006077648693625027, + "loss": 2.701, + "step": 14723 + }, + { + "epoch": 0.43661595943421405, + "grad_norm": 0.15980888903141022, + "learning_rate": 0.0006077189237503229, + "loss": 2.711, + "step": 14724 + }, + { + "epoch": 0.4366456127865255, + "grad_norm": 0.12693439424037933, + "learning_rate": 0.0006076729771842736, + "loss": 2.7407, + "step": 14725 + }, + { + "epoch": 0.436675266138837, + "grad_norm": 0.12607912719249725, + "learning_rate": 0.0006076270296647615, + "loss": 2.7055, + "step": 14726 + }, + { + "epoch": 0.4367049194911485, + "grad_norm": 0.13259543478488922, + "learning_rate": 0.0006075810811921936, + "loss": 2.6802, + "step": 14727 + }, + { + "epoch": 0.43673457284345996, + "grad_norm": 0.11077552288770676, + "learning_rate": 0.0006075351317669771, + "loss": 2.7014, + "step": 14728 + }, + { + "epoch": 0.43676422619577143, + "grad_norm": 0.11173071712255478, + "learning_rate": 0.0006074891813895182, + "loss": 2.6837, + "step": 14729 + }, + { + "epoch": 0.4367938795480829, + "grad_norm": 0.13858112692832947, + "learning_rate": 0.0006074432300602243, + "loss": 2.6928, + "step": 14730 + }, + { + "epoch": 0.4368235329003944, + "grad_norm": 0.10752952843904495, + "learning_rate": 0.0006073972777795021, + "loss": 2.7088, + "step": 14731 + }, + { + "epoch": 0.43685318625270586, + "grad_norm": 0.13099971413612366, + "learning_rate": 0.0006073513245477586, + "loss": 2.6781, + "step": 14732 + }, + { + "epoch": 0.43688283960501734, + "grad_norm": 0.13980168104171753, + "learning_rate": 0.0006073053703654006, + "loss": 2.6981, + "step": 14733 + }, + { + "epoch": 0.4369124929573288, + "grad_norm": 0.12317977845668793, + "learning_rate": 0.0006072594152328353, + "loss": 2.7134, + "step": 14734 + }, + { + "epoch": 0.4369421463096403, + "grad_norm": 0.11183801293373108, + "learning_rate": 0.0006072134591504692, + "loss": 2.7156, + "step": 14735 + }, + { + "epoch": 0.43697179966195177, + "grad_norm": 0.12453726679086685, + "learning_rate": 0.0006071675021187097, + "loss": 2.6879, + "step": 14736 + }, + { + "epoch": 0.43700145301426324, + "grad_norm": 0.10961928218603134, + "learning_rate": 0.0006071215441379636, + "loss": 2.6902, + "step": 14737 + }, + { + "epoch": 0.4370311063665747, + "grad_norm": 0.12548795342445374, + "learning_rate": 0.0006070755852086378, + "loss": 2.7128, + "step": 14738 + }, + { + "epoch": 0.4370607597188862, + "grad_norm": 0.13899004459381104, + "learning_rate": 0.0006070296253311392, + "loss": 2.7202, + "step": 14739 + }, + { + "epoch": 0.4370904130711977, + "grad_norm": 0.1392858326435089, + "learning_rate": 0.0006069836645058751, + "loss": 2.6999, + "step": 14740 + }, + { + "epoch": 0.4371200664235092, + "grad_norm": 0.14290542900562286, + "learning_rate": 0.0006069377027332522, + "loss": 2.6887, + "step": 14741 + }, + { + "epoch": 0.4371497197758207, + "grad_norm": 0.14808037877082825, + "learning_rate": 0.0006068917400136775, + "loss": 2.7322, + "step": 14742 + }, + { + "epoch": 0.43717937312813215, + "grad_norm": 0.13257934153079987, + "learning_rate": 0.0006068457763475582, + "loss": 2.6915, + "step": 14743 + }, + { + "epoch": 0.43720902648044363, + "grad_norm": 0.1327885091304779, + "learning_rate": 0.0006067998117353011, + "loss": 2.6842, + "step": 14744 + }, + { + "epoch": 0.4372386798327551, + "grad_norm": 0.12288661301136017, + "learning_rate": 0.0006067538461773137, + "loss": 2.7205, + "step": 14745 + }, + { + "epoch": 0.4372683331850666, + "grad_norm": 0.1076141819357872, + "learning_rate": 0.0006067078796740023, + "loss": 2.6898, + "step": 14746 + }, + { + "epoch": 0.43729798653737806, + "grad_norm": 0.12464892119169235, + "learning_rate": 0.0006066619122257743, + "loss": 2.713, + "step": 14747 + }, + { + "epoch": 0.43732763988968953, + "grad_norm": 0.12806127965450287, + "learning_rate": 0.0006066159438330369, + "loss": 2.713, + "step": 14748 + }, + { + "epoch": 0.437357293242001, + "grad_norm": 0.12927484512329102, + "learning_rate": 0.000606569974496197, + "loss": 2.7131, + "step": 14749 + }, + { + "epoch": 0.4373869465943125, + "grad_norm": 0.1495915800333023, + "learning_rate": 0.0006065240042156616, + "loss": 2.6344, + "step": 14750 + }, + { + "epoch": 0.43741659994662396, + "grad_norm": 0.13690970838069916, + "learning_rate": 0.000606478032991838, + "loss": 2.6961, + "step": 14751 + }, + { + "epoch": 0.43744625329893544, + "grad_norm": 0.14080145955085754, + "learning_rate": 0.000606432060825133, + "loss": 2.7176, + "step": 14752 + }, + { + "epoch": 0.4374759066512469, + "grad_norm": 0.1372949630022049, + "learning_rate": 0.0006063860877159538, + "loss": 2.6853, + "step": 14753 + }, + { + "epoch": 0.4375055600035584, + "grad_norm": 0.11799118667840958, + "learning_rate": 0.0006063401136647077, + "loss": 2.7067, + "step": 14754 + }, + { + "epoch": 0.43753521335586987, + "grad_norm": 0.12009140849113464, + "learning_rate": 0.0006062941386718015, + "loss": 2.7232, + "step": 14755 + }, + { + "epoch": 0.43756486670818134, + "grad_norm": 0.12204422801733017, + "learning_rate": 0.0006062481627376426, + "loss": 2.6913, + "step": 14756 + }, + { + "epoch": 0.4375945200604928, + "grad_norm": 0.11581283807754517, + "learning_rate": 0.000606202185862638, + "loss": 2.6911, + "step": 14757 + }, + { + "epoch": 0.4376241734128043, + "grad_norm": 0.12726883590221405, + "learning_rate": 0.0006061562080471947, + "loss": 2.7128, + "step": 14758 + }, + { + "epoch": 0.43765382676511577, + "grad_norm": 0.1294868439435959, + "learning_rate": 0.0006061102292917199, + "loss": 2.7102, + "step": 14759 + }, + { + "epoch": 0.43768348011742725, + "grad_norm": 0.12452669441699982, + "learning_rate": 0.0006060642495966207, + "loss": 2.6935, + "step": 14760 + }, + { + "epoch": 0.4377131334697388, + "grad_norm": 0.11733229458332062, + "learning_rate": 0.0006060182689623047, + "loss": 2.7221, + "step": 14761 + }, + { + "epoch": 0.43774278682205026, + "grad_norm": 0.1132039725780487, + "learning_rate": 0.0006059722873891786, + "loss": 2.6905, + "step": 14762 + }, + { + "epoch": 0.43777244017436173, + "grad_norm": 0.12055090069770813, + "learning_rate": 0.0006059263048776496, + "loss": 2.7462, + "step": 14763 + }, + { + "epoch": 0.4378020935266732, + "grad_norm": 0.13388149440288544, + "learning_rate": 0.0006058803214281252, + "loss": 2.6604, + "step": 14764 + }, + { + "epoch": 0.4378317468789847, + "grad_norm": 0.1284072995185852, + "learning_rate": 0.0006058343370410123, + "loss": 2.724, + "step": 14765 + }, + { + "epoch": 0.43786140023129616, + "grad_norm": 0.13017518818378448, + "learning_rate": 0.0006057883517167179, + "loss": 2.7351, + "step": 14766 + }, + { + "epoch": 0.43789105358360764, + "grad_norm": 0.1374116688966751, + "learning_rate": 0.0006057423654556497, + "loss": 2.706, + "step": 14767 + }, + { + "epoch": 0.4379207069359191, + "grad_norm": 0.12188976258039474, + "learning_rate": 0.0006056963782582148, + "loss": 2.6935, + "step": 14768 + }, + { + "epoch": 0.4379503602882306, + "grad_norm": 0.14670701324939728, + "learning_rate": 0.0006056503901248203, + "loss": 2.7298, + "step": 14769 + }, + { + "epoch": 0.43798001364054207, + "grad_norm": 0.14772257208824158, + "learning_rate": 0.0006056044010558733, + "loss": 2.6881, + "step": 14770 + }, + { + "epoch": 0.43800966699285354, + "grad_norm": 0.1279476284980774, + "learning_rate": 0.0006055584110517813, + "loss": 2.6829, + "step": 14771 + }, + { + "epoch": 0.438039320345165, + "grad_norm": 0.11170930415391922, + "learning_rate": 0.0006055124201129515, + "loss": 2.6882, + "step": 14772 + }, + { + "epoch": 0.4380689736974765, + "grad_norm": 0.10458774864673615, + "learning_rate": 0.000605466428239791, + "loss": 2.7089, + "step": 14773 + }, + { + "epoch": 0.43809862704978797, + "grad_norm": 0.11024881899356842, + "learning_rate": 0.0006054204354327073, + "loss": 2.7013, + "step": 14774 + }, + { + "epoch": 0.43812828040209945, + "grad_norm": 0.10456480085849762, + "learning_rate": 0.0006053744416921075, + "loss": 2.6972, + "step": 14775 + }, + { + "epoch": 0.4381579337544109, + "grad_norm": 0.11988924443721771, + "learning_rate": 0.0006053284470183989, + "loss": 2.7082, + "step": 14776 + }, + { + "epoch": 0.4381875871067224, + "grad_norm": 0.13828608393669128, + "learning_rate": 0.0006052824514119888, + "loss": 2.7167, + "step": 14777 + }, + { + "epoch": 0.4382172404590339, + "grad_norm": 0.14116206765174866, + "learning_rate": 0.0006052364548732848, + "loss": 2.7149, + "step": 14778 + }, + { + "epoch": 0.43824689381134535, + "grad_norm": 0.1515396386384964, + "learning_rate": 0.0006051904574026935, + "loss": 2.6856, + "step": 14779 + }, + { + "epoch": 0.4382765471636568, + "grad_norm": 0.11701861023902893, + "learning_rate": 0.0006051444590006227, + "loss": 2.7144, + "step": 14780 + }, + { + "epoch": 0.4383062005159683, + "grad_norm": 0.11670562624931335, + "learning_rate": 0.0006050984596674798, + "loss": 2.7376, + "step": 14781 + }, + { + "epoch": 0.43833585386827983, + "grad_norm": 0.15073050558567047, + "learning_rate": 0.0006050524594036721, + "loss": 2.7017, + "step": 14782 + }, + { + "epoch": 0.4383655072205913, + "grad_norm": 0.13145224750041962, + "learning_rate": 0.0006050064582096069, + "loss": 2.7324, + "step": 14783 + }, + { + "epoch": 0.4383951605729028, + "grad_norm": 0.11577048152685165, + "learning_rate": 0.0006049604560856913, + "loss": 2.6837, + "step": 14784 + }, + { + "epoch": 0.43842481392521426, + "grad_norm": 0.12022414058446884, + "learning_rate": 0.0006049144530323327, + "loss": 2.674, + "step": 14785 + }, + { + "epoch": 0.43845446727752574, + "grad_norm": 0.11994179338216782, + "learning_rate": 0.0006048684490499389, + "loss": 2.7044, + "step": 14786 + }, + { + "epoch": 0.4384841206298372, + "grad_norm": 0.12856842577457428, + "learning_rate": 0.0006048224441389168, + "loss": 2.731, + "step": 14787 + }, + { + "epoch": 0.4385137739821487, + "grad_norm": 0.13822127878665924, + "learning_rate": 0.0006047764382996741, + "loss": 2.7143, + "step": 14788 + }, + { + "epoch": 0.43854342733446017, + "grad_norm": 0.12793190777301788, + "learning_rate": 0.0006047304315326181, + "loss": 2.7077, + "step": 14789 + }, + { + "epoch": 0.43857308068677164, + "grad_norm": 0.15318644046783447, + "learning_rate": 0.0006046844238381561, + "loss": 2.6844, + "step": 14790 + }, + { + "epoch": 0.4386027340390831, + "grad_norm": 0.14234337210655212, + "learning_rate": 0.0006046384152166953, + "loss": 2.7178, + "step": 14791 + }, + { + "epoch": 0.4386323873913946, + "grad_norm": 0.14453807473182678, + "learning_rate": 0.0006045924056686436, + "loss": 2.6905, + "step": 14792 + }, + { + "epoch": 0.43866204074370607, + "grad_norm": 0.1398313045501709, + "learning_rate": 0.0006045463951944081, + "loss": 2.7224, + "step": 14793 + }, + { + "epoch": 0.43869169409601755, + "grad_norm": 0.12487900257110596, + "learning_rate": 0.0006045003837943965, + "loss": 2.7151, + "step": 14794 + }, + { + "epoch": 0.438721347448329, + "grad_norm": 0.14075374603271484, + "learning_rate": 0.000604454371469016, + "loss": 2.7247, + "step": 14795 + }, + { + "epoch": 0.4387510008006405, + "grad_norm": 0.13870225846767426, + "learning_rate": 0.000604408358218674, + "loss": 2.6985, + "step": 14796 + }, + { + "epoch": 0.438780654152952, + "grad_norm": 0.14704176783561707, + "learning_rate": 0.0006043623440437781, + "loss": 2.7025, + "step": 14797 + }, + { + "epoch": 0.43881030750526345, + "grad_norm": 0.14941219985485077, + "learning_rate": 0.0006043163289447357, + "loss": 2.7083, + "step": 14798 + }, + { + "epoch": 0.43883996085757493, + "grad_norm": 0.11667969822883606, + "learning_rate": 0.0006042703129219545, + "loss": 2.6783, + "step": 14799 + }, + { + "epoch": 0.4388696142098864, + "grad_norm": 0.1275615692138672, + "learning_rate": 0.0006042242959758416, + "loss": 2.6532, + "step": 14800 + }, + { + "epoch": 0.4388992675621979, + "grad_norm": 0.1276015341281891, + "learning_rate": 0.0006041782781068046, + "loss": 2.6902, + "step": 14801 + }, + { + "epoch": 0.4389289209145094, + "grad_norm": 0.11896279454231262, + "learning_rate": 0.0006041322593152513, + "loss": 2.6847, + "step": 14802 + }, + { + "epoch": 0.4389585742668209, + "grad_norm": 0.11602828651666641, + "learning_rate": 0.0006040862396015888, + "loss": 2.6881, + "step": 14803 + }, + { + "epoch": 0.43898822761913237, + "grad_norm": 0.12284643948078156, + "learning_rate": 0.0006040402189662248, + "loss": 2.6526, + "step": 14804 + }, + { + "epoch": 0.43901788097144384, + "grad_norm": 0.1322106122970581, + "learning_rate": 0.0006039941974095669, + "loss": 2.7115, + "step": 14805 + }, + { + "epoch": 0.4390475343237553, + "grad_norm": 0.1270458698272705, + "learning_rate": 0.0006039481749320225, + "loss": 2.7148, + "step": 14806 + }, + { + "epoch": 0.4390771876760668, + "grad_norm": 0.11664918810129166, + "learning_rate": 0.0006039021515339991, + "loss": 2.7025, + "step": 14807 + }, + { + "epoch": 0.43910684102837827, + "grad_norm": 0.11683676391839981, + "learning_rate": 0.0006038561272159043, + "loss": 2.6952, + "step": 14808 + }, + { + "epoch": 0.43913649438068975, + "grad_norm": 0.13591016829013824, + "learning_rate": 0.0006038101019781457, + "loss": 2.6986, + "step": 14809 + }, + { + "epoch": 0.4391661477330012, + "grad_norm": 0.10789430141448975, + "learning_rate": 0.0006037640758211309, + "loss": 2.6936, + "step": 14810 + }, + { + "epoch": 0.4391958010853127, + "grad_norm": 0.11373145133256912, + "learning_rate": 0.0006037180487452674, + "loss": 2.6937, + "step": 14811 + }, + { + "epoch": 0.4392254544376242, + "grad_norm": 0.12717851996421814, + "learning_rate": 0.0006036720207509628, + "loss": 2.6596, + "step": 14812 + }, + { + "epoch": 0.43925510778993565, + "grad_norm": 0.12665879726409912, + "learning_rate": 0.0006036259918386245, + "loss": 2.6586, + "step": 14813 + }, + { + "epoch": 0.4392847611422471, + "grad_norm": 0.1660076379776001, + "learning_rate": 0.0006035799620086603, + "loss": 2.695, + "step": 14814 + }, + { + "epoch": 0.4393144144945586, + "grad_norm": 0.15685348212718964, + "learning_rate": 0.0006035339312614778, + "loss": 2.7049, + "step": 14815 + }, + { + "epoch": 0.4393440678468701, + "grad_norm": 0.12289419770240784, + "learning_rate": 0.0006034878995974846, + "loss": 2.6963, + "step": 14816 + }, + { + "epoch": 0.43937372119918155, + "grad_norm": 0.15355776250362396, + "learning_rate": 0.0006034418670170882, + "loss": 2.7317, + "step": 14817 + }, + { + "epoch": 0.43940337455149303, + "grad_norm": 0.15145839750766754, + "learning_rate": 0.0006033958335206963, + "loss": 2.7175, + "step": 14818 + }, + { + "epoch": 0.4394330279038045, + "grad_norm": 0.12351672351360321, + "learning_rate": 0.0006033497991087166, + "loss": 2.6881, + "step": 14819 + }, + { + "epoch": 0.439462681256116, + "grad_norm": 0.15334750711917877, + "learning_rate": 0.0006033037637815567, + "loss": 2.675, + "step": 14820 + }, + { + "epoch": 0.43949233460842746, + "grad_norm": 0.16895119845867157, + "learning_rate": 0.0006032577275396243, + "loss": 2.6853, + "step": 14821 + }, + { + "epoch": 0.43952198796073894, + "grad_norm": 0.15305571258068085, + "learning_rate": 0.0006032116903833269, + "loss": 2.6949, + "step": 14822 + }, + { + "epoch": 0.43955164131305047, + "grad_norm": 0.13556882739067078, + "learning_rate": 0.0006031656523130724, + "loss": 2.714, + "step": 14823 + }, + { + "epoch": 0.43958129466536194, + "grad_norm": 0.14231565594673157, + "learning_rate": 0.0006031196133292682, + "loss": 2.7384, + "step": 14824 + }, + { + "epoch": 0.4396109480176734, + "grad_norm": 0.13932839035987854, + "learning_rate": 0.000603073573432322, + "loss": 2.6873, + "step": 14825 + }, + { + "epoch": 0.4396406013699849, + "grad_norm": 0.13948455452919006, + "learning_rate": 0.0006030275326226417, + "loss": 2.7137, + "step": 14826 + }, + { + "epoch": 0.43967025472229637, + "grad_norm": 0.1203116625547409, + "learning_rate": 0.0006029814909006353, + "loss": 2.7147, + "step": 14827 + }, + { + "epoch": 0.43969990807460785, + "grad_norm": 0.15710008144378662, + "learning_rate": 0.0006029354482667097, + "loss": 2.6861, + "step": 14828 + }, + { + "epoch": 0.4397295614269193, + "grad_norm": 0.14046244323253632, + "learning_rate": 0.0006028894047212732, + "loss": 2.673, + "step": 14829 + }, + { + "epoch": 0.4397592147792308, + "grad_norm": 0.1437794268131256, + "learning_rate": 0.0006028433602647333, + "loss": 2.6965, + "step": 14830 + }, + { + "epoch": 0.4397888681315423, + "grad_norm": 0.14789815247058868, + "learning_rate": 0.0006027973148974977, + "loss": 2.6986, + "step": 14831 + }, + { + "epoch": 0.43981852148385375, + "grad_norm": 0.11678913235664368, + "learning_rate": 0.0006027512686199745, + "loss": 2.7134, + "step": 14832 + }, + { + "epoch": 0.43984817483616523, + "grad_norm": 0.13223762810230255, + "learning_rate": 0.0006027052214325709, + "loss": 2.6785, + "step": 14833 + }, + { + "epoch": 0.4398778281884767, + "grad_norm": 0.12873341143131256, + "learning_rate": 0.000602659173335695, + "loss": 2.6753, + "step": 14834 + }, + { + "epoch": 0.4399074815407882, + "grad_norm": 0.1246834322810173, + "learning_rate": 0.0006026131243297546, + "loss": 2.6855, + "step": 14835 + }, + { + "epoch": 0.43993713489309966, + "grad_norm": 0.11584971845149994, + "learning_rate": 0.0006025670744151573, + "loss": 2.7096, + "step": 14836 + }, + { + "epoch": 0.43996678824541113, + "grad_norm": 0.12456822395324707, + "learning_rate": 0.000602521023592311, + "loss": 2.7311, + "step": 14837 + }, + { + "epoch": 0.4399964415977226, + "grad_norm": 0.14158384501934052, + "learning_rate": 0.0006024749718616234, + "loss": 2.7063, + "step": 14838 + }, + { + "epoch": 0.4400260949500341, + "grad_norm": 0.1622512936592102, + "learning_rate": 0.0006024289192235023, + "loss": 2.6781, + "step": 14839 + }, + { + "epoch": 0.44005574830234556, + "grad_norm": 0.1389591246843338, + "learning_rate": 0.0006023828656783555, + "loss": 2.6913, + "step": 14840 + }, + { + "epoch": 0.44008540165465704, + "grad_norm": 0.11819928139448166, + "learning_rate": 0.000602336811226591, + "loss": 2.6746, + "step": 14841 + }, + { + "epoch": 0.4401150550069685, + "grad_norm": 0.11920709908008575, + "learning_rate": 0.0006022907558686164, + "loss": 2.6915, + "step": 14842 + }, + { + "epoch": 0.44014470835928, + "grad_norm": 0.10527054965496063, + "learning_rate": 0.0006022446996048396, + "loss": 2.6759, + "step": 14843 + }, + { + "epoch": 0.4401743617115915, + "grad_norm": 0.1213935911655426, + "learning_rate": 0.0006021986424356684, + "loss": 2.7128, + "step": 14844 + }, + { + "epoch": 0.440204015063903, + "grad_norm": 0.1312914937734604, + "learning_rate": 0.0006021525843615108, + "loss": 2.6812, + "step": 14845 + }, + { + "epoch": 0.4402336684162145, + "grad_norm": 0.12939006090164185, + "learning_rate": 0.0006021065253827744, + "loss": 2.659, + "step": 14846 + }, + { + "epoch": 0.44026332176852595, + "grad_norm": 0.12668122351169586, + "learning_rate": 0.0006020604654998671, + "loss": 2.6905, + "step": 14847 + }, + { + "epoch": 0.4402929751208374, + "grad_norm": 0.1341915875673294, + "learning_rate": 0.0006020144047131971, + "loss": 2.7067, + "step": 14848 + }, + { + "epoch": 0.4403226284731489, + "grad_norm": 0.144780233502388, + "learning_rate": 0.0006019683430231721, + "loss": 2.6709, + "step": 14849 + }, + { + "epoch": 0.4403522818254604, + "grad_norm": 0.11568555980920792, + "learning_rate": 0.0006019222804301996, + "loss": 2.6992, + "step": 14850 + }, + { + "epoch": 0.44038193517777185, + "grad_norm": 0.15556761622428894, + "learning_rate": 0.000601876216934688, + "loss": 2.7132, + "step": 14851 + }, + { + "epoch": 0.44041158853008333, + "grad_norm": 0.13813140988349915, + "learning_rate": 0.0006018301525370449, + "loss": 2.6741, + "step": 14852 + }, + { + "epoch": 0.4404412418823948, + "grad_norm": 0.10725460946559906, + "learning_rate": 0.0006017840872376784, + "loss": 2.6994, + "step": 14853 + }, + { + "epoch": 0.4404708952347063, + "grad_norm": 0.14809389412403107, + "learning_rate": 0.0006017380210369965, + "loss": 2.6857, + "step": 14854 + }, + { + "epoch": 0.44050054858701776, + "grad_norm": 0.15691767632961273, + "learning_rate": 0.0006016919539354068, + "loss": 2.7196, + "step": 14855 + }, + { + "epoch": 0.44053020193932924, + "grad_norm": 0.15879344940185547, + "learning_rate": 0.0006016458859333173, + "loss": 2.6904, + "step": 14856 + }, + { + "epoch": 0.4405598552916407, + "grad_norm": 0.16490261256694794, + "learning_rate": 0.000601599817031136, + "loss": 2.6948, + "step": 14857 + }, + { + "epoch": 0.4405895086439522, + "grad_norm": 0.135880246758461, + "learning_rate": 0.000601553747229271, + "loss": 2.6867, + "step": 14858 + }, + { + "epoch": 0.44061916199626366, + "grad_norm": 0.12096485495567322, + "learning_rate": 0.0006015076765281301, + "loss": 2.7003, + "step": 14859 + }, + { + "epoch": 0.44064881534857514, + "grad_norm": 0.10928502678871155, + "learning_rate": 0.0006014616049281216, + "loss": 2.7018, + "step": 14860 + }, + { + "epoch": 0.4406784687008866, + "grad_norm": 0.13019156455993652, + "learning_rate": 0.0006014155324296528, + "loss": 2.7125, + "step": 14861 + }, + { + "epoch": 0.4407081220531981, + "grad_norm": 0.13509860634803772, + "learning_rate": 0.0006013694590331321, + "loss": 2.682, + "step": 14862 + }, + { + "epoch": 0.44073777540550957, + "grad_norm": 0.12702560424804688, + "learning_rate": 0.0006013233847389674, + "loss": 2.6867, + "step": 14863 + }, + { + "epoch": 0.44076742875782104, + "grad_norm": 0.101042740046978, + "learning_rate": 0.0006012773095475668, + "loss": 2.6795, + "step": 14864 + }, + { + "epoch": 0.4407970821101326, + "grad_norm": 0.11714256554841995, + "learning_rate": 0.0006012312334593385, + "loss": 2.671, + "step": 14865 + }, + { + "epoch": 0.44082673546244405, + "grad_norm": 0.122254878282547, + "learning_rate": 0.0006011851564746899, + "loss": 2.6664, + "step": 14866 + }, + { + "epoch": 0.44085638881475553, + "grad_norm": 0.12184807658195496, + "learning_rate": 0.0006011390785940296, + "loss": 2.714, + "step": 14867 + }, + { + "epoch": 0.440886042167067, + "grad_norm": 0.12089920789003372, + "learning_rate": 0.0006010929998177653, + "loss": 2.6948, + "step": 14868 + }, + { + "epoch": 0.4409156955193785, + "grad_norm": 0.10985089838504791, + "learning_rate": 0.000601046920146305, + "loss": 2.709, + "step": 14869 + }, + { + "epoch": 0.44094534887168996, + "grad_norm": 0.11733199656009674, + "learning_rate": 0.0006010008395800571, + "loss": 2.7104, + "step": 14870 + }, + { + "epoch": 0.44097500222400143, + "grad_norm": 0.11566948145627975, + "learning_rate": 0.0006009547581194293, + "loss": 2.7133, + "step": 14871 + }, + { + "epoch": 0.4410046555763129, + "grad_norm": 0.14955729246139526, + "learning_rate": 0.0006009086757648299, + "loss": 2.7528, + "step": 14872 + }, + { + "epoch": 0.4410343089286244, + "grad_norm": 0.1470199078321457, + "learning_rate": 0.0006008625925166668, + "loss": 2.6995, + "step": 14873 + }, + { + "epoch": 0.44106396228093586, + "grad_norm": 0.12050365656614304, + "learning_rate": 0.0006008165083753481, + "loss": 2.7108, + "step": 14874 + }, + { + "epoch": 0.44109361563324734, + "grad_norm": 0.1393241137266159, + "learning_rate": 0.0006007704233412819, + "loss": 2.696, + "step": 14875 + }, + { + "epoch": 0.4411232689855588, + "grad_norm": 0.1227167621254921, + "learning_rate": 0.0006007243374148763, + "loss": 2.6752, + "step": 14876 + }, + { + "epoch": 0.4411529223378703, + "grad_norm": 0.11537287384271622, + "learning_rate": 0.0006006782505965395, + "loss": 2.7271, + "step": 14877 + }, + { + "epoch": 0.44118257569018177, + "grad_norm": 0.1284399926662445, + "learning_rate": 0.0006006321628866794, + "loss": 2.6939, + "step": 14878 + }, + { + "epoch": 0.44121222904249324, + "grad_norm": 0.1339646428823471, + "learning_rate": 0.0006005860742857042, + "loss": 2.7166, + "step": 14879 + }, + { + "epoch": 0.4412418823948047, + "grad_norm": 0.10616957396268845, + "learning_rate": 0.0006005399847940221, + "loss": 2.7131, + "step": 14880 + }, + { + "epoch": 0.4412715357471162, + "grad_norm": 0.11435724794864655, + "learning_rate": 0.0006004938944120413, + "loss": 2.6901, + "step": 14881 + }, + { + "epoch": 0.44130118909942767, + "grad_norm": 0.12906543910503387, + "learning_rate": 0.0006004478031401697, + "loss": 2.7477, + "step": 14882 + }, + { + "epoch": 0.44133084245173915, + "grad_norm": 0.11882822960615158, + "learning_rate": 0.0006004017109788156, + "loss": 2.6747, + "step": 14883 + }, + { + "epoch": 0.4413604958040506, + "grad_norm": 0.11513006687164307, + "learning_rate": 0.000600355617928387, + "loss": 2.7181, + "step": 14884 + }, + { + "epoch": 0.4413901491563621, + "grad_norm": 0.12649478018283844, + "learning_rate": 0.0006003095239892923, + "loss": 2.6532, + "step": 14885 + }, + { + "epoch": 0.44141980250867363, + "grad_norm": 0.13556787371635437, + "learning_rate": 0.0006002634291619396, + "loss": 2.7244, + "step": 14886 + }, + { + "epoch": 0.4414494558609851, + "grad_norm": 0.13235171139240265, + "learning_rate": 0.0006002173334467369, + "loss": 2.6993, + "step": 14887 + }, + { + "epoch": 0.4414791092132966, + "grad_norm": 0.11499206721782684, + "learning_rate": 0.0006001712368440926, + "loss": 2.6775, + "step": 14888 + }, + { + "epoch": 0.44150876256560806, + "grad_norm": 0.1254093050956726, + "learning_rate": 0.0006001251393544146, + "loss": 2.7323, + "step": 14889 + }, + { + "epoch": 0.44153841591791954, + "grad_norm": 0.12285739928483963, + "learning_rate": 0.0006000790409781115, + "loss": 2.6942, + "step": 14890 + }, + { + "epoch": 0.441568069270231, + "grad_norm": 0.12954860925674438, + "learning_rate": 0.0006000329417155912, + "loss": 2.709, + "step": 14891 + }, + { + "epoch": 0.4415977226225425, + "grad_norm": 0.13713018596172333, + "learning_rate": 0.0005999868415672622, + "loss": 2.7028, + "step": 14892 + }, + { + "epoch": 0.44162737597485396, + "grad_norm": 0.1420866996049881, + "learning_rate": 0.0005999407405335325, + "loss": 2.6779, + "step": 14893 + }, + { + "epoch": 0.44165702932716544, + "grad_norm": 0.13270758092403412, + "learning_rate": 0.0005998946386148104, + "loss": 2.6973, + "step": 14894 + }, + { + "epoch": 0.4416866826794769, + "grad_norm": 0.11662440001964569, + "learning_rate": 0.0005998485358115039, + "loss": 2.7117, + "step": 14895 + }, + { + "epoch": 0.4417163360317884, + "grad_norm": 0.116458959877491, + "learning_rate": 0.0005998024321240217, + "loss": 2.6789, + "step": 14896 + }, + { + "epoch": 0.44174598938409987, + "grad_norm": 0.14110027253627777, + "learning_rate": 0.0005997563275527717, + "loss": 2.7155, + "step": 14897 + }, + { + "epoch": 0.44177564273641134, + "grad_norm": 0.13629302382469177, + "learning_rate": 0.0005997102220981625, + "loss": 2.6869, + "step": 14898 + }, + { + "epoch": 0.4418052960887228, + "grad_norm": 0.17630864679813385, + "learning_rate": 0.000599664115760602, + "loss": 2.6329, + "step": 14899 + }, + { + "epoch": 0.4418349494410343, + "grad_norm": 0.1798364371061325, + "learning_rate": 0.0005996180085404987, + "loss": 2.68, + "step": 14900 + }, + { + "epoch": 0.4418646027933458, + "grad_norm": 0.15860286355018616, + "learning_rate": 0.0005995719004382609, + "loss": 2.6798, + "step": 14901 + }, + { + "epoch": 0.44189425614565725, + "grad_norm": 0.1556946486234665, + "learning_rate": 0.0005995257914542966, + "loss": 2.668, + "step": 14902 + }, + { + "epoch": 0.4419239094979687, + "grad_norm": 0.1370343714952469, + "learning_rate": 0.0005994796815890146, + "loss": 2.6877, + "step": 14903 + }, + { + "epoch": 0.4419535628502802, + "grad_norm": 0.12062016874551773, + "learning_rate": 0.0005994335708428228, + "loss": 2.6988, + "step": 14904 + }, + { + "epoch": 0.4419832162025917, + "grad_norm": 0.1325918436050415, + "learning_rate": 0.0005993874592161297, + "loss": 2.7124, + "step": 14905 + }, + { + "epoch": 0.4420128695549032, + "grad_norm": 0.10908836126327515, + "learning_rate": 0.0005993413467093436, + "loss": 2.6796, + "step": 14906 + }, + { + "epoch": 0.4420425229072147, + "grad_norm": 0.1289723515510559, + "learning_rate": 0.0005992952333228728, + "loss": 2.6684, + "step": 14907 + }, + { + "epoch": 0.44207217625952616, + "grad_norm": 0.11945801228284836, + "learning_rate": 0.0005992491190571256, + "loss": 2.7148, + "step": 14908 + }, + { + "epoch": 0.44210182961183764, + "grad_norm": 0.13323092460632324, + "learning_rate": 0.0005992030039125105, + "loss": 2.6315, + "step": 14909 + }, + { + "epoch": 0.4421314829641491, + "grad_norm": 0.12158481776714325, + "learning_rate": 0.0005991568878894358, + "loss": 2.6815, + "step": 14910 + }, + { + "epoch": 0.4421611363164606, + "grad_norm": 0.13791315257549286, + "learning_rate": 0.0005991107709883098, + "loss": 2.7178, + "step": 14911 + }, + { + "epoch": 0.44219078966877207, + "grad_norm": 0.13439884781837463, + "learning_rate": 0.000599064653209541, + "loss": 2.7145, + "step": 14912 + }, + { + "epoch": 0.44222044302108354, + "grad_norm": 0.11990322917699814, + "learning_rate": 0.0005990185345535375, + "loss": 2.6981, + "step": 14913 + }, + { + "epoch": 0.442250096373395, + "grad_norm": 0.12909887731075287, + "learning_rate": 0.000598972415020708, + "loss": 2.6594, + "step": 14914 + }, + { + "epoch": 0.4422797497257065, + "grad_norm": 0.12256183475255966, + "learning_rate": 0.0005989262946114607, + "loss": 2.7127, + "step": 14915 + }, + { + "epoch": 0.44230940307801797, + "grad_norm": 0.12288036942481995, + "learning_rate": 0.0005988801733262042, + "loss": 2.691, + "step": 14916 + }, + { + "epoch": 0.44233905643032945, + "grad_norm": 0.11985668540000916, + "learning_rate": 0.0005988340511653467, + "loss": 2.714, + "step": 14917 + }, + { + "epoch": 0.4423687097826409, + "grad_norm": 0.125044047832489, + "learning_rate": 0.0005987879281292968, + "loss": 2.6833, + "step": 14918 + }, + { + "epoch": 0.4423983631349524, + "grad_norm": 0.1190633773803711, + "learning_rate": 0.0005987418042184627, + "loss": 2.6616, + "step": 14919 + }, + { + "epoch": 0.4424280164872639, + "grad_norm": 0.12138780951499939, + "learning_rate": 0.0005986956794332533, + "loss": 2.6761, + "step": 14920 + }, + { + "epoch": 0.44245766983957535, + "grad_norm": 0.10715045034885406, + "learning_rate": 0.0005986495537740762, + "loss": 2.7257, + "step": 14921 + }, + { + "epoch": 0.4424873231918868, + "grad_norm": 0.12601354718208313, + "learning_rate": 0.0005986034272413407, + "loss": 2.7097, + "step": 14922 + }, + { + "epoch": 0.4425169765441983, + "grad_norm": 0.11741317063570023, + "learning_rate": 0.0005985572998354549, + "loss": 2.7138, + "step": 14923 + }, + { + "epoch": 0.4425466298965098, + "grad_norm": 0.1296658217906952, + "learning_rate": 0.0005985111715568273, + "loss": 2.7208, + "step": 14924 + }, + { + "epoch": 0.44257628324882126, + "grad_norm": 0.15218156576156616, + "learning_rate": 0.0005984650424058664, + "loss": 2.6906, + "step": 14925 + }, + { + "epoch": 0.44260593660113273, + "grad_norm": 0.14831426739692688, + "learning_rate": 0.0005984189123829806, + "loss": 2.697, + "step": 14926 + }, + { + "epoch": 0.44263558995344426, + "grad_norm": 0.15094004571437836, + "learning_rate": 0.0005983727814885783, + "loss": 2.7055, + "step": 14927 + }, + { + "epoch": 0.44266524330575574, + "grad_norm": 0.1523084044456482, + "learning_rate": 0.0005983266497230681, + "loss": 2.6938, + "step": 14928 + }, + { + "epoch": 0.4426948966580672, + "grad_norm": 0.1240503117442131, + "learning_rate": 0.0005982805170868589, + "loss": 2.6919, + "step": 14929 + }, + { + "epoch": 0.4427245500103787, + "grad_norm": 0.14502987265586853, + "learning_rate": 0.0005982343835803587, + "loss": 2.7062, + "step": 14930 + }, + { + "epoch": 0.44275420336269017, + "grad_norm": 0.13066788017749786, + "learning_rate": 0.0005981882492039761, + "loss": 2.6965, + "step": 14931 + }, + { + "epoch": 0.44278385671500164, + "grad_norm": 0.1564689427614212, + "learning_rate": 0.0005981421139581197, + "loss": 2.6797, + "step": 14932 + }, + { + "epoch": 0.4428135100673131, + "grad_norm": 0.17986586689949036, + "learning_rate": 0.000598095977843198, + "loss": 2.693, + "step": 14933 + }, + { + "epoch": 0.4428431634196246, + "grad_norm": 0.1255330890417099, + "learning_rate": 0.0005980498408596196, + "loss": 2.7017, + "step": 14934 + }, + { + "epoch": 0.4428728167719361, + "grad_norm": 0.14485131204128265, + "learning_rate": 0.000598003703007793, + "loss": 2.6732, + "step": 14935 + }, + { + "epoch": 0.44290247012424755, + "grad_norm": 0.15459221601486206, + "learning_rate": 0.0005979575642881268, + "loss": 2.7173, + "step": 14936 + }, + { + "epoch": 0.442932123476559, + "grad_norm": 0.15664738416671753, + "learning_rate": 0.0005979114247010297, + "loss": 2.6663, + "step": 14937 + }, + { + "epoch": 0.4429617768288705, + "grad_norm": 0.15180116891860962, + "learning_rate": 0.00059786528424691, + "loss": 2.6983, + "step": 14938 + }, + { + "epoch": 0.442991430181182, + "grad_norm": 0.13968275487422943, + "learning_rate": 0.0005978191429261764, + "loss": 2.6851, + "step": 14939 + }, + { + "epoch": 0.44302108353349345, + "grad_norm": 0.13335487246513367, + "learning_rate": 0.0005977730007392376, + "loss": 2.6737, + "step": 14940 + }, + { + "epoch": 0.44305073688580493, + "grad_norm": 0.12411832064390182, + "learning_rate": 0.000597726857686502, + "loss": 2.7185, + "step": 14941 + }, + { + "epoch": 0.4430803902381164, + "grad_norm": 0.12104841321706772, + "learning_rate": 0.0005976807137683783, + "loss": 2.7204, + "step": 14942 + }, + { + "epoch": 0.4431100435904279, + "grad_norm": 0.1199624165892601, + "learning_rate": 0.0005976345689852751, + "loss": 2.707, + "step": 14943 + }, + { + "epoch": 0.44313969694273936, + "grad_norm": 0.11129643768072128, + "learning_rate": 0.0005975884233376011, + "loss": 2.7066, + "step": 14944 + }, + { + "epoch": 0.44316935029505083, + "grad_norm": 0.12096492201089859, + "learning_rate": 0.0005975422768257648, + "loss": 2.7164, + "step": 14945 + }, + { + "epoch": 0.4431990036473623, + "grad_norm": 0.1139802485704422, + "learning_rate": 0.0005974961294501751, + "loss": 2.6969, + "step": 14946 + }, + { + "epoch": 0.4432286569996738, + "grad_norm": 0.10674832761287689, + "learning_rate": 0.0005974499812112402, + "loss": 2.7151, + "step": 14947 + }, + { + "epoch": 0.4432583103519853, + "grad_norm": 0.11350798606872559, + "learning_rate": 0.000597403832109369, + "loss": 2.6914, + "step": 14948 + }, + { + "epoch": 0.4432879637042968, + "grad_norm": 0.0997878909111023, + "learning_rate": 0.0005973576821449703, + "loss": 2.7077, + "step": 14949 + }, + { + "epoch": 0.44331761705660827, + "grad_norm": 0.11618360131978989, + "learning_rate": 0.0005973115313184525, + "loss": 2.7024, + "step": 14950 + }, + { + "epoch": 0.44334727040891975, + "grad_norm": 0.10839074850082397, + "learning_rate": 0.0005972653796302243, + "loss": 2.6886, + "step": 14951 + }, + { + "epoch": 0.4433769237612312, + "grad_norm": 0.12195640057325363, + "learning_rate": 0.0005972192270806946, + "loss": 2.7021, + "step": 14952 + }, + { + "epoch": 0.4434065771135427, + "grad_norm": 0.12846633791923523, + "learning_rate": 0.000597173073670272, + "loss": 2.7174, + "step": 14953 + }, + { + "epoch": 0.4434362304658542, + "grad_norm": 0.11991050839424133, + "learning_rate": 0.000597126919399365, + "loss": 2.7284, + "step": 14954 + }, + { + "epoch": 0.44346588381816565, + "grad_norm": 0.12403804063796997, + "learning_rate": 0.0005970807642683827, + "loss": 2.7301, + "step": 14955 + }, + { + "epoch": 0.4434955371704771, + "grad_norm": 0.11446738243103027, + "learning_rate": 0.0005970346082777333, + "loss": 2.6783, + "step": 14956 + }, + { + "epoch": 0.4435251905227886, + "grad_norm": 0.11244377493858337, + "learning_rate": 0.0005969884514278259, + "loss": 2.6853, + "step": 14957 + }, + { + "epoch": 0.4435548438751001, + "grad_norm": 0.11615338176488876, + "learning_rate": 0.0005969422937190692, + "loss": 2.7194, + "step": 14958 + }, + { + "epoch": 0.44358449722741156, + "grad_norm": 0.11209339648485184, + "learning_rate": 0.0005968961351518718, + "loss": 2.6674, + "step": 14959 + }, + { + "epoch": 0.44361415057972303, + "grad_norm": 0.1076408252120018, + "learning_rate": 0.0005968499757266424, + "loss": 2.662, + "step": 14960 + }, + { + "epoch": 0.4436438039320345, + "grad_norm": 0.10904266685247421, + "learning_rate": 0.0005968038154437898, + "loss": 2.687, + "step": 14961 + }, + { + "epoch": 0.443673457284346, + "grad_norm": 0.12542365491390228, + "learning_rate": 0.0005967576543037229, + "loss": 2.6993, + "step": 14962 + }, + { + "epoch": 0.44370311063665746, + "grad_norm": 0.1211930438876152, + "learning_rate": 0.0005967114923068505, + "loss": 2.7068, + "step": 14963 + }, + { + "epoch": 0.44373276398896894, + "grad_norm": 0.1238841563463211, + "learning_rate": 0.0005966653294535811, + "loss": 2.6657, + "step": 14964 + }, + { + "epoch": 0.4437624173412804, + "grad_norm": 0.12085838615894318, + "learning_rate": 0.0005966191657443236, + "loss": 2.7051, + "step": 14965 + }, + { + "epoch": 0.4437920706935919, + "grad_norm": 0.13061188161373138, + "learning_rate": 0.0005965730011794866, + "loss": 2.7194, + "step": 14966 + }, + { + "epoch": 0.44382172404590337, + "grad_norm": 0.13852559030056, + "learning_rate": 0.0005965268357594794, + "loss": 2.711, + "step": 14967 + }, + { + "epoch": 0.44385137739821484, + "grad_norm": 0.11563058197498322, + "learning_rate": 0.0005964806694847104, + "loss": 2.7215, + "step": 14968 + }, + { + "epoch": 0.4438810307505264, + "grad_norm": 0.13040325045585632, + "learning_rate": 0.0005964345023555886, + "loss": 2.6882, + "step": 14969 + }, + { + "epoch": 0.44391068410283785, + "grad_norm": 0.12895677983760834, + "learning_rate": 0.0005963883343725226, + "loss": 2.7345, + "step": 14970 + }, + { + "epoch": 0.4439403374551493, + "grad_norm": 0.1370963454246521, + "learning_rate": 0.0005963421655359215, + "loss": 2.6562, + "step": 14971 + }, + { + "epoch": 0.4439699908074608, + "grad_norm": 0.1612546443939209, + "learning_rate": 0.0005962959958461939, + "loss": 2.6805, + "step": 14972 + }, + { + "epoch": 0.4439996441597723, + "grad_norm": 0.1674947738647461, + "learning_rate": 0.0005962498253037485, + "loss": 2.7049, + "step": 14973 + }, + { + "epoch": 0.44402929751208375, + "grad_norm": 0.13131743669509888, + "learning_rate": 0.0005962036539089948, + "loss": 2.7039, + "step": 14974 + }, + { + "epoch": 0.44405895086439523, + "grad_norm": 0.12538617849349976, + "learning_rate": 0.0005961574816623409, + "loss": 2.696, + "step": 14975 + }, + { + "epoch": 0.4440886042167067, + "grad_norm": 0.14292457699775696, + "learning_rate": 0.0005961113085641962, + "loss": 2.7021, + "step": 14976 + }, + { + "epoch": 0.4441182575690182, + "grad_norm": 0.13481025397777557, + "learning_rate": 0.0005960651346149692, + "loss": 2.7083, + "step": 14977 + }, + { + "epoch": 0.44414791092132966, + "grad_norm": 0.13981784880161285, + "learning_rate": 0.000596018959815069, + "loss": 2.6766, + "step": 14978 + }, + { + "epoch": 0.44417756427364113, + "grad_norm": 0.13908320665359497, + "learning_rate": 0.0005959727841649045, + "loss": 2.7465, + "step": 14979 + }, + { + "epoch": 0.4442072176259526, + "grad_norm": 0.14085827767848969, + "learning_rate": 0.0005959266076648845, + "loss": 2.656, + "step": 14980 + }, + { + "epoch": 0.4442368709782641, + "grad_norm": 0.1274341493844986, + "learning_rate": 0.000595880430315418, + "loss": 2.7111, + "step": 14981 + }, + { + "epoch": 0.44426652433057556, + "grad_norm": 0.13066311180591583, + "learning_rate": 0.0005958342521169137, + "loss": 2.7243, + "step": 14982 + }, + { + "epoch": 0.44429617768288704, + "grad_norm": 0.13421630859375, + "learning_rate": 0.0005957880730697807, + "loss": 2.6937, + "step": 14983 + }, + { + "epoch": 0.4443258310351985, + "grad_norm": 0.12145797908306122, + "learning_rate": 0.0005957418931744279, + "loss": 2.6801, + "step": 14984 + }, + { + "epoch": 0.44435548438751, + "grad_norm": 0.1185101866722107, + "learning_rate": 0.0005956957124312642, + "loss": 2.6816, + "step": 14985 + }, + { + "epoch": 0.44438513773982147, + "grad_norm": 0.10338234901428223, + "learning_rate": 0.0005956495308406984, + "loss": 2.6934, + "step": 14986 + }, + { + "epoch": 0.44441479109213294, + "grad_norm": 0.11356990784406662, + "learning_rate": 0.0005956033484031396, + "loss": 2.6839, + "step": 14987 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.11852099746465683, + "learning_rate": 0.0005955571651189968, + "loss": 2.7159, + "step": 14988 + }, + { + "epoch": 0.4444740977967559, + "grad_norm": 0.10809007287025452, + "learning_rate": 0.0005955109809886789, + "loss": 2.6844, + "step": 14989 + }, + { + "epoch": 0.4445037511490674, + "grad_norm": 0.14238791167736053, + "learning_rate": 0.000595464796012595, + "loss": 2.7122, + "step": 14990 + }, + { + "epoch": 0.4445334045013789, + "grad_norm": 0.12569360435009003, + "learning_rate": 0.0005954186101911538, + "loss": 2.7083, + "step": 14991 + }, + { + "epoch": 0.4445630578536904, + "grad_norm": 0.11734399199485779, + "learning_rate": 0.0005953724235247645, + "loss": 2.6761, + "step": 14992 + }, + { + "epoch": 0.44459271120600186, + "grad_norm": 0.14688464999198914, + "learning_rate": 0.0005953262360138359, + "loss": 2.7019, + "step": 14993 + }, + { + "epoch": 0.44462236455831333, + "grad_norm": 0.1542433351278305, + "learning_rate": 0.0005952800476587772, + "loss": 2.6886, + "step": 14994 + }, + { + "epoch": 0.4446520179106248, + "grad_norm": 0.1453598141670227, + "learning_rate": 0.0005952338584599973, + "loss": 2.6869, + "step": 14995 + }, + { + "epoch": 0.4446816712629363, + "grad_norm": 0.13813312351703644, + "learning_rate": 0.0005951876684179054, + "loss": 2.719, + "step": 14996 + }, + { + "epoch": 0.44471132461524776, + "grad_norm": 0.14266394078731537, + "learning_rate": 0.0005951414775329102, + "loss": 2.6697, + "step": 14997 + }, + { + "epoch": 0.44474097796755924, + "grad_norm": 0.1387433111667633, + "learning_rate": 0.0005950952858054209, + "loss": 2.6727, + "step": 14998 + }, + { + "epoch": 0.4447706313198707, + "grad_norm": 0.1371363401412964, + "learning_rate": 0.0005950490932358464, + "loss": 2.6916, + "step": 14999 + }, + { + "epoch": 0.4448002846721822, + "grad_norm": 0.11839868128299713, + "learning_rate": 0.0005950028998245961, + "loss": 2.6803, + "step": 15000 + }, + { + "epoch": 0.44482993802449367, + "grad_norm": 0.11978676915168762, + "learning_rate": 0.0005949567055720788, + "loss": 2.7237, + "step": 15001 + }, + { + "epoch": 0.44485959137680514, + "grad_norm": 0.11006128042936325, + "learning_rate": 0.0005949105104787035, + "loss": 2.6618, + "step": 15002 + }, + { + "epoch": 0.4448892447291166, + "grad_norm": 0.11282137036323547, + "learning_rate": 0.0005948643145448794, + "loss": 2.7017, + "step": 15003 + }, + { + "epoch": 0.4449188980814281, + "grad_norm": 0.12236032634973526, + "learning_rate": 0.0005948181177710154, + "loss": 2.7072, + "step": 15004 + }, + { + "epoch": 0.44494855143373957, + "grad_norm": 0.12113878130912781, + "learning_rate": 0.0005947719201575206, + "loss": 2.6828, + "step": 15005 + }, + { + "epoch": 0.44497820478605105, + "grad_norm": 0.10757718235254288, + "learning_rate": 0.0005947257217048044, + "loss": 2.7072, + "step": 15006 + }, + { + "epoch": 0.4450078581383625, + "grad_norm": 0.10768810659646988, + "learning_rate": 0.0005946795224132755, + "loss": 2.6954, + "step": 15007 + }, + { + "epoch": 0.445037511490674, + "grad_norm": 0.11239506304264069, + "learning_rate": 0.0005946333222833433, + "loss": 2.6744, + "step": 15008 + }, + { + "epoch": 0.4450671648429855, + "grad_norm": 0.09933900833129883, + "learning_rate": 0.0005945871213154167, + "loss": 2.7177, + "step": 15009 + }, + { + "epoch": 0.445096818195297, + "grad_norm": 0.12219429016113281, + "learning_rate": 0.0005945409195099049, + "loss": 2.7145, + "step": 15010 + }, + { + "epoch": 0.4451264715476085, + "grad_norm": 0.12146388739347458, + "learning_rate": 0.0005944947168672172, + "loss": 2.6806, + "step": 15011 + }, + { + "epoch": 0.44515612489991996, + "grad_norm": 0.13707689940929413, + "learning_rate": 0.0005944485133877624, + "loss": 2.7278, + "step": 15012 + }, + { + "epoch": 0.44518577825223143, + "grad_norm": 0.143843874335289, + "learning_rate": 0.0005944023090719498, + "loss": 2.7394, + "step": 15013 + }, + { + "epoch": 0.4452154316045429, + "grad_norm": 0.16662943363189697, + "learning_rate": 0.0005943561039201885, + "loss": 2.7267, + "step": 15014 + }, + { + "epoch": 0.4452450849568544, + "grad_norm": 0.18956752121448517, + "learning_rate": 0.0005943098979328878, + "loss": 2.6969, + "step": 15015 + }, + { + "epoch": 0.44527473830916586, + "grad_norm": 0.15680237114429474, + "learning_rate": 0.0005942636911104569, + "loss": 2.7175, + "step": 15016 + }, + { + "epoch": 0.44530439166147734, + "grad_norm": 0.14289693534374237, + "learning_rate": 0.0005942174834533046, + "loss": 2.7145, + "step": 15017 + }, + { + "epoch": 0.4453340450137888, + "grad_norm": 0.16031044721603394, + "learning_rate": 0.0005941712749618404, + "loss": 2.6971, + "step": 15018 + }, + { + "epoch": 0.4453636983661003, + "grad_norm": 0.15543951094150543, + "learning_rate": 0.0005941250656364734, + "loss": 2.6902, + "step": 15019 + }, + { + "epoch": 0.44539335171841177, + "grad_norm": 0.15126168727874756, + "learning_rate": 0.0005940788554776128, + "loss": 2.6984, + "step": 15020 + }, + { + "epoch": 0.44542300507072324, + "grad_norm": 0.1379512995481491, + "learning_rate": 0.0005940326444856677, + "loss": 2.7077, + "step": 15021 + }, + { + "epoch": 0.4454526584230347, + "grad_norm": 0.13801899552345276, + "learning_rate": 0.0005939864326610475, + "loss": 2.7108, + "step": 15022 + }, + { + "epoch": 0.4454823117753462, + "grad_norm": 0.12460941821336746, + "learning_rate": 0.0005939402200041614, + "loss": 2.6904, + "step": 15023 + }, + { + "epoch": 0.44551196512765767, + "grad_norm": 0.14105267822742462, + "learning_rate": 0.0005938940065154185, + "loss": 2.667, + "step": 15024 + }, + { + "epoch": 0.44554161847996915, + "grad_norm": 0.1369011253118515, + "learning_rate": 0.0005938477921952278, + "loss": 2.6848, + "step": 15025 + }, + { + "epoch": 0.4455712718322806, + "grad_norm": 0.1314428448677063, + "learning_rate": 0.000593801577043999, + "loss": 2.6956, + "step": 15026 + }, + { + "epoch": 0.4456009251845921, + "grad_norm": 0.12639029324054718, + "learning_rate": 0.000593755361062141, + "loss": 2.7025, + "step": 15027 + }, + { + "epoch": 0.4456305785369036, + "grad_norm": 0.12145485728979111, + "learning_rate": 0.0005937091442500633, + "loss": 2.7096, + "step": 15028 + }, + { + "epoch": 0.44566023188921505, + "grad_norm": 0.12104518711566925, + "learning_rate": 0.0005936629266081751, + "loss": 2.6698, + "step": 15029 + }, + { + "epoch": 0.44568988524152653, + "grad_norm": 0.12384415417909622, + "learning_rate": 0.0005936167081368855, + "loss": 2.6868, + "step": 15030 + }, + { + "epoch": 0.44571953859383806, + "grad_norm": 0.12271036207675934, + "learning_rate": 0.0005935704888366038, + "loss": 2.6734, + "step": 15031 + }, + { + "epoch": 0.44574919194614954, + "grad_norm": 0.11198626458644867, + "learning_rate": 0.0005935242687077394, + "loss": 2.6715, + "step": 15032 + }, + { + "epoch": 0.445778845298461, + "grad_norm": 0.10997151583433151, + "learning_rate": 0.0005934780477507017, + "loss": 2.6938, + "step": 15033 + }, + { + "epoch": 0.4458084986507725, + "grad_norm": 0.11871073395013809, + "learning_rate": 0.0005934318259658998, + "loss": 2.7241, + "step": 15034 + }, + { + "epoch": 0.44583815200308397, + "grad_norm": 0.11849186569452286, + "learning_rate": 0.000593385603353743, + "loss": 2.6861, + "step": 15035 + }, + { + "epoch": 0.44586780535539544, + "grad_norm": 0.13375017046928406, + "learning_rate": 0.0005933393799146407, + "loss": 2.7054, + "step": 15036 + }, + { + "epoch": 0.4458974587077069, + "grad_norm": 0.10464534908533096, + "learning_rate": 0.0005932931556490021, + "loss": 2.7002, + "step": 15037 + }, + { + "epoch": 0.4459271120600184, + "grad_norm": 0.11450327932834625, + "learning_rate": 0.0005932469305572365, + "loss": 2.6936, + "step": 15038 + }, + { + "epoch": 0.44595676541232987, + "grad_norm": 0.13404618203639984, + "learning_rate": 0.0005932007046397536, + "loss": 2.7168, + "step": 15039 + }, + { + "epoch": 0.44598641876464135, + "grad_norm": 0.11482535302639008, + "learning_rate": 0.0005931544778969622, + "loss": 2.7012, + "step": 15040 + }, + { + "epoch": 0.4460160721169528, + "grad_norm": 0.11247675120830536, + "learning_rate": 0.0005931082503292719, + "loss": 2.6484, + "step": 15041 + }, + { + "epoch": 0.4460457254692643, + "grad_norm": 0.10985364764928818, + "learning_rate": 0.0005930620219370922, + "loss": 2.6688, + "step": 15042 + }, + { + "epoch": 0.4460753788215758, + "grad_norm": 0.10619907081127167, + "learning_rate": 0.0005930157927208323, + "loss": 2.7186, + "step": 15043 + }, + { + "epoch": 0.44610503217388725, + "grad_norm": 0.1058485358953476, + "learning_rate": 0.0005929695626809016, + "loss": 2.6836, + "step": 15044 + }, + { + "epoch": 0.4461346855261987, + "grad_norm": 0.1055215522646904, + "learning_rate": 0.0005929233318177095, + "loss": 2.7272, + "step": 15045 + }, + { + "epoch": 0.4461643388785102, + "grad_norm": 0.10171598941087723, + "learning_rate": 0.0005928771001316653, + "loss": 2.7047, + "step": 15046 + }, + { + "epoch": 0.4461939922308217, + "grad_norm": 0.10234741866588593, + "learning_rate": 0.0005928308676231784, + "loss": 2.6854, + "step": 15047 + }, + { + "epoch": 0.44622364558313315, + "grad_norm": 0.10653091222047806, + "learning_rate": 0.0005927846342926582, + "loss": 2.7232, + "step": 15048 + }, + { + "epoch": 0.44625329893544463, + "grad_norm": 0.10696864873170853, + "learning_rate": 0.0005927384001405141, + "loss": 2.6913, + "step": 15049 + }, + { + "epoch": 0.4462829522877561, + "grad_norm": 0.11415541917085648, + "learning_rate": 0.0005926921651671557, + "loss": 2.702, + "step": 15050 + }, + { + "epoch": 0.4463126056400676, + "grad_norm": 0.12474745512008667, + "learning_rate": 0.0005926459293729922, + "loss": 2.7139, + "step": 15051 + }, + { + "epoch": 0.4463422589923791, + "grad_norm": 0.14034365117549896, + "learning_rate": 0.0005925996927584332, + "loss": 2.695, + "step": 15052 + }, + { + "epoch": 0.4463719123446906, + "grad_norm": 0.13845452666282654, + "learning_rate": 0.000592553455323888, + "loss": 2.6765, + "step": 15053 + }, + { + "epoch": 0.44640156569700207, + "grad_norm": 0.14749334752559662, + "learning_rate": 0.0005925072170697658, + "loss": 2.7123, + "step": 15054 + }, + { + "epoch": 0.44643121904931354, + "grad_norm": 0.13289709389209747, + "learning_rate": 0.0005924609779964766, + "loss": 2.7003, + "step": 15055 + }, + { + "epoch": 0.446460872401625, + "grad_norm": 0.12932920455932617, + "learning_rate": 0.0005924147381044296, + "loss": 2.6941, + "step": 15056 + }, + { + "epoch": 0.4464905257539365, + "grad_norm": 0.14650897681713104, + "learning_rate": 0.000592368497394034, + "loss": 2.7261, + "step": 15057 + }, + { + "epoch": 0.44652017910624797, + "grad_norm": 0.15978118777275085, + "learning_rate": 0.0005923222558656996, + "loss": 2.6639, + "step": 15058 + }, + { + "epoch": 0.44654983245855945, + "grad_norm": 0.1465444564819336, + "learning_rate": 0.0005922760135198357, + "loss": 2.7095, + "step": 15059 + }, + { + "epoch": 0.4465794858108709, + "grad_norm": 0.12300363183021545, + "learning_rate": 0.000592229770356852, + "loss": 2.7117, + "step": 15060 + }, + { + "epoch": 0.4466091391631824, + "grad_norm": 0.1310863047838211, + "learning_rate": 0.0005921835263771578, + "loss": 2.7029, + "step": 15061 + }, + { + "epoch": 0.4466387925154939, + "grad_norm": 0.14639784395694733, + "learning_rate": 0.0005921372815811628, + "loss": 2.7044, + "step": 15062 + }, + { + "epoch": 0.44666844586780535, + "grad_norm": 0.13189339637756348, + "learning_rate": 0.000592091035969276, + "loss": 2.6897, + "step": 15063 + }, + { + "epoch": 0.44669809922011683, + "grad_norm": 0.14549583196640015, + "learning_rate": 0.0005920447895419076, + "loss": 2.6535, + "step": 15064 + }, + { + "epoch": 0.4467277525724283, + "grad_norm": 0.1491217017173767, + "learning_rate": 0.0005919985422994666, + "loss": 2.7121, + "step": 15065 + }, + { + "epoch": 0.4467574059247398, + "grad_norm": 0.1239793449640274, + "learning_rate": 0.0005919522942423628, + "loss": 2.7119, + "step": 15066 + }, + { + "epoch": 0.44678705927705126, + "grad_norm": 0.11268675327301025, + "learning_rate": 0.0005919060453710057, + "loss": 2.6942, + "step": 15067 + }, + { + "epoch": 0.44681671262936273, + "grad_norm": 0.13267631828784943, + "learning_rate": 0.0005918597956858047, + "loss": 2.7165, + "step": 15068 + }, + { + "epoch": 0.4468463659816742, + "grad_norm": 0.11886676400899887, + "learning_rate": 0.0005918135451871696, + "loss": 2.7115, + "step": 15069 + }, + { + "epoch": 0.4468760193339857, + "grad_norm": 0.12734274566173553, + "learning_rate": 0.0005917672938755094, + "loss": 2.6974, + "step": 15070 + }, + { + "epoch": 0.44690567268629716, + "grad_norm": 0.12496178597211838, + "learning_rate": 0.0005917210417512344, + "loss": 2.7051, + "step": 15071 + }, + { + "epoch": 0.44693532603860864, + "grad_norm": 0.1168091744184494, + "learning_rate": 0.0005916747888147539, + "loss": 2.7013, + "step": 15072 + }, + { + "epoch": 0.44696497939092017, + "grad_norm": 0.13266713917255402, + "learning_rate": 0.0005916285350664772, + "loss": 2.7063, + "step": 15073 + }, + { + "epoch": 0.44699463274323165, + "grad_norm": 0.13898621499538422, + "learning_rate": 0.0005915822805068142, + "loss": 2.7389, + "step": 15074 + }, + { + "epoch": 0.4470242860955431, + "grad_norm": 0.12219838052988052, + "learning_rate": 0.0005915360251361743, + "loss": 2.6938, + "step": 15075 + }, + { + "epoch": 0.4470539394478546, + "grad_norm": 0.12442207336425781, + "learning_rate": 0.0005914897689549672, + "loss": 2.6624, + "step": 15076 + }, + { + "epoch": 0.4470835928001661, + "grad_norm": 0.12221554666757584, + "learning_rate": 0.0005914435119636026, + "loss": 2.6841, + "step": 15077 + }, + { + "epoch": 0.44711324615247755, + "grad_norm": 0.11879755556583405, + "learning_rate": 0.0005913972541624899, + "loss": 2.7056, + "step": 15078 + }, + { + "epoch": 0.447142899504789, + "grad_norm": 0.12897035479545593, + "learning_rate": 0.0005913509955520388, + "loss": 2.6835, + "step": 15079 + }, + { + "epoch": 0.4471725528571005, + "grad_norm": 0.13321281969547272, + "learning_rate": 0.0005913047361326591, + "loss": 2.6779, + "step": 15080 + }, + { + "epoch": 0.447202206209412, + "grad_norm": 0.13734368979930878, + "learning_rate": 0.0005912584759047603, + "loss": 2.6668, + "step": 15081 + }, + { + "epoch": 0.44723185956172345, + "grad_norm": 0.12880510091781616, + "learning_rate": 0.0005912122148687518, + "loss": 2.6996, + "step": 15082 + }, + { + "epoch": 0.44726151291403493, + "grad_norm": 0.11390876024961472, + "learning_rate": 0.0005911659530250436, + "loss": 2.7197, + "step": 15083 + }, + { + "epoch": 0.4472911662663464, + "grad_norm": 0.1246863603591919, + "learning_rate": 0.0005911196903740453, + "loss": 2.6834, + "step": 15084 + }, + { + "epoch": 0.4473208196186579, + "grad_norm": 0.13414156436920166, + "learning_rate": 0.0005910734269161664, + "loss": 2.6818, + "step": 15085 + }, + { + "epoch": 0.44735047297096936, + "grad_norm": 0.12557415664196014, + "learning_rate": 0.0005910271626518168, + "loss": 2.6725, + "step": 15086 + }, + { + "epoch": 0.44738012632328084, + "grad_norm": 0.10588064789772034, + "learning_rate": 0.000590980897581406, + "loss": 2.7258, + "step": 15087 + }, + { + "epoch": 0.4474097796755923, + "grad_norm": 0.1261693388223648, + "learning_rate": 0.0005909346317053436, + "loss": 2.7135, + "step": 15088 + }, + { + "epoch": 0.4474394330279038, + "grad_norm": 0.14133745431900024, + "learning_rate": 0.0005908883650240396, + "loss": 2.7125, + "step": 15089 + }, + { + "epoch": 0.44746908638021526, + "grad_norm": 0.12507687509059906, + "learning_rate": 0.0005908420975379034, + "loss": 2.6855, + "step": 15090 + }, + { + "epoch": 0.44749873973252674, + "grad_norm": 0.11488121747970581, + "learning_rate": 0.000590795829247345, + "loss": 2.7204, + "step": 15091 + }, + { + "epoch": 0.4475283930848382, + "grad_norm": 0.14176785945892334, + "learning_rate": 0.0005907495601527738, + "loss": 2.6912, + "step": 15092 + }, + { + "epoch": 0.4475580464371497, + "grad_norm": 0.15983040630817413, + "learning_rate": 0.0005907032902545997, + "loss": 2.6843, + "step": 15093 + }, + { + "epoch": 0.4475876997894612, + "grad_norm": 0.18470877408981323, + "learning_rate": 0.0005906570195532325, + "loss": 2.6619, + "step": 15094 + }, + { + "epoch": 0.4476173531417727, + "grad_norm": 0.14743176102638245, + "learning_rate": 0.0005906107480490818, + "loss": 2.7299, + "step": 15095 + }, + { + "epoch": 0.4476470064940842, + "grad_norm": 0.1442335546016693, + "learning_rate": 0.0005905644757425571, + "loss": 2.7277, + "step": 15096 + }, + { + "epoch": 0.44767665984639565, + "grad_norm": 0.16579361259937286, + "learning_rate": 0.0005905182026340688, + "loss": 2.6899, + "step": 15097 + }, + { + "epoch": 0.44770631319870713, + "grad_norm": 0.15068285167217255, + "learning_rate": 0.000590471928724026, + "loss": 2.723, + "step": 15098 + }, + { + "epoch": 0.4477359665510186, + "grad_norm": 0.14183937013149261, + "learning_rate": 0.0005904256540128389, + "loss": 2.7007, + "step": 15099 + }, + { + "epoch": 0.4477656199033301, + "grad_norm": 0.1294754445552826, + "learning_rate": 0.0005903793785009172, + "loss": 2.6873, + "step": 15100 + }, + { + "epoch": 0.44779527325564156, + "grad_norm": 0.13404417037963867, + "learning_rate": 0.0005903331021886705, + "loss": 2.7145, + "step": 15101 + }, + { + "epoch": 0.44782492660795303, + "grad_norm": 0.13672082126140594, + "learning_rate": 0.0005902868250765084, + "loss": 2.7034, + "step": 15102 + }, + { + "epoch": 0.4478545799602645, + "grad_norm": 0.13311907649040222, + "learning_rate": 0.0005902405471648412, + "loss": 2.6802, + "step": 15103 + }, + { + "epoch": 0.447884233312576, + "grad_norm": 0.13074786961078644, + "learning_rate": 0.0005901942684540785, + "loss": 2.6872, + "step": 15104 + }, + { + "epoch": 0.44791388666488746, + "grad_norm": 0.12567603588104248, + "learning_rate": 0.0005901479889446301, + "loss": 2.6894, + "step": 15105 + }, + { + "epoch": 0.44794354001719894, + "grad_norm": 0.14450418949127197, + "learning_rate": 0.0005901017086369057, + "loss": 2.741, + "step": 15106 + }, + { + "epoch": 0.4479731933695104, + "grad_norm": 0.12562832236289978, + "learning_rate": 0.0005900554275313153, + "loss": 2.6905, + "step": 15107 + }, + { + "epoch": 0.4480028467218219, + "grad_norm": 0.11780579388141632, + "learning_rate": 0.0005900091456282685, + "loss": 2.7072, + "step": 15108 + }, + { + "epoch": 0.44803250007413337, + "grad_norm": 0.12353470176458359, + "learning_rate": 0.0005899628629281753, + "loss": 2.7198, + "step": 15109 + }, + { + "epoch": 0.44806215342644484, + "grad_norm": 0.1193603128194809, + "learning_rate": 0.0005899165794314456, + "loss": 2.6902, + "step": 15110 + }, + { + "epoch": 0.4480918067787563, + "grad_norm": 0.12332431226968765, + "learning_rate": 0.000589870295138489, + "loss": 2.7159, + "step": 15111 + }, + { + "epoch": 0.4481214601310678, + "grad_norm": 0.11291346698999405, + "learning_rate": 0.0005898240100497157, + "loss": 2.703, + "step": 15112 + }, + { + "epoch": 0.44815111348337927, + "grad_norm": 0.11303427815437317, + "learning_rate": 0.0005897777241655353, + "loss": 2.6938, + "step": 15113 + }, + { + "epoch": 0.4481807668356908, + "grad_norm": 0.10816039890050888, + "learning_rate": 0.0005897314374863576, + "loss": 2.6836, + "step": 15114 + }, + { + "epoch": 0.4482104201880023, + "grad_norm": 0.12176726013422012, + "learning_rate": 0.0005896851500125927, + "loss": 2.6906, + "step": 15115 + }, + { + "epoch": 0.44824007354031375, + "grad_norm": 0.11938226222991943, + "learning_rate": 0.0005896388617446504, + "loss": 2.6807, + "step": 15116 + }, + { + "epoch": 0.44826972689262523, + "grad_norm": 0.12516753375530243, + "learning_rate": 0.0005895925726829407, + "loss": 2.7404, + "step": 15117 + }, + { + "epoch": 0.4482993802449367, + "grad_norm": 0.14387567341327667, + "learning_rate": 0.0005895462828278732, + "loss": 2.7098, + "step": 15118 + }, + { + "epoch": 0.4483290335972482, + "grad_norm": 0.1383565217256546, + "learning_rate": 0.0005894999921798582, + "loss": 2.6947, + "step": 15119 + }, + { + "epoch": 0.44835868694955966, + "grad_norm": 0.13219748437404633, + "learning_rate": 0.0005894537007393052, + "loss": 2.6934, + "step": 15120 + }, + { + "epoch": 0.44838834030187114, + "grad_norm": 0.12415970861911774, + "learning_rate": 0.0005894074085066246, + "loss": 2.687, + "step": 15121 + }, + { + "epoch": 0.4484179936541826, + "grad_norm": 0.12325271219015121, + "learning_rate": 0.0005893611154822258, + "loss": 2.668, + "step": 15122 + }, + { + "epoch": 0.4484476470064941, + "grad_norm": 0.13695800304412842, + "learning_rate": 0.0005893148216665191, + "loss": 2.6879, + "step": 15123 + }, + { + "epoch": 0.44847730035880556, + "grad_norm": 0.1336749941110611, + "learning_rate": 0.0005892685270599143, + "loss": 2.6873, + "step": 15124 + }, + { + "epoch": 0.44850695371111704, + "grad_norm": 0.10811935365200043, + "learning_rate": 0.0005892222316628214, + "loss": 2.7069, + "step": 15125 + }, + { + "epoch": 0.4485366070634285, + "grad_norm": 0.1227637529373169, + "learning_rate": 0.0005891759354756503, + "loss": 2.7134, + "step": 15126 + }, + { + "epoch": 0.44856626041574, + "grad_norm": 0.1272011548280716, + "learning_rate": 0.0005891296384988111, + "loss": 2.7132, + "step": 15127 + }, + { + "epoch": 0.44859591376805147, + "grad_norm": 0.11902444064617157, + "learning_rate": 0.0005890833407327134, + "loss": 2.6656, + "step": 15128 + }, + { + "epoch": 0.44862556712036294, + "grad_norm": 0.11068017780780792, + "learning_rate": 0.0005890370421777675, + "loss": 2.6984, + "step": 15129 + }, + { + "epoch": 0.4486552204726744, + "grad_norm": 0.11177652329206467, + "learning_rate": 0.0005889907428343834, + "loss": 2.7257, + "step": 15130 + }, + { + "epoch": 0.4486848738249859, + "grad_norm": 0.12517127394676208, + "learning_rate": 0.000588944442702971, + "loss": 2.6697, + "step": 15131 + }, + { + "epoch": 0.4487145271772974, + "grad_norm": 0.12334506213665009, + "learning_rate": 0.0005888981417839403, + "loss": 2.6776, + "step": 15132 + }, + { + "epoch": 0.44874418052960885, + "grad_norm": 0.12112115323543549, + "learning_rate": 0.0005888518400777012, + "loss": 2.6997, + "step": 15133 + }, + { + "epoch": 0.4487738338819203, + "grad_norm": 0.11317481100559235, + "learning_rate": 0.0005888055375846639, + "loss": 2.7114, + "step": 15134 + }, + { + "epoch": 0.44880348723423186, + "grad_norm": 0.12450491636991501, + "learning_rate": 0.0005887592343052382, + "loss": 2.6652, + "step": 15135 + }, + { + "epoch": 0.44883314058654333, + "grad_norm": 0.12604866921901703, + "learning_rate": 0.0005887129302398343, + "loss": 2.7147, + "step": 15136 + }, + { + "epoch": 0.4488627939388548, + "grad_norm": 0.13366667926311493, + "learning_rate": 0.0005886666253888622, + "loss": 2.7148, + "step": 15137 + }, + { + "epoch": 0.4488924472911663, + "grad_norm": 0.11234920471906662, + "learning_rate": 0.000588620319752732, + "loss": 2.7184, + "step": 15138 + }, + { + "epoch": 0.44892210064347776, + "grad_norm": 0.12092702090740204, + "learning_rate": 0.0005885740133318535, + "loss": 2.6821, + "step": 15139 + }, + { + "epoch": 0.44895175399578924, + "grad_norm": 0.1212923675775528, + "learning_rate": 0.000588527706126637, + "loss": 2.6962, + "step": 15140 + }, + { + "epoch": 0.4489814073481007, + "grad_norm": 0.11912443488836288, + "learning_rate": 0.0005884813981374923, + "loss": 2.6944, + "step": 15141 + }, + { + "epoch": 0.4490110607004122, + "grad_norm": 0.1307583451271057, + "learning_rate": 0.0005884350893648298, + "loss": 2.6837, + "step": 15142 + }, + { + "epoch": 0.44904071405272367, + "grad_norm": 0.12302014231681824, + "learning_rate": 0.0005883887798090595, + "loss": 2.6754, + "step": 15143 + }, + { + "epoch": 0.44907036740503514, + "grad_norm": 0.1467907875776291, + "learning_rate": 0.0005883424694705913, + "loss": 2.7012, + "step": 15144 + }, + { + "epoch": 0.4491000207573466, + "grad_norm": 0.13045158982276917, + "learning_rate": 0.0005882961583498353, + "loss": 2.7143, + "step": 15145 + }, + { + "epoch": 0.4491296741096581, + "grad_norm": 0.15333695709705353, + "learning_rate": 0.0005882498464472017, + "loss": 2.6981, + "step": 15146 + }, + { + "epoch": 0.44915932746196957, + "grad_norm": 0.14704594016075134, + "learning_rate": 0.0005882035337631004, + "loss": 2.6821, + "step": 15147 + }, + { + "epoch": 0.44918898081428105, + "grad_norm": 0.14557796716690063, + "learning_rate": 0.000588157220297942, + "loss": 2.6888, + "step": 15148 + }, + { + "epoch": 0.4492186341665925, + "grad_norm": 0.15334628522396088, + "learning_rate": 0.0005881109060521362, + "loss": 2.6887, + "step": 15149 + }, + { + "epoch": 0.449248287518904, + "grad_norm": 0.16096244752407074, + "learning_rate": 0.000588064591026093, + "loss": 2.7086, + "step": 15150 + }, + { + "epoch": 0.4492779408712155, + "grad_norm": 0.17423681914806366, + "learning_rate": 0.0005880182752202227, + "loss": 2.709, + "step": 15151 + }, + { + "epoch": 0.44930759422352695, + "grad_norm": 0.1806858777999878, + "learning_rate": 0.0005879719586349357, + "loss": 2.7287, + "step": 15152 + }, + { + "epoch": 0.4493372475758384, + "grad_norm": 0.16343095898628235, + "learning_rate": 0.0005879256412706418, + "loss": 2.6851, + "step": 15153 + }, + { + "epoch": 0.4493669009281499, + "grad_norm": 0.1396016627550125, + "learning_rate": 0.0005878793231277513, + "loss": 2.7116, + "step": 15154 + }, + { + "epoch": 0.4493965542804614, + "grad_norm": 0.14747391641139984, + "learning_rate": 0.0005878330042066742, + "loss": 2.6911, + "step": 15155 + }, + { + "epoch": 0.4494262076327729, + "grad_norm": 0.1575438678264618, + "learning_rate": 0.000587786684507821, + "loss": 2.7032, + "step": 15156 + }, + { + "epoch": 0.4494558609850844, + "grad_norm": 0.17210154235363007, + "learning_rate": 0.0005877403640316013, + "loss": 2.7025, + "step": 15157 + }, + { + "epoch": 0.44948551433739586, + "grad_norm": 0.13369140028953552, + "learning_rate": 0.0005876940427784259, + "loss": 2.7043, + "step": 15158 + }, + { + "epoch": 0.44951516768970734, + "grad_norm": 0.12975654006004333, + "learning_rate": 0.0005876477207487045, + "loss": 2.6862, + "step": 15159 + }, + { + "epoch": 0.4495448210420188, + "grad_norm": 0.151548832654953, + "learning_rate": 0.0005876013979428476, + "loss": 2.6948, + "step": 15160 + }, + { + "epoch": 0.4495744743943303, + "grad_norm": 0.15798209607601166, + "learning_rate": 0.0005875550743612653, + "loss": 2.7167, + "step": 15161 + }, + { + "epoch": 0.44960412774664177, + "grad_norm": 0.13901451230049133, + "learning_rate": 0.0005875087500043678, + "loss": 2.7053, + "step": 15162 + }, + { + "epoch": 0.44963378109895324, + "grad_norm": 0.14066793024539948, + "learning_rate": 0.0005874624248725653, + "loss": 2.6718, + "step": 15163 + }, + { + "epoch": 0.4496634344512647, + "grad_norm": 0.14314281940460205, + "learning_rate": 0.0005874160989662679, + "loss": 2.7065, + "step": 15164 + }, + { + "epoch": 0.4496930878035762, + "grad_norm": 0.13346359133720398, + "learning_rate": 0.0005873697722858862, + "loss": 2.7219, + "step": 15165 + }, + { + "epoch": 0.4497227411558877, + "grad_norm": 0.1434287577867508, + "learning_rate": 0.0005873234448318298, + "loss": 2.7015, + "step": 15166 + }, + { + "epoch": 0.44975239450819915, + "grad_norm": 0.14877650141716003, + "learning_rate": 0.0005872771166045094, + "loss": 2.7145, + "step": 15167 + }, + { + "epoch": 0.4497820478605106, + "grad_norm": 0.131316140294075, + "learning_rate": 0.0005872307876043354, + "loss": 2.7152, + "step": 15168 + }, + { + "epoch": 0.4498117012128221, + "grad_norm": 0.13279490172863007, + "learning_rate": 0.0005871844578317176, + "loss": 2.6932, + "step": 15169 + }, + { + "epoch": 0.4498413545651336, + "grad_norm": 0.13375906646251678, + "learning_rate": 0.0005871381272870666, + "loss": 2.696, + "step": 15170 + }, + { + "epoch": 0.44987100791744505, + "grad_norm": 0.10918663442134857, + "learning_rate": 0.0005870917959707924, + "loss": 2.6725, + "step": 15171 + }, + { + "epoch": 0.44990066126975653, + "grad_norm": 0.1274881213903427, + "learning_rate": 0.0005870454638833054, + "loss": 2.6915, + "step": 15172 + }, + { + "epoch": 0.449930314622068, + "grad_norm": 0.11572116613388062, + "learning_rate": 0.0005869991310250158, + "loss": 2.6854, + "step": 15173 + }, + { + "epoch": 0.4499599679743795, + "grad_norm": 0.11167740076780319, + "learning_rate": 0.0005869527973963341, + "loss": 2.6862, + "step": 15174 + }, + { + "epoch": 0.44998962132669096, + "grad_norm": 0.11901998519897461, + "learning_rate": 0.0005869064629976704, + "loss": 2.6849, + "step": 15175 + }, + { + "epoch": 0.45001927467900243, + "grad_norm": 0.11135652661323547, + "learning_rate": 0.0005868601278294352, + "loss": 2.6982, + "step": 15176 + }, + { + "epoch": 0.45004892803131397, + "grad_norm": 0.12778927385807037, + "learning_rate": 0.0005868137918920384, + "loss": 2.6866, + "step": 15177 + }, + { + "epoch": 0.45007858138362544, + "grad_norm": 0.12448592483997345, + "learning_rate": 0.0005867674551858908, + "loss": 2.6922, + "step": 15178 + }, + { + "epoch": 0.4501082347359369, + "grad_norm": 0.11975977569818497, + "learning_rate": 0.0005867211177114024, + "loss": 2.6992, + "step": 15179 + }, + { + "epoch": 0.4501378880882484, + "grad_norm": 0.1234392449259758, + "learning_rate": 0.0005866747794689835, + "loss": 2.6946, + "step": 15180 + }, + { + "epoch": 0.45016754144055987, + "grad_norm": 0.11505566537380219, + "learning_rate": 0.0005866284404590447, + "loss": 2.7215, + "step": 15181 + }, + { + "epoch": 0.45019719479287135, + "grad_norm": 0.13099730014801025, + "learning_rate": 0.0005865821006819963, + "loss": 2.7528, + "step": 15182 + }, + { + "epoch": 0.4502268481451828, + "grad_norm": 0.11240706592798233, + "learning_rate": 0.0005865357601382483, + "loss": 2.677, + "step": 15183 + }, + { + "epoch": 0.4502565014974943, + "grad_norm": 0.11811754107475281, + "learning_rate": 0.0005864894188282114, + "loss": 2.6996, + "step": 15184 + }, + { + "epoch": 0.4502861548498058, + "grad_norm": 0.12182046473026276, + "learning_rate": 0.0005864430767522958, + "loss": 2.6999, + "step": 15185 + }, + { + "epoch": 0.45031580820211725, + "grad_norm": 0.12309897691011429, + "learning_rate": 0.000586396733910912, + "loss": 2.6845, + "step": 15186 + }, + { + "epoch": 0.4503454615544287, + "grad_norm": 0.12536375224590302, + "learning_rate": 0.0005863503903044701, + "loss": 2.6893, + "step": 15187 + }, + { + "epoch": 0.4503751149067402, + "grad_norm": 0.11451449990272522, + "learning_rate": 0.0005863040459333809, + "loss": 2.666, + "step": 15188 + }, + { + "epoch": 0.4504047682590517, + "grad_norm": 0.10593856871128082, + "learning_rate": 0.0005862577007980544, + "loss": 2.6897, + "step": 15189 + }, + { + "epoch": 0.45043442161136316, + "grad_norm": 0.1133338063955307, + "learning_rate": 0.0005862113548989012, + "loss": 2.6992, + "step": 15190 + }, + { + "epoch": 0.45046407496367463, + "grad_norm": 0.11131835728883743, + "learning_rate": 0.0005861650082363317, + "loss": 2.6955, + "step": 15191 + }, + { + "epoch": 0.4504937283159861, + "grad_norm": 0.12920509278774261, + "learning_rate": 0.0005861186608107562, + "loss": 2.7323, + "step": 15192 + }, + { + "epoch": 0.4505233816682976, + "grad_norm": 0.13297389447689056, + "learning_rate": 0.0005860723126225854, + "loss": 2.6917, + "step": 15193 + }, + { + "epoch": 0.45055303502060906, + "grad_norm": 0.13771316409111023, + "learning_rate": 0.0005860259636722291, + "loss": 2.6759, + "step": 15194 + }, + { + "epoch": 0.45058268837292054, + "grad_norm": 0.12069471925497055, + "learning_rate": 0.0005859796139600984, + "loss": 2.6868, + "step": 15195 + }, + { + "epoch": 0.450612341725232, + "grad_norm": 0.11059840768575668, + "learning_rate": 0.0005859332634866034, + "loss": 2.7031, + "step": 15196 + }, + { + "epoch": 0.4506419950775435, + "grad_norm": 0.1345350742340088, + "learning_rate": 0.0005858869122521547, + "loss": 2.6592, + "step": 15197 + }, + { + "epoch": 0.450671648429855, + "grad_norm": 0.13648506999015808, + "learning_rate": 0.0005858405602571626, + "loss": 2.689, + "step": 15198 + }, + { + "epoch": 0.4507013017821665, + "grad_norm": 0.13638001680374146, + "learning_rate": 0.0005857942075020374, + "loss": 2.6825, + "step": 15199 + }, + { + "epoch": 0.450730955134478, + "grad_norm": 0.1278349906206131, + "learning_rate": 0.0005857478539871898, + "loss": 2.7082, + "step": 15200 + }, + { + "epoch": 0.45076060848678945, + "grad_norm": 0.1343628615140915, + "learning_rate": 0.0005857014997130304, + "loss": 2.7103, + "step": 15201 + }, + { + "epoch": 0.4507902618391009, + "grad_norm": 0.12433288246393204, + "learning_rate": 0.0005856551446799695, + "loss": 2.7142, + "step": 15202 + }, + { + "epoch": 0.4508199151914124, + "grad_norm": 0.1135389432311058, + "learning_rate": 0.0005856087888884177, + "loss": 2.68, + "step": 15203 + }, + { + "epoch": 0.4508495685437239, + "grad_norm": 0.12786780297756195, + "learning_rate": 0.0005855624323387853, + "loss": 2.6725, + "step": 15204 + }, + { + "epoch": 0.45087922189603535, + "grad_norm": 0.14648480713367462, + "learning_rate": 0.0005855160750314828, + "loss": 2.7014, + "step": 15205 + }, + { + "epoch": 0.45090887524834683, + "grad_norm": 0.12307187914848328, + "learning_rate": 0.0005854697169669206, + "loss": 2.6616, + "step": 15206 + }, + { + "epoch": 0.4509385286006583, + "grad_norm": 0.12062100321054459, + "learning_rate": 0.0005854233581455096, + "loss": 2.6857, + "step": 15207 + }, + { + "epoch": 0.4509681819529698, + "grad_norm": 0.13045862317085266, + "learning_rate": 0.0005853769985676602, + "loss": 2.7381, + "step": 15208 + }, + { + "epoch": 0.45099783530528126, + "grad_norm": 0.13113053143024445, + "learning_rate": 0.0005853306382337827, + "loss": 2.6755, + "step": 15209 + }, + { + "epoch": 0.45102748865759273, + "grad_norm": 0.13046137988567352, + "learning_rate": 0.0005852842771442877, + "loss": 2.6875, + "step": 15210 + }, + { + "epoch": 0.4510571420099042, + "grad_norm": 0.13415591418743134, + "learning_rate": 0.0005852379152995859, + "loss": 2.7013, + "step": 15211 + }, + { + "epoch": 0.4510867953622157, + "grad_norm": 0.136732816696167, + "learning_rate": 0.0005851915527000875, + "loss": 2.7157, + "step": 15212 + }, + { + "epoch": 0.45111644871452716, + "grad_norm": 0.13554897904396057, + "learning_rate": 0.0005851451893462035, + "loss": 2.7117, + "step": 15213 + }, + { + "epoch": 0.45114610206683864, + "grad_norm": 0.11956052482128143, + "learning_rate": 0.0005850988252383443, + "loss": 2.6713, + "step": 15214 + }, + { + "epoch": 0.4511757554191501, + "grad_norm": 0.12152738124132156, + "learning_rate": 0.0005850524603769201, + "loss": 2.7209, + "step": 15215 + }, + { + "epoch": 0.4512054087714616, + "grad_norm": 0.14139732718467712, + "learning_rate": 0.000585006094762342, + "loss": 2.679, + "step": 15216 + }, + { + "epoch": 0.45123506212377307, + "grad_norm": 0.11259663105010986, + "learning_rate": 0.0005849597283950203, + "loss": 2.7052, + "step": 15217 + }, + { + "epoch": 0.4512647154760846, + "grad_norm": 0.1198209747672081, + "learning_rate": 0.0005849133612753656, + "loss": 2.69, + "step": 15218 + }, + { + "epoch": 0.4512943688283961, + "grad_norm": 0.14945612847805023, + "learning_rate": 0.0005848669934037884, + "loss": 2.7223, + "step": 15219 + }, + { + "epoch": 0.45132402218070755, + "grad_norm": 0.1594494879245758, + "learning_rate": 0.0005848206247806996, + "loss": 2.6951, + "step": 15220 + }, + { + "epoch": 0.451353675533019, + "grad_norm": 0.1425950974225998, + "learning_rate": 0.0005847742554065096, + "loss": 2.6734, + "step": 15221 + }, + { + "epoch": 0.4513833288853305, + "grad_norm": 0.13548927009105682, + "learning_rate": 0.0005847278852816289, + "loss": 2.6879, + "step": 15222 + }, + { + "epoch": 0.451412982237642, + "grad_norm": 0.12232249230146408, + "learning_rate": 0.0005846815144064682, + "loss": 2.6932, + "step": 15223 + }, + { + "epoch": 0.45144263558995346, + "grad_norm": 0.11642807722091675, + "learning_rate": 0.0005846351427814383, + "loss": 2.6946, + "step": 15224 + }, + { + "epoch": 0.45147228894226493, + "grad_norm": 0.12349146604537964, + "learning_rate": 0.0005845887704069495, + "loss": 2.707, + "step": 15225 + }, + { + "epoch": 0.4515019422945764, + "grad_norm": 0.12162335216999054, + "learning_rate": 0.0005845423972834127, + "loss": 2.7143, + "step": 15226 + }, + { + "epoch": 0.4515315956468879, + "grad_norm": 0.11188486963510513, + "learning_rate": 0.0005844960234112385, + "loss": 2.6885, + "step": 15227 + }, + { + "epoch": 0.45156124899919936, + "grad_norm": 0.11770643293857574, + "learning_rate": 0.0005844496487908375, + "loss": 2.6897, + "step": 15228 + }, + { + "epoch": 0.45159090235151084, + "grad_norm": 0.22621065378189087, + "learning_rate": 0.0005844032734226204, + "loss": 2.7075, + "step": 15229 + }, + { + "epoch": 0.4516205557038223, + "grad_norm": 0.11400070041418076, + "learning_rate": 0.0005843568973069978, + "loss": 2.6795, + "step": 15230 + }, + { + "epoch": 0.4516502090561338, + "grad_norm": 0.11044324189424515, + "learning_rate": 0.0005843105204443805, + "loss": 2.6794, + "step": 15231 + }, + { + "epoch": 0.45167986240844527, + "grad_norm": 0.11830291897058487, + "learning_rate": 0.0005842641428351788, + "loss": 2.6979, + "step": 15232 + }, + { + "epoch": 0.45170951576075674, + "grad_norm": 0.12136072665452957, + "learning_rate": 0.0005842177644798038, + "loss": 2.7254, + "step": 15233 + }, + { + "epoch": 0.4517391691130682, + "grad_norm": 0.12371513247489929, + "learning_rate": 0.0005841713853786661, + "loss": 2.7109, + "step": 15234 + }, + { + "epoch": 0.4517688224653797, + "grad_norm": 0.12035038322210312, + "learning_rate": 0.0005841250055321763, + "loss": 2.7398, + "step": 15235 + }, + { + "epoch": 0.45179847581769117, + "grad_norm": 0.11328276246786118, + "learning_rate": 0.0005840786249407451, + "loss": 2.6866, + "step": 15236 + }, + { + "epoch": 0.45182812917000265, + "grad_norm": 0.1006191298365593, + "learning_rate": 0.0005840322436047833, + "loss": 2.6799, + "step": 15237 + }, + { + "epoch": 0.4518577825223141, + "grad_norm": 0.1225922629237175, + "learning_rate": 0.0005839858615247015, + "loss": 2.6992, + "step": 15238 + }, + { + "epoch": 0.45188743587462565, + "grad_norm": 0.09919088333845139, + "learning_rate": 0.0005839394787009105, + "loss": 2.7361, + "step": 15239 + }, + { + "epoch": 0.45191708922693713, + "grad_norm": 0.1104302853345871, + "learning_rate": 0.0005838930951338209, + "loss": 2.6713, + "step": 15240 + }, + { + "epoch": 0.4519467425792486, + "grad_norm": 0.10618731379508972, + "learning_rate": 0.0005838467108238437, + "loss": 2.6524, + "step": 15241 + }, + { + "epoch": 0.4519763959315601, + "grad_norm": 0.10549023002386093, + "learning_rate": 0.0005838003257713896, + "loss": 2.6816, + "step": 15242 + }, + { + "epoch": 0.45200604928387156, + "grad_norm": 0.12401483952999115, + "learning_rate": 0.000583753939976869, + "loss": 2.7106, + "step": 15243 + }, + { + "epoch": 0.45203570263618303, + "grad_norm": 0.1328415721654892, + "learning_rate": 0.0005837075534406928, + "loss": 2.6728, + "step": 15244 + }, + { + "epoch": 0.4520653559884945, + "grad_norm": 0.13562500476837158, + "learning_rate": 0.000583661166163272, + "loss": 2.6891, + "step": 15245 + }, + { + "epoch": 0.452095009340806, + "grad_norm": 0.12529151141643524, + "learning_rate": 0.0005836147781450173, + "loss": 2.7406, + "step": 15246 + }, + { + "epoch": 0.45212466269311746, + "grad_norm": 0.1188388466835022, + "learning_rate": 0.0005835683893863393, + "loss": 2.6788, + "step": 15247 + }, + { + "epoch": 0.45215431604542894, + "grad_norm": 0.1212717816233635, + "learning_rate": 0.0005835219998876488, + "loss": 2.6912, + "step": 15248 + }, + { + "epoch": 0.4521839693977404, + "grad_norm": 0.12612751126289368, + "learning_rate": 0.0005834756096493568, + "loss": 2.711, + "step": 15249 + }, + { + "epoch": 0.4522136227500519, + "grad_norm": 0.12283220142126083, + "learning_rate": 0.0005834292186718738, + "loss": 2.7329, + "step": 15250 + }, + { + "epoch": 0.45224327610236337, + "grad_norm": 0.10880003124475479, + "learning_rate": 0.0005833828269556108, + "loss": 2.6485, + "step": 15251 + }, + { + "epoch": 0.45227292945467484, + "grad_norm": 0.11522421985864639, + "learning_rate": 0.0005833364345009787, + "loss": 2.6847, + "step": 15252 + }, + { + "epoch": 0.4523025828069863, + "grad_norm": 0.12576749920845032, + "learning_rate": 0.0005832900413083879, + "loss": 2.7292, + "step": 15253 + }, + { + "epoch": 0.4523322361592978, + "grad_norm": 0.12738542258739471, + "learning_rate": 0.0005832436473782496, + "loss": 2.6778, + "step": 15254 + }, + { + "epoch": 0.45236188951160927, + "grad_norm": 0.12484809011220932, + "learning_rate": 0.0005831972527109746, + "loss": 2.681, + "step": 15255 + }, + { + "epoch": 0.45239154286392075, + "grad_norm": 0.12803739309310913, + "learning_rate": 0.0005831508573069736, + "loss": 2.6863, + "step": 15256 + }, + { + "epoch": 0.4524211962162322, + "grad_norm": 0.12906253337860107, + "learning_rate": 0.0005831044611666575, + "loss": 2.7043, + "step": 15257 + }, + { + "epoch": 0.4524508495685437, + "grad_norm": 0.14687088131904602, + "learning_rate": 0.000583058064290437, + "loss": 2.6958, + "step": 15258 + }, + { + "epoch": 0.4524805029208552, + "grad_norm": 0.14558923244476318, + "learning_rate": 0.0005830116666787233, + "loss": 2.6577, + "step": 15259 + }, + { + "epoch": 0.4525101562731667, + "grad_norm": 0.10022532194852829, + "learning_rate": 0.0005829652683319268, + "loss": 2.6703, + "step": 15260 + }, + { + "epoch": 0.4525398096254782, + "grad_norm": 0.11419576406478882, + "learning_rate": 0.0005829188692504588, + "loss": 2.7067, + "step": 15261 + }, + { + "epoch": 0.45256946297778966, + "grad_norm": 0.12086372822523117, + "learning_rate": 0.0005828724694347299, + "loss": 2.6961, + "step": 15262 + }, + { + "epoch": 0.45259911633010114, + "grad_norm": 0.11893314123153687, + "learning_rate": 0.0005828260688851511, + "loss": 2.7248, + "step": 15263 + }, + { + "epoch": 0.4526287696824126, + "grad_norm": 0.129214808344841, + "learning_rate": 0.0005827796676021331, + "loss": 2.6974, + "step": 15264 + }, + { + "epoch": 0.4526584230347241, + "grad_norm": 0.12333139032125473, + "learning_rate": 0.0005827332655860872, + "loss": 2.6839, + "step": 15265 + }, + { + "epoch": 0.45268807638703557, + "grad_norm": 0.12222269922494888, + "learning_rate": 0.0005826868628374239, + "loss": 2.7045, + "step": 15266 + }, + { + "epoch": 0.45271772973934704, + "grad_norm": 0.12644174695014954, + "learning_rate": 0.0005826404593565541, + "loss": 2.6926, + "step": 15267 + }, + { + "epoch": 0.4527473830916585, + "grad_norm": 0.11262320727109909, + "learning_rate": 0.0005825940551438891, + "loss": 2.7062, + "step": 15268 + }, + { + "epoch": 0.45277703644397, + "grad_norm": 0.12182863056659698, + "learning_rate": 0.0005825476501998395, + "loss": 2.6964, + "step": 15269 + }, + { + "epoch": 0.45280668979628147, + "grad_norm": 0.12421119958162308, + "learning_rate": 0.0005825012445248161, + "loss": 2.7183, + "step": 15270 + }, + { + "epoch": 0.45283634314859295, + "grad_norm": 0.12189418077468872, + "learning_rate": 0.0005824548381192302, + "loss": 2.7229, + "step": 15271 + }, + { + "epoch": 0.4528659965009044, + "grad_norm": 0.1402861624956131, + "learning_rate": 0.0005824084309834924, + "loss": 2.7136, + "step": 15272 + }, + { + "epoch": 0.4528956498532159, + "grad_norm": 0.16396576166152954, + "learning_rate": 0.0005823620231180139, + "loss": 2.6711, + "step": 15273 + }, + { + "epoch": 0.4529253032055274, + "grad_norm": 0.16059790551662445, + "learning_rate": 0.0005823156145232057, + "loss": 2.7003, + "step": 15274 + }, + { + "epoch": 0.45295495655783885, + "grad_norm": 0.1523250788450241, + "learning_rate": 0.0005822692051994785, + "loss": 2.709, + "step": 15275 + }, + { + "epoch": 0.4529846099101503, + "grad_norm": 0.1443885713815689, + "learning_rate": 0.0005822227951472432, + "loss": 2.6811, + "step": 15276 + }, + { + "epoch": 0.4530142632624618, + "grad_norm": 0.1313745081424713, + "learning_rate": 0.000582176384366911, + "loss": 2.6837, + "step": 15277 + }, + { + "epoch": 0.4530439166147733, + "grad_norm": 0.14543738961219788, + "learning_rate": 0.0005821299728588928, + "loss": 2.7021, + "step": 15278 + }, + { + "epoch": 0.45307356996708475, + "grad_norm": 0.13307256996631622, + "learning_rate": 0.0005820835606235998, + "loss": 2.6923, + "step": 15279 + }, + { + "epoch": 0.45310322331939623, + "grad_norm": 0.11853986233472824, + "learning_rate": 0.0005820371476614425, + "loss": 2.7056, + "step": 15280 + }, + { + "epoch": 0.45313287667170776, + "grad_norm": 0.1349790245294571, + "learning_rate": 0.0005819907339728324, + "loss": 2.6908, + "step": 15281 + }, + { + "epoch": 0.45316253002401924, + "grad_norm": 0.15035361051559448, + "learning_rate": 0.0005819443195581802, + "loss": 2.679, + "step": 15282 + }, + { + "epoch": 0.4531921833763307, + "grad_norm": 0.13456891477108002, + "learning_rate": 0.000581897904417897, + "loss": 2.6915, + "step": 15283 + }, + { + "epoch": 0.4532218367286422, + "grad_norm": 0.12550808489322662, + "learning_rate": 0.0005818514885523938, + "loss": 2.7031, + "step": 15284 + }, + { + "epoch": 0.45325149008095367, + "grad_norm": 0.12751099467277527, + "learning_rate": 0.0005818050719620815, + "loss": 2.6716, + "step": 15285 + }, + { + "epoch": 0.45328114343326514, + "grad_norm": 0.12202005088329315, + "learning_rate": 0.0005817586546473713, + "loss": 2.7095, + "step": 15286 + }, + { + "epoch": 0.4533107967855766, + "grad_norm": 0.13236595690250397, + "learning_rate": 0.0005817122366086742, + "loss": 2.7367, + "step": 15287 + }, + { + "epoch": 0.4533404501378881, + "grad_norm": 0.11493023484945297, + "learning_rate": 0.0005816658178464013, + "loss": 2.6893, + "step": 15288 + }, + { + "epoch": 0.45337010349019957, + "grad_norm": 0.14040008187294006, + "learning_rate": 0.0005816193983609636, + "loss": 2.7503, + "step": 15289 + }, + { + "epoch": 0.45339975684251105, + "grad_norm": 0.12585942447185516, + "learning_rate": 0.0005815729781527719, + "loss": 2.7193, + "step": 15290 + }, + { + "epoch": 0.4534294101948225, + "grad_norm": 0.11458304524421692, + "learning_rate": 0.0005815265572222376, + "loss": 2.6824, + "step": 15291 + }, + { + "epoch": 0.453459063547134, + "grad_norm": 0.12628118693828583, + "learning_rate": 0.0005814801355697717, + "loss": 2.682, + "step": 15292 + }, + { + "epoch": 0.4534887168994455, + "grad_norm": 0.1346714049577713, + "learning_rate": 0.0005814337131957851, + "loss": 2.6794, + "step": 15293 + }, + { + "epoch": 0.45351837025175695, + "grad_norm": 0.13016337156295776, + "learning_rate": 0.0005813872901006891, + "loss": 2.6828, + "step": 15294 + }, + { + "epoch": 0.45354802360406843, + "grad_norm": 0.11416275054216385, + "learning_rate": 0.0005813408662848946, + "loss": 2.6947, + "step": 15295 + }, + { + "epoch": 0.4535776769563799, + "grad_norm": 0.1150490939617157, + "learning_rate": 0.0005812944417488128, + "loss": 2.6979, + "step": 15296 + }, + { + "epoch": 0.4536073303086914, + "grad_norm": 0.12214647978544235, + "learning_rate": 0.0005812480164928546, + "loss": 2.7094, + "step": 15297 + }, + { + "epoch": 0.45363698366100286, + "grad_norm": 0.12938089668750763, + "learning_rate": 0.0005812015905174314, + "loss": 2.6779, + "step": 15298 + }, + { + "epoch": 0.45366663701331433, + "grad_norm": 0.12311570346355438, + "learning_rate": 0.0005811551638229543, + "loss": 2.6754, + "step": 15299 + }, + { + "epoch": 0.4536962903656258, + "grad_norm": 0.11815577745437622, + "learning_rate": 0.0005811087364098341, + "loss": 2.6625, + "step": 15300 + }, + { + "epoch": 0.4537259437179373, + "grad_norm": 0.1053466871380806, + "learning_rate": 0.0005810623082784823, + "loss": 2.6956, + "step": 15301 + }, + { + "epoch": 0.4537555970702488, + "grad_norm": 0.10841479152441025, + "learning_rate": 0.0005810158794293099, + "loss": 2.6947, + "step": 15302 + }, + { + "epoch": 0.4537852504225603, + "grad_norm": 0.11059649288654327, + "learning_rate": 0.0005809694498627277, + "loss": 2.6579, + "step": 15303 + }, + { + "epoch": 0.45381490377487177, + "grad_norm": 0.1162584200501442, + "learning_rate": 0.0005809230195791471, + "loss": 2.7064, + "step": 15304 + }, + { + "epoch": 0.45384455712718325, + "grad_norm": 0.10980261117219925, + "learning_rate": 0.0005808765885789795, + "loss": 2.7168, + "step": 15305 + }, + { + "epoch": 0.4538742104794947, + "grad_norm": 0.12544722855091095, + "learning_rate": 0.0005808301568626358, + "loss": 2.7093, + "step": 15306 + }, + { + "epoch": 0.4539038638318062, + "grad_norm": 0.1245075985789299, + "learning_rate": 0.0005807837244305271, + "loss": 2.6914, + "step": 15307 + }, + { + "epoch": 0.4539335171841177, + "grad_norm": 0.1110542044043541, + "learning_rate": 0.0005807372912830648, + "loss": 2.6846, + "step": 15308 + }, + { + "epoch": 0.45396317053642915, + "grad_norm": 0.12115448713302612, + "learning_rate": 0.0005806908574206598, + "loss": 2.6925, + "step": 15309 + }, + { + "epoch": 0.4539928238887406, + "grad_norm": 0.11295345425605774, + "learning_rate": 0.0005806444228437233, + "loss": 2.684, + "step": 15310 + }, + { + "epoch": 0.4540224772410521, + "grad_norm": 0.11572268605232239, + "learning_rate": 0.0005805979875526668, + "loss": 2.683, + "step": 15311 + }, + { + "epoch": 0.4540521305933636, + "grad_norm": 0.10484588891267776, + "learning_rate": 0.0005805515515479013, + "loss": 2.6806, + "step": 15312 + }, + { + "epoch": 0.45408178394567505, + "grad_norm": 0.1144472062587738, + "learning_rate": 0.000580505114829838, + "loss": 2.7138, + "step": 15313 + }, + { + "epoch": 0.45411143729798653, + "grad_norm": 0.12759456038475037, + "learning_rate": 0.0005804586773988879, + "loss": 2.6692, + "step": 15314 + }, + { + "epoch": 0.454141090650298, + "grad_norm": 0.11706122010946274, + "learning_rate": 0.0005804122392554625, + "loss": 2.6916, + "step": 15315 + }, + { + "epoch": 0.4541707440026095, + "grad_norm": 0.11146461963653564, + "learning_rate": 0.0005803658003999728, + "loss": 2.7094, + "step": 15316 + }, + { + "epoch": 0.45420039735492096, + "grad_norm": 0.1252070516347885, + "learning_rate": 0.0005803193608328303, + "loss": 2.6936, + "step": 15317 + }, + { + "epoch": 0.45423005070723244, + "grad_norm": 0.11832530796527863, + "learning_rate": 0.000580272920554446, + "loss": 2.7183, + "step": 15318 + }, + { + "epoch": 0.4542597040595439, + "grad_norm": 0.13292182981967926, + "learning_rate": 0.0005802264795652313, + "loss": 2.6726, + "step": 15319 + }, + { + "epoch": 0.4542893574118554, + "grad_norm": 0.15463432669639587, + "learning_rate": 0.0005801800378655973, + "loss": 2.7426, + "step": 15320 + }, + { + "epoch": 0.45431901076416686, + "grad_norm": 0.18467840552330017, + "learning_rate": 0.0005801335954559552, + "loss": 2.6853, + "step": 15321 + }, + { + "epoch": 0.45434866411647834, + "grad_norm": 0.1618230789899826, + "learning_rate": 0.0005800871523367163, + "loss": 2.6396, + "step": 15322 + }, + { + "epoch": 0.45437831746878987, + "grad_norm": 0.13576863706111908, + "learning_rate": 0.0005800407085082922, + "loss": 2.6901, + "step": 15323 + }, + { + "epoch": 0.45440797082110135, + "grad_norm": 0.12185214459896088, + "learning_rate": 0.0005799942639710938, + "loss": 2.7375, + "step": 15324 + }, + { + "epoch": 0.4544376241734128, + "grad_norm": 0.1474296897649765, + "learning_rate": 0.0005799478187255324, + "loss": 2.7076, + "step": 15325 + }, + { + "epoch": 0.4544672775257243, + "grad_norm": 0.1463371366262436, + "learning_rate": 0.0005799013727720193, + "loss": 2.708, + "step": 15326 + }, + { + "epoch": 0.4544969308780358, + "grad_norm": 0.12501443922519684, + "learning_rate": 0.0005798549261109659, + "loss": 2.7119, + "step": 15327 + }, + { + "epoch": 0.45452658423034725, + "grad_norm": 0.14374828338623047, + "learning_rate": 0.0005798084787427834, + "loss": 2.6865, + "step": 15328 + }, + { + "epoch": 0.45455623758265873, + "grad_norm": 0.1489933282136917, + "learning_rate": 0.0005797620306678831, + "loss": 2.7195, + "step": 15329 + }, + { + "epoch": 0.4545858909349702, + "grad_norm": 0.1312418133020401, + "learning_rate": 0.0005797155818866764, + "loss": 2.7156, + "step": 15330 + }, + { + "epoch": 0.4546155442872817, + "grad_norm": 0.14071869850158691, + "learning_rate": 0.0005796691323995744, + "loss": 2.6975, + "step": 15331 + }, + { + "epoch": 0.45464519763959316, + "grad_norm": 0.13735860586166382, + "learning_rate": 0.0005796226822069886, + "loss": 2.6877, + "step": 15332 + }, + { + "epoch": 0.45467485099190463, + "grad_norm": 0.14638304710388184, + "learning_rate": 0.0005795762313093305, + "loss": 2.6916, + "step": 15333 + }, + { + "epoch": 0.4547045043442161, + "grad_norm": 0.1362769901752472, + "learning_rate": 0.000579529779707011, + "loss": 2.7299, + "step": 15334 + }, + { + "epoch": 0.4547341576965276, + "grad_norm": 0.14375929534435272, + "learning_rate": 0.0005794833274004416, + "loss": 2.6867, + "step": 15335 + }, + { + "epoch": 0.45476381104883906, + "grad_norm": 0.12168391048908234, + "learning_rate": 0.0005794368743900338, + "loss": 2.7108, + "step": 15336 + }, + { + "epoch": 0.45479346440115054, + "grad_norm": 0.11553416401147842, + "learning_rate": 0.0005793904206761989, + "loss": 2.6863, + "step": 15337 + }, + { + "epoch": 0.454823117753462, + "grad_norm": 0.11464317888021469, + "learning_rate": 0.000579343966259348, + "loss": 2.7108, + "step": 15338 + }, + { + "epoch": 0.4548527711057735, + "grad_norm": 0.11574586480855942, + "learning_rate": 0.0005792975111398928, + "loss": 2.7152, + "step": 15339 + }, + { + "epoch": 0.45488242445808497, + "grad_norm": 0.12250149250030518, + "learning_rate": 0.0005792510553182446, + "loss": 2.7305, + "step": 15340 + }, + { + "epoch": 0.45491207781039644, + "grad_norm": 0.10643994063138962, + "learning_rate": 0.0005792045987948146, + "loss": 2.7029, + "step": 15341 + }, + { + "epoch": 0.4549417311627079, + "grad_norm": 0.12309470027685165, + "learning_rate": 0.0005791581415700143, + "loss": 2.662, + "step": 15342 + }, + { + "epoch": 0.45497138451501945, + "grad_norm": 0.12587356567382812, + "learning_rate": 0.000579111683644255, + "loss": 2.6933, + "step": 15343 + }, + { + "epoch": 0.4550010378673309, + "grad_norm": 0.13001704216003418, + "learning_rate": 0.0005790652250179482, + "loss": 2.719, + "step": 15344 + }, + { + "epoch": 0.4550306912196424, + "grad_norm": 0.12701661884784698, + "learning_rate": 0.0005790187656915055, + "loss": 2.665, + "step": 15345 + }, + { + "epoch": 0.4550603445719539, + "grad_norm": 0.1460881233215332, + "learning_rate": 0.0005789723056653377, + "loss": 2.6978, + "step": 15346 + }, + { + "epoch": 0.45508999792426535, + "grad_norm": 0.12302576750516891, + "learning_rate": 0.0005789258449398569, + "loss": 2.7185, + "step": 15347 + }, + { + "epoch": 0.45511965127657683, + "grad_norm": 0.1099276915192604, + "learning_rate": 0.0005788793835154739, + "loss": 2.7149, + "step": 15348 + }, + { + "epoch": 0.4551493046288883, + "grad_norm": 0.12275715172290802, + "learning_rate": 0.0005788329213926005, + "loss": 2.6753, + "step": 15349 + }, + { + "epoch": 0.4551789579811998, + "grad_norm": 0.12419173121452332, + "learning_rate": 0.0005787864585716483, + "loss": 2.6894, + "step": 15350 + }, + { + "epoch": 0.45520861133351126, + "grad_norm": 0.11169838160276413, + "learning_rate": 0.0005787399950530282, + "loss": 2.667, + "step": 15351 + }, + { + "epoch": 0.45523826468582274, + "grad_norm": 0.11557991057634354, + "learning_rate": 0.000578693530837152, + "loss": 2.699, + "step": 15352 + }, + { + "epoch": 0.4552679180381342, + "grad_norm": 0.12997081875801086, + "learning_rate": 0.000578647065924431, + "loss": 2.7121, + "step": 15353 + }, + { + "epoch": 0.4552975713904457, + "grad_norm": 0.13554517924785614, + "learning_rate": 0.0005786006003152768, + "loss": 2.7006, + "step": 15354 + }, + { + "epoch": 0.45532722474275716, + "grad_norm": 0.1423138976097107, + "learning_rate": 0.000578554134010101, + "loss": 2.6771, + "step": 15355 + }, + { + "epoch": 0.45535687809506864, + "grad_norm": 0.12945549190044403, + "learning_rate": 0.0005785076670093146, + "loss": 2.6759, + "step": 15356 + }, + { + "epoch": 0.4553865314473801, + "grad_norm": 0.12757834792137146, + "learning_rate": 0.0005784611993133295, + "loss": 2.6896, + "step": 15357 + }, + { + "epoch": 0.4554161847996916, + "grad_norm": 0.11489340662956238, + "learning_rate": 0.0005784147309225568, + "loss": 2.6982, + "step": 15358 + }, + { + "epoch": 0.45544583815200307, + "grad_norm": 0.1109471246600151, + "learning_rate": 0.0005783682618374083, + "loss": 2.6826, + "step": 15359 + }, + { + "epoch": 0.45547549150431454, + "grad_norm": 0.11267372965812683, + "learning_rate": 0.0005783217920582954, + "loss": 2.7286, + "step": 15360 + }, + { + "epoch": 0.455505144856626, + "grad_norm": 0.1153002381324768, + "learning_rate": 0.0005782753215856296, + "loss": 2.7391, + "step": 15361 + }, + { + "epoch": 0.4555347982089375, + "grad_norm": 0.11264713108539581, + "learning_rate": 0.0005782288504198224, + "loss": 2.6945, + "step": 15362 + }, + { + "epoch": 0.455564451561249, + "grad_norm": 0.108583003282547, + "learning_rate": 0.0005781823785612853, + "loss": 2.7023, + "step": 15363 + }, + { + "epoch": 0.4555941049135605, + "grad_norm": 0.10886723548173904, + "learning_rate": 0.0005781359060104298, + "loss": 2.6918, + "step": 15364 + }, + { + "epoch": 0.455623758265872, + "grad_norm": 0.11543810367584229, + "learning_rate": 0.0005780894327676675, + "loss": 2.673, + "step": 15365 + }, + { + "epoch": 0.45565341161818346, + "grad_norm": 0.10849791765213013, + "learning_rate": 0.0005780429588334098, + "loss": 2.6881, + "step": 15366 + }, + { + "epoch": 0.45568306497049493, + "grad_norm": 0.12161844968795776, + "learning_rate": 0.0005779964842080683, + "loss": 2.6834, + "step": 15367 + }, + { + "epoch": 0.4557127183228064, + "grad_norm": 0.12005922943353653, + "learning_rate": 0.0005779500088920546, + "loss": 2.7088, + "step": 15368 + }, + { + "epoch": 0.4557423716751179, + "grad_norm": 0.11566849052906036, + "learning_rate": 0.00057790353288578, + "loss": 2.6809, + "step": 15369 + }, + { + "epoch": 0.45577202502742936, + "grad_norm": 0.12447664141654968, + "learning_rate": 0.0005778570561896564, + "loss": 2.6645, + "step": 15370 + }, + { + "epoch": 0.45580167837974084, + "grad_norm": 0.1258784979581833, + "learning_rate": 0.0005778105788040953, + "loss": 2.6768, + "step": 15371 + }, + { + "epoch": 0.4558313317320523, + "grad_norm": 0.12314054369926453, + "learning_rate": 0.0005777641007295081, + "loss": 2.6971, + "step": 15372 + }, + { + "epoch": 0.4558609850843638, + "grad_norm": 0.12620048224925995, + "learning_rate": 0.0005777176219663065, + "loss": 2.7094, + "step": 15373 + }, + { + "epoch": 0.45589063843667527, + "grad_norm": 0.12698669731616974, + "learning_rate": 0.0005776711425149018, + "loss": 2.6937, + "step": 15374 + }, + { + "epoch": 0.45592029178898674, + "grad_norm": 0.14651824533939362, + "learning_rate": 0.0005776246623757059, + "loss": 2.6781, + "step": 15375 + }, + { + "epoch": 0.4559499451412982, + "grad_norm": 0.1373436003923416, + "learning_rate": 0.0005775781815491304, + "loss": 2.718, + "step": 15376 + }, + { + "epoch": 0.4559795984936097, + "grad_norm": 0.11856254190206528, + "learning_rate": 0.0005775317000355866, + "loss": 2.6713, + "step": 15377 + }, + { + "epoch": 0.45600925184592117, + "grad_norm": 0.12381318211555481, + "learning_rate": 0.0005774852178354865, + "loss": 2.6901, + "step": 15378 + }, + { + "epoch": 0.45603890519823265, + "grad_norm": 0.11240560561418533, + "learning_rate": 0.0005774387349492413, + "loss": 2.6708, + "step": 15379 + }, + { + "epoch": 0.4560685585505441, + "grad_norm": 0.1276378035545349, + "learning_rate": 0.0005773922513772629, + "loss": 2.6923, + "step": 15380 + }, + { + "epoch": 0.4560982119028556, + "grad_norm": 0.1308978796005249, + "learning_rate": 0.0005773457671199628, + "loss": 2.6976, + "step": 15381 + }, + { + "epoch": 0.4561278652551671, + "grad_norm": 0.13361608982086182, + "learning_rate": 0.0005772992821777527, + "loss": 2.6898, + "step": 15382 + }, + { + "epoch": 0.45615751860747855, + "grad_norm": 0.13580210506916046, + "learning_rate": 0.0005772527965510442, + "loss": 2.6787, + "step": 15383 + }, + { + "epoch": 0.45618717195979, + "grad_norm": 0.12061753869056702, + "learning_rate": 0.000577206310240249, + "loss": 2.7234, + "step": 15384 + }, + { + "epoch": 0.45621682531210156, + "grad_norm": 0.1031499132514, + "learning_rate": 0.0005771598232457786, + "loss": 2.678, + "step": 15385 + }, + { + "epoch": 0.45624647866441304, + "grad_norm": 0.11393848061561584, + "learning_rate": 0.0005771133355680447, + "loss": 2.6964, + "step": 15386 + }, + { + "epoch": 0.4562761320167245, + "grad_norm": 0.11769901216030121, + "learning_rate": 0.0005770668472074587, + "loss": 2.6849, + "step": 15387 + }, + { + "epoch": 0.456305785369036, + "grad_norm": 0.1179405227303505, + "learning_rate": 0.000577020358164433, + "loss": 2.7169, + "step": 15388 + }, + { + "epoch": 0.45633543872134746, + "grad_norm": 0.12621405720710754, + "learning_rate": 0.0005769738684393786, + "loss": 2.7126, + "step": 15389 + }, + { + "epoch": 0.45636509207365894, + "grad_norm": 0.14850541949272156, + "learning_rate": 0.0005769273780327074, + "loss": 2.7008, + "step": 15390 + }, + { + "epoch": 0.4563947454259704, + "grad_norm": 0.1705314666032791, + "learning_rate": 0.000576880886944831, + "loss": 2.7094, + "step": 15391 + }, + { + "epoch": 0.4564243987782819, + "grad_norm": 0.203673854470253, + "learning_rate": 0.0005768343951761614, + "loss": 2.7307, + "step": 15392 + }, + { + "epoch": 0.45645405213059337, + "grad_norm": 0.19192998111248016, + "learning_rate": 0.0005767879027271097, + "loss": 2.6902, + "step": 15393 + }, + { + "epoch": 0.45648370548290484, + "grad_norm": 0.11903925240039825, + "learning_rate": 0.0005767414095980881, + "loss": 2.6946, + "step": 15394 + }, + { + "epoch": 0.4565133588352163, + "grad_norm": 0.14616048336029053, + "learning_rate": 0.0005766949157895081, + "loss": 2.6922, + "step": 15395 + }, + { + "epoch": 0.4565430121875278, + "grad_norm": 0.13103227317333221, + "learning_rate": 0.0005766484213017816, + "loss": 2.6624, + "step": 15396 + }, + { + "epoch": 0.4565726655398393, + "grad_norm": 0.12240352481603622, + "learning_rate": 0.0005766019261353201, + "loss": 2.698, + "step": 15397 + }, + { + "epoch": 0.45660231889215075, + "grad_norm": 0.14614231884479523, + "learning_rate": 0.0005765554302905353, + "loss": 2.7004, + "step": 15398 + }, + { + "epoch": 0.4566319722444622, + "grad_norm": 0.1507367342710495, + "learning_rate": 0.0005765089337678391, + "loss": 2.684, + "step": 15399 + }, + { + "epoch": 0.4566616255967737, + "grad_norm": 0.14361847937107086, + "learning_rate": 0.0005764624365676431, + "loss": 2.6953, + "step": 15400 + }, + { + "epoch": 0.4566912789490852, + "grad_norm": 0.11765529215335846, + "learning_rate": 0.0005764159386903591, + "loss": 2.693, + "step": 15401 + }, + { + "epoch": 0.45672093230139665, + "grad_norm": 0.11618780344724655, + "learning_rate": 0.0005763694401363989, + "loss": 2.6857, + "step": 15402 + }, + { + "epoch": 0.45675058565370813, + "grad_norm": 0.12226848304271698, + "learning_rate": 0.0005763229409061743, + "loss": 2.691, + "step": 15403 + }, + { + "epoch": 0.4567802390060196, + "grad_norm": 0.12557898461818695, + "learning_rate": 0.0005762764410000968, + "loss": 2.6917, + "step": 15404 + }, + { + "epoch": 0.4568098923583311, + "grad_norm": 0.10952363908290863, + "learning_rate": 0.0005762299404185784, + "loss": 2.6885, + "step": 15405 + }, + { + "epoch": 0.4568395457106426, + "grad_norm": 0.12265777587890625, + "learning_rate": 0.0005761834391620307, + "loss": 2.6896, + "step": 15406 + }, + { + "epoch": 0.4568691990629541, + "grad_norm": 0.12021403759717941, + "learning_rate": 0.0005761369372308657, + "loss": 2.6946, + "step": 15407 + }, + { + "epoch": 0.45689885241526557, + "grad_norm": 0.13368502259254456, + "learning_rate": 0.0005760904346254949, + "loss": 2.7229, + "step": 15408 + }, + { + "epoch": 0.45692850576757704, + "grad_norm": 0.13063210248947144, + "learning_rate": 0.0005760439313463304, + "loss": 2.7202, + "step": 15409 + }, + { + "epoch": 0.4569581591198885, + "grad_norm": 0.14386717975139618, + "learning_rate": 0.0005759974273937839, + "loss": 2.717, + "step": 15410 + }, + { + "epoch": 0.4569878124722, + "grad_norm": 0.16102614998817444, + "learning_rate": 0.0005759509227682668, + "loss": 2.6948, + "step": 15411 + }, + { + "epoch": 0.45701746582451147, + "grad_norm": 0.14337265491485596, + "learning_rate": 0.0005759044174701915, + "loss": 2.6779, + "step": 15412 + }, + { + "epoch": 0.45704711917682295, + "grad_norm": 0.12273697555065155, + "learning_rate": 0.0005758579114999695, + "loss": 2.673, + "step": 15413 + }, + { + "epoch": 0.4570767725291344, + "grad_norm": 0.13261637091636658, + "learning_rate": 0.0005758114048580126, + "loss": 2.7037, + "step": 15414 + }, + { + "epoch": 0.4571064258814459, + "grad_norm": 0.11512736231088638, + "learning_rate": 0.0005757648975447327, + "loss": 2.6886, + "step": 15415 + }, + { + "epoch": 0.4571360792337574, + "grad_norm": 0.12064173072576523, + "learning_rate": 0.0005757183895605419, + "loss": 2.7289, + "step": 15416 + }, + { + "epoch": 0.45716573258606885, + "grad_norm": 0.14686208963394165, + "learning_rate": 0.0005756718809058516, + "loss": 2.6983, + "step": 15417 + }, + { + "epoch": 0.4571953859383803, + "grad_norm": 0.14036273956298828, + "learning_rate": 0.0005756253715810736, + "loss": 2.6787, + "step": 15418 + }, + { + "epoch": 0.4572250392906918, + "grad_norm": 0.12993581593036652, + "learning_rate": 0.0005755788615866201, + "loss": 2.6623, + "step": 15419 + }, + { + "epoch": 0.4572546926430033, + "grad_norm": 0.12541770935058594, + "learning_rate": 0.0005755323509229028, + "loss": 2.6731, + "step": 15420 + }, + { + "epoch": 0.45728434599531476, + "grad_norm": 0.1350421905517578, + "learning_rate": 0.0005754858395903337, + "loss": 2.6972, + "step": 15421 + }, + { + "epoch": 0.45731399934762623, + "grad_norm": 0.13959155976772308, + "learning_rate": 0.0005754393275893243, + "loss": 2.716, + "step": 15422 + }, + { + "epoch": 0.4573436526999377, + "grad_norm": 0.13174936175346375, + "learning_rate": 0.0005753928149202869, + "loss": 2.6872, + "step": 15423 + }, + { + "epoch": 0.4573733060522492, + "grad_norm": 0.13168823719024658, + "learning_rate": 0.0005753463015836331, + "loss": 2.6758, + "step": 15424 + }, + { + "epoch": 0.45740295940456066, + "grad_norm": 0.1229839101433754, + "learning_rate": 0.0005752997875797749, + "loss": 2.7022, + "step": 15425 + }, + { + "epoch": 0.45743261275687214, + "grad_norm": 0.11737003922462463, + "learning_rate": 0.0005752532729091242, + "loss": 2.6641, + "step": 15426 + }, + { + "epoch": 0.45746226610918367, + "grad_norm": 0.1496317833662033, + "learning_rate": 0.0005752067575720927, + "loss": 2.7387, + "step": 15427 + }, + { + "epoch": 0.45749191946149514, + "grad_norm": 0.14249317348003387, + "learning_rate": 0.0005751602415690925, + "loss": 2.6984, + "step": 15428 + }, + { + "epoch": 0.4575215728138066, + "grad_norm": 0.13666945695877075, + "learning_rate": 0.0005751137249005356, + "loss": 2.6962, + "step": 15429 + }, + { + "epoch": 0.4575512261661181, + "grad_norm": 0.15835106372833252, + "learning_rate": 0.0005750672075668336, + "loss": 2.6902, + "step": 15430 + }, + { + "epoch": 0.4575808795184296, + "grad_norm": 0.13006553053855896, + "learning_rate": 0.0005750206895683987, + "loss": 2.6999, + "step": 15431 + }, + { + "epoch": 0.45761053287074105, + "grad_norm": 0.1270969957113266, + "learning_rate": 0.0005749741709056426, + "loss": 2.7113, + "step": 15432 + }, + { + "epoch": 0.4576401862230525, + "grad_norm": 0.12735527753829956, + "learning_rate": 0.0005749276515789775, + "loss": 2.6549, + "step": 15433 + }, + { + "epoch": 0.457669839575364, + "grad_norm": 0.12433088570833206, + "learning_rate": 0.0005748811315888152, + "loss": 2.6696, + "step": 15434 + }, + { + "epoch": 0.4576994929276755, + "grad_norm": 0.12756481766700745, + "learning_rate": 0.0005748346109355674, + "loss": 2.6731, + "step": 15435 + }, + { + "epoch": 0.45772914627998695, + "grad_norm": 0.1153096854686737, + "learning_rate": 0.0005747880896196465, + "loss": 2.6984, + "step": 15436 + }, + { + "epoch": 0.45775879963229843, + "grad_norm": 0.13067342340946198, + "learning_rate": 0.0005747415676414641, + "loss": 2.728, + "step": 15437 + }, + { + "epoch": 0.4577884529846099, + "grad_norm": 0.13326171040534973, + "learning_rate": 0.0005746950450014323, + "loss": 2.6956, + "step": 15438 + }, + { + "epoch": 0.4578181063369214, + "grad_norm": 0.11214754730463028, + "learning_rate": 0.000574648521699963, + "loss": 2.6887, + "step": 15439 + }, + { + "epoch": 0.45784775968923286, + "grad_norm": 0.12165411561727524, + "learning_rate": 0.0005746019977374684, + "loss": 2.7111, + "step": 15440 + }, + { + "epoch": 0.45787741304154433, + "grad_norm": 0.13154858350753784, + "learning_rate": 0.0005745554731143602, + "loss": 2.6831, + "step": 15441 + }, + { + "epoch": 0.4579070663938558, + "grad_norm": 0.11941319704055786, + "learning_rate": 0.0005745089478310506, + "loss": 2.6723, + "step": 15442 + }, + { + "epoch": 0.4579367197461673, + "grad_norm": 0.11743209511041641, + "learning_rate": 0.0005744624218879514, + "loss": 2.6935, + "step": 15443 + }, + { + "epoch": 0.45796637309847876, + "grad_norm": 0.12381511926651001, + "learning_rate": 0.0005744158952854747, + "loss": 2.7001, + "step": 15444 + }, + { + "epoch": 0.45799602645079024, + "grad_norm": 0.12922020256519318, + "learning_rate": 0.0005743693680240323, + "loss": 2.6951, + "step": 15445 + }, + { + "epoch": 0.4580256798031017, + "grad_norm": 0.12418412417173386, + "learning_rate": 0.0005743228401040364, + "loss": 2.6631, + "step": 15446 + }, + { + "epoch": 0.45805533315541325, + "grad_norm": 0.11585704237222672, + "learning_rate": 0.0005742763115258992, + "loss": 2.6787, + "step": 15447 + }, + { + "epoch": 0.4580849865077247, + "grad_norm": 0.12009349465370178, + "learning_rate": 0.0005742297822900326, + "loss": 2.6765, + "step": 15448 + }, + { + "epoch": 0.4581146398600362, + "grad_norm": 0.12432834506034851, + "learning_rate": 0.0005741832523968484, + "loss": 2.6585, + "step": 15449 + }, + { + "epoch": 0.4581442932123477, + "grad_norm": 0.11723790317773819, + "learning_rate": 0.0005741367218467586, + "loss": 2.6614, + "step": 15450 + }, + { + "epoch": 0.45817394656465915, + "grad_norm": 0.12185726314783096, + "learning_rate": 0.0005740901906401754, + "loss": 2.691, + "step": 15451 + }, + { + "epoch": 0.4582035999169706, + "grad_norm": 0.13501344621181488, + "learning_rate": 0.0005740436587775109, + "loss": 2.6869, + "step": 15452 + }, + { + "epoch": 0.4582332532692821, + "grad_norm": 0.13536947965621948, + "learning_rate": 0.0005739971262591772, + "loss": 2.6564, + "step": 15453 + }, + { + "epoch": 0.4582629066215936, + "grad_norm": 0.12609441578388214, + "learning_rate": 0.0005739505930855864, + "loss": 2.6768, + "step": 15454 + }, + { + "epoch": 0.45829255997390506, + "grad_norm": 0.1104409471154213, + "learning_rate": 0.0005739040592571502, + "loss": 2.6802, + "step": 15455 + }, + { + "epoch": 0.45832221332621653, + "grad_norm": 0.13405178487300873, + "learning_rate": 0.0005738575247742808, + "loss": 2.7251, + "step": 15456 + }, + { + "epoch": 0.458351866678528, + "grad_norm": 0.14415563642978668, + "learning_rate": 0.0005738109896373904, + "loss": 2.6876, + "step": 15457 + }, + { + "epoch": 0.4583815200308395, + "grad_norm": 0.12846912443637848, + "learning_rate": 0.000573764453846891, + "loss": 2.7188, + "step": 15458 + }, + { + "epoch": 0.45841117338315096, + "grad_norm": 0.10776834189891815, + "learning_rate": 0.0005737179174031948, + "loss": 2.6852, + "step": 15459 + }, + { + "epoch": 0.45844082673546244, + "grad_norm": 0.12353713810443878, + "learning_rate": 0.0005736713803067137, + "loss": 2.6751, + "step": 15460 + }, + { + "epoch": 0.4584704800877739, + "grad_norm": 0.14317725598812103, + "learning_rate": 0.00057362484255786, + "loss": 2.7161, + "step": 15461 + }, + { + "epoch": 0.4585001334400854, + "grad_norm": 0.14462321996688843, + "learning_rate": 0.0005735783041570455, + "loss": 2.6813, + "step": 15462 + }, + { + "epoch": 0.45852978679239687, + "grad_norm": 0.12296454608440399, + "learning_rate": 0.0005735317651046827, + "loss": 2.6711, + "step": 15463 + }, + { + "epoch": 0.45855944014470834, + "grad_norm": 0.11621315777301788, + "learning_rate": 0.0005734852254011833, + "loss": 2.7191, + "step": 15464 + }, + { + "epoch": 0.4585890934970198, + "grad_norm": 0.11896377056837082, + "learning_rate": 0.0005734386850469596, + "loss": 2.6912, + "step": 15465 + }, + { + "epoch": 0.4586187468493313, + "grad_norm": 0.13329504430294037, + "learning_rate": 0.0005733921440424239, + "loss": 2.6725, + "step": 15466 + }, + { + "epoch": 0.45864840020164277, + "grad_norm": 0.12105554342269897, + "learning_rate": 0.0005733456023879881, + "loss": 2.7001, + "step": 15467 + }, + { + "epoch": 0.4586780535539543, + "grad_norm": 0.1313350647687912, + "learning_rate": 0.0005732990600840644, + "loss": 2.6981, + "step": 15468 + }, + { + "epoch": 0.4587077069062658, + "grad_norm": 0.13090163469314575, + "learning_rate": 0.000573252517131065, + "loss": 2.6631, + "step": 15469 + }, + { + "epoch": 0.45873736025857725, + "grad_norm": 0.14658884704113007, + "learning_rate": 0.0005732059735294019, + "loss": 2.696, + "step": 15470 + }, + { + "epoch": 0.45876701361088873, + "grad_norm": 0.13506528735160828, + "learning_rate": 0.0005731594292794872, + "loss": 2.6916, + "step": 15471 + }, + { + "epoch": 0.4587966669632002, + "grad_norm": 0.11317766457796097, + "learning_rate": 0.0005731128843817335, + "loss": 2.7203, + "step": 15472 + }, + { + "epoch": 0.4588263203155117, + "grad_norm": 0.1230865940451622, + "learning_rate": 0.0005730663388365525, + "loss": 2.7042, + "step": 15473 + }, + { + "epoch": 0.45885597366782316, + "grad_norm": 0.12491126358509064, + "learning_rate": 0.0005730197926443565, + "loss": 2.7117, + "step": 15474 + }, + { + "epoch": 0.45888562702013463, + "grad_norm": 0.13198654353618622, + "learning_rate": 0.0005729732458055577, + "loss": 2.7271, + "step": 15475 + }, + { + "epoch": 0.4589152803724461, + "grad_norm": 0.13984104990959167, + "learning_rate": 0.0005729266983205685, + "loss": 2.6629, + "step": 15476 + }, + { + "epoch": 0.4589449337247576, + "grad_norm": 0.13585463166236877, + "learning_rate": 0.0005728801501898006, + "loss": 2.7104, + "step": 15477 + }, + { + "epoch": 0.45897458707706906, + "grad_norm": 0.11207230389118195, + "learning_rate": 0.0005728336014136666, + "loss": 2.6761, + "step": 15478 + }, + { + "epoch": 0.45900424042938054, + "grad_norm": 0.12135394662618637, + "learning_rate": 0.0005727870519925784, + "loss": 2.7137, + "step": 15479 + }, + { + "epoch": 0.459033893781692, + "grad_norm": 0.14121057093143463, + "learning_rate": 0.0005727405019269485, + "loss": 2.6929, + "step": 15480 + }, + { + "epoch": 0.4590635471340035, + "grad_norm": 0.1462133675813675, + "learning_rate": 0.0005726939512171891, + "loss": 2.6679, + "step": 15481 + }, + { + "epoch": 0.45909320048631497, + "grad_norm": 0.11910039931535721, + "learning_rate": 0.000572647399863712, + "loss": 2.7188, + "step": 15482 + }, + { + "epoch": 0.45912285383862644, + "grad_norm": 0.13036765158176422, + "learning_rate": 0.00057260084786693, + "loss": 2.693, + "step": 15483 + }, + { + "epoch": 0.4591525071909379, + "grad_norm": 0.12248044461011887, + "learning_rate": 0.0005725542952272546, + "loss": 2.6875, + "step": 15484 + }, + { + "epoch": 0.4591821605432494, + "grad_norm": 0.11970439553260803, + "learning_rate": 0.0005725077419450988, + "loss": 2.6964, + "step": 15485 + }, + { + "epoch": 0.45921181389556087, + "grad_norm": 0.12265349179506302, + "learning_rate": 0.0005724611880208745, + "loss": 2.6927, + "step": 15486 + }, + { + "epoch": 0.45924146724787235, + "grad_norm": 0.1135469451546669, + "learning_rate": 0.000572414633454994, + "loss": 2.6791, + "step": 15487 + }, + { + "epoch": 0.4592711206001838, + "grad_norm": 0.11626578122377396, + "learning_rate": 0.0005723680782478693, + "loss": 2.6805, + "step": 15488 + }, + { + "epoch": 0.45930077395249536, + "grad_norm": 0.11250367760658264, + "learning_rate": 0.0005723215223999129, + "loss": 2.6744, + "step": 15489 + }, + { + "epoch": 0.45933042730480683, + "grad_norm": 0.13071894645690918, + "learning_rate": 0.000572274965911537, + "loss": 2.7159, + "step": 15490 + }, + { + "epoch": 0.4593600806571183, + "grad_norm": 0.12330298125743866, + "learning_rate": 0.0005722284087831537, + "loss": 2.6704, + "step": 15491 + }, + { + "epoch": 0.4593897340094298, + "grad_norm": 0.11717148125171661, + "learning_rate": 0.0005721818510151758, + "loss": 2.6948, + "step": 15492 + }, + { + "epoch": 0.45941938736174126, + "grad_norm": 0.1264788657426834, + "learning_rate": 0.0005721352926080152, + "loss": 2.6718, + "step": 15493 + }, + { + "epoch": 0.45944904071405274, + "grad_norm": 0.11148641258478165, + "learning_rate": 0.0005720887335620839, + "loss": 2.6938, + "step": 15494 + }, + { + "epoch": 0.4594786940663642, + "grad_norm": 0.12392911314964294, + "learning_rate": 0.0005720421738777947, + "loss": 2.6678, + "step": 15495 + }, + { + "epoch": 0.4595083474186757, + "grad_norm": 0.13057133555412292, + "learning_rate": 0.0005719956135555595, + "loss": 2.7089, + "step": 15496 + }, + { + "epoch": 0.45953800077098717, + "grad_norm": 0.12274220585823059, + "learning_rate": 0.000571949052595791, + "loss": 2.6949, + "step": 15497 + }, + { + "epoch": 0.45956765412329864, + "grad_norm": 0.11772841960191727, + "learning_rate": 0.0005719024909989012, + "loss": 2.6706, + "step": 15498 + }, + { + "epoch": 0.4595973074756101, + "grad_norm": 0.14048384130001068, + "learning_rate": 0.0005718559287653024, + "loss": 2.7127, + "step": 15499 + }, + { + "epoch": 0.4596269608279216, + "grad_norm": 0.13024401664733887, + "learning_rate": 0.0005718093658954072, + "loss": 2.693, + "step": 15500 + }, + { + "epoch": 0.45965661418023307, + "grad_norm": 0.11770226061344147, + "learning_rate": 0.0005717628023896277, + "loss": 2.6769, + "step": 15501 + }, + { + "epoch": 0.45968626753254455, + "grad_norm": 0.15000787377357483, + "learning_rate": 0.0005717162382483761, + "loss": 2.7011, + "step": 15502 + }, + { + "epoch": 0.459715920884856, + "grad_norm": 0.17472098767757416, + "learning_rate": 0.000571669673472065, + "loss": 2.6914, + "step": 15503 + }, + { + "epoch": 0.4597455742371675, + "grad_norm": 0.15573081374168396, + "learning_rate": 0.0005716231080611068, + "loss": 2.7072, + "step": 15504 + }, + { + "epoch": 0.459775227589479, + "grad_norm": 0.10336846858263016, + "learning_rate": 0.0005715765420159135, + "loss": 2.6675, + "step": 15505 + }, + { + "epoch": 0.45980488094179045, + "grad_norm": 0.11921931058168411, + "learning_rate": 0.0005715299753368977, + "loss": 2.6825, + "step": 15506 + }, + { + "epoch": 0.4598345342941019, + "grad_norm": 0.1325802356004715, + "learning_rate": 0.0005714834080244716, + "loss": 2.7188, + "step": 15507 + }, + { + "epoch": 0.4598641876464134, + "grad_norm": 0.1305783987045288, + "learning_rate": 0.0005714368400790477, + "loss": 2.6816, + "step": 15508 + }, + { + "epoch": 0.4598938409987249, + "grad_norm": 0.11982107162475586, + "learning_rate": 0.0005713902715010385, + "loss": 2.6989, + "step": 15509 + }, + { + "epoch": 0.4599234943510364, + "grad_norm": 0.12687726318836212, + "learning_rate": 0.0005713437022908559, + "loss": 2.6799, + "step": 15510 + }, + { + "epoch": 0.4599531477033479, + "grad_norm": 0.12139669060707092, + "learning_rate": 0.0005712971324489126, + "loss": 2.6922, + "step": 15511 + }, + { + "epoch": 0.45998280105565936, + "grad_norm": 0.1315091848373413, + "learning_rate": 0.0005712505619756212, + "loss": 2.678, + "step": 15512 + }, + { + "epoch": 0.46001245440797084, + "grad_norm": 0.12699320912361145, + "learning_rate": 0.0005712039908713937, + "loss": 2.6245, + "step": 15513 + }, + { + "epoch": 0.4600421077602823, + "grad_norm": 0.12078925222158432, + "learning_rate": 0.0005711574191366427, + "loss": 2.6974, + "step": 15514 + }, + { + "epoch": 0.4600717611125938, + "grad_norm": 0.1340922713279724, + "learning_rate": 0.0005711108467717805, + "loss": 2.7005, + "step": 15515 + }, + { + "epoch": 0.46010141446490527, + "grad_norm": 0.12777218222618103, + "learning_rate": 0.0005710642737772194, + "loss": 2.6861, + "step": 15516 + }, + { + "epoch": 0.46013106781721674, + "grad_norm": 0.13344928622245789, + "learning_rate": 0.0005710177001533721, + "loss": 2.6758, + "step": 15517 + }, + { + "epoch": 0.4601607211695282, + "grad_norm": 0.12962068617343903, + "learning_rate": 0.0005709711259006508, + "loss": 2.703, + "step": 15518 + }, + { + "epoch": 0.4601903745218397, + "grad_norm": 0.12152943760156631, + "learning_rate": 0.0005709245510194681, + "loss": 2.7105, + "step": 15519 + }, + { + "epoch": 0.46022002787415117, + "grad_norm": 0.11593476682901382, + "learning_rate": 0.0005708779755102363, + "loss": 2.6654, + "step": 15520 + }, + { + "epoch": 0.46024968122646265, + "grad_norm": 0.11457652598619461, + "learning_rate": 0.0005708313993733679, + "loss": 2.6607, + "step": 15521 + }, + { + "epoch": 0.4602793345787741, + "grad_norm": 0.10261833667755127, + "learning_rate": 0.0005707848226092751, + "loss": 2.6827, + "step": 15522 + }, + { + "epoch": 0.4603089879310856, + "grad_norm": 0.12110278010368347, + "learning_rate": 0.0005707382452183707, + "loss": 2.7005, + "step": 15523 + }, + { + "epoch": 0.4603386412833971, + "grad_norm": 0.12140904366970062, + "learning_rate": 0.0005706916672010671, + "loss": 2.6734, + "step": 15524 + }, + { + "epoch": 0.46036829463570855, + "grad_norm": 0.11173191666603088, + "learning_rate": 0.0005706450885577765, + "loss": 2.6887, + "step": 15525 + }, + { + "epoch": 0.46039794798802003, + "grad_norm": 0.12261085957288742, + "learning_rate": 0.0005705985092889116, + "loss": 2.6674, + "step": 15526 + }, + { + "epoch": 0.4604276013403315, + "grad_norm": 0.12748605012893677, + "learning_rate": 0.0005705519293948846, + "loss": 2.6802, + "step": 15527 + }, + { + "epoch": 0.460457254692643, + "grad_norm": 0.12235990166664124, + "learning_rate": 0.0005705053488761084, + "loss": 2.6849, + "step": 15528 + }, + { + "epoch": 0.46048690804495446, + "grad_norm": 0.13551770150661469, + "learning_rate": 0.0005704587677329949, + "loss": 2.7206, + "step": 15529 + }, + { + "epoch": 0.46051656139726593, + "grad_norm": 0.12351011484861374, + "learning_rate": 0.0005704121859659573, + "loss": 2.6948, + "step": 15530 + }, + { + "epoch": 0.46054621474957746, + "grad_norm": 0.13811247050762177, + "learning_rate": 0.0005703656035754075, + "loss": 2.6798, + "step": 15531 + }, + { + "epoch": 0.46057586810188894, + "grad_norm": 0.11286737769842148, + "learning_rate": 0.0005703190205617584, + "loss": 2.7062, + "step": 15532 + }, + { + "epoch": 0.4606055214542004, + "grad_norm": 0.13441814482212067, + "learning_rate": 0.0005702724369254221, + "loss": 2.6888, + "step": 15533 + }, + { + "epoch": 0.4606351748065119, + "grad_norm": 0.13194456696510315, + "learning_rate": 0.0005702258526668113, + "loss": 2.6727, + "step": 15534 + }, + { + "epoch": 0.46066482815882337, + "grad_norm": 0.11863712221384048, + "learning_rate": 0.0005701792677863387, + "loss": 2.6919, + "step": 15535 + }, + { + "epoch": 0.46069448151113485, + "grad_norm": 0.13025736808776855, + "learning_rate": 0.0005701326822844164, + "loss": 2.7007, + "step": 15536 + }, + { + "epoch": 0.4607241348634463, + "grad_norm": 0.12827236950397491, + "learning_rate": 0.0005700860961614573, + "loss": 2.6715, + "step": 15537 + }, + { + "epoch": 0.4607537882157578, + "grad_norm": 0.12074259668588638, + "learning_rate": 0.0005700395094178738, + "loss": 2.6823, + "step": 15538 + }, + { + "epoch": 0.4607834415680693, + "grad_norm": 0.1280970424413681, + "learning_rate": 0.0005699929220540783, + "loss": 2.7068, + "step": 15539 + }, + { + "epoch": 0.46081309492038075, + "grad_norm": 0.12748323380947113, + "learning_rate": 0.0005699463340704837, + "loss": 2.6824, + "step": 15540 + }, + { + "epoch": 0.4608427482726922, + "grad_norm": 0.1238439753651619, + "learning_rate": 0.0005698997454675021, + "loss": 2.638, + "step": 15541 + }, + { + "epoch": 0.4608724016250037, + "grad_norm": 0.1205439567565918, + "learning_rate": 0.0005698531562455464, + "loss": 2.6992, + "step": 15542 + }, + { + "epoch": 0.4609020549773152, + "grad_norm": 0.11876382678747177, + "learning_rate": 0.0005698065664050288, + "loss": 2.6804, + "step": 15543 + }, + { + "epoch": 0.46093170832962665, + "grad_norm": 0.13533490896224976, + "learning_rate": 0.0005697599759463622, + "loss": 2.6505, + "step": 15544 + }, + { + "epoch": 0.46096136168193813, + "grad_norm": 0.11385789513587952, + "learning_rate": 0.000569713384869959, + "loss": 2.6699, + "step": 15545 + }, + { + "epoch": 0.4609910150342496, + "grad_norm": 0.12214947491884232, + "learning_rate": 0.000569666793176232, + "loss": 2.6557, + "step": 15546 + }, + { + "epoch": 0.4610206683865611, + "grad_norm": 0.13245850801467896, + "learning_rate": 0.0005696202008655934, + "loss": 2.7207, + "step": 15547 + }, + { + "epoch": 0.46105032173887256, + "grad_norm": 0.105262391269207, + "learning_rate": 0.000569573607938456, + "loss": 2.6971, + "step": 15548 + }, + { + "epoch": 0.46107997509118404, + "grad_norm": 0.11704564094543457, + "learning_rate": 0.0005695270143952322, + "loss": 2.6676, + "step": 15549 + }, + { + "epoch": 0.4611096284434955, + "grad_norm": 0.12041330337524414, + "learning_rate": 0.000569480420236335, + "loss": 2.7362, + "step": 15550 + }, + { + "epoch": 0.46113928179580704, + "grad_norm": 0.13219261169433594, + "learning_rate": 0.0005694338254621767, + "loss": 2.7115, + "step": 15551 + }, + { + "epoch": 0.4611689351481185, + "grad_norm": 0.1329759806394577, + "learning_rate": 0.00056938723007317, + "loss": 2.7164, + "step": 15552 + }, + { + "epoch": 0.46119858850043, + "grad_norm": 0.12519308924674988, + "learning_rate": 0.0005693406340697274, + "loss": 2.75, + "step": 15553 + }, + { + "epoch": 0.46122824185274147, + "grad_norm": 0.15081754326820374, + "learning_rate": 0.0005692940374522616, + "loss": 2.7239, + "step": 15554 + }, + { + "epoch": 0.46125789520505295, + "grad_norm": 0.19161121547222137, + "learning_rate": 0.0005692474402211851, + "loss": 2.7134, + "step": 15555 + }, + { + "epoch": 0.4612875485573644, + "grad_norm": 0.20262442529201508, + "learning_rate": 0.0005692008423769107, + "loss": 2.6814, + "step": 15556 + }, + { + "epoch": 0.4613172019096759, + "grad_norm": 0.1573057621717453, + "learning_rate": 0.000569154243919851, + "loss": 2.6887, + "step": 15557 + }, + { + "epoch": 0.4613468552619874, + "grad_norm": 0.13059823215007782, + "learning_rate": 0.0005691076448504186, + "loss": 2.6931, + "step": 15558 + }, + { + "epoch": 0.46137650861429885, + "grad_norm": 0.1360275000333786, + "learning_rate": 0.0005690610451690261, + "loss": 2.689, + "step": 15559 + }, + { + "epoch": 0.46140616196661033, + "grad_norm": 0.13621123135089874, + "learning_rate": 0.0005690144448760862, + "loss": 2.7028, + "step": 15560 + }, + { + "epoch": 0.4614358153189218, + "grad_norm": 0.13310295343399048, + "learning_rate": 0.0005689678439720115, + "loss": 2.6897, + "step": 15561 + }, + { + "epoch": 0.4614654686712333, + "grad_norm": 0.1372753530740738, + "learning_rate": 0.0005689212424572149, + "loss": 2.6881, + "step": 15562 + }, + { + "epoch": 0.46149512202354476, + "grad_norm": 0.13794027268886566, + "learning_rate": 0.0005688746403321087, + "loss": 2.7236, + "step": 15563 + }, + { + "epoch": 0.46152477537585623, + "grad_norm": 0.12310393154621124, + "learning_rate": 0.0005688280375971057, + "loss": 2.6511, + "step": 15564 + }, + { + "epoch": 0.4615544287281677, + "grad_norm": 0.1342664361000061, + "learning_rate": 0.0005687814342526188, + "loss": 2.651, + "step": 15565 + }, + { + "epoch": 0.4615840820804792, + "grad_norm": 0.10743137449026108, + "learning_rate": 0.0005687348302990603, + "loss": 2.6875, + "step": 15566 + }, + { + "epoch": 0.46161373543279066, + "grad_norm": 0.13162720203399658, + "learning_rate": 0.0005686882257368431, + "loss": 2.7398, + "step": 15567 + }, + { + "epoch": 0.46164338878510214, + "grad_norm": 0.131172314286232, + "learning_rate": 0.00056864162056638, + "loss": 2.6593, + "step": 15568 + }, + { + "epoch": 0.4616730421374136, + "grad_norm": 0.11613064259290695, + "learning_rate": 0.0005685950147880834, + "loss": 2.688, + "step": 15569 + }, + { + "epoch": 0.4617026954897251, + "grad_norm": 0.11401692777872086, + "learning_rate": 0.0005685484084023663, + "loss": 2.7255, + "step": 15570 + }, + { + "epoch": 0.46173234884203657, + "grad_norm": 0.12498222291469574, + "learning_rate": 0.0005685018014096412, + "loss": 2.679, + "step": 15571 + }, + { + "epoch": 0.4617620021943481, + "grad_norm": 0.11720104515552521, + "learning_rate": 0.0005684551938103209, + "loss": 2.6771, + "step": 15572 + }, + { + "epoch": 0.4617916555466596, + "grad_norm": 0.12977948784828186, + "learning_rate": 0.0005684085856048183, + "loss": 2.7111, + "step": 15573 + }, + { + "epoch": 0.46182130889897105, + "grad_norm": 0.13197949528694153, + "learning_rate": 0.0005683619767935457, + "loss": 2.7348, + "step": 15574 + }, + { + "epoch": 0.4618509622512825, + "grad_norm": 0.12777432799339294, + "learning_rate": 0.0005683153673769161, + "loss": 2.7088, + "step": 15575 + }, + { + "epoch": 0.461880615603594, + "grad_norm": 0.1641756296157837, + "learning_rate": 0.0005682687573553422, + "loss": 2.691, + "step": 15576 + }, + { + "epoch": 0.4619102689559055, + "grad_norm": 0.1458699107170105, + "learning_rate": 0.0005682221467292368, + "loss": 2.7124, + "step": 15577 + }, + { + "epoch": 0.46193992230821695, + "grad_norm": 0.15301863849163055, + "learning_rate": 0.0005681755354990125, + "loss": 2.6701, + "step": 15578 + }, + { + "epoch": 0.46196957566052843, + "grad_norm": 0.1332598328590393, + "learning_rate": 0.0005681289236650821, + "loss": 2.6881, + "step": 15579 + }, + { + "epoch": 0.4619992290128399, + "grad_norm": 0.13056457042694092, + "learning_rate": 0.0005680823112278586, + "loss": 2.6741, + "step": 15580 + }, + { + "epoch": 0.4620288823651514, + "grad_norm": 0.12704025208950043, + "learning_rate": 0.0005680356981877544, + "loss": 2.6537, + "step": 15581 + }, + { + "epoch": 0.46205853571746286, + "grad_norm": 0.12972381711006165, + "learning_rate": 0.0005679890845451824, + "loss": 2.6722, + "step": 15582 + }, + { + "epoch": 0.46208818906977434, + "grad_norm": 0.11494565010070801, + "learning_rate": 0.0005679424703005553, + "loss": 2.6842, + "step": 15583 + }, + { + "epoch": 0.4621178424220858, + "grad_norm": 0.12502677738666534, + "learning_rate": 0.0005678958554542861, + "loss": 2.7128, + "step": 15584 + }, + { + "epoch": 0.4621474957743973, + "grad_norm": 0.13647779822349548, + "learning_rate": 0.0005678492400067875, + "loss": 2.6841, + "step": 15585 + }, + { + "epoch": 0.46217714912670876, + "grad_norm": 0.12322335690259933, + "learning_rate": 0.0005678026239584722, + "loss": 2.6692, + "step": 15586 + }, + { + "epoch": 0.46220680247902024, + "grad_norm": 0.10621479153633118, + "learning_rate": 0.0005677560073097528, + "loss": 2.6957, + "step": 15587 + }, + { + "epoch": 0.4622364558313317, + "grad_norm": 0.11461099982261658, + "learning_rate": 0.0005677093900610426, + "loss": 2.7079, + "step": 15588 + }, + { + "epoch": 0.4622661091836432, + "grad_norm": 0.11367768794298172, + "learning_rate": 0.000567662772212754, + "loss": 2.6777, + "step": 15589 + }, + { + "epoch": 0.46229576253595467, + "grad_norm": 0.11932674050331116, + "learning_rate": 0.0005676161537653, + "loss": 2.7011, + "step": 15590 + }, + { + "epoch": 0.46232541588826614, + "grad_norm": 0.14195220172405243, + "learning_rate": 0.0005675695347190933, + "loss": 2.7155, + "step": 15591 + }, + { + "epoch": 0.4623550692405776, + "grad_norm": 0.11923494189977646, + "learning_rate": 0.0005675229150745469, + "loss": 2.689, + "step": 15592 + }, + { + "epoch": 0.46238472259288915, + "grad_norm": 0.10971973091363907, + "learning_rate": 0.0005674762948320733, + "loss": 2.7138, + "step": 15593 + }, + { + "epoch": 0.46241437594520063, + "grad_norm": 0.10928457975387573, + "learning_rate": 0.0005674296739920856, + "loss": 2.7118, + "step": 15594 + }, + { + "epoch": 0.4624440292975121, + "grad_norm": 0.12410306185483932, + "learning_rate": 0.0005673830525549967, + "loss": 2.6921, + "step": 15595 + }, + { + "epoch": 0.4624736826498236, + "grad_norm": 0.13268157839775085, + "learning_rate": 0.000567336430521219, + "loss": 2.685, + "step": 15596 + }, + { + "epoch": 0.46250333600213506, + "grad_norm": 0.12635810673236847, + "learning_rate": 0.0005672898078911659, + "loss": 2.6778, + "step": 15597 + }, + { + "epoch": 0.46253298935444653, + "grad_norm": 0.12450132519006729, + "learning_rate": 0.0005672431846652499, + "loss": 2.6787, + "step": 15598 + }, + { + "epoch": 0.462562642706758, + "grad_norm": 0.11828521639108658, + "learning_rate": 0.0005671965608438841, + "loss": 2.6816, + "step": 15599 + }, + { + "epoch": 0.4625922960590695, + "grad_norm": 0.10405566543340683, + "learning_rate": 0.000567149936427481, + "loss": 2.6965, + "step": 15600 + }, + { + "epoch": 0.46262194941138096, + "grad_norm": 0.13104651868343353, + "learning_rate": 0.0005671033114164538, + "loss": 2.6569, + "step": 15601 + }, + { + "epoch": 0.46265160276369244, + "grad_norm": 0.13463588058948517, + "learning_rate": 0.0005670566858112152, + "loss": 2.6942, + "step": 15602 + }, + { + "epoch": 0.4626812561160039, + "grad_norm": 0.10684479773044586, + "learning_rate": 0.0005670100596121783, + "loss": 2.7161, + "step": 15603 + }, + { + "epoch": 0.4627109094683154, + "grad_norm": 0.1185295581817627, + "learning_rate": 0.0005669634328197557, + "loss": 2.6803, + "step": 15604 + }, + { + "epoch": 0.46274056282062687, + "grad_norm": 0.12934735417366028, + "learning_rate": 0.0005669168054343604, + "loss": 2.6952, + "step": 15605 + }, + { + "epoch": 0.46277021617293834, + "grad_norm": 0.11548599600791931, + "learning_rate": 0.0005668701774564054, + "loss": 2.7131, + "step": 15606 + }, + { + "epoch": 0.4627998695252498, + "grad_norm": 0.12035642564296722, + "learning_rate": 0.0005668235488863034, + "loss": 2.6775, + "step": 15607 + }, + { + "epoch": 0.4628295228775613, + "grad_norm": 0.11065944284200668, + "learning_rate": 0.0005667769197244674, + "loss": 2.7003, + "step": 15608 + }, + { + "epoch": 0.46285917622987277, + "grad_norm": 0.1380174607038498, + "learning_rate": 0.0005667302899713104, + "loss": 2.6995, + "step": 15609 + }, + { + "epoch": 0.46288882958218425, + "grad_norm": 0.1484110951423645, + "learning_rate": 0.0005666836596272451, + "loss": 2.6919, + "step": 15610 + }, + { + "epoch": 0.4629184829344957, + "grad_norm": 0.13078735768795013, + "learning_rate": 0.0005666370286926847, + "loss": 2.6485, + "step": 15611 + }, + { + "epoch": 0.4629481362868072, + "grad_norm": 0.12540240585803986, + "learning_rate": 0.0005665903971680419, + "loss": 2.6863, + "step": 15612 + }, + { + "epoch": 0.4629777896391187, + "grad_norm": 0.1191975399851799, + "learning_rate": 0.0005665437650537297, + "loss": 2.7042, + "step": 15613 + }, + { + "epoch": 0.4630074429914302, + "grad_norm": 0.11702880263328552, + "learning_rate": 0.000566497132350161, + "loss": 2.6687, + "step": 15614 + }, + { + "epoch": 0.4630370963437417, + "grad_norm": 0.1173066720366478, + "learning_rate": 0.0005664504990577487, + "loss": 2.6943, + "step": 15615 + }, + { + "epoch": 0.46306674969605316, + "grad_norm": 0.12306632101535797, + "learning_rate": 0.000566403865176906, + "loss": 2.7219, + "step": 15616 + }, + { + "epoch": 0.46309640304836464, + "grad_norm": 0.1191793829202652, + "learning_rate": 0.0005663572307080455, + "loss": 2.6916, + "step": 15617 + }, + { + "epoch": 0.4631260564006761, + "grad_norm": 0.11248692125082016, + "learning_rate": 0.0005663105956515806, + "loss": 2.7022, + "step": 15618 + }, + { + "epoch": 0.4631557097529876, + "grad_norm": 0.11852660030126572, + "learning_rate": 0.0005662639600079238, + "loss": 2.65, + "step": 15619 + }, + { + "epoch": 0.46318536310529906, + "grad_norm": 0.1439061164855957, + "learning_rate": 0.0005662173237774883, + "loss": 2.6858, + "step": 15620 + }, + { + "epoch": 0.46321501645761054, + "grad_norm": 0.1292315274477005, + "learning_rate": 0.000566170686960687, + "loss": 2.7266, + "step": 15621 + }, + { + "epoch": 0.463244669809922, + "grad_norm": 0.11634072661399841, + "learning_rate": 0.0005661240495579329, + "loss": 2.6958, + "step": 15622 + }, + { + "epoch": 0.4632743231622335, + "grad_norm": 0.12643961608409882, + "learning_rate": 0.0005660774115696392, + "loss": 2.6865, + "step": 15623 + }, + { + "epoch": 0.46330397651454497, + "grad_norm": 0.14470958709716797, + "learning_rate": 0.0005660307729962184, + "loss": 2.6941, + "step": 15624 + }, + { + "epoch": 0.46333362986685644, + "grad_norm": 0.14464722573757172, + "learning_rate": 0.0005659841338380839, + "loss": 2.6921, + "step": 15625 + }, + { + "epoch": 0.4633632832191679, + "grad_norm": 0.12726330757141113, + "learning_rate": 0.0005659374940956485, + "loss": 2.6638, + "step": 15626 + }, + { + "epoch": 0.4633929365714794, + "grad_norm": 0.12220247834920883, + "learning_rate": 0.0005658908537693253, + "loss": 2.7071, + "step": 15627 + }, + { + "epoch": 0.4634225899237909, + "grad_norm": 0.12122119963169098, + "learning_rate": 0.0005658442128595273, + "loss": 2.6567, + "step": 15628 + }, + { + "epoch": 0.46345224327610235, + "grad_norm": 0.1412816345691681, + "learning_rate": 0.0005657975713666676, + "loss": 2.724, + "step": 15629 + }, + { + "epoch": 0.4634818966284138, + "grad_norm": 0.15885260701179504, + "learning_rate": 0.0005657509292911591, + "loss": 2.7035, + "step": 15630 + }, + { + "epoch": 0.4635115499807253, + "grad_norm": 0.13452036678791046, + "learning_rate": 0.0005657042866334147, + "loss": 2.6998, + "step": 15631 + }, + { + "epoch": 0.4635412033330368, + "grad_norm": 0.13724875450134277, + "learning_rate": 0.0005656576433938474, + "loss": 2.6808, + "step": 15632 + }, + { + "epoch": 0.46357085668534825, + "grad_norm": 0.13167992234230042, + "learning_rate": 0.0005656109995728708, + "loss": 2.6843, + "step": 15633 + }, + { + "epoch": 0.46360051003765973, + "grad_norm": 0.12771457433700562, + "learning_rate": 0.0005655643551708972, + "loss": 2.6904, + "step": 15634 + }, + { + "epoch": 0.46363016338997126, + "grad_norm": 0.1405685693025589, + "learning_rate": 0.0005655177101883402, + "loss": 2.6882, + "step": 15635 + }, + { + "epoch": 0.46365981674228274, + "grad_norm": 0.1519746333360672, + "learning_rate": 0.0005654710646256125, + "loss": 2.6844, + "step": 15636 + }, + { + "epoch": 0.4636894700945942, + "grad_norm": 0.14853918552398682, + "learning_rate": 0.0005654244184831273, + "loss": 2.6417, + "step": 15637 + }, + { + "epoch": 0.4637191234469057, + "grad_norm": 0.13984952867031097, + "learning_rate": 0.0005653777717612977, + "loss": 2.6875, + "step": 15638 + }, + { + "epoch": 0.46374877679921717, + "grad_norm": 0.13102932274341583, + "learning_rate": 0.0005653311244605367, + "loss": 2.6667, + "step": 15639 + }, + { + "epoch": 0.46377843015152864, + "grad_norm": 0.1499485820531845, + "learning_rate": 0.0005652844765812574, + "loss": 2.6634, + "step": 15640 + }, + { + "epoch": 0.4638080835038401, + "grad_norm": 0.12633712589740753, + "learning_rate": 0.0005652378281238728, + "loss": 2.7266, + "step": 15641 + }, + { + "epoch": 0.4638377368561516, + "grad_norm": 0.11846770346164703, + "learning_rate": 0.000565191179088796, + "loss": 2.6355, + "step": 15642 + }, + { + "epoch": 0.46386739020846307, + "grad_norm": 0.13732662796974182, + "learning_rate": 0.0005651445294764402, + "loss": 2.6803, + "step": 15643 + }, + { + "epoch": 0.46389704356077455, + "grad_norm": 0.13121627271175385, + "learning_rate": 0.0005650978792872183, + "loss": 2.6702, + "step": 15644 + }, + { + "epoch": 0.463926696913086, + "grad_norm": 0.13169103860855103, + "learning_rate": 0.0005650512285215437, + "loss": 2.7017, + "step": 15645 + }, + { + "epoch": 0.4639563502653975, + "grad_norm": 0.12328982353210449, + "learning_rate": 0.0005650045771798291, + "loss": 2.65, + "step": 15646 + }, + { + "epoch": 0.463986003617709, + "grad_norm": 0.1309627741575241, + "learning_rate": 0.0005649579252624879, + "loss": 2.6989, + "step": 15647 + }, + { + "epoch": 0.46401565697002045, + "grad_norm": 0.1257719099521637, + "learning_rate": 0.0005649112727699331, + "loss": 2.7066, + "step": 15648 + }, + { + "epoch": 0.4640453103223319, + "grad_norm": 0.1136118620634079, + "learning_rate": 0.000564864619702578, + "loss": 2.6654, + "step": 15649 + }, + { + "epoch": 0.4640749636746434, + "grad_norm": 0.11129003763198853, + "learning_rate": 0.0005648179660608355, + "loss": 2.6982, + "step": 15650 + }, + { + "epoch": 0.4641046170269549, + "grad_norm": 0.11831843852996826, + "learning_rate": 0.0005647713118451187, + "loss": 2.7151, + "step": 15651 + }, + { + "epoch": 0.46413427037926636, + "grad_norm": 0.1200881078839302, + "learning_rate": 0.0005647246570558407, + "loss": 2.6853, + "step": 15652 + }, + { + "epoch": 0.46416392373157783, + "grad_norm": 0.11714829504489899, + "learning_rate": 0.0005646780016934149, + "loss": 2.7065, + "step": 15653 + }, + { + "epoch": 0.4641935770838893, + "grad_norm": 0.11978410184383392, + "learning_rate": 0.0005646313457582543, + "loss": 2.6703, + "step": 15654 + }, + { + "epoch": 0.46422323043620084, + "grad_norm": 0.12495137006044388, + "learning_rate": 0.0005645846892507719, + "loss": 2.6818, + "step": 15655 + }, + { + "epoch": 0.4642528837885123, + "grad_norm": 0.12507042288780212, + "learning_rate": 0.0005645380321713812, + "loss": 2.7353, + "step": 15656 + }, + { + "epoch": 0.4642825371408238, + "grad_norm": 0.12639746069908142, + "learning_rate": 0.000564491374520495, + "loss": 2.6811, + "step": 15657 + }, + { + "epoch": 0.46431219049313527, + "grad_norm": 0.1275819092988968, + "learning_rate": 0.0005644447162985267, + "loss": 2.6786, + "step": 15658 + }, + { + "epoch": 0.46434184384544674, + "grad_norm": 0.11344682425260544, + "learning_rate": 0.0005643980575058893, + "loss": 2.6501, + "step": 15659 + }, + { + "epoch": 0.4643714971977582, + "grad_norm": 0.13412298262119293, + "learning_rate": 0.000564351398142996, + "loss": 2.668, + "step": 15660 + }, + { + "epoch": 0.4644011505500697, + "grad_norm": 0.14087021350860596, + "learning_rate": 0.0005643047382102601, + "loss": 2.6885, + "step": 15661 + }, + { + "epoch": 0.4644308039023812, + "grad_norm": 0.14166629314422607, + "learning_rate": 0.0005642580777080948, + "loss": 2.6602, + "step": 15662 + }, + { + "epoch": 0.46446045725469265, + "grad_norm": 0.11904051899909973, + "learning_rate": 0.0005642114166369131, + "loss": 2.6487, + "step": 15663 + }, + { + "epoch": 0.4644901106070041, + "grad_norm": 0.12554305791854858, + "learning_rate": 0.0005641647549971283, + "loss": 2.7231, + "step": 15664 + }, + { + "epoch": 0.4645197639593156, + "grad_norm": 0.1565365195274353, + "learning_rate": 0.0005641180927891535, + "loss": 2.6775, + "step": 15665 + }, + { + "epoch": 0.4645494173116271, + "grad_norm": 0.12798114120960236, + "learning_rate": 0.0005640714300134021, + "loss": 2.6567, + "step": 15666 + }, + { + "epoch": 0.46457907066393855, + "grad_norm": 0.1252363920211792, + "learning_rate": 0.0005640247666702871, + "loss": 2.6757, + "step": 15667 + }, + { + "epoch": 0.46460872401625003, + "grad_norm": 0.12799997627735138, + "learning_rate": 0.0005639781027602219, + "loss": 2.6874, + "step": 15668 + }, + { + "epoch": 0.4646383773685615, + "grad_norm": 0.1172819435596466, + "learning_rate": 0.0005639314382836196, + "loss": 2.6622, + "step": 15669 + }, + { + "epoch": 0.464668030720873, + "grad_norm": 0.11396484076976776, + "learning_rate": 0.0005638847732408934, + "loss": 2.6937, + "step": 15670 + }, + { + "epoch": 0.46469768407318446, + "grad_norm": 0.12442424148321152, + "learning_rate": 0.0005638381076324564, + "loss": 2.7168, + "step": 15671 + }, + { + "epoch": 0.46472733742549593, + "grad_norm": 0.13349302113056183, + "learning_rate": 0.0005637914414587222, + "loss": 2.6876, + "step": 15672 + }, + { + "epoch": 0.4647569907778074, + "grad_norm": 0.12854501605033875, + "learning_rate": 0.0005637447747201039, + "loss": 2.6702, + "step": 15673 + }, + { + "epoch": 0.4647866441301189, + "grad_norm": 0.12336437404155731, + "learning_rate": 0.0005636981074170146, + "loss": 2.7006, + "step": 15674 + }, + { + "epoch": 0.46481629748243036, + "grad_norm": 0.12302912771701813, + "learning_rate": 0.0005636514395498675, + "loss": 2.7255, + "step": 15675 + }, + { + "epoch": 0.4648459508347419, + "grad_norm": 0.12411647289991379, + "learning_rate": 0.000563604771119076, + "loss": 2.6702, + "step": 15676 + }, + { + "epoch": 0.46487560418705337, + "grad_norm": 0.12630054354667664, + "learning_rate": 0.0005635581021250536, + "loss": 2.72, + "step": 15677 + }, + { + "epoch": 0.46490525753936485, + "grad_norm": 0.1095905527472496, + "learning_rate": 0.0005635114325682131, + "loss": 2.6957, + "step": 15678 + }, + { + "epoch": 0.4649349108916763, + "grad_norm": 0.13173560798168182, + "learning_rate": 0.0005634647624489679, + "loss": 2.687, + "step": 15679 + }, + { + "epoch": 0.4649645642439878, + "grad_norm": 0.11913455277681351, + "learning_rate": 0.0005634180917677314, + "loss": 2.7174, + "step": 15680 + }, + { + "epoch": 0.4649942175962993, + "grad_norm": 0.10914632678031921, + "learning_rate": 0.0005633714205249168, + "loss": 2.6801, + "step": 15681 + }, + { + "epoch": 0.46502387094861075, + "grad_norm": 0.13287755846977234, + "learning_rate": 0.0005633247487209374, + "loss": 2.7183, + "step": 15682 + }, + { + "epoch": 0.4650535243009222, + "grad_norm": 0.11313576996326447, + "learning_rate": 0.0005632780763562067, + "loss": 2.6541, + "step": 15683 + }, + { + "epoch": 0.4650831776532337, + "grad_norm": 0.1284051388502121, + "learning_rate": 0.0005632314034311373, + "loss": 2.69, + "step": 15684 + }, + { + "epoch": 0.4651128310055452, + "grad_norm": 0.14550994336605072, + "learning_rate": 0.0005631847299461432, + "loss": 2.6955, + "step": 15685 + }, + { + "epoch": 0.46514248435785666, + "grad_norm": 0.14739802479743958, + "learning_rate": 0.0005631380559016376, + "loss": 2.6861, + "step": 15686 + }, + { + "epoch": 0.46517213771016813, + "grad_norm": 0.15820538997650146, + "learning_rate": 0.0005630913812980336, + "loss": 2.7011, + "step": 15687 + }, + { + "epoch": 0.4652017910624796, + "grad_norm": 0.12831993401050568, + "learning_rate": 0.0005630447061357447, + "loss": 2.688, + "step": 15688 + }, + { + "epoch": 0.4652314444147911, + "grad_norm": 0.14362552762031555, + "learning_rate": 0.0005629980304151839, + "loss": 2.7146, + "step": 15689 + }, + { + "epoch": 0.46526109776710256, + "grad_norm": 0.13141334056854248, + "learning_rate": 0.0005629513541367648, + "loss": 2.6584, + "step": 15690 + }, + { + "epoch": 0.46529075111941404, + "grad_norm": 0.15587839484214783, + "learning_rate": 0.0005629046773009005, + "loss": 2.6971, + "step": 15691 + }, + { + "epoch": 0.4653204044717255, + "grad_norm": 0.16832974553108215, + "learning_rate": 0.0005628579999080046, + "loss": 2.7211, + "step": 15692 + }, + { + "epoch": 0.465350057824037, + "grad_norm": 0.11887028068304062, + "learning_rate": 0.0005628113219584906, + "loss": 2.7203, + "step": 15693 + }, + { + "epoch": 0.46537971117634847, + "grad_norm": 0.1473207324743271, + "learning_rate": 0.0005627646434527713, + "loss": 2.704, + "step": 15694 + }, + { + "epoch": 0.46540936452865994, + "grad_norm": 0.12738488614559174, + "learning_rate": 0.0005627179643912603, + "loss": 2.6873, + "step": 15695 + }, + { + "epoch": 0.4654390178809714, + "grad_norm": 0.1235371008515358, + "learning_rate": 0.000562671284774371, + "loss": 2.6653, + "step": 15696 + }, + { + "epoch": 0.46546867123328295, + "grad_norm": 0.12092168629169464, + "learning_rate": 0.0005626246046025167, + "loss": 2.6924, + "step": 15697 + }, + { + "epoch": 0.4654983245855944, + "grad_norm": 0.11509603261947632, + "learning_rate": 0.0005625779238761107, + "loss": 2.6789, + "step": 15698 + }, + { + "epoch": 0.4655279779379059, + "grad_norm": 0.12346048653125763, + "learning_rate": 0.0005625312425955667, + "loss": 2.7027, + "step": 15699 + }, + { + "epoch": 0.4655576312902174, + "grad_norm": 0.11129086464643478, + "learning_rate": 0.0005624845607612976, + "loss": 2.6552, + "step": 15700 + }, + { + "epoch": 0.46558728464252885, + "grad_norm": 0.12854766845703125, + "learning_rate": 0.000562437878373717, + "loss": 2.6911, + "step": 15701 + }, + { + "epoch": 0.46561693799484033, + "grad_norm": 0.1304795891046524, + "learning_rate": 0.0005623911954332384, + "loss": 2.6695, + "step": 15702 + }, + { + "epoch": 0.4656465913471518, + "grad_norm": 0.12648044526576996, + "learning_rate": 0.0005623445119402748, + "loss": 2.6657, + "step": 15703 + }, + { + "epoch": 0.4656762446994633, + "grad_norm": 0.12544658780097961, + "learning_rate": 0.0005622978278952401, + "loss": 2.6785, + "step": 15704 + }, + { + "epoch": 0.46570589805177476, + "grad_norm": 0.15962688624858856, + "learning_rate": 0.0005622511432985473, + "loss": 2.6803, + "step": 15705 + }, + { + "epoch": 0.46573555140408623, + "grad_norm": 0.10862820595502853, + "learning_rate": 0.0005622044581506101, + "loss": 2.6704, + "step": 15706 + }, + { + "epoch": 0.4657652047563977, + "grad_norm": 0.11625996232032776, + "learning_rate": 0.0005621577724518415, + "loss": 2.6866, + "step": 15707 + }, + { + "epoch": 0.4657948581087092, + "grad_norm": 0.1287102997303009, + "learning_rate": 0.0005621110862026553, + "loss": 2.6514, + "step": 15708 + }, + { + "epoch": 0.46582451146102066, + "grad_norm": 0.1136668473482132, + "learning_rate": 0.0005620643994034648, + "loss": 2.6683, + "step": 15709 + }, + { + "epoch": 0.46585416481333214, + "grad_norm": 0.12183310836553574, + "learning_rate": 0.0005620177120546833, + "loss": 2.6863, + "step": 15710 + }, + { + "epoch": 0.4658838181656436, + "grad_norm": 0.11470190435647964, + "learning_rate": 0.0005619710241567244, + "loss": 2.7255, + "step": 15711 + }, + { + "epoch": 0.4659134715179551, + "grad_norm": 0.10462626814842224, + "learning_rate": 0.0005619243357100014, + "loss": 2.6734, + "step": 15712 + }, + { + "epoch": 0.46594312487026657, + "grad_norm": 0.10835868865251541, + "learning_rate": 0.0005618776467149278, + "loss": 2.7067, + "step": 15713 + }, + { + "epoch": 0.46597277822257804, + "grad_norm": 0.1162109524011612, + "learning_rate": 0.0005618309571719171, + "loss": 2.6927, + "step": 15714 + }, + { + "epoch": 0.4660024315748895, + "grad_norm": 0.11973553150892258, + "learning_rate": 0.0005617842670813825, + "loss": 2.6868, + "step": 15715 + }, + { + "epoch": 0.466032084927201, + "grad_norm": 0.12054307013750076, + "learning_rate": 0.0005617375764437376, + "loss": 2.7093, + "step": 15716 + }, + { + "epoch": 0.46606173827951247, + "grad_norm": 0.11162158846855164, + "learning_rate": 0.000561690885259396, + "loss": 2.6821, + "step": 15717 + }, + { + "epoch": 0.466091391631824, + "grad_norm": 0.10671168565750122, + "learning_rate": 0.0005616441935287708, + "loss": 2.6715, + "step": 15718 + }, + { + "epoch": 0.4661210449841355, + "grad_norm": 0.10967544466257095, + "learning_rate": 0.000561597501252276, + "loss": 2.677, + "step": 15719 + }, + { + "epoch": 0.46615069833644696, + "grad_norm": 0.1178821474313736, + "learning_rate": 0.0005615508084303245, + "loss": 2.7042, + "step": 15720 + }, + { + "epoch": 0.46618035168875843, + "grad_norm": 0.11392955482006073, + "learning_rate": 0.0005615041150633302, + "loss": 2.6763, + "step": 15721 + }, + { + "epoch": 0.4662100050410699, + "grad_norm": 0.13125479221343994, + "learning_rate": 0.0005614574211517064, + "loss": 2.6769, + "step": 15722 + }, + { + "epoch": 0.4662396583933814, + "grad_norm": 0.13822387158870697, + "learning_rate": 0.0005614107266958664, + "loss": 2.6777, + "step": 15723 + }, + { + "epoch": 0.46626931174569286, + "grad_norm": 0.15120083093643188, + "learning_rate": 0.000561364031696224, + "loss": 2.7133, + "step": 15724 + }, + { + "epoch": 0.46629896509800434, + "grad_norm": 0.12785570323467255, + "learning_rate": 0.0005613173361531925, + "loss": 2.7043, + "step": 15725 + }, + { + "epoch": 0.4663286184503158, + "grad_norm": 0.10877612978219986, + "learning_rate": 0.0005612706400671857, + "loss": 2.6624, + "step": 15726 + }, + { + "epoch": 0.4663582718026273, + "grad_norm": 0.12084439396858215, + "learning_rate": 0.0005612239434386166, + "loss": 2.6666, + "step": 15727 + }, + { + "epoch": 0.46638792515493877, + "grad_norm": 0.13188622891902924, + "learning_rate": 0.000561177246267899, + "loss": 2.6733, + "step": 15728 + }, + { + "epoch": 0.46641757850725024, + "grad_norm": 0.12284345924854279, + "learning_rate": 0.0005611305485554465, + "loss": 2.6626, + "step": 15729 + }, + { + "epoch": 0.4664472318595617, + "grad_norm": 0.1289760023355484, + "learning_rate": 0.0005610838503016723, + "loss": 2.6956, + "step": 15730 + }, + { + "epoch": 0.4664768852118732, + "grad_norm": 0.13158005475997925, + "learning_rate": 0.0005610371515069903, + "loss": 2.6798, + "step": 15731 + }, + { + "epoch": 0.46650653856418467, + "grad_norm": 0.12868571281433105, + "learning_rate": 0.0005609904521718139, + "loss": 2.6786, + "step": 15732 + }, + { + "epoch": 0.46653619191649615, + "grad_norm": 0.12253676354885101, + "learning_rate": 0.0005609437522965565, + "loss": 2.7105, + "step": 15733 + }, + { + "epoch": 0.4665658452688076, + "grad_norm": 0.13514935970306396, + "learning_rate": 0.0005608970518816317, + "loss": 2.707, + "step": 15734 + }, + { + "epoch": 0.4665954986211191, + "grad_norm": 0.12251254916191101, + "learning_rate": 0.000560850350927453, + "loss": 2.6661, + "step": 15735 + }, + { + "epoch": 0.4666251519734306, + "grad_norm": 0.11701387912034988, + "learning_rate": 0.000560803649434434, + "loss": 2.7072, + "step": 15736 + }, + { + "epoch": 0.46665480532574205, + "grad_norm": 0.10821178555488586, + "learning_rate": 0.0005607569474029883, + "loss": 2.6913, + "step": 15737 + }, + { + "epoch": 0.4666844586780535, + "grad_norm": 0.11557308584451675, + "learning_rate": 0.0005607102448335294, + "loss": 2.6933, + "step": 15738 + }, + { + "epoch": 0.46671411203036506, + "grad_norm": 0.11541730165481567, + "learning_rate": 0.000560663541726471, + "loss": 2.7003, + "step": 15739 + }, + { + "epoch": 0.46674376538267653, + "grad_norm": 0.12528710067272186, + "learning_rate": 0.0005606168380822263, + "loss": 2.6925, + "step": 15740 + }, + { + "epoch": 0.466773418734988, + "grad_norm": 0.13931839168071747, + "learning_rate": 0.0005605701339012092, + "loss": 2.6868, + "step": 15741 + }, + { + "epoch": 0.4668030720872995, + "grad_norm": 0.1354435682296753, + "learning_rate": 0.0005605234291838331, + "loss": 2.6843, + "step": 15742 + }, + { + "epoch": 0.46683272543961096, + "grad_norm": 0.11903202533721924, + "learning_rate": 0.0005604767239305116, + "loss": 2.6446, + "step": 15743 + }, + { + "epoch": 0.46686237879192244, + "grad_norm": 0.12991930544376373, + "learning_rate": 0.0005604300181416585, + "loss": 2.7272, + "step": 15744 + }, + { + "epoch": 0.4668920321442339, + "grad_norm": 0.12773482501506805, + "learning_rate": 0.0005603833118176871, + "loss": 2.7313, + "step": 15745 + }, + { + "epoch": 0.4669216854965454, + "grad_norm": 0.1365203708410263, + "learning_rate": 0.0005603366049590112, + "loss": 2.6717, + "step": 15746 + }, + { + "epoch": 0.46695133884885687, + "grad_norm": 0.13729587197303772, + "learning_rate": 0.0005602898975660442, + "loss": 2.6871, + "step": 15747 + }, + { + "epoch": 0.46698099220116834, + "grad_norm": 0.13659627735614777, + "learning_rate": 0.0005602431896391998, + "loss": 2.6766, + "step": 15748 + }, + { + "epoch": 0.4670106455534798, + "grad_norm": 0.1484692543745041, + "learning_rate": 0.0005601964811788918, + "loss": 2.7056, + "step": 15749 + }, + { + "epoch": 0.4670402989057913, + "grad_norm": 0.1418425589799881, + "learning_rate": 0.0005601497721855334, + "loss": 2.682, + "step": 15750 + }, + { + "epoch": 0.46706995225810277, + "grad_norm": 0.13770386576652527, + "learning_rate": 0.0005601030626595386, + "loss": 2.643, + "step": 15751 + }, + { + "epoch": 0.46709960561041425, + "grad_norm": 0.13950738310813904, + "learning_rate": 0.0005600563526013207, + "loss": 2.695, + "step": 15752 + }, + { + "epoch": 0.4671292589627257, + "grad_norm": 0.13679872453212738, + "learning_rate": 0.0005600096420112937, + "loss": 2.7073, + "step": 15753 + }, + { + "epoch": 0.4671589123150372, + "grad_norm": 0.1279759556055069, + "learning_rate": 0.0005599629308898709, + "loss": 2.701, + "step": 15754 + }, + { + "epoch": 0.4671885656673487, + "grad_norm": 0.14481458067893982, + "learning_rate": 0.000559916219237466, + "loss": 2.7107, + "step": 15755 + }, + { + "epoch": 0.46721821901966015, + "grad_norm": 0.13605688512325287, + "learning_rate": 0.0005598695070544929, + "loss": 2.6745, + "step": 15756 + }, + { + "epoch": 0.46724787237197163, + "grad_norm": 0.11383494734764099, + "learning_rate": 0.0005598227943413648, + "loss": 2.724, + "step": 15757 + }, + { + "epoch": 0.4672775257242831, + "grad_norm": 0.10438815504312515, + "learning_rate": 0.0005597760810984957, + "loss": 2.6797, + "step": 15758 + }, + { + "epoch": 0.46730717907659464, + "grad_norm": 0.11315164715051651, + "learning_rate": 0.0005597293673262992, + "loss": 2.6773, + "step": 15759 + }, + { + "epoch": 0.4673368324289061, + "grad_norm": 0.12048520147800446, + "learning_rate": 0.0005596826530251889, + "loss": 2.6616, + "step": 15760 + }, + { + "epoch": 0.4673664857812176, + "grad_norm": 0.15471887588500977, + "learning_rate": 0.0005596359381955784, + "loss": 2.7067, + "step": 15761 + }, + { + "epoch": 0.46739613913352906, + "grad_norm": 0.12416849285364151, + "learning_rate": 0.0005595892228378813, + "loss": 2.7078, + "step": 15762 + }, + { + "epoch": 0.46742579248584054, + "grad_norm": 0.12219968438148499, + "learning_rate": 0.0005595425069525115, + "loss": 2.7045, + "step": 15763 + }, + { + "epoch": 0.467455445838152, + "grad_norm": 0.11899371445178986, + "learning_rate": 0.0005594957905398826, + "loss": 2.7221, + "step": 15764 + }, + { + "epoch": 0.4674850991904635, + "grad_norm": 0.11957992613315582, + "learning_rate": 0.0005594490736004083, + "loss": 2.68, + "step": 15765 + }, + { + "epoch": 0.46751475254277497, + "grad_norm": 0.12633159756660461, + "learning_rate": 0.0005594023561345023, + "loss": 2.7034, + "step": 15766 + }, + { + "epoch": 0.46754440589508645, + "grad_norm": 0.12037906795740128, + "learning_rate": 0.0005593556381425782, + "loss": 2.7164, + "step": 15767 + }, + { + "epoch": 0.4675740592473979, + "grad_norm": 0.10852665454149246, + "learning_rate": 0.0005593089196250495, + "loss": 2.7048, + "step": 15768 + }, + { + "epoch": 0.4676037125997094, + "grad_norm": 0.11574019491672516, + "learning_rate": 0.0005592622005823303, + "loss": 2.7107, + "step": 15769 + }, + { + "epoch": 0.4676333659520209, + "grad_norm": 0.11949644982814789, + "learning_rate": 0.0005592154810148344, + "loss": 2.6823, + "step": 15770 + }, + { + "epoch": 0.46766301930433235, + "grad_norm": 0.11401154845952988, + "learning_rate": 0.000559168760922975, + "loss": 2.6988, + "step": 15771 + }, + { + "epoch": 0.4676926726566438, + "grad_norm": 0.1182461827993393, + "learning_rate": 0.000559122040307166, + "loss": 2.6755, + "step": 15772 + }, + { + "epoch": 0.4677223260089553, + "grad_norm": 0.11639250069856644, + "learning_rate": 0.0005590753191678213, + "loss": 2.6995, + "step": 15773 + }, + { + "epoch": 0.4677519793612668, + "grad_norm": 0.12154679745435715, + "learning_rate": 0.0005590285975053545, + "loss": 2.6608, + "step": 15774 + }, + { + "epoch": 0.46778163271357825, + "grad_norm": 0.11261550337076187, + "learning_rate": 0.0005589818753201792, + "loss": 2.7237, + "step": 15775 + }, + { + "epoch": 0.46781128606588973, + "grad_norm": 0.11138519644737244, + "learning_rate": 0.0005589351526127096, + "loss": 2.6994, + "step": 15776 + }, + { + "epoch": 0.4678409394182012, + "grad_norm": 0.11292453855276108, + "learning_rate": 0.0005588884293833588, + "loss": 2.698, + "step": 15777 + }, + { + "epoch": 0.4678705927705127, + "grad_norm": 0.13065707683563232, + "learning_rate": 0.000558841705632541, + "loss": 2.6743, + "step": 15778 + }, + { + "epoch": 0.46790024612282416, + "grad_norm": 0.14375139772891998, + "learning_rate": 0.0005587949813606696, + "loss": 2.7052, + "step": 15779 + }, + { + "epoch": 0.4679298994751357, + "grad_norm": 0.1431117057800293, + "learning_rate": 0.0005587482565681587, + "loss": 2.6882, + "step": 15780 + }, + { + "epoch": 0.46795955282744717, + "grad_norm": 0.1409274786710739, + "learning_rate": 0.000558701531255422, + "loss": 2.6912, + "step": 15781 + }, + { + "epoch": 0.46798920617975864, + "grad_norm": 0.11661559343338013, + "learning_rate": 0.000558654805422873, + "loss": 2.7017, + "step": 15782 + }, + { + "epoch": 0.4680188595320701, + "grad_norm": 0.12566344439983368, + "learning_rate": 0.0005586080790709257, + "loss": 2.6869, + "step": 15783 + }, + { + "epoch": 0.4680485128843816, + "grad_norm": 0.12558791041374207, + "learning_rate": 0.0005585613521999938, + "loss": 2.7104, + "step": 15784 + }, + { + "epoch": 0.46807816623669307, + "grad_norm": 0.12090504914522171, + "learning_rate": 0.0005585146248104911, + "loss": 2.6854, + "step": 15785 + }, + { + "epoch": 0.46810781958900455, + "grad_norm": 0.13365590572357178, + "learning_rate": 0.0005584678969028313, + "loss": 2.7255, + "step": 15786 + }, + { + "epoch": 0.468137472941316, + "grad_norm": 0.11646012961864471, + "learning_rate": 0.0005584211684774283, + "loss": 2.6821, + "step": 15787 + }, + { + "epoch": 0.4681671262936275, + "grad_norm": 0.13337084650993347, + "learning_rate": 0.0005583744395346957, + "loss": 2.7101, + "step": 15788 + }, + { + "epoch": 0.468196779645939, + "grad_norm": 0.14541804790496826, + "learning_rate": 0.0005583277100750475, + "loss": 2.6772, + "step": 15789 + }, + { + "epoch": 0.46822643299825045, + "grad_norm": 0.12329237163066864, + "learning_rate": 0.0005582809800988974, + "loss": 2.697, + "step": 15790 + }, + { + "epoch": 0.46825608635056193, + "grad_norm": 0.1262902468442917, + "learning_rate": 0.0005582342496066592, + "loss": 2.682, + "step": 15791 + }, + { + "epoch": 0.4682857397028734, + "grad_norm": 0.1136397272348404, + "learning_rate": 0.000558187518598747, + "loss": 2.7024, + "step": 15792 + }, + { + "epoch": 0.4683153930551849, + "grad_norm": 0.1330835372209549, + "learning_rate": 0.000558140787075574, + "loss": 2.6931, + "step": 15793 + }, + { + "epoch": 0.46834504640749636, + "grad_norm": 0.1288449764251709, + "learning_rate": 0.0005580940550375543, + "loss": 2.7006, + "step": 15794 + }, + { + "epoch": 0.46837469975980783, + "grad_norm": 0.13191847503185272, + "learning_rate": 0.0005580473224851019, + "loss": 2.6903, + "step": 15795 + }, + { + "epoch": 0.4684043531121193, + "grad_norm": 0.12884576618671417, + "learning_rate": 0.0005580005894186306, + "loss": 2.6778, + "step": 15796 + }, + { + "epoch": 0.4684340064644308, + "grad_norm": 0.11049173027276993, + "learning_rate": 0.000557953855838554, + "loss": 2.7117, + "step": 15797 + }, + { + "epoch": 0.46846365981674226, + "grad_norm": 0.1140960156917572, + "learning_rate": 0.0005579071217452862, + "loss": 2.6989, + "step": 15798 + }, + { + "epoch": 0.46849331316905374, + "grad_norm": 0.12122656404972076, + "learning_rate": 0.0005578603871392408, + "loss": 2.6584, + "step": 15799 + }, + { + "epoch": 0.4685229665213652, + "grad_norm": 0.11570771783590317, + "learning_rate": 0.0005578136520208315, + "loss": 2.6952, + "step": 15800 + }, + { + "epoch": 0.46855261987367675, + "grad_norm": 0.12924204766750336, + "learning_rate": 0.0005577669163904727, + "loss": 2.7027, + "step": 15801 + }, + { + "epoch": 0.4685822732259882, + "grad_norm": 0.1238047257065773, + "learning_rate": 0.0005577201802485779, + "loss": 2.6996, + "step": 15802 + }, + { + "epoch": 0.4686119265782997, + "grad_norm": 0.1250598132610321, + "learning_rate": 0.0005576734435955611, + "loss": 2.7249, + "step": 15803 + }, + { + "epoch": 0.4686415799306112, + "grad_norm": 0.11565719544887543, + "learning_rate": 0.0005576267064318359, + "loss": 2.6716, + "step": 15804 + }, + { + "epoch": 0.46867123328292265, + "grad_norm": 0.11092893034219742, + "learning_rate": 0.0005575799687578163, + "loss": 2.7119, + "step": 15805 + }, + { + "epoch": 0.4687008866352341, + "grad_norm": 0.14433348178863525, + "learning_rate": 0.0005575332305739162, + "loss": 2.6828, + "step": 15806 + }, + { + "epoch": 0.4687305399875456, + "grad_norm": 0.1505097895860672, + "learning_rate": 0.0005574864918805494, + "loss": 2.6899, + "step": 15807 + }, + { + "epoch": 0.4687601933398571, + "grad_norm": 0.12555912137031555, + "learning_rate": 0.0005574397526781301, + "loss": 2.6813, + "step": 15808 + }, + { + "epoch": 0.46878984669216855, + "grad_norm": 0.14777985215187073, + "learning_rate": 0.0005573930129670716, + "loss": 2.6923, + "step": 15809 + }, + { + "epoch": 0.46881950004448003, + "grad_norm": 0.1382037252187729, + "learning_rate": 0.0005573462727477883, + "loss": 2.6969, + "step": 15810 + }, + { + "epoch": 0.4688491533967915, + "grad_norm": 0.12804976105690002, + "learning_rate": 0.000557299532020694, + "loss": 2.6938, + "step": 15811 + }, + { + "epoch": 0.468878806749103, + "grad_norm": 0.1347881555557251, + "learning_rate": 0.0005572527907862024, + "loss": 2.6614, + "step": 15812 + }, + { + "epoch": 0.46890846010141446, + "grad_norm": 0.14964520931243896, + "learning_rate": 0.0005572060490447275, + "loss": 2.6882, + "step": 15813 + }, + { + "epoch": 0.46893811345372594, + "grad_norm": 0.1240617036819458, + "learning_rate": 0.0005571593067966832, + "loss": 2.6862, + "step": 15814 + }, + { + "epoch": 0.4689677668060374, + "grad_norm": 0.11453460901975632, + "learning_rate": 0.0005571125640424835, + "loss": 2.663, + "step": 15815 + }, + { + "epoch": 0.4689974201583489, + "grad_norm": 0.1276465505361557, + "learning_rate": 0.0005570658207825423, + "loss": 2.7094, + "step": 15816 + }, + { + "epoch": 0.46902707351066036, + "grad_norm": 0.12765420973300934, + "learning_rate": 0.0005570190770172733, + "loss": 2.6912, + "step": 15817 + }, + { + "epoch": 0.46905672686297184, + "grad_norm": 0.11303291469812393, + "learning_rate": 0.0005569723327470907, + "loss": 2.6658, + "step": 15818 + }, + { + "epoch": 0.4690863802152833, + "grad_norm": 0.1254103183746338, + "learning_rate": 0.0005569255879724082, + "loss": 2.6802, + "step": 15819 + }, + { + "epoch": 0.4691160335675948, + "grad_norm": 0.10625496506690979, + "learning_rate": 0.0005568788426936399, + "loss": 2.6504, + "step": 15820 + }, + { + "epoch": 0.46914568691990627, + "grad_norm": 0.11790056526660919, + "learning_rate": 0.0005568320969111997, + "loss": 2.6712, + "step": 15821 + }, + { + "epoch": 0.4691753402722178, + "grad_norm": 0.1339646726846695, + "learning_rate": 0.0005567853506255016, + "loss": 2.6947, + "step": 15822 + }, + { + "epoch": 0.4692049936245293, + "grad_norm": 0.13268089294433594, + "learning_rate": 0.0005567386038369594, + "loss": 2.7058, + "step": 15823 + }, + { + "epoch": 0.46923464697684075, + "grad_norm": 0.10987760126590729, + "learning_rate": 0.0005566918565459871, + "loss": 2.6865, + "step": 15824 + }, + { + "epoch": 0.46926430032915223, + "grad_norm": 0.10869105160236359, + "learning_rate": 0.0005566451087529988, + "loss": 2.6547, + "step": 15825 + }, + { + "epoch": 0.4692939536814637, + "grad_norm": 0.1294746845960617, + "learning_rate": 0.0005565983604584082, + "loss": 2.6875, + "step": 15826 + }, + { + "epoch": 0.4693236070337752, + "grad_norm": 0.13401541113853455, + "learning_rate": 0.0005565516116626294, + "loss": 2.6956, + "step": 15827 + }, + { + "epoch": 0.46935326038608666, + "grad_norm": 0.1307556927204132, + "learning_rate": 0.0005565048623660764, + "loss": 2.6667, + "step": 15828 + }, + { + "epoch": 0.46938291373839813, + "grad_norm": 0.12403254956007004, + "learning_rate": 0.0005564581125691632, + "loss": 2.6891, + "step": 15829 + }, + { + "epoch": 0.4694125670907096, + "grad_norm": 0.13555008172988892, + "learning_rate": 0.0005564113622723038, + "loss": 2.7095, + "step": 15830 + }, + { + "epoch": 0.4694422204430211, + "grad_norm": 0.12803888320922852, + "learning_rate": 0.000556364611475912, + "loss": 2.6915, + "step": 15831 + }, + { + "epoch": 0.46947187379533256, + "grad_norm": 0.11680880934000015, + "learning_rate": 0.0005563178601804019, + "loss": 2.684, + "step": 15832 + }, + { + "epoch": 0.46950152714764404, + "grad_norm": 0.12466494739055634, + "learning_rate": 0.0005562711083861873, + "loss": 2.7009, + "step": 15833 + }, + { + "epoch": 0.4695311804999555, + "grad_norm": 0.12204070389270782, + "learning_rate": 0.0005562243560936824, + "loss": 2.6755, + "step": 15834 + }, + { + "epoch": 0.469560833852267, + "grad_norm": 0.1234201192855835, + "learning_rate": 0.0005561776033033016, + "loss": 2.6935, + "step": 15835 + }, + { + "epoch": 0.46959048720457847, + "grad_norm": 0.11698756366968155, + "learning_rate": 0.0005561308500154581, + "loss": 2.6783, + "step": 15836 + }, + { + "epoch": 0.46962014055688994, + "grad_norm": 0.13312788307666779, + "learning_rate": 0.0005560840962305665, + "loss": 2.6841, + "step": 15837 + }, + { + "epoch": 0.4696497939092014, + "grad_norm": 0.13351771235466003, + "learning_rate": 0.0005560373419490405, + "loss": 2.6893, + "step": 15838 + }, + { + "epoch": 0.4696794472615129, + "grad_norm": 0.11358878016471863, + "learning_rate": 0.000555990587171294, + "loss": 2.6996, + "step": 15839 + }, + { + "epoch": 0.46970910061382437, + "grad_norm": 0.11830619722604752, + "learning_rate": 0.0005559438318977416, + "loss": 2.6894, + "step": 15840 + }, + { + "epoch": 0.46973875396613585, + "grad_norm": 0.13577032089233398, + "learning_rate": 0.0005558970761287968, + "loss": 2.666, + "step": 15841 + }, + { + "epoch": 0.4697684073184473, + "grad_norm": 0.13934066891670227, + "learning_rate": 0.0005558503198648737, + "loss": 2.6508, + "step": 15842 + }, + { + "epoch": 0.46979806067075885, + "grad_norm": 0.12389899790287018, + "learning_rate": 0.0005558035631063866, + "loss": 2.6826, + "step": 15843 + }, + { + "epoch": 0.46982771402307033, + "grad_norm": 0.12852200865745544, + "learning_rate": 0.0005557568058537493, + "loss": 2.6886, + "step": 15844 + }, + { + "epoch": 0.4698573673753818, + "grad_norm": 0.11483006179332733, + "learning_rate": 0.0005557100481073759, + "loss": 2.6817, + "step": 15845 + }, + { + "epoch": 0.4698870207276933, + "grad_norm": 0.13399606943130493, + "learning_rate": 0.0005556632898676806, + "loss": 2.6538, + "step": 15846 + }, + { + "epoch": 0.46991667408000476, + "grad_norm": 0.13635282218456268, + "learning_rate": 0.0005556165311350772, + "loss": 2.6969, + "step": 15847 + }, + { + "epoch": 0.46994632743231624, + "grad_norm": 0.14093990623950958, + "learning_rate": 0.0005555697719099797, + "loss": 2.6855, + "step": 15848 + }, + { + "epoch": 0.4699759807846277, + "grad_norm": 0.13618971407413483, + "learning_rate": 0.0005555230121928026, + "loss": 2.7095, + "step": 15849 + }, + { + "epoch": 0.4700056341369392, + "grad_norm": 0.14558148384094238, + "learning_rate": 0.0005554762519839596, + "loss": 2.7215, + "step": 15850 + }, + { + "epoch": 0.47003528748925066, + "grad_norm": 0.11544130742549896, + "learning_rate": 0.0005554294912838648, + "loss": 2.6653, + "step": 15851 + }, + { + "epoch": 0.47006494084156214, + "grad_norm": 0.1113428995013237, + "learning_rate": 0.0005553827300929324, + "loss": 2.6951, + "step": 15852 + }, + { + "epoch": 0.4700945941938736, + "grad_norm": 0.10483714938163757, + "learning_rate": 0.0005553359684115764, + "loss": 2.6554, + "step": 15853 + }, + { + "epoch": 0.4701242475461851, + "grad_norm": 0.12558895349502563, + "learning_rate": 0.0005552892062402109, + "loss": 2.7047, + "step": 15854 + }, + { + "epoch": 0.47015390089849657, + "grad_norm": 0.15001076459884644, + "learning_rate": 0.00055524244357925, + "loss": 2.6826, + "step": 15855 + }, + { + "epoch": 0.47018355425080804, + "grad_norm": 0.13896873593330383, + "learning_rate": 0.0005551956804291079, + "loss": 2.6976, + "step": 15856 + }, + { + "epoch": 0.4702132076031195, + "grad_norm": 0.1168035939335823, + "learning_rate": 0.0005551489167901984, + "loss": 2.6612, + "step": 15857 + }, + { + "epoch": 0.470242860955431, + "grad_norm": 0.13256138563156128, + "learning_rate": 0.0005551021526629359, + "loss": 2.6615, + "step": 15858 + }, + { + "epoch": 0.4702725143077425, + "grad_norm": 0.136540949344635, + "learning_rate": 0.0005550553880477343, + "loss": 2.6871, + "step": 15859 + }, + { + "epoch": 0.47030216766005395, + "grad_norm": 0.13351230323314667, + "learning_rate": 0.0005550086229450077, + "loss": 2.6837, + "step": 15860 + }, + { + "epoch": 0.4703318210123654, + "grad_norm": 0.12117020040750504, + "learning_rate": 0.0005549618573551704, + "loss": 2.7375, + "step": 15861 + }, + { + "epoch": 0.4703614743646769, + "grad_norm": 0.1183413490653038, + "learning_rate": 0.0005549150912786365, + "loss": 2.7083, + "step": 15862 + }, + { + "epoch": 0.47039112771698843, + "grad_norm": 0.15782862901687622, + "learning_rate": 0.00055486832471582, + "loss": 2.6582, + "step": 15863 + }, + { + "epoch": 0.4704207810692999, + "grad_norm": 0.1454073041677475, + "learning_rate": 0.0005548215576671352, + "loss": 2.6484, + "step": 15864 + }, + { + "epoch": 0.4704504344216114, + "grad_norm": 0.16235136985778809, + "learning_rate": 0.0005547747901329957, + "loss": 2.6656, + "step": 15865 + }, + { + "epoch": 0.47048008777392286, + "grad_norm": 0.14535854756832123, + "learning_rate": 0.0005547280221138162, + "loss": 2.7113, + "step": 15866 + }, + { + "epoch": 0.47050974112623434, + "grad_norm": 0.13119357824325562, + "learning_rate": 0.0005546812536100107, + "loss": 2.681, + "step": 15867 + }, + { + "epoch": 0.4705393944785458, + "grad_norm": 0.13298049569129944, + "learning_rate": 0.0005546344846219936, + "loss": 2.6804, + "step": 15868 + }, + { + "epoch": 0.4705690478308573, + "grad_norm": 0.14568740129470825, + "learning_rate": 0.0005545877151501785, + "loss": 2.6858, + "step": 15869 + }, + { + "epoch": 0.47059870118316877, + "grad_norm": 0.1337296962738037, + "learning_rate": 0.0005545409451949798, + "loss": 2.6919, + "step": 15870 + }, + { + "epoch": 0.47062835453548024, + "grad_norm": 0.11636729538440704, + "learning_rate": 0.0005544941747568118, + "loss": 2.6695, + "step": 15871 + }, + { + "epoch": 0.4706580078877917, + "grad_norm": 0.12907157838344574, + "learning_rate": 0.0005544474038360882, + "loss": 2.6743, + "step": 15872 + }, + { + "epoch": 0.4706876612401032, + "grad_norm": 0.12961646914482117, + "learning_rate": 0.0005544006324332238, + "loss": 2.6955, + "step": 15873 + }, + { + "epoch": 0.47071731459241467, + "grad_norm": 0.1192798912525177, + "learning_rate": 0.0005543538605486323, + "loss": 2.6819, + "step": 15874 + }, + { + "epoch": 0.47074696794472615, + "grad_norm": 0.12206538766622543, + "learning_rate": 0.0005543070881827282, + "loss": 2.7259, + "step": 15875 + }, + { + "epoch": 0.4707766212970376, + "grad_norm": 0.12356381863355637, + "learning_rate": 0.0005542603153359253, + "loss": 2.6586, + "step": 15876 + }, + { + "epoch": 0.4708062746493491, + "grad_norm": 0.11801353096961975, + "learning_rate": 0.0005542135420086383, + "loss": 2.687, + "step": 15877 + }, + { + "epoch": 0.4708359280016606, + "grad_norm": 0.11606782674789429, + "learning_rate": 0.0005541667682012807, + "loss": 2.6726, + "step": 15878 + }, + { + "epoch": 0.47086558135397205, + "grad_norm": 0.10806920379400253, + "learning_rate": 0.0005541199939142673, + "loss": 2.6965, + "step": 15879 + }, + { + "epoch": 0.4708952347062835, + "grad_norm": 0.12080247700214386, + "learning_rate": 0.0005540732191480121, + "loss": 2.7026, + "step": 15880 + }, + { + "epoch": 0.470924888058595, + "grad_norm": 0.11887671798467636, + "learning_rate": 0.0005540264439029292, + "loss": 2.7069, + "step": 15881 + }, + { + "epoch": 0.4709545414109065, + "grad_norm": 0.11139769107103348, + "learning_rate": 0.0005539796681794329, + "loss": 2.6645, + "step": 15882 + }, + { + "epoch": 0.47098419476321796, + "grad_norm": 0.12399216741323471, + "learning_rate": 0.0005539328919779373, + "loss": 2.6962, + "step": 15883 + }, + { + "epoch": 0.4710138481155295, + "grad_norm": 0.10921774804592133, + "learning_rate": 0.0005538861152988567, + "loss": 2.669, + "step": 15884 + }, + { + "epoch": 0.47104350146784096, + "grad_norm": 0.10626573860645294, + "learning_rate": 0.0005538393381426054, + "loss": 2.6734, + "step": 15885 + }, + { + "epoch": 0.47107315482015244, + "grad_norm": 0.11125999689102173, + "learning_rate": 0.0005537925605095974, + "loss": 2.7004, + "step": 15886 + }, + { + "epoch": 0.4711028081724639, + "grad_norm": 0.10393189638853073, + "learning_rate": 0.0005537457824002469, + "loss": 2.6914, + "step": 15887 + }, + { + "epoch": 0.4711324615247754, + "grad_norm": 0.1195514127612114, + "learning_rate": 0.0005536990038149685, + "loss": 2.6899, + "step": 15888 + }, + { + "epoch": 0.47116211487708687, + "grad_norm": 0.10581782460212708, + "learning_rate": 0.0005536522247541762, + "loss": 2.7055, + "step": 15889 + }, + { + "epoch": 0.47119176822939834, + "grad_norm": 0.40088850259780884, + "learning_rate": 0.0005536054452182844, + "loss": 2.7183, + "step": 15890 + }, + { + "epoch": 0.4712214215817098, + "grad_norm": 0.10513004660606384, + "learning_rate": 0.0005535586652077068, + "loss": 2.7026, + "step": 15891 + }, + { + "epoch": 0.4712510749340213, + "grad_norm": 0.10116010904312134, + "learning_rate": 0.0005535118847228581, + "loss": 2.7074, + "step": 15892 + }, + { + "epoch": 0.4712807282863328, + "grad_norm": 0.11086121946573257, + "learning_rate": 0.0005534651037641527, + "loss": 2.6586, + "step": 15893 + }, + { + "epoch": 0.47131038163864425, + "grad_norm": 0.11490973085165024, + "learning_rate": 0.0005534183223320045, + "loss": 2.7304, + "step": 15894 + }, + { + "epoch": 0.4713400349909557, + "grad_norm": 0.11929216235876083, + "learning_rate": 0.0005533715404268278, + "loss": 2.6613, + "step": 15895 + }, + { + "epoch": 0.4713696883432672, + "grad_norm": 0.09634111076593399, + "learning_rate": 0.0005533247580490373, + "loss": 2.6781, + "step": 15896 + }, + { + "epoch": 0.4713993416955787, + "grad_norm": 0.11359479278326035, + "learning_rate": 0.0005532779751990464, + "loss": 2.6591, + "step": 15897 + }, + { + "epoch": 0.47142899504789015, + "grad_norm": 0.11355246603488922, + "learning_rate": 0.0005532311918772702, + "loss": 2.6888, + "step": 15898 + }, + { + "epoch": 0.47145864840020163, + "grad_norm": 0.11236201971769333, + "learning_rate": 0.0005531844080841227, + "loss": 2.656, + "step": 15899 + }, + { + "epoch": 0.4714883017525131, + "grad_norm": 0.10361641645431519, + "learning_rate": 0.0005531376238200179, + "loss": 2.6995, + "step": 15900 + }, + { + "epoch": 0.4715179551048246, + "grad_norm": 0.11895284801721573, + "learning_rate": 0.0005530908390853706, + "loss": 2.7321, + "step": 15901 + }, + { + "epoch": 0.47154760845713606, + "grad_norm": 0.12239470332860947, + "learning_rate": 0.0005530440538805947, + "loss": 2.6716, + "step": 15902 + }, + { + "epoch": 0.47157726180944753, + "grad_norm": 0.11147688329219818, + "learning_rate": 0.0005529972682061045, + "loss": 2.6981, + "step": 15903 + }, + { + "epoch": 0.471606915161759, + "grad_norm": 0.12738843262195587, + "learning_rate": 0.0005529504820623144, + "loss": 2.6823, + "step": 15904 + }, + { + "epoch": 0.47163656851407054, + "grad_norm": 0.12354633957147598, + "learning_rate": 0.0005529036954496387, + "loss": 2.6917, + "step": 15905 + }, + { + "epoch": 0.471666221866382, + "grad_norm": 0.11110604554414749, + "learning_rate": 0.0005528569083684918, + "loss": 2.6637, + "step": 15906 + }, + { + "epoch": 0.4716958752186935, + "grad_norm": 0.1261332482099533, + "learning_rate": 0.000552810120819288, + "loss": 2.6819, + "step": 15907 + }, + { + "epoch": 0.47172552857100497, + "grad_norm": 0.12342555820941925, + "learning_rate": 0.0005527633328024413, + "loss": 2.6647, + "step": 15908 + }, + { + "epoch": 0.47175518192331645, + "grad_norm": 0.11484295129776001, + "learning_rate": 0.0005527165443183663, + "loss": 2.7375, + "step": 15909 + }, + { + "epoch": 0.4717848352756279, + "grad_norm": 0.11163606494665146, + "learning_rate": 0.0005526697553674771, + "loss": 2.6826, + "step": 15910 + }, + { + "epoch": 0.4718144886279394, + "grad_norm": 0.11302078515291214, + "learning_rate": 0.0005526229659501883, + "loss": 2.6647, + "step": 15911 + }, + { + "epoch": 0.4718441419802509, + "grad_norm": 0.11982408910989761, + "learning_rate": 0.0005525761760669142, + "loss": 2.7264, + "step": 15912 + }, + { + "epoch": 0.47187379533256235, + "grad_norm": 0.11421723663806915, + "learning_rate": 0.000552529385718069, + "loss": 2.685, + "step": 15913 + }, + { + "epoch": 0.4719034486848738, + "grad_norm": 0.12730443477630615, + "learning_rate": 0.0005524825949040671, + "loss": 2.6826, + "step": 15914 + }, + { + "epoch": 0.4719331020371853, + "grad_norm": 0.13881510496139526, + "learning_rate": 0.0005524358036253226, + "loss": 2.6799, + "step": 15915 + }, + { + "epoch": 0.4719627553894968, + "grad_norm": 0.11743230372667313, + "learning_rate": 0.0005523890118822502, + "loss": 2.7205, + "step": 15916 + }, + { + "epoch": 0.47199240874180826, + "grad_norm": 0.09974388778209686, + "learning_rate": 0.0005523422196752642, + "loss": 2.6751, + "step": 15917 + }, + { + "epoch": 0.47202206209411973, + "grad_norm": 0.12463928759098053, + "learning_rate": 0.0005522954270047787, + "loss": 2.7091, + "step": 15918 + }, + { + "epoch": 0.4720517154464312, + "grad_norm": 0.1299014389514923, + "learning_rate": 0.0005522486338712083, + "loss": 2.6952, + "step": 15919 + }, + { + "epoch": 0.4720813687987427, + "grad_norm": 0.12753520905971527, + "learning_rate": 0.0005522018402749673, + "loss": 2.6764, + "step": 15920 + }, + { + "epoch": 0.47211102215105416, + "grad_norm": 0.15379616618156433, + "learning_rate": 0.00055215504621647, + "loss": 2.6806, + "step": 15921 + }, + { + "epoch": 0.47214067550336564, + "grad_norm": 0.14127983152866364, + "learning_rate": 0.0005521082516961309, + "loss": 2.706, + "step": 15922 + }, + { + "epoch": 0.4721703288556771, + "grad_norm": 0.11711406707763672, + "learning_rate": 0.0005520614567143641, + "loss": 2.6753, + "step": 15923 + }, + { + "epoch": 0.4721999822079886, + "grad_norm": 0.11994726955890656, + "learning_rate": 0.0005520146612715842, + "loss": 2.6878, + "step": 15924 + }, + { + "epoch": 0.47222963556030007, + "grad_norm": 0.11848017573356628, + "learning_rate": 0.0005519678653682058, + "loss": 2.6924, + "step": 15925 + }, + { + "epoch": 0.4722592889126116, + "grad_norm": 0.11285015940666199, + "learning_rate": 0.0005519210690046427, + "loss": 2.6909, + "step": 15926 + }, + { + "epoch": 0.4722889422649231, + "grad_norm": 0.11873724311590195, + "learning_rate": 0.0005518742721813099, + "loss": 2.6995, + "step": 15927 + }, + { + "epoch": 0.47231859561723455, + "grad_norm": 0.13516615331172943, + "learning_rate": 0.0005518274748986215, + "loss": 2.6851, + "step": 15928 + }, + { + "epoch": 0.472348248969546, + "grad_norm": 0.14170533418655396, + "learning_rate": 0.0005517806771569918, + "loss": 2.714, + "step": 15929 + }, + { + "epoch": 0.4723779023218575, + "grad_norm": 0.11202305555343628, + "learning_rate": 0.0005517338789568353, + "loss": 2.6716, + "step": 15930 + }, + { + "epoch": 0.472407555674169, + "grad_norm": 0.131858229637146, + "learning_rate": 0.0005516870802985663, + "loss": 2.6788, + "step": 15931 + }, + { + "epoch": 0.47243720902648045, + "grad_norm": 0.1505223661661148, + "learning_rate": 0.0005516402811825996, + "loss": 2.7099, + "step": 15932 + }, + { + "epoch": 0.47246686237879193, + "grad_norm": 0.17549370229244232, + "learning_rate": 0.000551593481609349, + "loss": 2.7485, + "step": 15933 + }, + { + "epoch": 0.4724965157311034, + "grad_norm": 0.16403034329414368, + "learning_rate": 0.0005515466815792297, + "loss": 2.6498, + "step": 15934 + }, + { + "epoch": 0.4725261690834149, + "grad_norm": 0.13396580517292023, + "learning_rate": 0.0005514998810926554, + "loss": 2.7227, + "step": 15935 + }, + { + "epoch": 0.47255582243572636, + "grad_norm": 0.13560916483402252, + "learning_rate": 0.0005514530801500409, + "loss": 2.6692, + "step": 15936 + }, + { + "epoch": 0.47258547578803783, + "grad_norm": 0.13097985088825226, + "learning_rate": 0.0005514062787518004, + "loss": 2.6968, + "step": 15937 + }, + { + "epoch": 0.4726151291403493, + "grad_norm": 0.14025895297527313, + "learning_rate": 0.0005513594768983488, + "loss": 2.676, + "step": 15938 + }, + { + "epoch": 0.4726447824926608, + "grad_norm": 0.10702547430992126, + "learning_rate": 0.0005513126745901, + "loss": 2.662, + "step": 15939 + }, + { + "epoch": 0.47267443584497226, + "grad_norm": 0.12372145801782608, + "learning_rate": 0.0005512658718274688, + "loss": 2.6508, + "step": 15940 + }, + { + "epoch": 0.47270408919728374, + "grad_norm": 0.1194198727607727, + "learning_rate": 0.0005512190686108693, + "loss": 2.6577, + "step": 15941 + }, + { + "epoch": 0.4727337425495952, + "grad_norm": 0.11740956455469131, + "learning_rate": 0.0005511722649407162, + "loss": 2.6977, + "step": 15942 + }, + { + "epoch": 0.4727633959019067, + "grad_norm": 0.09941116720438004, + "learning_rate": 0.0005511254608174239, + "loss": 2.7078, + "step": 15943 + }, + { + "epoch": 0.47279304925421817, + "grad_norm": 0.1116442009806633, + "learning_rate": 0.0005510786562414071, + "loss": 2.7014, + "step": 15944 + }, + { + "epoch": 0.47282270260652964, + "grad_norm": 0.09840545803308487, + "learning_rate": 0.0005510318512130797, + "loss": 2.6672, + "step": 15945 + }, + { + "epoch": 0.4728523559588411, + "grad_norm": 0.10582675784826279, + "learning_rate": 0.0005509850457328568, + "loss": 2.6486, + "step": 15946 + }, + { + "epoch": 0.47288200931115265, + "grad_norm": 0.13660821318626404, + "learning_rate": 0.0005509382398011523, + "loss": 2.6729, + "step": 15947 + }, + { + "epoch": 0.4729116626634641, + "grad_norm": 0.1357470601797104, + "learning_rate": 0.0005508914334183811, + "loss": 2.6707, + "step": 15948 + }, + { + "epoch": 0.4729413160157756, + "grad_norm": 0.13002541661262512, + "learning_rate": 0.0005508446265849575, + "loss": 2.6974, + "step": 15949 + }, + { + "epoch": 0.4729709693680871, + "grad_norm": 0.1363563984632492, + "learning_rate": 0.000550797819301296, + "loss": 2.6821, + "step": 15950 + }, + { + "epoch": 0.47300062272039856, + "grad_norm": 0.12575316429138184, + "learning_rate": 0.0005507510115678111, + "loss": 2.6562, + "step": 15951 + }, + { + "epoch": 0.47303027607271003, + "grad_norm": 0.12015286833047867, + "learning_rate": 0.0005507042033849173, + "loss": 2.7027, + "step": 15952 + }, + { + "epoch": 0.4730599294250215, + "grad_norm": 0.1405213326215744, + "learning_rate": 0.000550657394753029, + "loss": 2.6769, + "step": 15953 + }, + { + "epoch": 0.473089582777333, + "grad_norm": 0.14945773780345917, + "learning_rate": 0.0005506105856725608, + "loss": 2.6748, + "step": 15954 + }, + { + "epoch": 0.47311923612964446, + "grad_norm": 0.14690415561199188, + "learning_rate": 0.0005505637761439271, + "loss": 2.6587, + "step": 15955 + }, + { + "epoch": 0.47314888948195594, + "grad_norm": 0.12904678285121918, + "learning_rate": 0.0005505169661675427, + "loss": 2.7103, + "step": 15956 + }, + { + "epoch": 0.4731785428342674, + "grad_norm": 0.13011863827705383, + "learning_rate": 0.0005504701557438218, + "loss": 2.6781, + "step": 15957 + }, + { + "epoch": 0.4732081961865789, + "grad_norm": 0.13619135320186615, + "learning_rate": 0.0005504233448731789, + "loss": 2.7096, + "step": 15958 + }, + { + "epoch": 0.47323784953889036, + "grad_norm": 0.1331629604101181, + "learning_rate": 0.0005503765335560287, + "loss": 2.6958, + "step": 15959 + }, + { + "epoch": 0.47326750289120184, + "grad_norm": 0.11978556960821152, + "learning_rate": 0.0005503297217927856, + "loss": 2.6925, + "step": 15960 + }, + { + "epoch": 0.4732971562435133, + "grad_norm": 0.14539118111133575, + "learning_rate": 0.0005502829095838644, + "loss": 2.677, + "step": 15961 + }, + { + "epoch": 0.4733268095958248, + "grad_norm": 0.13946516811847687, + "learning_rate": 0.0005502360969296791, + "loss": 2.6889, + "step": 15962 + }, + { + "epoch": 0.47335646294813627, + "grad_norm": 0.13071809709072113, + "learning_rate": 0.0005501892838306446, + "loss": 2.7205, + "step": 15963 + }, + { + "epoch": 0.47338611630044775, + "grad_norm": 0.1278570592403412, + "learning_rate": 0.0005501424702871754, + "loss": 2.7003, + "step": 15964 + }, + { + "epoch": 0.4734157696527592, + "grad_norm": 0.10652238875627518, + "learning_rate": 0.0005500956562996862, + "loss": 2.6979, + "step": 15965 + }, + { + "epoch": 0.4734454230050707, + "grad_norm": 0.11216775327920914, + "learning_rate": 0.0005500488418685913, + "loss": 2.7106, + "step": 15966 + }, + { + "epoch": 0.47347507635738223, + "grad_norm": 0.11927995830774307, + "learning_rate": 0.0005500020269943052, + "loss": 2.6631, + "step": 15967 + }, + { + "epoch": 0.4735047297096937, + "grad_norm": 0.13454850018024445, + "learning_rate": 0.0005499552116772427, + "loss": 2.7108, + "step": 15968 + }, + { + "epoch": 0.4735343830620052, + "grad_norm": 0.11588682234287262, + "learning_rate": 0.000549908395917818, + "loss": 2.6829, + "step": 15969 + }, + { + "epoch": 0.47356403641431666, + "grad_norm": 0.10393142700195312, + "learning_rate": 0.000549861579716446, + "loss": 2.6672, + "step": 15970 + }, + { + "epoch": 0.47359368976662813, + "grad_norm": 0.1159626841545105, + "learning_rate": 0.0005498147630735411, + "loss": 2.7108, + "step": 15971 + }, + { + "epoch": 0.4736233431189396, + "grad_norm": 0.12282992899417877, + "learning_rate": 0.0005497679459895182, + "loss": 2.7248, + "step": 15972 + }, + { + "epoch": 0.4736529964712511, + "grad_norm": 0.1302793323993683, + "learning_rate": 0.0005497211284647914, + "loss": 2.6725, + "step": 15973 + }, + { + "epoch": 0.47368264982356256, + "grad_norm": 0.13104334473609924, + "learning_rate": 0.0005496743104997754, + "loss": 2.7014, + "step": 15974 + }, + { + "epoch": 0.47371230317587404, + "grad_norm": 0.1412917971611023, + "learning_rate": 0.0005496274920948848, + "loss": 2.6789, + "step": 15975 + }, + { + "epoch": 0.4737419565281855, + "grad_norm": 0.1271863877773285, + "learning_rate": 0.0005495806732505343, + "loss": 2.6961, + "step": 15976 + }, + { + "epoch": 0.473771609880497, + "grad_norm": 0.11830489337444305, + "learning_rate": 0.0005495338539671387, + "loss": 2.6942, + "step": 15977 + }, + { + "epoch": 0.47380126323280847, + "grad_norm": 0.11518619954586029, + "learning_rate": 0.0005494870342451122, + "loss": 2.6819, + "step": 15978 + }, + { + "epoch": 0.47383091658511994, + "grad_norm": 0.10211265832185745, + "learning_rate": 0.0005494402140848693, + "loss": 2.6838, + "step": 15979 + }, + { + "epoch": 0.4738605699374314, + "grad_norm": 0.11579041182994843, + "learning_rate": 0.000549393393486825, + "loss": 2.6925, + "step": 15980 + }, + { + "epoch": 0.4738902232897429, + "grad_norm": 0.12192004173994064, + "learning_rate": 0.0005493465724513935, + "loss": 2.685, + "step": 15981 + }, + { + "epoch": 0.47391987664205437, + "grad_norm": 0.11605266481637955, + "learning_rate": 0.0005492997509789899, + "loss": 2.7396, + "step": 15982 + }, + { + "epoch": 0.47394952999436585, + "grad_norm": 0.12480947375297546, + "learning_rate": 0.0005492529290700284, + "loss": 2.6919, + "step": 15983 + }, + { + "epoch": 0.4739791833466773, + "grad_norm": 0.11738178133964539, + "learning_rate": 0.0005492061067249239, + "loss": 2.6769, + "step": 15984 + }, + { + "epoch": 0.4740088366989888, + "grad_norm": 0.11876222491264343, + "learning_rate": 0.0005491592839440908, + "loss": 2.6651, + "step": 15985 + }, + { + "epoch": 0.4740384900513003, + "grad_norm": 0.11640463769435883, + "learning_rate": 0.0005491124607279438, + "loss": 2.6958, + "step": 15986 + }, + { + "epoch": 0.47406814340361175, + "grad_norm": 0.14079450070858002, + "learning_rate": 0.0005490656370768974, + "loss": 2.6931, + "step": 15987 + }, + { + "epoch": 0.4740977967559233, + "grad_norm": 0.14350242912769318, + "learning_rate": 0.0005490188129913665, + "loss": 2.7064, + "step": 15988 + }, + { + "epoch": 0.47412745010823476, + "grad_norm": 0.13953320682048798, + "learning_rate": 0.0005489719884717656, + "loss": 2.7068, + "step": 15989 + }, + { + "epoch": 0.47415710346054624, + "grad_norm": 0.18745160102844238, + "learning_rate": 0.0005489251635185093, + "loss": 2.6675, + "step": 15990 + }, + { + "epoch": 0.4741867568128577, + "grad_norm": 0.1527748852968216, + "learning_rate": 0.0005488783381320122, + "loss": 2.6854, + "step": 15991 + }, + { + "epoch": 0.4742164101651692, + "grad_norm": 0.11416557431221008, + "learning_rate": 0.0005488315123126892, + "loss": 2.6562, + "step": 15992 + }, + { + "epoch": 0.47424606351748066, + "grad_norm": 0.12755823135375977, + "learning_rate": 0.0005487846860609547, + "loss": 2.6648, + "step": 15993 + }, + { + "epoch": 0.47427571686979214, + "grad_norm": 0.13109315931797028, + "learning_rate": 0.0005487378593772233, + "loss": 2.7035, + "step": 15994 + }, + { + "epoch": 0.4743053702221036, + "grad_norm": 0.13834501802921295, + "learning_rate": 0.00054869103226191, + "loss": 2.7098, + "step": 15995 + }, + { + "epoch": 0.4743350235744151, + "grad_norm": 0.12186508625745773, + "learning_rate": 0.0005486442047154291, + "loss": 2.6841, + "step": 15996 + }, + { + "epoch": 0.47436467692672657, + "grad_norm": 0.1229935884475708, + "learning_rate": 0.0005485973767381955, + "loss": 2.6355, + "step": 15997 + }, + { + "epoch": 0.47439433027903805, + "grad_norm": 0.1284867525100708, + "learning_rate": 0.0005485505483306238, + "loss": 2.7025, + "step": 15998 + }, + { + "epoch": 0.4744239836313495, + "grad_norm": 0.1285523772239685, + "learning_rate": 0.0005485037194931287, + "loss": 2.7099, + "step": 15999 + }, + { + "epoch": 0.474453636983661, + "grad_norm": 0.1179775521159172, + "learning_rate": 0.0005484568902261248, + "loss": 2.7012, + "step": 16000 + }, + { + "epoch": 0.4744832903359725, + "grad_norm": 0.12143150717020035, + "learning_rate": 0.0005484100605300267, + "loss": 2.687, + "step": 16001 + }, + { + "epoch": 0.47451294368828395, + "grad_norm": 0.10721399635076523, + "learning_rate": 0.0005483632304052493, + "loss": 2.6756, + "step": 16002 + }, + { + "epoch": 0.4745425970405954, + "grad_norm": 0.13439232110977173, + "learning_rate": 0.0005483163998522071, + "loss": 2.6603, + "step": 16003 + }, + { + "epoch": 0.4745722503929069, + "grad_norm": 0.15796512365341187, + "learning_rate": 0.0005482695688713151, + "loss": 2.7045, + "step": 16004 + }, + { + "epoch": 0.4746019037452184, + "grad_norm": 0.11758309602737427, + "learning_rate": 0.0005482227374629877, + "loss": 2.7099, + "step": 16005 + }, + { + "epoch": 0.47463155709752985, + "grad_norm": 0.13677820563316345, + "learning_rate": 0.0005481759056276395, + "loss": 2.6722, + "step": 16006 + }, + { + "epoch": 0.47466121044984133, + "grad_norm": 0.14101673662662506, + "learning_rate": 0.0005481290733656854, + "loss": 2.6744, + "step": 16007 + }, + { + "epoch": 0.4746908638021528, + "grad_norm": 0.15193630754947662, + "learning_rate": 0.0005480822406775403, + "loss": 2.6561, + "step": 16008 + }, + { + "epoch": 0.47472051715446434, + "grad_norm": 0.12945732474327087, + "learning_rate": 0.0005480354075636186, + "loss": 2.6466, + "step": 16009 + }, + { + "epoch": 0.4747501705067758, + "grad_norm": 0.1305534988641739, + "learning_rate": 0.0005479885740243352, + "loss": 2.6945, + "step": 16010 + }, + { + "epoch": 0.4747798238590873, + "grad_norm": 0.1459541767835617, + "learning_rate": 0.0005479417400601047, + "loss": 2.7277, + "step": 16011 + }, + { + "epoch": 0.47480947721139877, + "grad_norm": 0.13210485875606537, + "learning_rate": 0.0005478949056713418, + "loss": 2.7111, + "step": 16012 + }, + { + "epoch": 0.47483913056371024, + "grad_norm": 0.1329151690006256, + "learning_rate": 0.0005478480708584613, + "loss": 2.696, + "step": 16013 + }, + { + "epoch": 0.4748687839160217, + "grad_norm": 0.11767967790365219, + "learning_rate": 0.0005478012356218779, + "loss": 2.7011, + "step": 16014 + }, + { + "epoch": 0.4748984372683332, + "grad_norm": 0.13214965164661407, + "learning_rate": 0.0005477543999620065, + "loss": 2.7148, + "step": 16015 + }, + { + "epoch": 0.47492809062064467, + "grad_norm": 0.11812695115804672, + "learning_rate": 0.0005477075638792617, + "loss": 2.6822, + "step": 16016 + }, + { + "epoch": 0.47495774397295615, + "grad_norm": 0.12558677792549133, + "learning_rate": 0.0005476607273740581, + "loss": 2.6926, + "step": 16017 + }, + { + "epoch": 0.4749873973252676, + "grad_norm": 0.11117634177207947, + "learning_rate": 0.0005476138904468105, + "loss": 2.6685, + "step": 16018 + }, + { + "epoch": 0.4750170506775791, + "grad_norm": 0.11248968541622162, + "learning_rate": 0.0005475670530979338, + "loss": 2.6763, + "step": 16019 + }, + { + "epoch": 0.4750467040298906, + "grad_norm": 0.1052083671092987, + "learning_rate": 0.0005475202153278428, + "loss": 2.6985, + "step": 16020 + }, + { + "epoch": 0.47507635738220205, + "grad_norm": 0.12187503278255463, + "learning_rate": 0.000547473377136952, + "loss": 2.6886, + "step": 16021 + }, + { + "epoch": 0.47510601073451353, + "grad_norm": 0.11619498580694199, + "learning_rate": 0.0005474265385256764, + "loss": 2.6984, + "step": 16022 + }, + { + "epoch": 0.475135664086825, + "grad_norm": 0.11577104032039642, + "learning_rate": 0.0005473796994944306, + "loss": 2.6604, + "step": 16023 + }, + { + "epoch": 0.4751653174391365, + "grad_norm": 0.1152738481760025, + "learning_rate": 0.0005473328600436295, + "loss": 2.7124, + "step": 16024 + }, + { + "epoch": 0.47519497079144796, + "grad_norm": 0.11605104058980942, + "learning_rate": 0.0005472860201736877, + "loss": 2.6637, + "step": 16025 + }, + { + "epoch": 0.47522462414375943, + "grad_norm": 0.141420379281044, + "learning_rate": 0.00054723917988502, + "loss": 2.7007, + "step": 16026 + }, + { + "epoch": 0.4752542774960709, + "grad_norm": 0.11718704551458359, + "learning_rate": 0.0005471923391780415, + "loss": 2.6936, + "step": 16027 + }, + { + "epoch": 0.4752839308483824, + "grad_norm": 0.11709713935852051, + "learning_rate": 0.0005471454980531665, + "loss": 2.6496, + "step": 16028 + }, + { + "epoch": 0.47531358420069386, + "grad_norm": 0.12611214816570282, + "learning_rate": 0.0005470986565108101, + "loss": 2.7035, + "step": 16029 + }, + { + "epoch": 0.4753432375530054, + "grad_norm": 0.1373940259218216, + "learning_rate": 0.0005470518145513871, + "loss": 2.6778, + "step": 16030 + }, + { + "epoch": 0.47537289090531687, + "grad_norm": 0.14763498306274414, + "learning_rate": 0.0005470049721753121, + "loss": 2.7087, + "step": 16031 + }, + { + "epoch": 0.47540254425762835, + "grad_norm": 0.11952131241559982, + "learning_rate": 0.0005469581293830001, + "loss": 2.6691, + "step": 16032 + }, + { + "epoch": 0.4754321976099398, + "grad_norm": 0.11380992829799652, + "learning_rate": 0.0005469112861748658, + "loss": 2.6837, + "step": 16033 + }, + { + "epoch": 0.4754618509622513, + "grad_norm": 0.12098567187786102, + "learning_rate": 0.0005468644425513238, + "loss": 2.6879, + "step": 16034 + }, + { + "epoch": 0.4754915043145628, + "grad_norm": 0.1327340304851532, + "learning_rate": 0.0005468175985127894, + "loss": 2.6886, + "step": 16035 + }, + { + "epoch": 0.47552115766687425, + "grad_norm": 0.12401571124792099, + "learning_rate": 0.0005467707540596769, + "loss": 2.6844, + "step": 16036 + }, + { + "epoch": 0.4755508110191857, + "grad_norm": 0.10316328704357147, + "learning_rate": 0.0005467239091924015, + "loss": 2.655, + "step": 16037 + }, + { + "epoch": 0.4755804643714972, + "grad_norm": 0.11994794756174088, + "learning_rate": 0.0005466770639113779, + "loss": 2.6807, + "step": 16038 + }, + { + "epoch": 0.4756101177238087, + "grad_norm": 0.14256510138511658, + "learning_rate": 0.0005466302182170208, + "loss": 2.6686, + "step": 16039 + }, + { + "epoch": 0.47563977107612015, + "grad_norm": 0.12555643916130066, + "learning_rate": 0.000546583372109745, + "loss": 2.7113, + "step": 16040 + }, + { + "epoch": 0.47566942442843163, + "grad_norm": 0.12440531700849533, + "learning_rate": 0.0005465365255899656, + "loss": 2.6816, + "step": 16041 + }, + { + "epoch": 0.4756990777807431, + "grad_norm": 0.12401864677667618, + "learning_rate": 0.0005464896786580973, + "loss": 2.6538, + "step": 16042 + }, + { + "epoch": 0.4757287311330546, + "grad_norm": 0.12339864671230316, + "learning_rate": 0.0005464428313145548, + "loss": 2.6916, + "step": 16043 + }, + { + "epoch": 0.47575838448536606, + "grad_norm": 0.1259462833404541, + "learning_rate": 0.0005463959835597531, + "loss": 2.6886, + "step": 16044 + }, + { + "epoch": 0.47578803783767754, + "grad_norm": 0.14492273330688477, + "learning_rate": 0.0005463491353941071, + "loss": 2.7009, + "step": 16045 + }, + { + "epoch": 0.475817691189989, + "grad_norm": 0.1396251618862152, + "learning_rate": 0.0005463022868180313, + "loss": 2.7087, + "step": 16046 + }, + { + "epoch": 0.4758473445423005, + "grad_norm": 0.12010505050420761, + "learning_rate": 0.000546255437831941, + "loss": 2.6678, + "step": 16047 + }, + { + "epoch": 0.47587699789461196, + "grad_norm": 0.12579287588596344, + "learning_rate": 0.0005462085884362509, + "loss": 2.7029, + "step": 16048 + }, + { + "epoch": 0.47590665124692344, + "grad_norm": 0.12772004306316376, + "learning_rate": 0.0005461617386313758, + "loss": 2.6874, + "step": 16049 + }, + { + "epoch": 0.4759363045992349, + "grad_norm": 0.13006924092769623, + "learning_rate": 0.0005461148884177305, + "loss": 2.7242, + "step": 16050 + }, + { + "epoch": 0.47596595795154645, + "grad_norm": 0.13486191630363464, + "learning_rate": 0.0005460680377957299, + "loss": 2.7175, + "step": 16051 + }, + { + "epoch": 0.4759956113038579, + "grad_norm": 0.13766774535179138, + "learning_rate": 0.0005460211867657891, + "loss": 2.6619, + "step": 16052 + }, + { + "epoch": 0.4760252646561694, + "grad_norm": 0.13309651613235474, + "learning_rate": 0.0005459743353283226, + "loss": 2.7157, + "step": 16053 + }, + { + "epoch": 0.4760549180084809, + "grad_norm": 0.11442718654870987, + "learning_rate": 0.0005459274834837455, + "loss": 2.713, + "step": 16054 + }, + { + "epoch": 0.47608457136079235, + "grad_norm": 0.1118854507803917, + "learning_rate": 0.0005458806312324726, + "loss": 2.6639, + "step": 16055 + }, + { + "epoch": 0.47611422471310383, + "grad_norm": 0.1239977702498436, + "learning_rate": 0.000545833778574919, + "loss": 2.6994, + "step": 16056 + }, + { + "epoch": 0.4761438780654153, + "grad_norm": 0.12664933502674103, + "learning_rate": 0.0005457869255114994, + "loss": 2.691, + "step": 16057 + }, + { + "epoch": 0.4761735314177268, + "grad_norm": 0.11505164951086044, + "learning_rate": 0.0005457400720426287, + "loss": 2.6806, + "step": 16058 + }, + { + "epoch": 0.47620318477003826, + "grad_norm": 0.12120329588651657, + "learning_rate": 0.0005456932181687218, + "loss": 2.7027, + "step": 16059 + }, + { + "epoch": 0.47623283812234973, + "grad_norm": 0.10100623965263367, + "learning_rate": 0.0005456463638901935, + "loss": 2.6894, + "step": 16060 + }, + { + "epoch": 0.4762624914746612, + "grad_norm": 0.10389877110719681, + "learning_rate": 0.000545599509207459, + "loss": 2.6733, + "step": 16061 + }, + { + "epoch": 0.4762921448269727, + "grad_norm": 0.11761151254177094, + "learning_rate": 0.0005455526541209328, + "loss": 2.7023, + "step": 16062 + }, + { + "epoch": 0.47632179817928416, + "grad_norm": 0.13157857954502106, + "learning_rate": 0.0005455057986310302, + "loss": 2.688, + "step": 16063 + }, + { + "epoch": 0.47635145153159564, + "grad_norm": 0.11734515428543091, + "learning_rate": 0.0005454589427381659, + "loss": 2.681, + "step": 16064 + }, + { + "epoch": 0.4763811048839071, + "grad_norm": 0.13757912814617157, + "learning_rate": 0.0005454120864427549, + "loss": 2.6452, + "step": 16065 + }, + { + "epoch": 0.4764107582362186, + "grad_norm": 0.1350664645433426, + "learning_rate": 0.0005453652297452119, + "loss": 2.6824, + "step": 16066 + }, + { + "epoch": 0.47644041158853007, + "grad_norm": 0.1572771817445755, + "learning_rate": 0.0005453183726459522, + "loss": 2.6673, + "step": 16067 + }, + { + "epoch": 0.47647006494084154, + "grad_norm": 0.15251897275447845, + "learning_rate": 0.0005452715151453904, + "loss": 2.6827, + "step": 16068 + }, + { + "epoch": 0.476499718293153, + "grad_norm": 0.10973645746707916, + "learning_rate": 0.0005452246572439416, + "loss": 2.6815, + "step": 16069 + }, + { + "epoch": 0.4765293716454645, + "grad_norm": 0.1401277333498001, + "learning_rate": 0.0005451777989420209, + "loss": 2.6931, + "step": 16070 + }, + { + "epoch": 0.476559024997776, + "grad_norm": 0.15295252203941345, + "learning_rate": 0.0005451309402400428, + "loss": 2.6779, + "step": 16071 + }, + { + "epoch": 0.4765886783500875, + "grad_norm": 0.11697595566511154, + "learning_rate": 0.0005450840811384225, + "loss": 2.6818, + "step": 16072 + }, + { + "epoch": 0.476618331702399, + "grad_norm": 0.12230023741722107, + "learning_rate": 0.000545037221637575, + "loss": 2.6851, + "step": 16073 + }, + { + "epoch": 0.47664798505471045, + "grad_norm": 0.1406746655702591, + "learning_rate": 0.0005449903617379151, + "loss": 2.6704, + "step": 16074 + }, + { + "epoch": 0.47667763840702193, + "grad_norm": 0.12373074889183044, + "learning_rate": 0.0005449435014398579, + "loss": 2.6749, + "step": 16075 + }, + { + "epoch": 0.4767072917593334, + "grad_norm": 0.15098834037780762, + "learning_rate": 0.0005448966407438183, + "loss": 2.6723, + "step": 16076 + }, + { + "epoch": 0.4767369451116449, + "grad_norm": 0.1326572299003601, + "learning_rate": 0.0005448497796502112, + "loss": 2.6703, + "step": 16077 + }, + { + "epoch": 0.47676659846395636, + "grad_norm": 0.13061803579330444, + "learning_rate": 0.0005448029181594515, + "loss": 2.6796, + "step": 16078 + }, + { + "epoch": 0.47679625181626784, + "grad_norm": 0.14223678410053253, + "learning_rate": 0.0005447560562719543, + "loss": 2.6755, + "step": 16079 + }, + { + "epoch": 0.4768259051685793, + "grad_norm": 0.12930426001548767, + "learning_rate": 0.0005447091939881346, + "loss": 2.6932, + "step": 16080 + }, + { + "epoch": 0.4768555585208908, + "grad_norm": 0.1357986479997635, + "learning_rate": 0.0005446623313084074, + "loss": 2.6894, + "step": 16081 + }, + { + "epoch": 0.47688521187320226, + "grad_norm": 0.14850933849811554, + "learning_rate": 0.0005446154682331875, + "loss": 2.6899, + "step": 16082 + }, + { + "epoch": 0.47691486522551374, + "grad_norm": 0.12585562467575073, + "learning_rate": 0.00054456860476289, + "loss": 2.6953, + "step": 16083 + }, + { + "epoch": 0.4769445185778252, + "grad_norm": 0.10679658502340317, + "learning_rate": 0.0005445217408979299, + "loss": 2.6678, + "step": 16084 + }, + { + "epoch": 0.4769741719301367, + "grad_norm": 0.1081419587135315, + "learning_rate": 0.0005444748766387219, + "loss": 2.7107, + "step": 16085 + }, + { + "epoch": 0.47700382528244817, + "grad_norm": 0.12610465288162231, + "learning_rate": 0.0005444280119856814, + "loss": 2.7057, + "step": 16086 + }, + { + "epoch": 0.47703347863475964, + "grad_norm": 0.12671852111816406, + "learning_rate": 0.0005443811469392233, + "loss": 2.646, + "step": 16087 + }, + { + "epoch": 0.4770631319870711, + "grad_norm": 0.13000787794589996, + "learning_rate": 0.0005443342814997624, + "loss": 2.6945, + "step": 16088 + }, + { + "epoch": 0.4770927853393826, + "grad_norm": 0.12226465344429016, + "learning_rate": 0.0005442874156677139, + "loss": 2.6883, + "step": 16089 + }, + { + "epoch": 0.4771224386916941, + "grad_norm": 0.11187521368265152, + "learning_rate": 0.0005442405494434926, + "loss": 2.7387, + "step": 16090 + }, + { + "epoch": 0.47715209204400555, + "grad_norm": 0.12868422269821167, + "learning_rate": 0.0005441936828275138, + "loss": 2.6716, + "step": 16091 + }, + { + "epoch": 0.4771817453963171, + "grad_norm": 0.13316377997398376, + "learning_rate": 0.0005441468158201923, + "loss": 2.6417, + "step": 16092 + }, + { + "epoch": 0.47721139874862856, + "grad_norm": 0.12502740323543549, + "learning_rate": 0.0005440999484219432, + "loss": 2.6938, + "step": 16093 + }, + { + "epoch": 0.47724105210094003, + "grad_norm": 0.13843265175819397, + "learning_rate": 0.0005440530806331815, + "loss": 2.6889, + "step": 16094 + }, + { + "epoch": 0.4772707054532515, + "grad_norm": 0.12306073307991028, + "learning_rate": 0.0005440062124543221, + "loss": 2.6805, + "step": 16095 + }, + { + "epoch": 0.477300358805563, + "grad_norm": 0.12379013746976852, + "learning_rate": 0.0005439593438857803, + "loss": 2.7039, + "step": 16096 + }, + { + "epoch": 0.47733001215787446, + "grad_norm": 0.13368913531303406, + "learning_rate": 0.0005439124749279708, + "loss": 2.6784, + "step": 16097 + }, + { + "epoch": 0.47735966551018594, + "grad_norm": 0.15256887674331665, + "learning_rate": 0.0005438656055813088, + "loss": 2.6669, + "step": 16098 + }, + { + "epoch": 0.4773893188624974, + "grad_norm": 0.16876384615898132, + "learning_rate": 0.0005438187358462094, + "loss": 2.6615, + "step": 16099 + }, + { + "epoch": 0.4774189722148089, + "grad_norm": 0.15352323651313782, + "learning_rate": 0.0005437718657230876, + "loss": 2.6632, + "step": 16100 + }, + { + "epoch": 0.47744862556712037, + "grad_norm": 0.13148580491542816, + "learning_rate": 0.0005437249952123584, + "loss": 2.6727, + "step": 16101 + }, + { + "epoch": 0.47747827891943184, + "grad_norm": 0.12770815193653107, + "learning_rate": 0.0005436781243144369, + "loss": 2.6888, + "step": 16102 + }, + { + "epoch": 0.4775079322717433, + "grad_norm": 0.13282839953899384, + "learning_rate": 0.0005436312530297382, + "loss": 2.6897, + "step": 16103 + }, + { + "epoch": 0.4775375856240548, + "grad_norm": 0.13601407408714294, + "learning_rate": 0.0005435843813586769, + "loss": 2.6358, + "step": 16104 + }, + { + "epoch": 0.47756723897636627, + "grad_norm": 0.1362665593624115, + "learning_rate": 0.0005435375093016686, + "loss": 2.6568, + "step": 16105 + }, + { + "epoch": 0.47759689232867775, + "grad_norm": 0.13132533431053162, + "learning_rate": 0.0005434906368591282, + "loss": 2.6872, + "step": 16106 + }, + { + "epoch": 0.4776265456809892, + "grad_norm": 0.1440475732088089, + "learning_rate": 0.0005434437640314709, + "loss": 2.6623, + "step": 16107 + }, + { + "epoch": 0.4776561990333007, + "grad_norm": 0.1152184009552002, + "learning_rate": 0.0005433968908191115, + "loss": 2.6833, + "step": 16108 + }, + { + "epoch": 0.4776858523856122, + "grad_norm": 0.11672315746545792, + "learning_rate": 0.0005433500172224653, + "loss": 2.668, + "step": 16109 + }, + { + "epoch": 0.47771550573792365, + "grad_norm": 0.1231796070933342, + "learning_rate": 0.000543303143241947, + "loss": 2.6844, + "step": 16110 + }, + { + "epoch": 0.4777451590902351, + "grad_norm": 0.1303715705871582, + "learning_rate": 0.000543256268877972, + "loss": 2.6852, + "step": 16111 + }, + { + "epoch": 0.4777748124425466, + "grad_norm": 0.1211719959974289, + "learning_rate": 0.0005432093941309554, + "loss": 2.6746, + "step": 16112 + }, + { + "epoch": 0.47780446579485814, + "grad_norm": 0.12594451010227203, + "learning_rate": 0.0005431625190013123, + "loss": 2.6646, + "step": 16113 + }, + { + "epoch": 0.4778341191471696, + "grad_norm": 0.11830569058656693, + "learning_rate": 0.0005431156434894575, + "loss": 2.6702, + "step": 16114 + }, + { + "epoch": 0.4778637724994811, + "grad_norm": 0.11251033842563629, + "learning_rate": 0.0005430687675958063, + "loss": 2.6814, + "step": 16115 + }, + { + "epoch": 0.47789342585179256, + "grad_norm": 0.11096775531768799, + "learning_rate": 0.0005430218913207739, + "loss": 2.6579, + "step": 16116 + }, + { + "epoch": 0.47792307920410404, + "grad_norm": 0.10160257667303085, + "learning_rate": 0.0005429750146647749, + "loss": 2.6794, + "step": 16117 + }, + { + "epoch": 0.4779527325564155, + "grad_norm": 0.1041390672326088, + "learning_rate": 0.0005429281376282251, + "loss": 2.6362, + "step": 16118 + }, + { + "epoch": 0.477982385908727, + "grad_norm": 0.11636275053024292, + "learning_rate": 0.0005428812602115394, + "loss": 2.6678, + "step": 16119 + }, + { + "epoch": 0.47801203926103847, + "grad_norm": 0.11600664258003235, + "learning_rate": 0.0005428343824151325, + "loss": 2.7397, + "step": 16120 + }, + { + "epoch": 0.47804169261334994, + "grad_norm": 0.11083071678876877, + "learning_rate": 0.0005427875042394199, + "loss": 2.6916, + "step": 16121 + }, + { + "epoch": 0.4780713459656614, + "grad_norm": 0.11816651374101639, + "learning_rate": 0.0005427406256848167, + "loss": 2.7243, + "step": 16122 + }, + { + "epoch": 0.4781009993179729, + "grad_norm": 0.11609338223934174, + "learning_rate": 0.0005426937467517377, + "loss": 2.7072, + "step": 16123 + }, + { + "epoch": 0.4781306526702844, + "grad_norm": 0.10926847904920578, + "learning_rate": 0.0005426468674405984, + "loss": 2.6857, + "step": 16124 + }, + { + "epoch": 0.47816030602259585, + "grad_norm": 0.11865701526403427, + "learning_rate": 0.0005425999877518138, + "loss": 2.6719, + "step": 16125 + }, + { + "epoch": 0.4781899593749073, + "grad_norm": 0.12113665044307709, + "learning_rate": 0.0005425531076857988, + "loss": 2.6805, + "step": 16126 + }, + { + "epoch": 0.4782196127272188, + "grad_norm": 0.10529319196939468, + "learning_rate": 0.0005425062272429688, + "loss": 2.7018, + "step": 16127 + }, + { + "epoch": 0.4782492660795303, + "grad_norm": 0.11014265567064285, + "learning_rate": 0.000542459346423739, + "loss": 2.6812, + "step": 16128 + }, + { + "epoch": 0.47827891943184175, + "grad_norm": 0.11529695242643356, + "learning_rate": 0.0005424124652285243, + "loss": 2.6929, + "step": 16129 + }, + { + "epoch": 0.47830857278415323, + "grad_norm": 0.14210598170757294, + "learning_rate": 0.0005423655836577399, + "loss": 2.6743, + "step": 16130 + }, + { + "epoch": 0.4783382261364647, + "grad_norm": 0.17894774675369263, + "learning_rate": 0.000542318701711801, + "loss": 2.7078, + "step": 16131 + }, + { + "epoch": 0.4783678794887762, + "grad_norm": 0.1511698067188263, + "learning_rate": 0.0005422718193911228, + "loss": 2.6797, + "step": 16132 + }, + { + "epoch": 0.47839753284108766, + "grad_norm": 0.11728758364915848, + "learning_rate": 0.0005422249366961204, + "loss": 2.6678, + "step": 16133 + }, + { + "epoch": 0.4784271861933992, + "grad_norm": 0.1427457332611084, + "learning_rate": 0.0005421780536272088, + "loss": 2.6744, + "step": 16134 + }, + { + "epoch": 0.47845683954571067, + "grad_norm": 0.13150475919246674, + "learning_rate": 0.0005421311701848035, + "loss": 2.6679, + "step": 16135 + }, + { + "epoch": 0.47848649289802214, + "grad_norm": 0.13129951059818268, + "learning_rate": 0.0005420842863693194, + "loss": 2.649, + "step": 16136 + }, + { + "epoch": 0.4785161462503336, + "grad_norm": 0.13166502118110657, + "learning_rate": 0.0005420374021811716, + "loss": 2.7027, + "step": 16137 + }, + { + "epoch": 0.4785457996026451, + "grad_norm": 0.12715578079223633, + "learning_rate": 0.0005419905176207755, + "loss": 2.63, + "step": 16138 + }, + { + "epoch": 0.47857545295495657, + "grad_norm": 0.12930285930633545, + "learning_rate": 0.0005419436326885461, + "loss": 2.7105, + "step": 16139 + }, + { + "epoch": 0.47860510630726805, + "grad_norm": 0.12933319807052612, + "learning_rate": 0.0005418967473848986, + "loss": 2.6677, + "step": 16140 + }, + { + "epoch": 0.4786347596595795, + "grad_norm": 0.12544091045856476, + "learning_rate": 0.0005418498617102483, + "loss": 2.7063, + "step": 16141 + }, + { + "epoch": 0.478664413011891, + "grad_norm": 0.1132710725069046, + "learning_rate": 0.0005418029756650102, + "loss": 2.6698, + "step": 16142 + }, + { + "epoch": 0.4786940663642025, + "grad_norm": 0.11038841307163239, + "learning_rate": 0.0005417560892495996, + "loss": 2.715, + "step": 16143 + }, + { + "epoch": 0.47872371971651395, + "grad_norm": 0.11923336982727051, + "learning_rate": 0.0005417092024644316, + "loss": 2.7016, + "step": 16144 + }, + { + "epoch": 0.4787533730688254, + "grad_norm": 0.13383649289608002, + "learning_rate": 0.0005416623153099216, + "loss": 2.6472, + "step": 16145 + }, + { + "epoch": 0.4787830264211369, + "grad_norm": 0.14318718016147614, + "learning_rate": 0.0005416154277864847, + "loss": 2.7306, + "step": 16146 + }, + { + "epoch": 0.4788126797734484, + "grad_norm": 0.14916200935840607, + "learning_rate": 0.000541568539894536, + "loss": 2.6923, + "step": 16147 + }, + { + "epoch": 0.47884233312575986, + "grad_norm": 0.15699823200702667, + "learning_rate": 0.0005415216516344905, + "loss": 2.723, + "step": 16148 + }, + { + "epoch": 0.47887198647807133, + "grad_norm": 0.15201450884342194, + "learning_rate": 0.0005414747630067639, + "loss": 2.6724, + "step": 16149 + }, + { + "epoch": 0.4789016398303828, + "grad_norm": 0.11698874086141586, + "learning_rate": 0.000541427874011771, + "loss": 2.686, + "step": 16150 + }, + { + "epoch": 0.4789312931826943, + "grad_norm": 0.12939006090164185, + "learning_rate": 0.0005413809846499273, + "loss": 2.7349, + "step": 16151 + }, + { + "epoch": 0.47896094653500576, + "grad_norm": 0.12163544446229935, + "learning_rate": 0.0005413340949216478, + "loss": 2.6701, + "step": 16152 + }, + { + "epoch": 0.47899059988731724, + "grad_norm": 0.13248078525066376, + "learning_rate": 0.0005412872048273478, + "loss": 2.6729, + "step": 16153 + }, + { + "epoch": 0.4790202532396287, + "grad_norm": 0.12924182415008545, + "learning_rate": 0.0005412403143674425, + "loss": 2.7073, + "step": 16154 + }, + { + "epoch": 0.47904990659194024, + "grad_norm": 0.11458705365657806, + "learning_rate": 0.0005411934235423472, + "loss": 2.6888, + "step": 16155 + }, + { + "epoch": 0.4790795599442517, + "grad_norm": 0.1245526522397995, + "learning_rate": 0.0005411465323524769, + "loss": 2.6933, + "step": 16156 + }, + { + "epoch": 0.4791092132965632, + "grad_norm": 0.10914372652769089, + "learning_rate": 0.0005410996407982472, + "loss": 2.7063, + "step": 16157 + }, + { + "epoch": 0.4791388666488747, + "grad_norm": 0.10778187960386276, + "learning_rate": 0.0005410527488800731, + "loss": 2.725, + "step": 16158 + }, + { + "epoch": 0.47916852000118615, + "grad_norm": 0.11286794394254684, + "learning_rate": 0.0005410058565983697, + "loss": 2.6937, + "step": 16159 + }, + { + "epoch": 0.4791981733534976, + "grad_norm": 0.12113740295171738, + "learning_rate": 0.0005409589639535526, + "loss": 2.6436, + "step": 16160 + }, + { + "epoch": 0.4792278267058091, + "grad_norm": 0.10521133989095688, + "learning_rate": 0.0005409120709460366, + "loss": 2.6857, + "step": 16161 + }, + { + "epoch": 0.4792574800581206, + "grad_norm": 0.12748929858207703, + "learning_rate": 0.0005408651775762374, + "loss": 2.7194, + "step": 16162 + }, + { + "epoch": 0.47928713341043205, + "grad_norm": 0.09938734024763107, + "learning_rate": 0.00054081828384457, + "loss": 2.6597, + "step": 16163 + }, + { + "epoch": 0.47931678676274353, + "grad_norm": 0.12112177908420563, + "learning_rate": 0.0005407713897514497, + "loss": 2.7084, + "step": 16164 + }, + { + "epoch": 0.479346440115055, + "grad_norm": 0.11373473703861237, + "learning_rate": 0.0005407244952972917, + "loss": 2.7173, + "step": 16165 + }, + { + "epoch": 0.4793760934673665, + "grad_norm": 0.12329603731632233, + "learning_rate": 0.0005406776004825112, + "loss": 2.6878, + "step": 16166 + }, + { + "epoch": 0.47940574681967796, + "grad_norm": 0.11835646629333496, + "learning_rate": 0.0005406307053075238, + "loss": 2.6846, + "step": 16167 + }, + { + "epoch": 0.47943540017198943, + "grad_norm": 0.1436992585659027, + "learning_rate": 0.0005405838097727445, + "loss": 2.6791, + "step": 16168 + }, + { + "epoch": 0.4794650535243009, + "grad_norm": 0.12753254175186157, + "learning_rate": 0.0005405369138785884, + "loss": 2.6582, + "step": 16169 + }, + { + "epoch": 0.4794947068766124, + "grad_norm": 0.1383557915687561, + "learning_rate": 0.0005404900176254711, + "loss": 2.6776, + "step": 16170 + }, + { + "epoch": 0.47952436022892386, + "grad_norm": 0.12832403182983398, + "learning_rate": 0.0005404431210138076, + "loss": 2.7046, + "step": 16171 + }, + { + "epoch": 0.47955401358123534, + "grad_norm": 0.12338251620531082, + "learning_rate": 0.0005403962240440135, + "loss": 2.6773, + "step": 16172 + }, + { + "epoch": 0.4795836669335468, + "grad_norm": 0.14136897027492523, + "learning_rate": 0.0005403493267165039, + "loss": 2.6588, + "step": 16173 + }, + { + "epoch": 0.4796133202858583, + "grad_norm": 0.1290488988161087, + "learning_rate": 0.0005403024290316942, + "loss": 2.6805, + "step": 16174 + }, + { + "epoch": 0.4796429736381698, + "grad_norm": 0.11763060837984085, + "learning_rate": 0.0005402555309899993, + "loss": 2.7015, + "step": 16175 + }, + { + "epoch": 0.4796726269904813, + "grad_norm": 0.13236212730407715, + "learning_rate": 0.0005402086325918348, + "loss": 2.6867, + "step": 16176 + }, + { + "epoch": 0.4797022803427928, + "grad_norm": 0.12738457322120667, + "learning_rate": 0.000540161733837616, + "loss": 2.6858, + "step": 16177 + }, + { + "epoch": 0.47973193369510425, + "grad_norm": 0.12216255068778992, + "learning_rate": 0.0005401148347277582, + "loss": 2.6681, + "step": 16178 + }, + { + "epoch": 0.4797615870474157, + "grad_norm": 0.12645690143108368, + "learning_rate": 0.0005400679352626767, + "loss": 2.673, + "step": 16179 + }, + { + "epoch": 0.4797912403997272, + "grad_norm": 0.13647037744522095, + "learning_rate": 0.0005400210354427867, + "loss": 2.6781, + "step": 16180 + }, + { + "epoch": 0.4798208937520387, + "grad_norm": 0.13400453329086304, + "learning_rate": 0.0005399741352685036, + "loss": 2.6602, + "step": 16181 + }, + { + "epoch": 0.47985054710435016, + "grad_norm": 0.1244468092918396, + "learning_rate": 0.0005399272347402425, + "loss": 2.6762, + "step": 16182 + }, + { + "epoch": 0.47988020045666163, + "grad_norm": 0.10637235641479492, + "learning_rate": 0.000539880333858419, + "loss": 2.6706, + "step": 16183 + }, + { + "epoch": 0.4799098538089731, + "grad_norm": 0.1278664469718933, + "learning_rate": 0.0005398334326234484, + "loss": 2.691, + "step": 16184 + }, + { + "epoch": 0.4799395071612846, + "grad_norm": 0.113523930311203, + "learning_rate": 0.0005397865310357459, + "loss": 2.7225, + "step": 16185 + }, + { + "epoch": 0.47996916051359606, + "grad_norm": 0.12567253410816193, + "learning_rate": 0.0005397396290957266, + "loss": 2.6857, + "step": 16186 + }, + { + "epoch": 0.47999881386590754, + "grad_norm": 0.12496987730264664, + "learning_rate": 0.0005396927268038063, + "loss": 2.6845, + "step": 16187 + }, + { + "epoch": 0.480028467218219, + "grad_norm": 0.13901744782924652, + "learning_rate": 0.0005396458241603999, + "loss": 2.6764, + "step": 16188 + }, + { + "epoch": 0.4800581205705305, + "grad_norm": 0.13612987101078033, + "learning_rate": 0.0005395989211659231, + "loss": 2.6796, + "step": 16189 + }, + { + "epoch": 0.48008777392284196, + "grad_norm": 0.1574770212173462, + "learning_rate": 0.000539552017820791, + "loss": 2.6797, + "step": 16190 + }, + { + "epoch": 0.48011742727515344, + "grad_norm": 0.15023666620254517, + "learning_rate": 0.000539505114125419, + "loss": 2.6816, + "step": 16191 + }, + { + "epoch": 0.4801470806274649, + "grad_norm": 0.13386103510856628, + "learning_rate": 0.0005394582100802225, + "loss": 2.6994, + "step": 16192 + }, + { + "epoch": 0.4801767339797764, + "grad_norm": 0.10611562430858612, + "learning_rate": 0.0005394113056856166, + "loss": 2.6841, + "step": 16193 + }, + { + "epoch": 0.48020638733208787, + "grad_norm": 0.13195879757404327, + "learning_rate": 0.0005393644009420169, + "loss": 2.6919, + "step": 16194 + }, + { + "epoch": 0.48023604068439935, + "grad_norm": 0.13945506513118744, + "learning_rate": 0.0005393174958498387, + "loss": 2.7135, + "step": 16195 + }, + { + "epoch": 0.4802656940367109, + "grad_norm": 0.14843180775642395, + "learning_rate": 0.0005392705904094973, + "loss": 2.6803, + "step": 16196 + }, + { + "epoch": 0.48029534738902235, + "grad_norm": 0.14846879243850708, + "learning_rate": 0.0005392236846214081, + "loss": 2.6856, + "step": 16197 + }, + { + "epoch": 0.48032500074133383, + "grad_norm": 0.12593820691108704, + "learning_rate": 0.0005391767784859864, + "loss": 2.6976, + "step": 16198 + }, + { + "epoch": 0.4803546540936453, + "grad_norm": 0.1327340304851532, + "learning_rate": 0.0005391298720036477, + "loss": 2.7138, + "step": 16199 + }, + { + "epoch": 0.4803843074459568, + "grad_norm": 0.14557111263275146, + "learning_rate": 0.0005390829651748071, + "loss": 2.7036, + "step": 16200 + }, + { + "epoch": 0.48041396079826826, + "grad_norm": 0.13121524453163147, + "learning_rate": 0.0005390360579998803, + "loss": 2.6912, + "step": 16201 + }, + { + "epoch": 0.48044361415057973, + "grad_norm": 0.13497351109981537, + "learning_rate": 0.0005389891504792824, + "loss": 2.6759, + "step": 16202 + }, + { + "epoch": 0.4804732675028912, + "grad_norm": 0.11724510043859482, + "learning_rate": 0.000538942242613429, + "loss": 2.6964, + "step": 16203 + }, + { + "epoch": 0.4805029208552027, + "grad_norm": 0.11106390506029129, + "learning_rate": 0.0005388953344027353, + "loss": 2.6761, + "step": 16204 + }, + { + "epoch": 0.48053257420751416, + "grad_norm": 0.11865375190973282, + "learning_rate": 0.0005388484258476167, + "loss": 2.706, + "step": 16205 + }, + { + "epoch": 0.48056222755982564, + "grad_norm": 0.11402293294668198, + "learning_rate": 0.0005388015169484888, + "loss": 2.6982, + "step": 16206 + }, + { + "epoch": 0.4805918809121371, + "grad_norm": 0.11432032287120819, + "learning_rate": 0.0005387546077057666, + "loss": 2.6932, + "step": 16207 + }, + { + "epoch": 0.4806215342644486, + "grad_norm": 0.1057506576180458, + "learning_rate": 0.0005387076981198657, + "loss": 2.7059, + "step": 16208 + }, + { + "epoch": 0.48065118761676007, + "grad_norm": 0.11191700398921967, + "learning_rate": 0.0005386607881912015, + "loss": 2.7082, + "step": 16209 + }, + { + "epoch": 0.48068084096907154, + "grad_norm": 0.136494979262352, + "learning_rate": 0.0005386138779201893, + "loss": 2.7021, + "step": 16210 + }, + { + "epoch": 0.480710494321383, + "grad_norm": 0.13485883176326752, + "learning_rate": 0.0005385669673072447, + "loss": 2.6964, + "step": 16211 + }, + { + "epoch": 0.4807401476736945, + "grad_norm": 0.1388121098279953, + "learning_rate": 0.0005385200563527831, + "loss": 2.7125, + "step": 16212 + }, + { + "epoch": 0.48076980102600597, + "grad_norm": 0.146112859249115, + "learning_rate": 0.0005384731450572196, + "loss": 2.6737, + "step": 16213 + }, + { + "epoch": 0.48079945437831745, + "grad_norm": 0.12870338559150696, + "learning_rate": 0.0005384262334209699, + "loss": 2.6846, + "step": 16214 + }, + { + "epoch": 0.4808291077306289, + "grad_norm": 0.12445753067731857, + "learning_rate": 0.0005383793214444493, + "loss": 2.6885, + "step": 16215 + }, + { + "epoch": 0.4808587610829404, + "grad_norm": 0.12682828307151794, + "learning_rate": 0.0005383324091280731, + "loss": 2.6579, + "step": 16216 + }, + { + "epoch": 0.48088841443525193, + "grad_norm": 0.12334617227315903, + "learning_rate": 0.0005382854964722571, + "loss": 2.6748, + "step": 16217 + }, + { + "epoch": 0.4809180677875634, + "grad_norm": 0.12949615716934204, + "learning_rate": 0.0005382385834774163, + "loss": 2.6551, + "step": 16218 + }, + { + "epoch": 0.4809477211398749, + "grad_norm": 0.1221383810043335, + "learning_rate": 0.0005381916701439663, + "loss": 2.6774, + "step": 16219 + }, + { + "epoch": 0.48097737449218636, + "grad_norm": 0.1306857019662857, + "learning_rate": 0.0005381447564723224, + "loss": 2.7204, + "step": 16220 + }, + { + "epoch": 0.48100702784449784, + "grad_norm": 0.12079314142465591, + "learning_rate": 0.0005380978424629002, + "loss": 2.6562, + "step": 16221 + }, + { + "epoch": 0.4810366811968093, + "grad_norm": 0.12429002672433853, + "learning_rate": 0.0005380509281161151, + "loss": 2.6971, + "step": 16222 + }, + { + "epoch": 0.4810663345491208, + "grad_norm": 0.12342879176139832, + "learning_rate": 0.0005380040134323825, + "loss": 2.6974, + "step": 16223 + }, + { + "epoch": 0.48109598790143226, + "grad_norm": 0.11740649491548538, + "learning_rate": 0.0005379570984121178, + "loss": 2.6869, + "step": 16224 + }, + { + "epoch": 0.48112564125374374, + "grad_norm": 0.11398938298225403, + "learning_rate": 0.0005379101830557364, + "loss": 2.6861, + "step": 16225 + }, + { + "epoch": 0.4811552946060552, + "grad_norm": 0.11703451722860336, + "learning_rate": 0.000537863267363654, + "loss": 2.6744, + "step": 16226 + }, + { + "epoch": 0.4811849479583667, + "grad_norm": 0.12743788957595825, + "learning_rate": 0.0005378163513362858, + "loss": 2.6428, + "step": 16227 + }, + { + "epoch": 0.48121460131067817, + "grad_norm": 0.11827497184276581, + "learning_rate": 0.0005377694349740472, + "loss": 2.7021, + "step": 16228 + }, + { + "epoch": 0.48124425466298965, + "grad_norm": 0.11444864422082901, + "learning_rate": 0.000537722518277354, + "loss": 2.6867, + "step": 16229 + }, + { + "epoch": 0.4812739080153011, + "grad_norm": 0.12309153378009796, + "learning_rate": 0.0005376756012466213, + "loss": 2.7081, + "step": 16230 + }, + { + "epoch": 0.4813035613676126, + "grad_norm": 0.13040822744369507, + "learning_rate": 0.0005376286838822647, + "loss": 2.6937, + "step": 16231 + }, + { + "epoch": 0.4813332147199241, + "grad_norm": 0.13275246322155, + "learning_rate": 0.0005375817661846995, + "loss": 2.6833, + "step": 16232 + }, + { + "epoch": 0.48136286807223555, + "grad_norm": 0.1281968504190445, + "learning_rate": 0.0005375348481543415, + "loss": 2.6601, + "step": 16233 + }, + { + "epoch": 0.481392521424547, + "grad_norm": 0.11620988696813583, + "learning_rate": 0.000537487929791606, + "loss": 2.6735, + "step": 16234 + }, + { + "epoch": 0.4814221747768585, + "grad_norm": 0.10115765780210495, + "learning_rate": 0.0005374410110969084, + "loss": 2.6532, + "step": 16235 + }, + { + "epoch": 0.48145182812917, + "grad_norm": 0.12139761447906494, + "learning_rate": 0.0005373940920706641, + "loss": 2.6671, + "step": 16236 + }, + { + "epoch": 0.48148148148148145, + "grad_norm": 0.1342024803161621, + "learning_rate": 0.0005373471727132889, + "loss": 2.655, + "step": 16237 + }, + { + "epoch": 0.481511134833793, + "grad_norm": 0.11631223559379578, + "learning_rate": 0.0005373002530251978, + "loss": 2.6463, + "step": 16238 + }, + { + "epoch": 0.48154078818610446, + "grad_norm": 0.13570979237556458, + "learning_rate": 0.0005372533330068068, + "loss": 2.687, + "step": 16239 + }, + { + "epoch": 0.48157044153841594, + "grad_norm": 0.12504224479198456, + "learning_rate": 0.000537206412658531, + "loss": 2.6512, + "step": 16240 + }, + { + "epoch": 0.4816000948907274, + "grad_norm": 0.11920852214097977, + "learning_rate": 0.000537159491980786, + "loss": 2.6857, + "step": 16241 + }, + { + "epoch": 0.4816297482430389, + "grad_norm": 0.10892195999622345, + "learning_rate": 0.0005371125709739875, + "loss": 2.6838, + "step": 16242 + }, + { + "epoch": 0.48165940159535037, + "grad_norm": 0.11850874125957489, + "learning_rate": 0.0005370656496385506, + "loss": 2.6647, + "step": 16243 + }, + { + "epoch": 0.48168905494766184, + "grad_norm": 0.11936180293560028, + "learning_rate": 0.0005370187279748913, + "loss": 2.6722, + "step": 16244 + }, + { + "epoch": 0.4817187082999733, + "grad_norm": 0.13473322987556458, + "learning_rate": 0.0005369718059834247, + "loss": 2.6624, + "step": 16245 + }, + { + "epoch": 0.4817483616522848, + "grad_norm": 0.1266491413116455, + "learning_rate": 0.0005369248836645663, + "loss": 2.7019, + "step": 16246 + }, + { + "epoch": 0.48177801500459627, + "grad_norm": 0.11904601007699966, + "learning_rate": 0.0005368779610187317, + "loss": 2.691, + "step": 16247 + }, + { + "epoch": 0.48180766835690775, + "grad_norm": 0.11507517099380493, + "learning_rate": 0.0005368310380463364, + "loss": 2.6671, + "step": 16248 + }, + { + "epoch": 0.4818373217092192, + "grad_norm": 0.11177773028612137, + "learning_rate": 0.0005367841147477961, + "loss": 2.6836, + "step": 16249 + }, + { + "epoch": 0.4818669750615307, + "grad_norm": 0.11122893542051315, + "learning_rate": 0.0005367371911235261, + "loss": 2.7124, + "step": 16250 + }, + { + "epoch": 0.4818966284138422, + "grad_norm": 0.09830452501773834, + "learning_rate": 0.000536690267173942, + "loss": 2.6466, + "step": 16251 + }, + { + "epoch": 0.48192628176615365, + "grad_norm": 0.12551207840442657, + "learning_rate": 0.0005366433428994591, + "loss": 2.6782, + "step": 16252 + }, + { + "epoch": 0.48195593511846513, + "grad_norm": 0.11412134021520615, + "learning_rate": 0.0005365964183004932, + "loss": 2.6764, + "step": 16253 + }, + { + "epoch": 0.4819855884707766, + "grad_norm": 0.11872732639312744, + "learning_rate": 0.0005365494933774599, + "loss": 2.6846, + "step": 16254 + }, + { + "epoch": 0.4820152418230881, + "grad_norm": 0.1199948638677597, + "learning_rate": 0.0005365025681307744, + "loss": 2.6841, + "step": 16255 + }, + { + "epoch": 0.48204489517539956, + "grad_norm": 0.12266243994235992, + "learning_rate": 0.0005364556425608524, + "loss": 2.7118, + "step": 16256 + }, + { + "epoch": 0.48207454852771103, + "grad_norm": 0.1336844563484192, + "learning_rate": 0.0005364087166681096, + "loss": 2.6789, + "step": 16257 + }, + { + "epoch": 0.4821042018800225, + "grad_norm": 0.1585523635149002, + "learning_rate": 0.0005363617904529612, + "loss": 2.6321, + "step": 16258 + }, + { + "epoch": 0.48213385523233404, + "grad_norm": 0.14001378417015076, + "learning_rate": 0.0005363148639158228, + "loss": 2.6819, + "step": 16259 + }, + { + "epoch": 0.4821635085846455, + "grad_norm": 0.13490445911884308, + "learning_rate": 0.0005362679370571102, + "loss": 2.6939, + "step": 16260 + }, + { + "epoch": 0.482193161936957, + "grad_norm": 0.13524554669857025, + "learning_rate": 0.0005362210098772388, + "loss": 2.6773, + "step": 16261 + }, + { + "epoch": 0.48222281528926847, + "grad_norm": 0.14650769531726837, + "learning_rate": 0.0005361740823766241, + "loss": 2.6795, + "step": 16262 + }, + { + "epoch": 0.48225246864157995, + "grad_norm": 0.12778949737548828, + "learning_rate": 0.0005361271545556817, + "loss": 2.6763, + "step": 16263 + }, + { + "epoch": 0.4822821219938914, + "grad_norm": 0.13295692205429077, + "learning_rate": 0.0005360802264148271, + "loss": 2.6646, + "step": 16264 + }, + { + "epoch": 0.4823117753462029, + "grad_norm": 0.11676135659217834, + "learning_rate": 0.000536033297954476, + "loss": 2.6817, + "step": 16265 + }, + { + "epoch": 0.4823414286985144, + "grad_norm": 0.12102721631526947, + "learning_rate": 0.0005359863691750437, + "loss": 2.6868, + "step": 16266 + }, + { + "epoch": 0.48237108205082585, + "grad_norm": 0.12021135538816452, + "learning_rate": 0.0005359394400769461, + "loss": 2.6942, + "step": 16267 + }, + { + "epoch": 0.4824007354031373, + "grad_norm": 0.11395756155252457, + "learning_rate": 0.0005358925106605985, + "loss": 2.6766, + "step": 16268 + }, + { + "epoch": 0.4824303887554488, + "grad_norm": 0.11662913113832474, + "learning_rate": 0.0005358455809264165, + "loss": 2.684, + "step": 16269 + }, + { + "epoch": 0.4824600421077603, + "grad_norm": 0.12648700177669525, + "learning_rate": 0.0005357986508748158, + "loss": 2.6336, + "step": 16270 + }, + { + "epoch": 0.48248969546007175, + "grad_norm": 0.111479252576828, + "learning_rate": 0.000535751720506212, + "loss": 2.691, + "step": 16271 + }, + { + "epoch": 0.48251934881238323, + "grad_norm": 0.11953005939722061, + "learning_rate": 0.0005357047898210205, + "loss": 2.6967, + "step": 16272 + }, + { + "epoch": 0.4825490021646947, + "grad_norm": 0.13541917502880096, + "learning_rate": 0.0005356578588196569, + "loss": 2.6939, + "step": 16273 + }, + { + "epoch": 0.4825786555170062, + "grad_norm": 0.11919247359037399, + "learning_rate": 0.000535610927502537, + "loss": 2.66, + "step": 16274 + }, + { + "epoch": 0.48260830886931766, + "grad_norm": 0.11613312363624573, + "learning_rate": 0.0005355639958700759, + "loss": 2.6594, + "step": 16275 + }, + { + "epoch": 0.48263796222162914, + "grad_norm": 0.14501790702342987, + "learning_rate": 0.0005355170639226898, + "loss": 2.6887, + "step": 16276 + }, + { + "epoch": 0.4826676155739406, + "grad_norm": 0.1202756017446518, + "learning_rate": 0.000535470131660794, + "loss": 2.6863, + "step": 16277 + }, + { + "epoch": 0.4826972689262521, + "grad_norm": 0.12351898849010468, + "learning_rate": 0.0005354231990848041, + "loss": 2.6618, + "step": 16278 + }, + { + "epoch": 0.4827269222785636, + "grad_norm": 0.12756569683551788, + "learning_rate": 0.0005353762661951354, + "loss": 2.6811, + "step": 16279 + }, + { + "epoch": 0.4827565756308751, + "grad_norm": 0.12564699351787567, + "learning_rate": 0.000535329332992204, + "loss": 2.6935, + "step": 16280 + }, + { + "epoch": 0.48278622898318657, + "grad_norm": 0.12785950303077698, + "learning_rate": 0.0005352823994764253, + "loss": 2.6883, + "step": 16281 + }, + { + "epoch": 0.48281588233549805, + "grad_norm": 0.1264794021844864, + "learning_rate": 0.000535235465648215, + "loss": 2.6767, + "step": 16282 + }, + { + "epoch": 0.4828455356878095, + "grad_norm": 0.1250910758972168, + "learning_rate": 0.0005351885315079885, + "loss": 2.7283, + "step": 16283 + }, + { + "epoch": 0.482875189040121, + "grad_norm": 0.1253638118505478, + "learning_rate": 0.0005351415970561615, + "loss": 2.6656, + "step": 16284 + }, + { + "epoch": 0.4829048423924325, + "grad_norm": 0.11606168001890182, + "learning_rate": 0.0005350946622931495, + "loss": 2.6731, + "step": 16285 + }, + { + "epoch": 0.48293449574474395, + "grad_norm": 0.12699490785598755, + "learning_rate": 0.0005350477272193684, + "loss": 2.6826, + "step": 16286 + }, + { + "epoch": 0.48296414909705543, + "grad_norm": 0.125941202044487, + "learning_rate": 0.0005350007918352336, + "loss": 2.6897, + "step": 16287 + }, + { + "epoch": 0.4829938024493669, + "grad_norm": 0.11832831054925919, + "learning_rate": 0.0005349538561411609, + "loss": 2.6435, + "step": 16288 + }, + { + "epoch": 0.4830234558016784, + "grad_norm": 0.10120267421007156, + "learning_rate": 0.0005349069201375657, + "loss": 2.6815, + "step": 16289 + }, + { + "epoch": 0.48305310915398986, + "grad_norm": 0.11653069406747818, + "learning_rate": 0.0005348599838248637, + "loss": 2.7035, + "step": 16290 + }, + { + "epoch": 0.48308276250630133, + "grad_norm": 0.12154360115528107, + "learning_rate": 0.0005348130472034707, + "loss": 2.7139, + "step": 16291 + }, + { + "epoch": 0.4831124158586128, + "grad_norm": 0.11884314566850662, + "learning_rate": 0.0005347661102738019, + "loss": 2.6923, + "step": 16292 + }, + { + "epoch": 0.4831420692109243, + "grad_norm": 0.11683948338031769, + "learning_rate": 0.0005347191730362736, + "loss": 2.6958, + "step": 16293 + }, + { + "epoch": 0.48317172256323576, + "grad_norm": 0.12652502954006195, + "learning_rate": 0.0005346722354913009, + "loss": 2.6933, + "step": 16294 + }, + { + "epoch": 0.48320137591554724, + "grad_norm": 0.12654612958431244, + "learning_rate": 0.0005346252976392995, + "loss": 2.6926, + "step": 16295 + }, + { + "epoch": 0.4832310292678587, + "grad_norm": 0.13510707020759583, + "learning_rate": 0.0005345783594806852, + "loss": 2.6512, + "step": 16296 + }, + { + "epoch": 0.4832606826201702, + "grad_norm": 0.12207337468862534, + "learning_rate": 0.0005345314210158737, + "loss": 2.6694, + "step": 16297 + }, + { + "epoch": 0.48329033597248167, + "grad_norm": 0.12673021852970123, + "learning_rate": 0.0005344844822452805, + "loss": 2.6845, + "step": 16298 + }, + { + "epoch": 0.48331998932479314, + "grad_norm": 0.13381248712539673, + "learning_rate": 0.0005344375431693213, + "loss": 2.6653, + "step": 16299 + }, + { + "epoch": 0.4833496426771047, + "grad_norm": 0.14085286855697632, + "learning_rate": 0.0005343906037884117, + "loss": 2.715, + "step": 16300 + }, + { + "epoch": 0.48337929602941615, + "grad_norm": 0.1256152242422104, + "learning_rate": 0.0005343436641029673, + "loss": 2.6704, + "step": 16301 + }, + { + "epoch": 0.4834089493817276, + "grad_norm": 0.11653783172369003, + "learning_rate": 0.000534296724113404, + "loss": 2.6842, + "step": 16302 + }, + { + "epoch": 0.4834386027340391, + "grad_norm": 0.11782057583332062, + "learning_rate": 0.0005342497838201373, + "loss": 2.705, + "step": 16303 + }, + { + "epoch": 0.4834682560863506, + "grad_norm": 0.11718530207872391, + "learning_rate": 0.0005342028432235828, + "loss": 2.6804, + "step": 16304 + }, + { + "epoch": 0.48349790943866205, + "grad_norm": 0.12146604061126709, + "learning_rate": 0.0005341559023241564, + "loss": 2.7171, + "step": 16305 + }, + { + "epoch": 0.48352756279097353, + "grad_norm": 0.1437203288078308, + "learning_rate": 0.0005341089611222735, + "loss": 2.7047, + "step": 16306 + }, + { + "epoch": 0.483557216143285, + "grad_norm": 0.15838930010795593, + "learning_rate": 0.0005340620196183499, + "loss": 2.7124, + "step": 16307 + }, + { + "epoch": 0.4835868694955965, + "grad_norm": 0.1480066478252411, + "learning_rate": 0.0005340150778128013, + "loss": 2.7077, + "step": 16308 + }, + { + "epoch": 0.48361652284790796, + "grad_norm": 0.13640962541103363, + "learning_rate": 0.0005339681357060433, + "loss": 2.672, + "step": 16309 + }, + { + "epoch": 0.48364617620021944, + "grad_norm": 0.1467674821615219, + "learning_rate": 0.0005339211932984918, + "loss": 2.7304, + "step": 16310 + }, + { + "epoch": 0.4836758295525309, + "grad_norm": 0.12416906654834747, + "learning_rate": 0.0005338742505905621, + "loss": 2.6371, + "step": 16311 + }, + { + "epoch": 0.4837054829048424, + "grad_norm": 0.10458739846944809, + "learning_rate": 0.0005338273075826702, + "loss": 2.69, + "step": 16312 + }, + { + "epoch": 0.48373513625715386, + "grad_norm": 0.1209997832775116, + "learning_rate": 0.0005337803642752317, + "loss": 2.6576, + "step": 16313 + }, + { + "epoch": 0.48376478960946534, + "grad_norm": 0.10708818584680557, + "learning_rate": 0.0005337334206686622, + "loss": 2.704, + "step": 16314 + }, + { + "epoch": 0.4837944429617768, + "grad_norm": 0.1172490268945694, + "learning_rate": 0.0005336864767633777, + "loss": 2.7051, + "step": 16315 + }, + { + "epoch": 0.4838240963140883, + "grad_norm": 0.11761656403541565, + "learning_rate": 0.0005336395325597935, + "loss": 2.6684, + "step": 16316 + }, + { + "epoch": 0.48385374966639977, + "grad_norm": 0.12530061602592468, + "learning_rate": 0.0005335925880583253, + "loss": 2.6514, + "step": 16317 + }, + { + "epoch": 0.48388340301871124, + "grad_norm": 0.13953912258148193, + "learning_rate": 0.0005335456432593891, + "loss": 2.6367, + "step": 16318 + }, + { + "epoch": 0.4839130563710227, + "grad_norm": 0.10726761072874069, + "learning_rate": 0.0005334986981634004, + "loss": 2.6929, + "step": 16319 + }, + { + "epoch": 0.4839427097233342, + "grad_norm": 0.11250460147857666, + "learning_rate": 0.0005334517527707752, + "loss": 2.6522, + "step": 16320 + }, + { + "epoch": 0.48397236307564573, + "grad_norm": 0.1163206696510315, + "learning_rate": 0.0005334048070819289, + "loss": 2.6943, + "step": 16321 + }, + { + "epoch": 0.4840020164279572, + "grad_norm": 0.12222614884376526, + "learning_rate": 0.0005333578610972773, + "loss": 2.662, + "step": 16322 + }, + { + "epoch": 0.4840316697802687, + "grad_norm": 0.11820466071367264, + "learning_rate": 0.000533310914817236, + "loss": 2.6765, + "step": 16323 + }, + { + "epoch": 0.48406132313258016, + "grad_norm": 0.15263791382312775, + "learning_rate": 0.0005332639682422207, + "loss": 2.6774, + "step": 16324 + }, + { + "epoch": 0.48409097648489163, + "grad_norm": 0.14584262669086456, + "learning_rate": 0.0005332170213726475, + "loss": 2.6854, + "step": 16325 + }, + { + "epoch": 0.4841206298372031, + "grad_norm": 0.11600663512945175, + "learning_rate": 0.0005331700742089319, + "loss": 2.6968, + "step": 16326 + }, + { + "epoch": 0.4841502831895146, + "grad_norm": 0.12486912310123444, + "learning_rate": 0.0005331231267514896, + "loss": 2.7086, + "step": 16327 + }, + { + "epoch": 0.48417993654182606, + "grad_norm": 0.1316758692264557, + "learning_rate": 0.0005330761790007363, + "loss": 2.6785, + "step": 16328 + }, + { + "epoch": 0.48420958989413754, + "grad_norm": 0.15208280086517334, + "learning_rate": 0.0005330292309570876, + "loss": 2.6766, + "step": 16329 + }, + { + "epoch": 0.484239243246449, + "grad_norm": 0.13685816526412964, + "learning_rate": 0.0005329822826209595, + "loss": 2.6636, + "step": 16330 + }, + { + "epoch": 0.4842688965987605, + "grad_norm": 0.12500016391277313, + "learning_rate": 0.0005329353339927676, + "loss": 2.6639, + "step": 16331 + }, + { + "epoch": 0.48429854995107197, + "grad_norm": 0.12949568033218384, + "learning_rate": 0.0005328883850729277, + "loss": 2.7113, + "step": 16332 + }, + { + "epoch": 0.48432820330338344, + "grad_norm": 0.12972776591777802, + "learning_rate": 0.0005328414358618555, + "loss": 2.68, + "step": 16333 + }, + { + "epoch": 0.4843578566556949, + "grad_norm": 0.1588086038827896, + "learning_rate": 0.0005327944863599668, + "loss": 2.6927, + "step": 16334 + }, + { + "epoch": 0.4843875100080064, + "grad_norm": 0.14779815077781677, + "learning_rate": 0.0005327475365676772, + "loss": 2.6242, + "step": 16335 + }, + { + "epoch": 0.48441716336031787, + "grad_norm": 0.11838354915380478, + "learning_rate": 0.0005327005864854026, + "loss": 2.6848, + "step": 16336 + }, + { + "epoch": 0.48444681671262935, + "grad_norm": 0.11834205687046051, + "learning_rate": 0.0005326536361135586, + "loss": 2.6683, + "step": 16337 + }, + { + "epoch": 0.4844764700649408, + "grad_norm": 0.12031041830778122, + "learning_rate": 0.0005326066854525612, + "loss": 2.6801, + "step": 16338 + }, + { + "epoch": 0.4845061234172523, + "grad_norm": 0.11558765918016434, + "learning_rate": 0.0005325597345028259, + "loss": 2.6973, + "step": 16339 + }, + { + "epoch": 0.4845357767695638, + "grad_norm": 0.11680158227682114, + "learning_rate": 0.0005325127832647687, + "loss": 2.6983, + "step": 16340 + }, + { + "epoch": 0.48456543012187525, + "grad_norm": 0.12156539410352707, + "learning_rate": 0.0005324658317388049, + "loss": 2.6823, + "step": 16341 + }, + { + "epoch": 0.4845950834741868, + "grad_norm": 0.11080078780651093, + "learning_rate": 0.0005324188799253509, + "loss": 2.6735, + "step": 16342 + }, + { + "epoch": 0.48462473682649826, + "grad_norm": 0.1216401681303978, + "learning_rate": 0.000532371927824822, + "loss": 2.6774, + "step": 16343 + }, + { + "epoch": 0.48465439017880974, + "grad_norm": 0.11120451241731644, + "learning_rate": 0.0005323249754376341, + "loss": 2.6825, + "step": 16344 + }, + { + "epoch": 0.4846840435311212, + "grad_norm": 0.1294967085123062, + "learning_rate": 0.0005322780227642031, + "loss": 2.6766, + "step": 16345 + }, + { + "epoch": 0.4847136968834327, + "grad_norm": 0.12034770101308823, + "learning_rate": 0.0005322310698049446, + "loss": 2.7048, + "step": 16346 + }, + { + "epoch": 0.48474335023574416, + "grad_norm": 0.12708286941051483, + "learning_rate": 0.0005321841165602746, + "loss": 2.6586, + "step": 16347 + }, + { + "epoch": 0.48477300358805564, + "grad_norm": 0.11800234764814377, + "learning_rate": 0.0005321371630306087, + "loss": 2.6812, + "step": 16348 + }, + { + "epoch": 0.4848026569403671, + "grad_norm": 0.11036945134401321, + "learning_rate": 0.0005320902092163625, + "loss": 2.6924, + "step": 16349 + }, + { + "epoch": 0.4848323102926786, + "grad_norm": 0.10510902851819992, + "learning_rate": 0.000532043255117952, + "loss": 2.6733, + "step": 16350 + }, + { + "epoch": 0.48486196364499007, + "grad_norm": 0.13088449835777283, + "learning_rate": 0.0005319963007357931, + "loss": 2.7098, + "step": 16351 + }, + { + "epoch": 0.48489161699730154, + "grad_norm": 0.12737436592578888, + "learning_rate": 0.0005319493460703014, + "loss": 2.7034, + "step": 16352 + }, + { + "epoch": 0.484921270349613, + "grad_norm": 0.13569295406341553, + "learning_rate": 0.000531902391121893, + "loss": 2.6724, + "step": 16353 + }, + { + "epoch": 0.4849509237019245, + "grad_norm": 0.14292480051517487, + "learning_rate": 0.0005318554358909832, + "loss": 2.6901, + "step": 16354 + }, + { + "epoch": 0.484980577054236, + "grad_norm": 0.12360561639070511, + "learning_rate": 0.000531808480377988, + "loss": 2.6638, + "step": 16355 + }, + { + "epoch": 0.48501023040654745, + "grad_norm": 0.11767850071191788, + "learning_rate": 0.0005317615245833232, + "loss": 2.6915, + "step": 16356 + }, + { + "epoch": 0.4850398837588589, + "grad_norm": 0.12631385028362274, + "learning_rate": 0.0005317145685074049, + "loss": 2.7044, + "step": 16357 + }, + { + "epoch": 0.4850695371111704, + "grad_norm": 0.12683087587356567, + "learning_rate": 0.0005316676121506485, + "loss": 2.67, + "step": 16358 + }, + { + "epoch": 0.4850991904634819, + "grad_norm": 0.11157191544771194, + "learning_rate": 0.0005316206555134701, + "loss": 2.6713, + "step": 16359 + }, + { + "epoch": 0.48512884381579335, + "grad_norm": 0.11043563485145569, + "learning_rate": 0.0005315736985962852, + "loss": 2.6662, + "step": 16360 + }, + { + "epoch": 0.48515849716810483, + "grad_norm": 0.1255379319190979, + "learning_rate": 0.00053152674139951, + "loss": 2.7199, + "step": 16361 + }, + { + "epoch": 0.4851881505204163, + "grad_norm": 0.119609534740448, + "learning_rate": 0.0005314797839235599, + "loss": 2.6806, + "step": 16362 + }, + { + "epoch": 0.48521780387272784, + "grad_norm": 0.12513108551502228, + "learning_rate": 0.0005314328261688508, + "loss": 2.6663, + "step": 16363 + }, + { + "epoch": 0.4852474572250393, + "grad_norm": 0.13464385271072388, + "learning_rate": 0.0005313858681357988, + "loss": 2.6344, + "step": 16364 + }, + { + "epoch": 0.4852771105773508, + "grad_norm": 0.16208486258983612, + "learning_rate": 0.0005313389098248196, + "loss": 2.6919, + "step": 16365 + }, + { + "epoch": 0.48530676392966227, + "grad_norm": 0.1612870842218399, + "learning_rate": 0.0005312919512363289, + "loss": 2.723, + "step": 16366 + }, + { + "epoch": 0.48533641728197374, + "grad_norm": 0.14010031521320343, + "learning_rate": 0.0005312449923707425, + "loss": 2.6838, + "step": 16367 + }, + { + "epoch": 0.4853660706342852, + "grad_norm": 0.1264878213405609, + "learning_rate": 0.0005311980332284765, + "loss": 2.6602, + "step": 16368 + }, + { + "epoch": 0.4853957239865967, + "grad_norm": 0.1165078729391098, + "learning_rate": 0.0005311510738099465, + "loss": 2.6955, + "step": 16369 + }, + { + "epoch": 0.48542537733890817, + "grad_norm": 0.14400693774223328, + "learning_rate": 0.0005311041141155684, + "loss": 2.707, + "step": 16370 + }, + { + "epoch": 0.48545503069121965, + "grad_norm": 0.14980822801589966, + "learning_rate": 0.0005310571541457579, + "loss": 2.7024, + "step": 16371 + }, + { + "epoch": 0.4854846840435311, + "grad_norm": 0.12457037717103958, + "learning_rate": 0.0005310101939009312, + "loss": 2.6745, + "step": 16372 + }, + { + "epoch": 0.4855143373958426, + "grad_norm": 0.14287549257278442, + "learning_rate": 0.0005309632333815038, + "loss": 2.6655, + "step": 16373 + }, + { + "epoch": 0.4855439907481541, + "grad_norm": 0.1289505958557129, + "learning_rate": 0.0005309162725878917, + "loss": 2.7097, + "step": 16374 + }, + { + "epoch": 0.48557364410046555, + "grad_norm": 0.12867425382137299, + "learning_rate": 0.0005308693115205106, + "loss": 2.7135, + "step": 16375 + }, + { + "epoch": 0.485603297452777, + "grad_norm": 0.15178096294403076, + "learning_rate": 0.0005308223501797765, + "loss": 2.6953, + "step": 16376 + }, + { + "epoch": 0.4856329508050885, + "grad_norm": 0.1374509334564209, + "learning_rate": 0.0005307753885661052, + "loss": 2.7172, + "step": 16377 + }, + { + "epoch": 0.4856626041574, + "grad_norm": 0.13556157052516937, + "learning_rate": 0.0005307284266799125, + "loss": 2.6816, + "step": 16378 + }, + { + "epoch": 0.48569225750971146, + "grad_norm": 0.12656109035015106, + "learning_rate": 0.0005306814645216144, + "loss": 2.6721, + "step": 16379 + }, + { + "epoch": 0.48572191086202293, + "grad_norm": 0.13554497063159943, + "learning_rate": 0.0005306345020916265, + "loss": 2.6925, + "step": 16380 + }, + { + "epoch": 0.4857515642143344, + "grad_norm": 0.1416948139667511, + "learning_rate": 0.0005305875393903651, + "loss": 2.6817, + "step": 16381 + }, + { + "epoch": 0.4857812175666459, + "grad_norm": 0.14121825993061066, + "learning_rate": 0.0005305405764182455, + "loss": 2.6405, + "step": 16382 + }, + { + "epoch": 0.48581087091895736, + "grad_norm": 0.1233873963356018, + "learning_rate": 0.000530493613175684, + "loss": 2.7146, + "step": 16383 + }, + { + "epoch": 0.4858405242712689, + "grad_norm": 0.12637050449848175, + "learning_rate": 0.0005304466496630963, + "loss": 2.6695, + "step": 16384 + }, + { + "epoch": 0.48587017762358037, + "grad_norm": 0.13879409432411194, + "learning_rate": 0.0005303996858808983, + "loss": 2.659, + "step": 16385 + }, + { + "epoch": 0.48589983097589184, + "grad_norm": 0.1159442588686943, + "learning_rate": 0.000530352721829506, + "loss": 2.6845, + "step": 16386 + }, + { + "epoch": 0.4859294843282033, + "grad_norm": 0.11535979062318802, + "learning_rate": 0.000530305757509335, + "loss": 2.6976, + "step": 16387 + }, + { + "epoch": 0.4859591376805148, + "grad_norm": 0.13783234357833862, + "learning_rate": 0.0005302587929208012, + "loss": 2.655, + "step": 16388 + }, + { + "epoch": 0.4859887910328263, + "grad_norm": 0.14978908002376556, + "learning_rate": 0.0005302118280643206, + "loss": 2.6773, + "step": 16389 + }, + { + "epoch": 0.48601844438513775, + "grad_norm": 0.11770045757293701, + "learning_rate": 0.0005301648629403093, + "loss": 2.6546, + "step": 16390 + }, + { + "epoch": 0.4860480977374492, + "grad_norm": 0.1299421340227127, + "learning_rate": 0.000530117897549183, + "loss": 2.6627, + "step": 16391 + }, + { + "epoch": 0.4860777510897607, + "grad_norm": 0.13317285478115082, + "learning_rate": 0.0005300709318913574, + "loss": 2.7247, + "step": 16392 + }, + { + "epoch": 0.4861074044420722, + "grad_norm": 0.13499382138252258, + "learning_rate": 0.0005300239659672485, + "loss": 2.6824, + "step": 16393 + }, + { + "epoch": 0.48613705779438365, + "grad_norm": 0.12062566727399826, + "learning_rate": 0.0005299769997772722, + "loss": 2.7288, + "step": 16394 + }, + { + "epoch": 0.48616671114669513, + "grad_norm": 0.11978999525308609, + "learning_rate": 0.0005299300333218444, + "loss": 2.6765, + "step": 16395 + }, + { + "epoch": 0.4861963644990066, + "grad_norm": 0.12640872597694397, + "learning_rate": 0.0005298830666013811, + "loss": 2.7041, + "step": 16396 + }, + { + "epoch": 0.4862260178513181, + "grad_norm": 0.11971946805715561, + "learning_rate": 0.0005298360996162982, + "loss": 2.679, + "step": 16397 + }, + { + "epoch": 0.48625567120362956, + "grad_norm": 0.12416648119688034, + "learning_rate": 0.0005297891323670115, + "loss": 2.6978, + "step": 16398 + }, + { + "epoch": 0.48628532455594103, + "grad_norm": 0.11686888337135315, + "learning_rate": 0.0005297421648539367, + "loss": 2.7036, + "step": 16399 + }, + { + "epoch": 0.4863149779082525, + "grad_norm": 0.13100723922252655, + "learning_rate": 0.0005296951970774901, + "loss": 2.651, + "step": 16400 + }, + { + "epoch": 0.486344631260564, + "grad_norm": 0.11423388123512268, + "learning_rate": 0.0005296482290380874, + "loss": 2.6756, + "step": 16401 + }, + { + "epoch": 0.48637428461287546, + "grad_norm": 0.1257484257221222, + "learning_rate": 0.0005296012607361446, + "loss": 2.7015, + "step": 16402 + }, + { + "epoch": 0.48640393796518694, + "grad_norm": 0.13991859555244446, + "learning_rate": 0.0005295542921720776, + "loss": 2.6627, + "step": 16403 + }, + { + "epoch": 0.48643359131749847, + "grad_norm": 0.13081829249858856, + "learning_rate": 0.000529507323346302, + "loss": 2.7057, + "step": 16404 + }, + { + "epoch": 0.48646324466980995, + "grad_norm": 0.14375796914100647, + "learning_rate": 0.0005294603542592342, + "loss": 2.7231, + "step": 16405 + }, + { + "epoch": 0.4864928980221214, + "grad_norm": 0.12476266920566559, + "learning_rate": 0.0005294133849112899, + "loss": 2.6822, + "step": 16406 + }, + { + "epoch": 0.4865225513744329, + "grad_norm": 0.11669453233480453, + "learning_rate": 0.0005293664153028849, + "loss": 2.6835, + "step": 16407 + }, + { + "epoch": 0.4865522047267444, + "grad_norm": 0.12800633907318115, + "learning_rate": 0.0005293194454344354, + "loss": 2.6705, + "step": 16408 + }, + { + "epoch": 0.48658185807905585, + "grad_norm": 0.12448237836360931, + "learning_rate": 0.0005292724753063571, + "loss": 2.6761, + "step": 16409 + }, + { + "epoch": 0.4866115114313673, + "grad_norm": 0.10934650897979736, + "learning_rate": 0.0005292255049190661, + "loss": 2.654, + "step": 16410 + }, + { + "epoch": 0.4866411647836788, + "grad_norm": 0.12018801271915436, + "learning_rate": 0.0005291785342729781, + "loss": 2.6693, + "step": 16411 + }, + { + "epoch": 0.4866708181359903, + "grad_norm": 0.11825084686279297, + "learning_rate": 0.0005291315633685094, + "loss": 2.6738, + "step": 16412 + }, + { + "epoch": 0.48670047148830176, + "grad_norm": 0.1163141131401062, + "learning_rate": 0.0005290845922060754, + "loss": 2.6422, + "step": 16413 + }, + { + "epoch": 0.48673012484061323, + "grad_norm": 0.11213621497154236, + "learning_rate": 0.0005290376207860927, + "loss": 2.703, + "step": 16414 + }, + { + "epoch": 0.4867597781929247, + "grad_norm": 0.1214684396982193, + "learning_rate": 0.0005289906491089765, + "loss": 2.7019, + "step": 16415 + }, + { + "epoch": 0.4867894315452362, + "grad_norm": 0.12254076451063156, + "learning_rate": 0.0005289436771751434, + "loss": 2.6875, + "step": 16416 + }, + { + "epoch": 0.48681908489754766, + "grad_norm": 0.1084083691239357, + "learning_rate": 0.0005288967049850089, + "loss": 2.6759, + "step": 16417 + }, + { + "epoch": 0.48684873824985914, + "grad_norm": 0.11206736415624619, + "learning_rate": 0.0005288497325389892, + "loss": 2.692, + "step": 16418 + }, + { + "epoch": 0.4868783916021706, + "grad_norm": 0.10648388415575027, + "learning_rate": 0.0005288027598375003, + "loss": 2.6696, + "step": 16419 + }, + { + "epoch": 0.4869080449544821, + "grad_norm": 0.11272355914115906, + "learning_rate": 0.0005287557868809579, + "loss": 2.7125, + "step": 16420 + }, + { + "epoch": 0.48693769830679356, + "grad_norm": 0.11563627421855927, + "learning_rate": 0.0005287088136697778, + "loss": 2.6747, + "step": 16421 + }, + { + "epoch": 0.48696735165910504, + "grad_norm": 0.12039876729249954, + "learning_rate": 0.0005286618402043765, + "loss": 2.6756, + "step": 16422 + }, + { + "epoch": 0.4869970050114165, + "grad_norm": 0.10569090396165848, + "learning_rate": 0.0005286148664851697, + "loss": 2.7036, + "step": 16423 + }, + { + "epoch": 0.487026658363728, + "grad_norm": 0.11325668543577194, + "learning_rate": 0.0005285678925125735, + "loss": 2.6606, + "step": 16424 + }, + { + "epoch": 0.4870563117160395, + "grad_norm": 0.13565607368946075, + "learning_rate": 0.0005285209182870034, + "loss": 2.6993, + "step": 16425 + }, + { + "epoch": 0.487085965068351, + "grad_norm": 0.13939426839351654, + "learning_rate": 0.0005284739438088758, + "loss": 2.6968, + "step": 16426 + }, + { + "epoch": 0.4871156184206625, + "grad_norm": 0.13239537179470062, + "learning_rate": 0.0005284269690786066, + "loss": 2.6883, + "step": 16427 + }, + { + "epoch": 0.48714527177297395, + "grad_norm": 0.10031478852033615, + "learning_rate": 0.0005283799940966114, + "loss": 2.6959, + "step": 16428 + }, + { + "epoch": 0.48717492512528543, + "grad_norm": 0.12035021930932999, + "learning_rate": 0.0005283330188633068, + "loss": 2.7154, + "step": 16429 + }, + { + "epoch": 0.4872045784775969, + "grad_norm": 0.13803695142269135, + "learning_rate": 0.0005282860433791083, + "loss": 2.715, + "step": 16430 + }, + { + "epoch": 0.4872342318299084, + "grad_norm": 0.12120247632265091, + "learning_rate": 0.0005282390676444322, + "loss": 2.6956, + "step": 16431 + }, + { + "epoch": 0.48726388518221986, + "grad_norm": 0.10911356657743454, + "learning_rate": 0.0005281920916596942, + "loss": 2.6909, + "step": 16432 + }, + { + "epoch": 0.48729353853453133, + "grad_norm": 0.11961270868778229, + "learning_rate": 0.0005281451154253104, + "loss": 2.6603, + "step": 16433 + }, + { + "epoch": 0.4873231918868428, + "grad_norm": 0.11574705690145493, + "learning_rate": 0.0005280981389416966, + "loss": 2.7023, + "step": 16434 + }, + { + "epoch": 0.4873528452391543, + "grad_norm": 0.11670735478401184, + "learning_rate": 0.0005280511622092693, + "loss": 2.6571, + "step": 16435 + }, + { + "epoch": 0.48738249859146576, + "grad_norm": 0.12336771190166473, + "learning_rate": 0.0005280041852284439, + "loss": 2.6827, + "step": 16436 + }, + { + "epoch": 0.48741215194377724, + "grad_norm": 0.16042859852313995, + "learning_rate": 0.0005279572079996367, + "loss": 2.7017, + "step": 16437 + }, + { + "epoch": 0.4874418052960887, + "grad_norm": 0.1589667946100235, + "learning_rate": 0.0005279102305232637, + "loss": 2.6949, + "step": 16438 + }, + { + "epoch": 0.4874714586484002, + "grad_norm": 0.13386116921901703, + "learning_rate": 0.0005278632527997407, + "loss": 2.691, + "step": 16439 + }, + { + "epoch": 0.48750111200071167, + "grad_norm": 0.14213211834430695, + "learning_rate": 0.0005278162748294839, + "loss": 2.6333, + "step": 16440 + }, + { + "epoch": 0.48753076535302314, + "grad_norm": 0.15452457964420319, + "learning_rate": 0.0005277692966129091, + "loss": 2.6589, + "step": 16441 + }, + { + "epoch": 0.4875604187053346, + "grad_norm": 0.13808061182498932, + "learning_rate": 0.0005277223181504324, + "loss": 2.6633, + "step": 16442 + }, + { + "epoch": 0.4875900720576461, + "grad_norm": 0.1359531432390213, + "learning_rate": 0.00052767533944247, + "loss": 2.6737, + "step": 16443 + }, + { + "epoch": 0.48761972540995757, + "grad_norm": 0.12544265389442444, + "learning_rate": 0.0005276283604894376, + "loss": 2.7119, + "step": 16444 + }, + { + "epoch": 0.48764937876226905, + "grad_norm": 0.12959349155426025, + "learning_rate": 0.0005275813812917514, + "loss": 2.6632, + "step": 16445 + }, + { + "epoch": 0.4876790321145806, + "grad_norm": 0.14007090032100677, + "learning_rate": 0.0005275344018498274, + "loss": 2.6964, + "step": 16446 + }, + { + "epoch": 0.48770868546689206, + "grad_norm": 0.1330588012933731, + "learning_rate": 0.0005274874221640813, + "loss": 2.6869, + "step": 16447 + }, + { + "epoch": 0.48773833881920353, + "grad_norm": 0.11279942095279694, + "learning_rate": 0.0005274404422349295, + "loss": 2.6596, + "step": 16448 + }, + { + "epoch": 0.487767992171515, + "grad_norm": 0.1324022263288498, + "learning_rate": 0.0005273934620627878, + "loss": 2.7031, + "step": 16449 + }, + { + "epoch": 0.4877976455238265, + "grad_norm": 0.10804242640733719, + "learning_rate": 0.0005273464816480723, + "loss": 2.6745, + "step": 16450 + }, + { + "epoch": 0.48782729887613796, + "grad_norm": 0.12099061161279678, + "learning_rate": 0.0005272995009911991, + "loss": 2.7142, + "step": 16451 + }, + { + "epoch": 0.48785695222844944, + "grad_norm": 0.11827746033668518, + "learning_rate": 0.0005272525200925842, + "loss": 2.6545, + "step": 16452 + }, + { + "epoch": 0.4878866055807609, + "grad_norm": 0.1213391125202179, + "learning_rate": 0.0005272055389526434, + "loss": 2.648, + "step": 16453 + }, + { + "epoch": 0.4879162589330724, + "grad_norm": 0.11682162433862686, + "learning_rate": 0.0005271585575717929, + "loss": 2.654, + "step": 16454 + }, + { + "epoch": 0.48794591228538386, + "grad_norm": 0.10073983669281006, + "learning_rate": 0.0005271115759504487, + "loss": 2.6715, + "step": 16455 + }, + { + "epoch": 0.48797556563769534, + "grad_norm": 0.10750997066497803, + "learning_rate": 0.0005270645940890268, + "loss": 2.6655, + "step": 16456 + }, + { + "epoch": 0.4880052189900068, + "grad_norm": 0.12024112790822983, + "learning_rate": 0.0005270176119879435, + "loss": 2.7202, + "step": 16457 + }, + { + "epoch": 0.4880348723423183, + "grad_norm": 0.11448010057210922, + "learning_rate": 0.0005269706296476144, + "loss": 2.6606, + "step": 16458 + }, + { + "epoch": 0.48806452569462977, + "grad_norm": 0.11520732194185257, + "learning_rate": 0.0005269236470684559, + "loss": 2.7017, + "step": 16459 + }, + { + "epoch": 0.48809417904694125, + "grad_norm": 0.11270280927419662, + "learning_rate": 0.0005268766642508837, + "loss": 2.6756, + "step": 16460 + }, + { + "epoch": 0.4881238323992527, + "grad_norm": 0.11987046152353287, + "learning_rate": 0.0005268296811953141, + "loss": 2.6877, + "step": 16461 + }, + { + "epoch": 0.4881534857515642, + "grad_norm": 0.10963115096092224, + "learning_rate": 0.0005267826979021632, + "loss": 2.6672, + "step": 16462 + }, + { + "epoch": 0.4881831391038757, + "grad_norm": 0.11400225013494492, + "learning_rate": 0.0005267357143718468, + "loss": 2.6671, + "step": 16463 + }, + { + "epoch": 0.48821279245618715, + "grad_norm": 0.11442715674638748, + "learning_rate": 0.0005266887306047811, + "loss": 2.6849, + "step": 16464 + }, + { + "epoch": 0.4882424458084986, + "grad_norm": 0.12385961413383484, + "learning_rate": 0.000526641746601382, + "loss": 2.7038, + "step": 16465 + }, + { + "epoch": 0.4882720991608101, + "grad_norm": 0.11914391070604324, + "learning_rate": 0.0005265947623620656, + "loss": 2.7187, + "step": 16466 + }, + { + "epoch": 0.48830175251312163, + "grad_norm": 0.15011529624462128, + "learning_rate": 0.0005265477778872483, + "loss": 2.6292, + "step": 16467 + }, + { + "epoch": 0.4883314058654331, + "grad_norm": 0.14943881332874298, + "learning_rate": 0.0005265007931773457, + "loss": 2.6906, + "step": 16468 + }, + { + "epoch": 0.4883610592177446, + "grad_norm": 0.1441083550453186, + "learning_rate": 0.000526453808232774, + "loss": 2.6926, + "step": 16469 + }, + { + "epoch": 0.48839071257005606, + "grad_norm": 0.14557765424251556, + "learning_rate": 0.0005264068230539494, + "loss": 2.6868, + "step": 16470 + }, + { + "epoch": 0.48842036592236754, + "grad_norm": 0.12986382842063904, + "learning_rate": 0.0005263598376412877, + "loss": 2.7053, + "step": 16471 + }, + { + "epoch": 0.488450019274679, + "grad_norm": 0.13371595740318298, + "learning_rate": 0.0005263128519952052, + "loss": 2.6683, + "step": 16472 + }, + { + "epoch": 0.4884796726269905, + "grad_norm": 0.14444772899150848, + "learning_rate": 0.0005262658661161178, + "loss": 2.6838, + "step": 16473 + }, + { + "epoch": 0.48850932597930197, + "grad_norm": 0.1343407779932022, + "learning_rate": 0.0005262188800044418, + "loss": 2.6996, + "step": 16474 + }, + { + "epoch": 0.48853897933161344, + "grad_norm": 0.11724133789539337, + "learning_rate": 0.0005261718936605931, + "loss": 2.7045, + "step": 16475 + }, + { + "epoch": 0.4885686326839249, + "grad_norm": 0.15029792487621307, + "learning_rate": 0.0005261249070849876, + "loss": 2.7131, + "step": 16476 + }, + { + "epoch": 0.4885982860362364, + "grad_norm": 0.13698425889015198, + "learning_rate": 0.0005260779202780417, + "loss": 2.7027, + "step": 16477 + }, + { + "epoch": 0.48862793938854787, + "grad_norm": 0.12723900377750397, + "learning_rate": 0.0005260309332401714, + "loss": 2.6997, + "step": 16478 + }, + { + "epoch": 0.48865759274085935, + "grad_norm": 0.13522924482822418, + "learning_rate": 0.0005259839459717926, + "loss": 2.7061, + "step": 16479 + }, + { + "epoch": 0.4886872460931708, + "grad_norm": 0.1418585628271103, + "learning_rate": 0.0005259369584733215, + "loss": 2.6934, + "step": 16480 + }, + { + "epoch": 0.4887168994454823, + "grad_norm": 0.13503876328468323, + "learning_rate": 0.0005258899707451742, + "loss": 2.6807, + "step": 16481 + }, + { + "epoch": 0.4887465527977938, + "grad_norm": 0.13541853427886963, + "learning_rate": 0.0005258429827877667, + "loss": 2.7026, + "step": 16482 + }, + { + "epoch": 0.48877620615010525, + "grad_norm": 0.13343073427677155, + "learning_rate": 0.0005257959946015154, + "loss": 2.6825, + "step": 16483 + }, + { + "epoch": 0.48880585950241673, + "grad_norm": 0.14573995769023895, + "learning_rate": 0.000525749006186836, + "loss": 2.6963, + "step": 16484 + }, + { + "epoch": 0.4888355128547282, + "grad_norm": 0.14213018119335175, + "learning_rate": 0.0005257020175441447, + "loss": 2.6652, + "step": 16485 + }, + { + "epoch": 0.4888651662070397, + "grad_norm": 0.14261652529239655, + "learning_rate": 0.0005256550286738575, + "loss": 2.6963, + "step": 16486 + }, + { + "epoch": 0.48889481955935116, + "grad_norm": 0.13892817497253418, + "learning_rate": 0.0005256080395763908, + "loss": 2.6787, + "step": 16487 + }, + { + "epoch": 0.4889244729116627, + "grad_norm": 0.12899544835090637, + "learning_rate": 0.0005255610502521605, + "loss": 2.7276, + "step": 16488 + }, + { + "epoch": 0.48895412626397416, + "grad_norm": 0.11870996654033661, + "learning_rate": 0.0005255140607015826, + "loss": 2.6841, + "step": 16489 + }, + { + "epoch": 0.48898377961628564, + "grad_norm": 0.12889407575130463, + "learning_rate": 0.0005254670709250735, + "loss": 2.6682, + "step": 16490 + }, + { + "epoch": 0.4890134329685971, + "grad_norm": 0.13168801367282867, + "learning_rate": 0.000525420080923049, + "loss": 2.6957, + "step": 16491 + }, + { + "epoch": 0.4890430863209086, + "grad_norm": 0.09854231029748917, + "learning_rate": 0.0005253730906959253, + "loss": 2.7102, + "step": 16492 + }, + { + "epoch": 0.48907273967322007, + "grad_norm": 0.13958217203617096, + "learning_rate": 0.0005253261002441186, + "loss": 2.7024, + "step": 16493 + }, + { + "epoch": 0.48910239302553155, + "grad_norm": 0.1367279440164566, + "learning_rate": 0.0005252791095680448, + "loss": 2.6524, + "step": 16494 + }, + { + "epoch": 0.489132046377843, + "grad_norm": 0.139836847782135, + "learning_rate": 0.0005252321186681204, + "loss": 2.6598, + "step": 16495 + }, + { + "epoch": 0.4891616997301545, + "grad_norm": 0.11842381209135056, + "learning_rate": 0.000525185127544761, + "loss": 2.6834, + "step": 16496 + }, + { + "epoch": 0.489191353082466, + "grad_norm": 0.1086762472987175, + "learning_rate": 0.0005251381361983831, + "loss": 2.6919, + "step": 16497 + }, + { + "epoch": 0.48922100643477745, + "grad_norm": 0.1265522837638855, + "learning_rate": 0.0005250911446294026, + "loss": 2.6907, + "step": 16498 + }, + { + "epoch": 0.4892506597870889, + "grad_norm": 0.1170874834060669, + "learning_rate": 0.0005250441528382357, + "loss": 2.6277, + "step": 16499 + }, + { + "epoch": 0.4892803131394004, + "grad_norm": 0.12412950396537781, + "learning_rate": 0.0005249971608252987, + "loss": 2.6893, + "step": 16500 + }, + { + "epoch": 0.4893099664917119, + "grad_norm": 0.13660438358783722, + "learning_rate": 0.0005249501685910074, + "loss": 2.641, + "step": 16501 + }, + { + "epoch": 0.48933961984402335, + "grad_norm": 0.12530483305454254, + "learning_rate": 0.0005249031761357781, + "loss": 2.6939, + "step": 16502 + }, + { + "epoch": 0.48936927319633483, + "grad_norm": 0.11015310883522034, + "learning_rate": 0.000524856183460027, + "loss": 2.703, + "step": 16503 + }, + { + "epoch": 0.4893989265486463, + "grad_norm": 0.11361132562160492, + "learning_rate": 0.00052480919056417, + "loss": 2.6825, + "step": 16504 + }, + { + "epoch": 0.4894285799009578, + "grad_norm": 0.11792714148759842, + "learning_rate": 0.0005247621974486233, + "loss": 2.6655, + "step": 16505 + }, + { + "epoch": 0.48945823325326926, + "grad_norm": 0.1200794205069542, + "learning_rate": 0.0005247152041138033, + "loss": 2.6539, + "step": 16506 + }, + { + "epoch": 0.48948788660558074, + "grad_norm": 0.12277314066886902, + "learning_rate": 0.0005246682105601257, + "loss": 2.6742, + "step": 16507 + }, + { + "epoch": 0.48951753995789227, + "grad_norm": 0.11625451594591141, + "learning_rate": 0.000524621216788007, + "loss": 2.6825, + "step": 16508 + }, + { + "epoch": 0.48954719331020374, + "grad_norm": 0.10038936883211136, + "learning_rate": 0.0005245742227978631, + "loss": 2.6331, + "step": 16509 + }, + { + "epoch": 0.4895768466625152, + "grad_norm": 0.11206018924713135, + "learning_rate": 0.0005245272285901104, + "loss": 2.6743, + "step": 16510 + }, + { + "epoch": 0.4896065000148267, + "grad_norm": 0.10023144632577896, + "learning_rate": 0.0005244802341651648, + "loss": 2.6583, + "step": 16511 + }, + { + "epoch": 0.48963615336713817, + "grad_norm": 0.10987704992294312, + "learning_rate": 0.0005244332395234426, + "loss": 2.7116, + "step": 16512 + }, + { + "epoch": 0.48966580671944965, + "grad_norm": 0.11326179653406143, + "learning_rate": 0.0005243862446653596, + "loss": 2.6441, + "step": 16513 + }, + { + "epoch": 0.4896954600717611, + "grad_norm": 0.1001773253083229, + "learning_rate": 0.0005243392495913325, + "loss": 2.6795, + "step": 16514 + }, + { + "epoch": 0.4897251134240726, + "grad_norm": 0.12217799574136734, + "learning_rate": 0.0005242922543017771, + "loss": 2.7085, + "step": 16515 + }, + { + "epoch": 0.4897547667763841, + "grad_norm": 0.12588313221931458, + "learning_rate": 0.0005242452587971096, + "loss": 2.6978, + "step": 16516 + }, + { + "epoch": 0.48978442012869555, + "grad_norm": 0.1516772359609604, + "learning_rate": 0.0005241982630777464, + "loss": 2.6645, + "step": 16517 + }, + { + "epoch": 0.48981407348100703, + "grad_norm": 0.1318078190088272, + "learning_rate": 0.000524151267144103, + "loss": 2.675, + "step": 16518 + }, + { + "epoch": 0.4898437268333185, + "grad_norm": 0.1329222023487091, + "learning_rate": 0.0005241042709965961, + "loss": 2.6947, + "step": 16519 + }, + { + "epoch": 0.48987338018563, + "grad_norm": 0.12728430330753326, + "learning_rate": 0.0005240572746356418, + "loss": 2.6851, + "step": 16520 + }, + { + "epoch": 0.48990303353794146, + "grad_norm": 0.11314556002616882, + "learning_rate": 0.0005240102780616563, + "loss": 2.6799, + "step": 16521 + }, + { + "epoch": 0.48993268689025293, + "grad_norm": 0.12455397844314575, + "learning_rate": 0.0005239632812750556, + "loss": 2.6873, + "step": 16522 + }, + { + "epoch": 0.4899623402425644, + "grad_norm": 0.12112626433372498, + "learning_rate": 0.000523916284276256, + "loss": 2.6892, + "step": 16523 + }, + { + "epoch": 0.4899919935948759, + "grad_norm": 0.11453201621770859, + "learning_rate": 0.0005238692870656735, + "loss": 2.6499, + "step": 16524 + }, + { + "epoch": 0.49002164694718736, + "grad_norm": 0.13662602007389069, + "learning_rate": 0.0005238222896437242, + "loss": 2.6645, + "step": 16525 + }, + { + "epoch": 0.49005130029949884, + "grad_norm": 0.14313501119613647, + "learning_rate": 0.0005237752920108248, + "loss": 2.6802, + "step": 16526 + }, + { + "epoch": 0.4900809536518103, + "grad_norm": 0.13418415188789368, + "learning_rate": 0.0005237282941673909, + "loss": 2.7018, + "step": 16527 + }, + { + "epoch": 0.4901106070041218, + "grad_norm": 0.134318545460701, + "learning_rate": 0.000523681296113839, + "loss": 2.6853, + "step": 16528 + }, + { + "epoch": 0.4901402603564333, + "grad_norm": 0.11687346547842026, + "learning_rate": 0.000523634297850585, + "loss": 2.6625, + "step": 16529 + }, + { + "epoch": 0.4901699137087448, + "grad_norm": 0.11805341392755508, + "learning_rate": 0.0005235872993780453, + "loss": 2.6862, + "step": 16530 + }, + { + "epoch": 0.4901995670610563, + "grad_norm": 0.12266641110181808, + "learning_rate": 0.000523540300696636, + "loss": 2.7004, + "step": 16531 + }, + { + "epoch": 0.49022922041336775, + "grad_norm": 0.13007047772407532, + "learning_rate": 0.0005234933018067732, + "loss": 2.6577, + "step": 16532 + }, + { + "epoch": 0.4902588737656792, + "grad_norm": 0.11726938188076019, + "learning_rate": 0.0005234463027088734, + "loss": 2.6822, + "step": 16533 + }, + { + "epoch": 0.4902885271179907, + "grad_norm": 0.1178465411067009, + "learning_rate": 0.0005233993034033525, + "loss": 2.7151, + "step": 16534 + }, + { + "epoch": 0.4903181804703022, + "grad_norm": 0.12685567140579224, + "learning_rate": 0.0005233523038906267, + "loss": 2.685, + "step": 16535 + }, + { + "epoch": 0.49034783382261365, + "grad_norm": 0.13570979237556458, + "learning_rate": 0.0005233053041711122, + "loss": 2.6829, + "step": 16536 + }, + { + "epoch": 0.49037748717492513, + "grad_norm": 0.13256308436393738, + "learning_rate": 0.0005232583042452252, + "loss": 2.7193, + "step": 16537 + }, + { + "epoch": 0.4904071405272366, + "grad_norm": 0.14703160524368286, + "learning_rate": 0.0005232113041133821, + "loss": 2.6821, + "step": 16538 + }, + { + "epoch": 0.4904367938795481, + "grad_norm": 0.13810962438583374, + "learning_rate": 0.0005231643037759989, + "loss": 2.6908, + "step": 16539 + }, + { + "epoch": 0.49046644723185956, + "grad_norm": 0.12233929336071014, + "learning_rate": 0.0005231173032334917, + "loss": 2.6767, + "step": 16540 + }, + { + "epoch": 0.49049610058417104, + "grad_norm": 0.13248422741889954, + "learning_rate": 0.0005230703024862768, + "loss": 2.6782, + "step": 16541 + }, + { + "epoch": 0.4905257539364825, + "grad_norm": 0.12790684401988983, + "learning_rate": 0.0005230233015347705, + "loss": 2.6633, + "step": 16542 + }, + { + "epoch": 0.490555407288794, + "grad_norm": 0.12224182486534119, + "learning_rate": 0.0005229763003793889, + "loss": 2.6863, + "step": 16543 + }, + { + "epoch": 0.49058506064110546, + "grad_norm": 0.11856256425380707, + "learning_rate": 0.0005229292990205482, + "loss": 2.7217, + "step": 16544 + }, + { + "epoch": 0.49061471399341694, + "grad_norm": 0.12665340304374695, + "learning_rate": 0.0005228822974586647, + "loss": 2.7209, + "step": 16545 + }, + { + "epoch": 0.4906443673457284, + "grad_norm": 0.11318831890821457, + "learning_rate": 0.0005228352956941543, + "loss": 2.6751, + "step": 16546 + }, + { + "epoch": 0.4906740206980399, + "grad_norm": 0.12126615643501282, + "learning_rate": 0.0005227882937274336, + "loss": 2.6259, + "step": 16547 + }, + { + "epoch": 0.49070367405035137, + "grad_norm": 0.11943982541561127, + "learning_rate": 0.0005227412915589187, + "loss": 2.6857, + "step": 16548 + }, + { + "epoch": 0.49073332740266284, + "grad_norm": 0.1339782327413559, + "learning_rate": 0.0005226942891890256, + "loss": 2.6947, + "step": 16549 + }, + { + "epoch": 0.4907629807549744, + "grad_norm": 0.11650598049163818, + "learning_rate": 0.0005226472866181708, + "loss": 2.6511, + "step": 16550 + }, + { + "epoch": 0.49079263410728585, + "grad_norm": 0.12382553517818451, + "learning_rate": 0.0005226002838467704, + "loss": 2.7031, + "step": 16551 + }, + { + "epoch": 0.49082228745959733, + "grad_norm": 0.13544194400310516, + "learning_rate": 0.0005225532808752405, + "loss": 2.6853, + "step": 16552 + }, + { + "epoch": 0.4908519408119088, + "grad_norm": 0.13323178887367249, + "learning_rate": 0.0005225062777039975, + "loss": 2.6866, + "step": 16553 + }, + { + "epoch": 0.4908815941642203, + "grad_norm": 0.1204165443778038, + "learning_rate": 0.0005224592743334575, + "loss": 2.6796, + "step": 16554 + }, + { + "epoch": 0.49091124751653176, + "grad_norm": 0.12627039849758148, + "learning_rate": 0.0005224122707640369, + "loss": 2.6952, + "step": 16555 + }, + { + "epoch": 0.49094090086884323, + "grad_norm": 0.15452063083648682, + "learning_rate": 0.0005223652669961518, + "loss": 2.6925, + "step": 16556 + }, + { + "epoch": 0.4909705542211547, + "grad_norm": 0.14693838357925415, + "learning_rate": 0.0005223182630302181, + "loss": 2.6643, + "step": 16557 + }, + { + "epoch": 0.4910002075734662, + "grad_norm": 0.1580502688884735, + "learning_rate": 0.0005222712588666527, + "loss": 2.6932, + "step": 16558 + }, + { + "epoch": 0.49102986092577766, + "grad_norm": 0.13194867968559265, + "learning_rate": 0.0005222242545058713, + "loss": 2.6712, + "step": 16559 + }, + { + "epoch": 0.49105951427808914, + "grad_norm": 0.12859265506267548, + "learning_rate": 0.0005221772499482903, + "loss": 2.6712, + "step": 16560 + }, + { + "epoch": 0.4910891676304006, + "grad_norm": 0.14770670235157013, + "learning_rate": 0.0005221302451943262, + "loss": 2.6904, + "step": 16561 + }, + { + "epoch": 0.4911188209827121, + "grad_norm": 0.1302672028541565, + "learning_rate": 0.0005220832402443947, + "loss": 2.6973, + "step": 16562 + }, + { + "epoch": 0.49114847433502357, + "grad_norm": 0.13083571195602417, + "learning_rate": 0.0005220362350989123, + "loss": 2.6591, + "step": 16563 + }, + { + "epoch": 0.49117812768733504, + "grad_norm": 0.1412125825881958, + "learning_rate": 0.0005219892297582954, + "loss": 2.6858, + "step": 16564 + }, + { + "epoch": 0.4912077810396465, + "grad_norm": 0.14184735715389252, + "learning_rate": 0.00052194222422296, + "loss": 2.6633, + "step": 16565 + }, + { + "epoch": 0.491237434391958, + "grad_norm": 0.1502881944179535, + "learning_rate": 0.0005218952184933227, + "loss": 2.6902, + "step": 16566 + }, + { + "epoch": 0.49126708774426947, + "grad_norm": 0.13784722983837128, + "learning_rate": 0.0005218482125697992, + "loss": 2.6884, + "step": 16567 + }, + { + "epoch": 0.49129674109658095, + "grad_norm": 0.12518523633480072, + "learning_rate": 0.0005218012064528061, + "loss": 2.6562, + "step": 16568 + }, + { + "epoch": 0.4913263944488924, + "grad_norm": 0.12026318907737732, + "learning_rate": 0.0005217542001427596, + "loss": 2.6513, + "step": 16569 + }, + { + "epoch": 0.4913560478012039, + "grad_norm": 0.15203574299812317, + "learning_rate": 0.0005217071936400758, + "loss": 2.6868, + "step": 16570 + }, + { + "epoch": 0.49138570115351543, + "grad_norm": 0.11661098152399063, + "learning_rate": 0.0005216601869451712, + "loss": 2.6593, + "step": 16571 + }, + { + "epoch": 0.4914153545058269, + "grad_norm": 0.14003778994083405, + "learning_rate": 0.000521613180058462, + "loss": 2.6809, + "step": 16572 + }, + { + "epoch": 0.4914450078581384, + "grad_norm": 0.1471789926290512, + "learning_rate": 0.0005215661729803642, + "loss": 2.6833, + "step": 16573 + }, + { + "epoch": 0.49147466121044986, + "grad_norm": 0.14066654443740845, + "learning_rate": 0.0005215191657112944, + "loss": 2.7169, + "step": 16574 + }, + { + "epoch": 0.49150431456276134, + "grad_norm": 0.12852396070957184, + "learning_rate": 0.0005214721582516686, + "loss": 2.69, + "step": 16575 + }, + { + "epoch": 0.4915339679150728, + "grad_norm": 0.12476583570241928, + "learning_rate": 0.0005214251506019032, + "loss": 2.7002, + "step": 16576 + }, + { + "epoch": 0.4915636212673843, + "grad_norm": 0.12524159252643585, + "learning_rate": 0.0005213781427624144, + "loss": 2.6822, + "step": 16577 + }, + { + "epoch": 0.49159327461969576, + "grad_norm": 0.11454949527978897, + "learning_rate": 0.0005213311347336183, + "loss": 2.6515, + "step": 16578 + }, + { + "epoch": 0.49162292797200724, + "grad_norm": 0.12114231288433075, + "learning_rate": 0.0005212841265159316, + "loss": 2.7038, + "step": 16579 + }, + { + "epoch": 0.4916525813243187, + "grad_norm": 0.11068665236234665, + "learning_rate": 0.0005212371181097701, + "loss": 2.679, + "step": 16580 + }, + { + "epoch": 0.4916822346766302, + "grad_norm": 0.11861854791641235, + "learning_rate": 0.0005211901095155504, + "loss": 2.6925, + "step": 16581 + }, + { + "epoch": 0.49171188802894167, + "grad_norm": 0.1295577585697174, + "learning_rate": 0.0005211431007336886, + "loss": 2.6805, + "step": 16582 + }, + { + "epoch": 0.49174154138125314, + "grad_norm": 0.12960538268089294, + "learning_rate": 0.0005210960917646012, + "loss": 2.6939, + "step": 16583 + }, + { + "epoch": 0.4917711947335646, + "grad_norm": 0.11908017098903656, + "learning_rate": 0.000521049082608704, + "loss": 2.6805, + "step": 16584 + }, + { + "epoch": 0.4918008480858761, + "grad_norm": 0.11476389318704605, + "learning_rate": 0.0005210020732664137, + "loss": 2.6774, + "step": 16585 + }, + { + "epoch": 0.4918305014381876, + "grad_norm": 0.11408746242523193, + "learning_rate": 0.0005209550637381465, + "loss": 2.6779, + "step": 16586 + }, + { + "epoch": 0.49186015479049905, + "grad_norm": 0.11009883880615234, + "learning_rate": 0.0005209080540243185, + "loss": 2.6779, + "step": 16587 + }, + { + "epoch": 0.4918898081428105, + "grad_norm": 0.12263675779104233, + "learning_rate": 0.0005208610441253461, + "loss": 2.6792, + "step": 16588 + }, + { + "epoch": 0.491919461495122, + "grad_norm": 0.1061636358499527, + "learning_rate": 0.0005208140340416457, + "loss": 2.6975, + "step": 16589 + }, + { + "epoch": 0.4919491148474335, + "grad_norm": 0.12523706257343292, + "learning_rate": 0.0005207670237736332, + "loss": 2.6962, + "step": 16590 + }, + { + "epoch": 0.49197876819974495, + "grad_norm": 0.12013889104127884, + "learning_rate": 0.0005207200133217254, + "loss": 2.6775, + "step": 16591 + }, + { + "epoch": 0.4920084215520565, + "grad_norm": 0.12166930735111237, + "learning_rate": 0.0005206730026863382, + "loss": 2.6736, + "step": 16592 + }, + { + "epoch": 0.49203807490436796, + "grad_norm": 0.16426171362400055, + "learning_rate": 0.0005206259918678881, + "loss": 2.6849, + "step": 16593 + }, + { + "epoch": 0.49206772825667944, + "grad_norm": 0.14197789132595062, + "learning_rate": 0.0005205789808667913, + "loss": 2.7073, + "step": 16594 + }, + { + "epoch": 0.4920973816089909, + "grad_norm": 0.11691757291555405, + "learning_rate": 0.0005205319696834639, + "loss": 2.6594, + "step": 16595 + }, + { + "epoch": 0.4921270349613024, + "grad_norm": 0.11464399844408035, + "learning_rate": 0.0005204849583183225, + "loss": 2.675, + "step": 16596 + }, + { + "epoch": 0.49215668831361387, + "grad_norm": 0.12591610848903656, + "learning_rate": 0.0005204379467717833, + "loss": 2.6821, + "step": 16597 + }, + { + "epoch": 0.49218634166592534, + "grad_norm": 0.11994093656539917, + "learning_rate": 0.0005203909350442625, + "loss": 2.6634, + "step": 16598 + }, + { + "epoch": 0.4922159950182368, + "grad_norm": 0.1258372962474823, + "learning_rate": 0.0005203439231361766, + "loss": 2.6307, + "step": 16599 + }, + { + "epoch": 0.4922456483705483, + "grad_norm": 0.1085912361741066, + "learning_rate": 0.0005202969110479418, + "loss": 2.6995, + "step": 16600 + }, + { + "epoch": 0.49227530172285977, + "grad_norm": 0.11137787997722626, + "learning_rate": 0.0005202498987799742, + "loss": 2.6913, + "step": 16601 + }, + { + "epoch": 0.49230495507517125, + "grad_norm": 0.11626095324754715, + "learning_rate": 0.0005202028863326902, + "loss": 2.7064, + "step": 16602 + }, + { + "epoch": 0.4923346084274827, + "grad_norm": 0.1464705765247345, + "learning_rate": 0.0005201558737065065, + "loss": 2.681, + "step": 16603 + }, + { + "epoch": 0.4923642617797942, + "grad_norm": 0.14573535323143005, + "learning_rate": 0.0005201088609018389, + "loss": 2.6783, + "step": 16604 + }, + { + "epoch": 0.4923939151321057, + "grad_norm": 0.1363861858844757, + "learning_rate": 0.0005200618479191038, + "loss": 2.7076, + "step": 16605 + }, + { + "epoch": 0.49242356848441715, + "grad_norm": 0.1305897831916809, + "learning_rate": 0.0005200148347587177, + "loss": 2.6954, + "step": 16606 + }, + { + "epoch": 0.4924532218367286, + "grad_norm": 0.1271933615207672, + "learning_rate": 0.0005199678214210968, + "loss": 2.6642, + "step": 16607 + }, + { + "epoch": 0.4924828751890401, + "grad_norm": 0.1107618659734726, + "learning_rate": 0.0005199208079066573, + "loss": 2.6551, + "step": 16608 + }, + { + "epoch": 0.4925125285413516, + "grad_norm": 0.12068657577037811, + "learning_rate": 0.0005198737942158158, + "loss": 2.6689, + "step": 16609 + }, + { + "epoch": 0.49254218189366306, + "grad_norm": 0.11661351472139359, + "learning_rate": 0.0005198267803489884, + "loss": 2.6627, + "step": 16610 + }, + { + "epoch": 0.49257183524597453, + "grad_norm": 0.09625567495822906, + "learning_rate": 0.0005197797663065913, + "loss": 2.6736, + "step": 16611 + }, + { + "epoch": 0.49260148859828606, + "grad_norm": 0.10538890957832336, + "learning_rate": 0.0005197327520890412, + "loss": 2.6901, + "step": 16612 + }, + { + "epoch": 0.49263114195059754, + "grad_norm": 0.10909564793109894, + "learning_rate": 0.0005196857376967539, + "loss": 2.6668, + "step": 16613 + }, + { + "epoch": 0.492660795302909, + "grad_norm": 0.11257901042699814, + "learning_rate": 0.0005196387231301463, + "loss": 2.7119, + "step": 16614 + }, + { + "epoch": 0.4926904486552205, + "grad_norm": 0.1089152917265892, + "learning_rate": 0.0005195917083896343, + "loss": 2.7034, + "step": 16615 + }, + { + "epoch": 0.49272010200753197, + "grad_norm": 0.112105593085289, + "learning_rate": 0.0005195446934756344, + "loss": 2.6661, + "step": 16616 + }, + { + "epoch": 0.49274975535984344, + "grad_norm": 0.11619263887405396, + "learning_rate": 0.0005194976783885628, + "loss": 2.65, + "step": 16617 + }, + { + "epoch": 0.4927794087121549, + "grad_norm": 0.10673786699771881, + "learning_rate": 0.000519450663128836, + "loss": 2.6756, + "step": 16618 + }, + { + "epoch": 0.4928090620644664, + "grad_norm": 0.11744236201047897, + "learning_rate": 0.0005194036476968702, + "loss": 2.6949, + "step": 16619 + }, + { + "epoch": 0.4928387154167779, + "grad_norm": 0.1137922927737236, + "learning_rate": 0.0005193566320930818, + "loss": 2.6926, + "step": 16620 + }, + { + "epoch": 0.49286836876908935, + "grad_norm": 0.11544155329465866, + "learning_rate": 0.000519309616317887, + "loss": 2.7134, + "step": 16621 + }, + { + "epoch": 0.4928980221214008, + "grad_norm": 0.13062343001365662, + "learning_rate": 0.0005192626003717025, + "loss": 2.7073, + "step": 16622 + }, + { + "epoch": 0.4929276754737123, + "grad_norm": 0.13455583155155182, + "learning_rate": 0.0005192155842549442, + "loss": 2.6968, + "step": 16623 + }, + { + "epoch": 0.4929573288260238, + "grad_norm": 0.13970813155174255, + "learning_rate": 0.0005191685679680286, + "loss": 2.7136, + "step": 16624 + }, + { + "epoch": 0.49298698217833525, + "grad_norm": 0.12719738483428955, + "learning_rate": 0.0005191215515113719, + "loss": 2.686, + "step": 16625 + }, + { + "epoch": 0.49301663553064673, + "grad_norm": 0.12883742153644562, + "learning_rate": 0.0005190745348853909, + "loss": 2.6807, + "step": 16626 + }, + { + "epoch": 0.4930462888829582, + "grad_norm": 0.11982931941747665, + "learning_rate": 0.0005190275180905014, + "loss": 2.7098, + "step": 16627 + }, + { + "epoch": 0.4930759422352697, + "grad_norm": 0.12235387414693832, + "learning_rate": 0.0005189805011271199, + "loss": 2.7059, + "step": 16628 + }, + { + "epoch": 0.49310559558758116, + "grad_norm": 0.1274203211069107, + "learning_rate": 0.0005189334839956628, + "loss": 2.7147, + "step": 16629 + }, + { + "epoch": 0.49313524893989263, + "grad_norm": 0.12069645524024963, + "learning_rate": 0.0005188864666965467, + "loss": 2.6869, + "step": 16630 + }, + { + "epoch": 0.4931649022922041, + "grad_norm": 0.11144667863845825, + "learning_rate": 0.0005188394492301877, + "loss": 2.7215, + "step": 16631 + }, + { + "epoch": 0.4931945556445156, + "grad_norm": 0.11986508220434189, + "learning_rate": 0.000518792431597002, + "loss": 2.6926, + "step": 16632 + }, + { + "epoch": 0.4932242089968271, + "grad_norm": 0.10965774208307266, + "learning_rate": 0.000518745413797406, + "loss": 2.6809, + "step": 16633 + }, + { + "epoch": 0.4932538623491386, + "grad_norm": 0.11725788563489914, + "learning_rate": 0.0005186983958318161, + "loss": 2.6625, + "step": 16634 + }, + { + "epoch": 0.49328351570145007, + "grad_norm": 0.12683065235614777, + "learning_rate": 0.0005186513777006488, + "loss": 2.712, + "step": 16635 + }, + { + "epoch": 0.49331316905376155, + "grad_norm": 0.12128695100545883, + "learning_rate": 0.0005186043594043204, + "loss": 2.6323, + "step": 16636 + }, + { + "epoch": 0.493342822406073, + "grad_norm": 0.13453246653079987, + "learning_rate": 0.0005185573409432473, + "loss": 2.6837, + "step": 16637 + }, + { + "epoch": 0.4933724757583845, + "grad_norm": 0.11144515872001648, + "learning_rate": 0.0005185103223178456, + "loss": 2.704, + "step": 16638 + }, + { + "epoch": 0.493402129110696, + "grad_norm": 0.13292859494686127, + "learning_rate": 0.0005184633035285319, + "loss": 2.6821, + "step": 16639 + }, + { + "epoch": 0.49343178246300745, + "grad_norm": 0.1441275179386139, + "learning_rate": 0.0005184162845757223, + "loss": 2.6626, + "step": 16640 + }, + { + "epoch": 0.4934614358153189, + "grad_norm": 0.10640197992324829, + "learning_rate": 0.0005183692654598334, + "loss": 2.6377, + "step": 16641 + }, + { + "epoch": 0.4934910891676304, + "grad_norm": 0.13694752752780914, + "learning_rate": 0.0005183222461812816, + "loss": 2.6994, + "step": 16642 + }, + { + "epoch": 0.4935207425199419, + "grad_norm": 0.11461708694696426, + "learning_rate": 0.0005182752267404832, + "loss": 2.6704, + "step": 16643 + }, + { + "epoch": 0.49355039587225336, + "grad_norm": 0.12810926139354706, + "learning_rate": 0.0005182282071378544, + "loss": 2.6788, + "step": 16644 + }, + { + "epoch": 0.49358004922456483, + "grad_norm": 0.1382599174976349, + "learning_rate": 0.0005181811873738118, + "loss": 2.6839, + "step": 16645 + }, + { + "epoch": 0.4936097025768763, + "grad_norm": 0.12143491953611374, + "learning_rate": 0.0005181341674487717, + "loss": 2.6711, + "step": 16646 + }, + { + "epoch": 0.4936393559291878, + "grad_norm": 0.13909490406513214, + "learning_rate": 0.0005180871473631503, + "loss": 2.7133, + "step": 16647 + }, + { + "epoch": 0.49366900928149926, + "grad_norm": 0.11221159994602203, + "learning_rate": 0.0005180401271173643, + "loss": 2.6951, + "step": 16648 + }, + { + "epoch": 0.49369866263381074, + "grad_norm": 0.12028029561042786, + "learning_rate": 0.0005179931067118296, + "loss": 2.7226, + "step": 16649 + }, + { + "epoch": 0.4937283159861222, + "grad_norm": 0.11099976301193237, + "learning_rate": 0.0005179460861469631, + "loss": 2.6996, + "step": 16650 + }, + { + "epoch": 0.4937579693384337, + "grad_norm": 0.11953508108854294, + "learning_rate": 0.0005178990654231808, + "loss": 2.66, + "step": 16651 + }, + { + "epoch": 0.49378762269074516, + "grad_norm": 0.10501113533973694, + "learning_rate": 0.0005178520445408991, + "loss": 2.6812, + "step": 16652 + }, + { + "epoch": 0.49381727604305664, + "grad_norm": 0.1081913411617279, + "learning_rate": 0.0005178050235005347, + "loss": 2.6798, + "step": 16653 + }, + { + "epoch": 0.4938469293953682, + "grad_norm": 0.10005804896354675, + "learning_rate": 0.0005177580023025037, + "loss": 2.6566, + "step": 16654 + }, + { + "epoch": 0.49387658274767965, + "grad_norm": 0.1213977187871933, + "learning_rate": 0.0005177109809472224, + "loss": 2.6843, + "step": 16655 + }, + { + "epoch": 0.4939062360999911, + "grad_norm": 0.12089546769857407, + "learning_rate": 0.0005176639594351074, + "loss": 2.6707, + "step": 16656 + }, + { + "epoch": 0.4939358894523026, + "grad_norm": 0.12823927402496338, + "learning_rate": 0.0005176169377665752, + "loss": 2.6455, + "step": 16657 + }, + { + "epoch": 0.4939655428046141, + "grad_norm": 0.12140055000782013, + "learning_rate": 0.0005175699159420419, + "loss": 2.689, + "step": 16658 + }, + { + "epoch": 0.49399519615692555, + "grad_norm": 0.1247585192322731, + "learning_rate": 0.0005175228939619239, + "loss": 2.7074, + "step": 16659 + }, + { + "epoch": 0.49402484950923703, + "grad_norm": 0.1272733360528946, + "learning_rate": 0.0005174758718266376, + "loss": 2.6935, + "step": 16660 + }, + { + "epoch": 0.4940545028615485, + "grad_norm": 0.1254805624485016, + "learning_rate": 0.0005174288495365995, + "loss": 2.6589, + "step": 16661 + }, + { + "epoch": 0.49408415621386, + "grad_norm": 0.12540395557880402, + "learning_rate": 0.000517381827092226, + "loss": 2.6531, + "step": 16662 + }, + { + "epoch": 0.49411380956617146, + "grad_norm": 0.15427127480506897, + "learning_rate": 0.0005173348044939334, + "loss": 2.6989, + "step": 16663 + }, + { + "epoch": 0.49414346291848293, + "grad_norm": 0.13670563697814941, + "learning_rate": 0.0005172877817421382, + "loss": 2.6969, + "step": 16664 + }, + { + "epoch": 0.4941731162707944, + "grad_norm": 0.1580635905265808, + "learning_rate": 0.0005172407588372568, + "loss": 2.7062, + "step": 16665 + }, + { + "epoch": 0.4942027696231059, + "grad_norm": 0.12868860363960266, + "learning_rate": 0.0005171937357797053, + "loss": 2.6893, + "step": 16666 + }, + { + "epoch": 0.49423242297541736, + "grad_norm": 0.11691179126501083, + "learning_rate": 0.0005171467125699003, + "loss": 2.6741, + "step": 16667 + }, + { + "epoch": 0.49426207632772884, + "grad_norm": 0.11772197484970093, + "learning_rate": 0.0005170996892082583, + "loss": 2.6921, + "step": 16668 + }, + { + "epoch": 0.4942917296800403, + "grad_norm": 0.12728004157543182, + "learning_rate": 0.0005170526656951958, + "loss": 2.6351, + "step": 16669 + }, + { + "epoch": 0.4943213830323518, + "grad_norm": 0.11561140418052673, + "learning_rate": 0.0005170056420311289, + "loss": 2.6741, + "step": 16670 + }, + { + "epoch": 0.49435103638466327, + "grad_norm": 0.11430960893630981, + "learning_rate": 0.0005169586182164741, + "loss": 2.6586, + "step": 16671 + }, + { + "epoch": 0.49438068973697474, + "grad_norm": 0.11857818067073822, + "learning_rate": 0.0005169115942516478, + "loss": 2.6853, + "step": 16672 + }, + { + "epoch": 0.4944103430892862, + "grad_norm": 0.10827571898698807, + "learning_rate": 0.0005168645701370663, + "loss": 2.6994, + "step": 16673 + }, + { + "epoch": 0.4944399964415977, + "grad_norm": 0.10579981654882431, + "learning_rate": 0.0005168175458731462, + "loss": 2.6537, + "step": 16674 + }, + { + "epoch": 0.4944696497939092, + "grad_norm": 0.12323254346847534, + "learning_rate": 0.0005167705214603041, + "loss": 2.6859, + "step": 16675 + }, + { + "epoch": 0.4944993031462207, + "grad_norm": 0.12104246765375137, + "learning_rate": 0.000516723496898956, + "loss": 2.69, + "step": 16676 + }, + { + "epoch": 0.4945289564985322, + "grad_norm": 0.14545848965644836, + "learning_rate": 0.0005166764721895184, + "loss": 2.7081, + "step": 16677 + }, + { + "epoch": 0.49455860985084366, + "grad_norm": 0.14388182759284973, + "learning_rate": 0.0005166294473324078, + "loss": 2.6641, + "step": 16678 + }, + { + "epoch": 0.49458826320315513, + "grad_norm": 0.13420403003692627, + "learning_rate": 0.0005165824223280406, + "loss": 2.7263, + "step": 16679 + }, + { + "epoch": 0.4946179165554666, + "grad_norm": 0.13054737448692322, + "learning_rate": 0.0005165353971768331, + "loss": 2.6724, + "step": 16680 + }, + { + "epoch": 0.4946475699077781, + "grad_norm": 0.1538151353597641, + "learning_rate": 0.0005164883718792021, + "loss": 2.695, + "step": 16681 + }, + { + "epoch": 0.49467722326008956, + "grad_norm": 0.10205541551113129, + "learning_rate": 0.0005164413464355635, + "loss": 2.6787, + "step": 16682 + }, + { + "epoch": 0.49470687661240104, + "grad_norm": 0.11279722303152084, + "learning_rate": 0.0005163943208463341, + "loss": 2.6852, + "step": 16683 + }, + { + "epoch": 0.4947365299647125, + "grad_norm": 0.12236396223306656, + "learning_rate": 0.00051634729511193, + "loss": 2.6879, + "step": 16684 + }, + { + "epoch": 0.494766183317024, + "grad_norm": 0.12376459687948227, + "learning_rate": 0.0005163002692327679, + "loss": 2.6811, + "step": 16685 + }, + { + "epoch": 0.49479583666933546, + "grad_norm": 0.12320605665445328, + "learning_rate": 0.0005162532432092642, + "loss": 2.6803, + "step": 16686 + }, + { + "epoch": 0.49482549002164694, + "grad_norm": 0.11668186634778976, + "learning_rate": 0.0005162062170418351, + "loss": 2.6787, + "step": 16687 + }, + { + "epoch": 0.4948551433739584, + "grad_norm": 0.12631729245185852, + "learning_rate": 0.0005161591907308972, + "loss": 2.6868, + "step": 16688 + }, + { + "epoch": 0.4948847967262699, + "grad_norm": 0.1301165521144867, + "learning_rate": 0.000516112164276867, + "loss": 2.711, + "step": 16689 + }, + { + "epoch": 0.49491445007858137, + "grad_norm": 0.11500942707061768, + "learning_rate": 0.0005160651376801606, + "loss": 2.6413, + "step": 16690 + }, + { + "epoch": 0.49494410343089285, + "grad_norm": 0.12504324316978455, + "learning_rate": 0.0005160181109411947, + "loss": 2.6713, + "step": 16691 + }, + { + "epoch": 0.4949737567832043, + "grad_norm": 0.12349139153957367, + "learning_rate": 0.000515971084060386, + "loss": 2.6469, + "step": 16692 + }, + { + "epoch": 0.4950034101355158, + "grad_norm": 0.12654711306095123, + "learning_rate": 0.0005159240570381503, + "loss": 2.6393, + "step": 16693 + }, + { + "epoch": 0.4950330634878273, + "grad_norm": 0.11575499922037125, + "learning_rate": 0.0005158770298749044, + "loss": 2.7018, + "step": 16694 + }, + { + "epoch": 0.49506271684013875, + "grad_norm": 0.11463535577058792, + "learning_rate": 0.0005158300025710646, + "loss": 2.6809, + "step": 16695 + }, + { + "epoch": 0.4950923701924503, + "grad_norm": 0.10730211436748505, + "learning_rate": 0.0005157829751270476, + "loss": 2.6929, + "step": 16696 + }, + { + "epoch": 0.49512202354476176, + "grad_norm": 0.13237448036670685, + "learning_rate": 0.0005157359475432696, + "loss": 2.6747, + "step": 16697 + }, + { + "epoch": 0.49515167689707323, + "grad_norm": 0.12113963067531586, + "learning_rate": 0.000515688919820147, + "loss": 2.6252, + "step": 16698 + }, + { + "epoch": 0.4951813302493847, + "grad_norm": 0.1089148223400116, + "learning_rate": 0.0005156418919580962, + "loss": 2.6747, + "step": 16699 + }, + { + "epoch": 0.4952109836016962, + "grad_norm": 0.11578615754842758, + "learning_rate": 0.0005155948639575338, + "loss": 2.6842, + "step": 16700 + }, + { + "epoch": 0.49524063695400766, + "grad_norm": 0.11199012398719788, + "learning_rate": 0.0005155478358188764, + "loss": 2.6735, + "step": 16701 + }, + { + "epoch": 0.49527029030631914, + "grad_norm": 0.1226915642619133, + "learning_rate": 0.0005155008075425402, + "loss": 2.6858, + "step": 16702 + }, + { + "epoch": 0.4952999436586306, + "grad_norm": 0.12512663006782532, + "learning_rate": 0.0005154537791289417, + "loss": 2.6527, + "step": 16703 + }, + { + "epoch": 0.4953295970109421, + "grad_norm": 0.09940052032470703, + "learning_rate": 0.0005154067505784973, + "loss": 2.6869, + "step": 16704 + }, + { + "epoch": 0.49535925036325357, + "grad_norm": 0.1138511672616005, + "learning_rate": 0.0005153597218916234, + "loss": 2.7238, + "step": 16705 + }, + { + "epoch": 0.49538890371556504, + "grad_norm": 0.10897696018218994, + "learning_rate": 0.0005153126930687365, + "loss": 2.6652, + "step": 16706 + }, + { + "epoch": 0.4954185570678765, + "grad_norm": 0.12263714522123337, + "learning_rate": 0.0005152656641102532, + "loss": 2.6959, + "step": 16707 + }, + { + "epoch": 0.495448210420188, + "grad_norm": 0.13056957721710205, + "learning_rate": 0.0005152186350165898, + "loss": 2.6632, + "step": 16708 + }, + { + "epoch": 0.49547786377249947, + "grad_norm": 0.14342539012432098, + "learning_rate": 0.0005151716057881628, + "loss": 2.6933, + "step": 16709 + }, + { + "epoch": 0.49550751712481095, + "grad_norm": 0.1583380401134491, + "learning_rate": 0.0005151245764253886, + "loss": 2.6303, + "step": 16710 + }, + { + "epoch": 0.4955371704771224, + "grad_norm": 0.14724013209342957, + "learning_rate": 0.0005150775469286836, + "loss": 2.69, + "step": 16711 + }, + { + "epoch": 0.4955668238294339, + "grad_norm": 0.11904368549585342, + "learning_rate": 0.0005150305172984642, + "loss": 2.6541, + "step": 16712 + }, + { + "epoch": 0.4955964771817454, + "grad_norm": 0.165262833237648, + "learning_rate": 0.0005149834875351475, + "loss": 2.6871, + "step": 16713 + }, + { + "epoch": 0.49562613053405685, + "grad_norm": 0.1841980218887329, + "learning_rate": 0.0005149364576391491, + "loss": 2.6834, + "step": 16714 + }, + { + "epoch": 0.49565578388636833, + "grad_norm": 0.15791068971157074, + "learning_rate": 0.0005148894276108858, + "loss": 2.6637, + "step": 16715 + }, + { + "epoch": 0.49568543723867986, + "grad_norm": 0.1248803436756134, + "learning_rate": 0.0005148423974507741, + "loss": 2.6641, + "step": 16716 + }, + { + "epoch": 0.49571509059099134, + "grad_norm": 0.1529565304517746, + "learning_rate": 0.0005147953671592304, + "loss": 2.6967, + "step": 16717 + }, + { + "epoch": 0.4957447439433028, + "grad_norm": 0.1457832157611847, + "learning_rate": 0.0005147483367366712, + "loss": 2.6835, + "step": 16718 + }, + { + "epoch": 0.4957743972956143, + "grad_norm": 0.14194615185260773, + "learning_rate": 0.000514701306183513, + "loss": 2.6688, + "step": 16719 + }, + { + "epoch": 0.49580405064792576, + "grad_norm": 0.14646798372268677, + "learning_rate": 0.0005146542755001721, + "loss": 2.6756, + "step": 16720 + }, + { + "epoch": 0.49583370400023724, + "grad_norm": 0.14858879148960114, + "learning_rate": 0.0005146072446870651, + "loss": 2.6764, + "step": 16721 + }, + { + "epoch": 0.4958633573525487, + "grad_norm": 0.14594577252864838, + "learning_rate": 0.0005145602137446084, + "loss": 2.68, + "step": 16722 + }, + { + "epoch": 0.4958930107048602, + "grad_norm": 0.15599946677684784, + "learning_rate": 0.0005145131826732186, + "loss": 2.6884, + "step": 16723 + }, + { + "epoch": 0.49592266405717167, + "grad_norm": 0.15255752205848694, + "learning_rate": 0.0005144661514733122, + "loss": 2.643, + "step": 16724 + }, + { + "epoch": 0.49595231740948315, + "grad_norm": 0.12385639548301697, + "learning_rate": 0.0005144191201453054, + "loss": 2.6857, + "step": 16725 + }, + { + "epoch": 0.4959819707617946, + "grad_norm": 0.13306371867656708, + "learning_rate": 0.0005143720886896147, + "loss": 2.6922, + "step": 16726 + }, + { + "epoch": 0.4960116241141061, + "grad_norm": 0.11443604528903961, + "learning_rate": 0.0005143250571066569, + "loss": 2.6553, + "step": 16727 + }, + { + "epoch": 0.4960412774664176, + "grad_norm": 0.12467783689498901, + "learning_rate": 0.0005142780253968481, + "loss": 2.6814, + "step": 16728 + }, + { + "epoch": 0.49607093081872905, + "grad_norm": 0.11323763430118561, + "learning_rate": 0.000514230993560605, + "loss": 2.6951, + "step": 16729 + }, + { + "epoch": 0.4961005841710405, + "grad_norm": 0.12031274288892746, + "learning_rate": 0.0005141839615983441, + "loss": 2.6707, + "step": 16730 + }, + { + "epoch": 0.496130237523352, + "grad_norm": 0.11062777042388916, + "learning_rate": 0.0005141369295104816, + "loss": 2.6905, + "step": 16731 + }, + { + "epoch": 0.4961598908756635, + "grad_norm": 0.1325007826089859, + "learning_rate": 0.0005140898972974343, + "loss": 2.6694, + "step": 16732 + }, + { + "epoch": 0.49618954422797495, + "grad_norm": 0.11593493819236755, + "learning_rate": 0.0005140428649596185, + "loss": 2.6523, + "step": 16733 + }, + { + "epoch": 0.49621919758028643, + "grad_norm": 0.1279183328151703, + "learning_rate": 0.0005139958324974507, + "loss": 2.6968, + "step": 16734 + }, + { + "epoch": 0.4962488509325979, + "grad_norm": 0.10821101814508438, + "learning_rate": 0.0005139487999113476, + "loss": 2.6576, + "step": 16735 + }, + { + "epoch": 0.4962785042849094, + "grad_norm": 0.12205834686756134, + "learning_rate": 0.0005139017672017253, + "loss": 2.6678, + "step": 16736 + }, + { + "epoch": 0.4963081576372209, + "grad_norm": 0.116561658680439, + "learning_rate": 0.0005138547343690004, + "loss": 2.6881, + "step": 16737 + }, + { + "epoch": 0.4963378109895324, + "grad_norm": 0.11257728934288025, + "learning_rate": 0.0005138077014135895, + "loss": 2.6693, + "step": 16738 + }, + { + "epoch": 0.49636746434184387, + "grad_norm": 0.12515616416931152, + "learning_rate": 0.0005137606683359089, + "loss": 2.6896, + "step": 16739 + }, + { + "epoch": 0.49639711769415534, + "grad_norm": 0.12414601445198059, + "learning_rate": 0.0005137136351363756, + "loss": 2.6714, + "step": 16740 + }, + { + "epoch": 0.4964267710464668, + "grad_norm": 0.13333985209465027, + "learning_rate": 0.0005136666018154054, + "loss": 2.6598, + "step": 16741 + }, + { + "epoch": 0.4964564243987783, + "grad_norm": 0.11002403497695923, + "learning_rate": 0.0005136195683734153, + "loss": 2.6673, + "step": 16742 + }, + { + "epoch": 0.49648607775108977, + "grad_norm": 0.11410760134458542, + "learning_rate": 0.0005135725348108214, + "loss": 2.6937, + "step": 16743 + }, + { + "epoch": 0.49651573110340125, + "grad_norm": 0.11765129864215851, + "learning_rate": 0.0005135255011280404, + "loss": 2.636, + "step": 16744 + }, + { + "epoch": 0.4965453844557127, + "grad_norm": 0.10197549313306808, + "learning_rate": 0.0005134784673254889, + "loss": 2.6614, + "step": 16745 + }, + { + "epoch": 0.4965750378080242, + "grad_norm": 0.09503956139087677, + "learning_rate": 0.0005134314334035832, + "loss": 2.7025, + "step": 16746 + }, + { + "epoch": 0.4966046911603357, + "grad_norm": 0.10620693117380142, + "learning_rate": 0.0005133843993627397, + "loss": 2.6648, + "step": 16747 + }, + { + "epoch": 0.49663434451264715, + "grad_norm": 0.11933857202529907, + "learning_rate": 0.0005133373652033751, + "loss": 2.6735, + "step": 16748 + }, + { + "epoch": 0.49666399786495863, + "grad_norm": 0.12553632259368896, + "learning_rate": 0.0005132903309259059, + "loss": 2.6521, + "step": 16749 + }, + { + "epoch": 0.4966936512172701, + "grad_norm": 0.1314152479171753, + "learning_rate": 0.0005132432965307487, + "loss": 2.7009, + "step": 16750 + }, + { + "epoch": 0.4967233045695816, + "grad_norm": 0.11822950094938278, + "learning_rate": 0.0005131962620183196, + "loss": 2.6677, + "step": 16751 + }, + { + "epoch": 0.49675295792189306, + "grad_norm": 0.13712334632873535, + "learning_rate": 0.0005131492273890354, + "loss": 2.708, + "step": 16752 + }, + { + "epoch": 0.49678261127420453, + "grad_norm": 0.13606572151184082, + "learning_rate": 0.0005131021926433125, + "loss": 2.6887, + "step": 16753 + }, + { + "epoch": 0.496812264626516, + "grad_norm": 0.12452220916748047, + "learning_rate": 0.0005130551577815675, + "loss": 2.6889, + "step": 16754 + }, + { + "epoch": 0.4968419179788275, + "grad_norm": 0.13797034323215485, + "learning_rate": 0.0005130081228042168, + "loss": 2.6847, + "step": 16755 + }, + { + "epoch": 0.49687157133113896, + "grad_norm": 0.14346244931221008, + "learning_rate": 0.0005129610877116769, + "loss": 2.6441, + "step": 16756 + }, + { + "epoch": 0.49690122468345044, + "grad_norm": 0.12077134847640991, + "learning_rate": 0.0005129140525043644, + "loss": 2.6636, + "step": 16757 + }, + { + "epoch": 0.49693087803576197, + "grad_norm": 0.11454568058252335, + "learning_rate": 0.0005128670171826958, + "loss": 2.7145, + "step": 16758 + }, + { + "epoch": 0.49696053138807345, + "grad_norm": 0.11299407482147217, + "learning_rate": 0.0005128199817470876, + "loss": 2.6856, + "step": 16759 + }, + { + "epoch": 0.4969901847403849, + "grad_norm": 0.11675015091896057, + "learning_rate": 0.000512772946197956, + "loss": 2.6816, + "step": 16760 + }, + { + "epoch": 0.4970198380926964, + "grad_norm": 0.11532001942396164, + "learning_rate": 0.0005127259105357179, + "loss": 2.6685, + "step": 16761 + }, + { + "epoch": 0.4970494914450079, + "grad_norm": 0.11518601328134537, + "learning_rate": 0.0005126788747607898, + "loss": 2.6616, + "step": 16762 + }, + { + "epoch": 0.49707914479731935, + "grad_norm": 0.12141097337007523, + "learning_rate": 0.0005126318388735882, + "loss": 2.6656, + "step": 16763 + }, + { + "epoch": 0.4971087981496308, + "grad_norm": 0.12659335136413574, + "learning_rate": 0.0005125848028745292, + "loss": 2.6546, + "step": 16764 + }, + { + "epoch": 0.4971384515019423, + "grad_norm": 0.12426985055208206, + "learning_rate": 0.0005125377667640296, + "loss": 2.6687, + "step": 16765 + }, + { + "epoch": 0.4971681048542538, + "grad_norm": 0.12049739807844162, + "learning_rate": 0.000512490730542506, + "loss": 2.6841, + "step": 16766 + }, + { + "epoch": 0.49719775820656525, + "grad_norm": 0.12884464859962463, + "learning_rate": 0.0005124436942103749, + "loss": 2.644, + "step": 16767 + }, + { + "epoch": 0.49722741155887673, + "grad_norm": 0.12312661111354828, + "learning_rate": 0.0005123966577680527, + "loss": 2.688, + "step": 16768 + }, + { + "epoch": 0.4972570649111882, + "grad_norm": 0.12155808508396149, + "learning_rate": 0.0005123496212159561, + "loss": 2.6733, + "step": 16769 + }, + { + "epoch": 0.4972867182634997, + "grad_norm": 0.11674650758504868, + "learning_rate": 0.0005123025845545013, + "loss": 2.6977, + "step": 16770 + }, + { + "epoch": 0.49731637161581116, + "grad_norm": 0.13380172848701477, + "learning_rate": 0.000512255547784105, + "loss": 2.646, + "step": 16771 + }, + { + "epoch": 0.49734602496812264, + "grad_norm": 0.1460011601448059, + "learning_rate": 0.0005122085109051838, + "loss": 2.6625, + "step": 16772 + }, + { + "epoch": 0.4973756783204341, + "grad_norm": 0.1774093508720398, + "learning_rate": 0.0005121614739181543, + "loss": 2.6648, + "step": 16773 + }, + { + "epoch": 0.4974053316727456, + "grad_norm": 0.1735108345746994, + "learning_rate": 0.0005121144368234326, + "loss": 2.6836, + "step": 16774 + }, + { + "epoch": 0.49743498502505706, + "grad_norm": 0.13205453753471375, + "learning_rate": 0.0005120673996214356, + "loss": 2.6667, + "step": 16775 + }, + { + "epoch": 0.49746463837736854, + "grad_norm": 0.12096665799617767, + "learning_rate": 0.0005120203623125796, + "loss": 2.6428, + "step": 16776 + }, + { + "epoch": 0.49749429172968, + "grad_norm": 0.13517245650291443, + "learning_rate": 0.0005119733248972814, + "loss": 2.697, + "step": 16777 + }, + { + "epoch": 0.4975239450819915, + "grad_norm": 0.13206398487091064, + "learning_rate": 0.0005119262873759572, + "loss": 2.6486, + "step": 16778 + }, + { + "epoch": 0.497553598434303, + "grad_norm": 0.12172359973192215, + "learning_rate": 0.0005118792497490238, + "loss": 2.6479, + "step": 16779 + }, + { + "epoch": 0.4975832517866145, + "grad_norm": 0.12107216566801071, + "learning_rate": 0.0005118322120168976, + "loss": 2.702, + "step": 16780 + }, + { + "epoch": 0.497612905138926, + "grad_norm": 0.12521490454673767, + "learning_rate": 0.000511785174179995, + "loss": 2.6948, + "step": 16781 + }, + { + "epoch": 0.49764255849123745, + "grad_norm": 0.13016755878925323, + "learning_rate": 0.0005117381362387327, + "loss": 2.6588, + "step": 16782 + }, + { + "epoch": 0.49767221184354893, + "grad_norm": 0.14371047914028168, + "learning_rate": 0.0005116910981935273, + "loss": 2.6595, + "step": 16783 + }, + { + "epoch": 0.4977018651958604, + "grad_norm": 0.11091239750385284, + "learning_rate": 0.0005116440600447951, + "loss": 2.6878, + "step": 16784 + }, + { + "epoch": 0.4977315185481719, + "grad_norm": 0.12087429314851761, + "learning_rate": 0.0005115970217929529, + "loss": 2.708, + "step": 16785 + }, + { + "epoch": 0.49776117190048336, + "grad_norm": 0.1157628744840622, + "learning_rate": 0.0005115499834384169, + "loss": 2.688, + "step": 16786 + }, + { + "epoch": 0.49779082525279483, + "grad_norm": 0.11705494672060013, + "learning_rate": 0.0005115029449816039, + "loss": 2.7016, + "step": 16787 + }, + { + "epoch": 0.4978204786051063, + "grad_norm": 0.1007084921002388, + "learning_rate": 0.0005114559064229303, + "loss": 2.6888, + "step": 16788 + }, + { + "epoch": 0.4978501319574178, + "grad_norm": 0.10538238286972046, + "learning_rate": 0.0005114088677628126, + "loss": 2.6653, + "step": 16789 + }, + { + "epoch": 0.49787978530972926, + "grad_norm": 0.12182681262493134, + "learning_rate": 0.0005113618290016677, + "loss": 2.6284, + "step": 16790 + }, + { + "epoch": 0.49790943866204074, + "grad_norm": 0.13115954399108887, + "learning_rate": 0.0005113147901399116, + "loss": 2.6752, + "step": 16791 + }, + { + "epoch": 0.4979390920143522, + "grad_norm": 0.13801860809326172, + "learning_rate": 0.0005112677511779612, + "loss": 2.6902, + "step": 16792 + }, + { + "epoch": 0.4979687453666637, + "grad_norm": 0.1434447318315506, + "learning_rate": 0.0005112207121162329, + "loss": 2.653, + "step": 16793 + }, + { + "epoch": 0.49799839871897517, + "grad_norm": 0.1304210126399994, + "learning_rate": 0.0005111736729551433, + "loss": 2.6878, + "step": 16794 + }, + { + "epoch": 0.49802805207128664, + "grad_norm": 0.10355173796415329, + "learning_rate": 0.0005111266336951091, + "loss": 2.6804, + "step": 16795 + }, + { + "epoch": 0.4980577054235981, + "grad_norm": 0.12344467639923096, + "learning_rate": 0.0005110795943365462, + "loss": 2.6719, + "step": 16796 + }, + { + "epoch": 0.4980873587759096, + "grad_norm": 0.14772622287273407, + "learning_rate": 0.0005110325548798719, + "loss": 2.6865, + "step": 16797 + }, + { + "epoch": 0.49811701212822107, + "grad_norm": 0.13754796981811523, + "learning_rate": 0.0005109855153255023, + "loss": 2.6985, + "step": 16798 + }, + { + "epoch": 0.49814666548053255, + "grad_norm": 0.11851643770933151, + "learning_rate": 0.0005109384756738542, + "loss": 2.6687, + "step": 16799 + }, + { + "epoch": 0.4981763188328441, + "grad_norm": 0.13131527602672577, + "learning_rate": 0.000510891435925344, + "loss": 2.7196, + "step": 16800 + }, + { + "epoch": 0.49820597218515555, + "grad_norm": 0.10929446667432785, + "learning_rate": 0.0005108443960803882, + "loss": 2.6799, + "step": 16801 + }, + { + "epoch": 0.49823562553746703, + "grad_norm": 0.11314524710178375, + "learning_rate": 0.0005107973561394034, + "loss": 2.6689, + "step": 16802 + }, + { + "epoch": 0.4982652788897785, + "grad_norm": 0.11828546971082687, + "learning_rate": 0.0005107503161028062, + "loss": 2.6989, + "step": 16803 + }, + { + "epoch": 0.49829493224209, + "grad_norm": 0.12718625366687775, + "learning_rate": 0.0005107032759710131, + "loss": 2.7188, + "step": 16804 + }, + { + "epoch": 0.49832458559440146, + "grad_norm": 0.10256683081388474, + "learning_rate": 0.0005106562357444406, + "loss": 2.6829, + "step": 16805 + }, + { + "epoch": 0.49835423894671294, + "grad_norm": 0.11452264338731766, + "learning_rate": 0.0005106091954235055, + "loss": 2.6734, + "step": 16806 + }, + { + "epoch": 0.4983838922990244, + "grad_norm": 0.12454157322645187, + "learning_rate": 0.000510562155008624, + "loss": 2.7016, + "step": 16807 + }, + { + "epoch": 0.4984135456513359, + "grad_norm": 0.12074814736843109, + "learning_rate": 0.0005105151145002128, + "loss": 2.674, + "step": 16808 + }, + { + "epoch": 0.49844319900364736, + "grad_norm": 0.11503241211175919, + "learning_rate": 0.0005104680738986883, + "loss": 2.7039, + "step": 16809 + }, + { + "epoch": 0.49847285235595884, + "grad_norm": 0.11098195612430573, + "learning_rate": 0.0005104210332044674, + "loss": 2.6691, + "step": 16810 + }, + { + "epoch": 0.4985025057082703, + "grad_norm": 0.12026266753673553, + "learning_rate": 0.0005103739924179665, + "loss": 2.6998, + "step": 16811 + }, + { + "epoch": 0.4985321590605818, + "grad_norm": 0.12279558181762695, + "learning_rate": 0.0005103269515396021, + "loss": 2.7004, + "step": 16812 + }, + { + "epoch": 0.49856181241289327, + "grad_norm": 0.11857197433710098, + "learning_rate": 0.0005102799105697908, + "loss": 2.6679, + "step": 16813 + }, + { + "epoch": 0.49859146576520474, + "grad_norm": 0.1359100639820099, + "learning_rate": 0.000510232869508949, + "loss": 2.6931, + "step": 16814 + }, + { + "epoch": 0.4986211191175162, + "grad_norm": 0.11788256466388702, + "learning_rate": 0.0005101858283574933, + "loss": 2.6676, + "step": 16815 + }, + { + "epoch": 0.4986507724698277, + "grad_norm": 0.12071844935417175, + "learning_rate": 0.0005101387871158406, + "loss": 2.6842, + "step": 16816 + }, + { + "epoch": 0.4986804258221392, + "grad_norm": 0.127965047955513, + "learning_rate": 0.0005100917457844071, + "loss": 2.6354, + "step": 16817 + }, + { + "epoch": 0.49871007917445065, + "grad_norm": 0.11085529625415802, + "learning_rate": 0.0005100447043636094, + "loss": 2.6781, + "step": 16818 + }, + { + "epoch": 0.4987397325267621, + "grad_norm": 0.11814185976982117, + "learning_rate": 0.0005099976628538641, + "loss": 2.6816, + "step": 16819 + }, + { + "epoch": 0.49876938587907366, + "grad_norm": 0.12008688598871231, + "learning_rate": 0.0005099506212555879, + "loss": 2.6702, + "step": 16820 + }, + { + "epoch": 0.49879903923138513, + "grad_norm": 0.11478546261787415, + "learning_rate": 0.0005099035795691972, + "loss": 2.6711, + "step": 16821 + }, + { + "epoch": 0.4988286925836966, + "grad_norm": 0.11431136727333069, + "learning_rate": 0.0005098565377951085, + "loss": 2.6922, + "step": 16822 + }, + { + "epoch": 0.4988583459360081, + "grad_norm": 0.12930503487586975, + "learning_rate": 0.0005098094959337386, + "loss": 2.6834, + "step": 16823 + }, + { + "epoch": 0.49888799928831956, + "grad_norm": 0.12795425951480865, + "learning_rate": 0.0005097624539855039, + "loss": 2.6655, + "step": 16824 + }, + { + "epoch": 0.49891765264063104, + "grad_norm": 0.11923009902238846, + "learning_rate": 0.0005097154119508209, + "loss": 2.6901, + "step": 16825 + }, + { + "epoch": 0.4989473059929425, + "grad_norm": 0.12481091916561127, + "learning_rate": 0.0005096683698301063, + "loss": 2.669, + "step": 16826 + }, + { + "epoch": 0.498976959345254, + "grad_norm": 0.11969957500696182, + "learning_rate": 0.0005096213276237768, + "loss": 2.6554, + "step": 16827 + }, + { + "epoch": 0.49900661269756547, + "grad_norm": 0.11785203218460083, + "learning_rate": 0.0005095742853322485, + "loss": 2.6841, + "step": 16828 + }, + { + "epoch": 0.49903626604987694, + "grad_norm": 0.12095232307910919, + "learning_rate": 0.0005095272429559384, + "loss": 2.6477, + "step": 16829 + }, + { + "epoch": 0.4990659194021884, + "grad_norm": 0.12620633840560913, + "learning_rate": 0.000509480200495263, + "loss": 2.6882, + "step": 16830 + }, + { + "epoch": 0.4990955727544999, + "grad_norm": 0.12028178572654724, + "learning_rate": 0.0005094331579506387, + "loss": 2.6627, + "step": 16831 + }, + { + "epoch": 0.49912522610681137, + "grad_norm": 0.11789801716804504, + "learning_rate": 0.0005093861153224823, + "loss": 2.6856, + "step": 16832 + }, + { + "epoch": 0.49915487945912285, + "grad_norm": 0.11009851098060608, + "learning_rate": 0.0005093390726112102, + "loss": 2.6823, + "step": 16833 + }, + { + "epoch": 0.4991845328114343, + "grad_norm": 0.1286674439907074, + "learning_rate": 0.000509292029817239, + "loss": 2.683, + "step": 16834 + }, + { + "epoch": 0.4992141861637458, + "grad_norm": 0.10263433307409286, + "learning_rate": 0.000509244986940985, + "loss": 2.6595, + "step": 16835 + }, + { + "epoch": 0.4992438395160573, + "grad_norm": 0.11293025314807892, + "learning_rate": 0.0005091979439828654, + "loss": 2.6795, + "step": 16836 + }, + { + "epoch": 0.49927349286836875, + "grad_norm": 0.12994904816150665, + "learning_rate": 0.0005091509009432962, + "loss": 2.6838, + "step": 16837 + }, + { + "epoch": 0.4993031462206802, + "grad_norm": 0.1068863645195961, + "learning_rate": 0.0005091038578226943, + "loss": 2.6803, + "step": 16838 + }, + { + "epoch": 0.4993327995729917, + "grad_norm": 0.12104498594999313, + "learning_rate": 0.0005090568146214763, + "loss": 2.7085, + "step": 16839 + }, + { + "epoch": 0.4993624529253032, + "grad_norm": 0.1087111383676529, + "learning_rate": 0.0005090097713400585, + "loss": 2.6514, + "step": 16840 + }, + { + "epoch": 0.4993921062776147, + "grad_norm": 0.13653554022312164, + "learning_rate": 0.0005089627279788577, + "loss": 2.6862, + "step": 16841 + }, + { + "epoch": 0.4994217596299262, + "grad_norm": 0.13709621131420135, + "learning_rate": 0.0005089156845382903, + "loss": 2.6975, + "step": 16842 + }, + { + "epoch": 0.49945141298223766, + "grad_norm": 0.15589278936386108, + "learning_rate": 0.0005088686410187731, + "loss": 2.6738, + "step": 16843 + }, + { + "epoch": 0.49948106633454914, + "grad_norm": 0.15351830422878265, + "learning_rate": 0.0005088215974207226, + "loss": 2.6481, + "step": 16844 + }, + { + "epoch": 0.4995107196868606, + "grad_norm": 0.14560680091381073, + "learning_rate": 0.0005087745537445552, + "loss": 2.6924, + "step": 16845 + }, + { + "epoch": 0.4995403730391721, + "grad_norm": 0.13515612483024597, + "learning_rate": 0.0005087275099906878, + "loss": 2.6521, + "step": 16846 + }, + { + "epoch": 0.49957002639148357, + "grad_norm": 0.15340811014175415, + "learning_rate": 0.0005086804661595366, + "loss": 2.6724, + "step": 16847 + }, + { + "epoch": 0.49959967974379504, + "grad_norm": 0.1506919264793396, + "learning_rate": 0.0005086334222515183, + "loss": 2.6838, + "step": 16848 + }, + { + "epoch": 0.4996293330961065, + "grad_norm": 0.14203453063964844, + "learning_rate": 0.00050858637826705, + "loss": 2.6583, + "step": 16849 + }, + { + "epoch": 0.499658986448418, + "grad_norm": 0.1567792445421219, + "learning_rate": 0.0005085393342065474, + "loss": 2.6848, + "step": 16850 + }, + { + "epoch": 0.4996886398007295, + "grad_norm": 0.11471454054117203, + "learning_rate": 0.0005084922900704278, + "loss": 2.6448, + "step": 16851 + }, + { + "epoch": 0.49971829315304095, + "grad_norm": 0.13252133131027222, + "learning_rate": 0.0005084452458591073, + "loss": 2.6941, + "step": 16852 + }, + { + "epoch": 0.4997479465053524, + "grad_norm": 0.112532839179039, + "learning_rate": 0.0005083982015730028, + "loss": 2.6818, + "step": 16853 + }, + { + "epoch": 0.4997775998576639, + "grad_norm": 0.10709644109010696, + "learning_rate": 0.0005083511572125308, + "loss": 2.6696, + "step": 16854 + }, + { + "epoch": 0.4998072532099754, + "grad_norm": 0.10403194278478622, + "learning_rate": 0.0005083041127781079, + "loss": 2.7109, + "step": 16855 + }, + { + "epoch": 0.49983690656228685, + "grad_norm": 0.11935108155012131, + "learning_rate": 0.0005082570682701506, + "loss": 2.6796, + "step": 16856 + }, + { + "epoch": 0.49986655991459833, + "grad_norm": 0.11530816555023193, + "learning_rate": 0.0005082100236890757, + "loss": 2.6948, + "step": 16857 + }, + { + "epoch": 0.4998962132669098, + "grad_norm": 0.1218784973025322, + "learning_rate": 0.0005081629790352994, + "loss": 2.7051, + "step": 16858 + }, + { + "epoch": 0.4999258666192213, + "grad_norm": 0.11814261227846146, + "learning_rate": 0.0005081159343092387, + "loss": 2.6966, + "step": 16859 + }, + { + "epoch": 0.49995551997153276, + "grad_norm": 0.12161396443843842, + "learning_rate": 0.00050806888951131, + "loss": 2.6924, + "step": 16860 + }, + { + "epoch": 0.49998517332384423, + "grad_norm": 0.10619913041591644, + "learning_rate": 0.0005080218446419296, + "loss": 2.6577, + "step": 16861 + }, + { + "epoch": 0.5000148266761557, + "grad_norm": 0.1208387091755867, + "learning_rate": 0.0005079747997015148, + "loss": 2.6904, + "step": 16862 + }, + { + "epoch": 0.5000444800284672, + "grad_norm": 0.14104807376861572, + "learning_rate": 0.0005079277546904815, + "loss": 2.6891, + "step": 16863 + }, + { + "epoch": 0.5000741333807787, + "grad_norm": 0.1716943085193634, + "learning_rate": 0.0005078807096092466, + "loss": 2.6782, + "step": 16864 + }, + { + "epoch": 0.5001037867330902, + "grad_norm": 0.17243506014347076, + "learning_rate": 0.0005078336644582268, + "loss": 2.684, + "step": 16865 + }, + { + "epoch": 0.5001334400854016, + "grad_norm": 0.1525782346725464, + "learning_rate": 0.0005077866192378385, + "loss": 2.6645, + "step": 16866 + }, + { + "epoch": 0.5001630934377131, + "grad_norm": 0.1310882717370987, + "learning_rate": 0.0005077395739484982, + "loss": 2.6906, + "step": 16867 + }, + { + "epoch": 0.5001927467900246, + "grad_norm": 0.15023283660411835, + "learning_rate": 0.0005076925285906229, + "loss": 2.6961, + "step": 16868 + }, + { + "epoch": 0.5002224001423361, + "grad_norm": 0.13508152961730957, + "learning_rate": 0.0005076454831646288, + "loss": 2.7089, + "step": 16869 + }, + { + "epoch": 0.5002520534946475, + "grad_norm": 0.12215477973222733, + "learning_rate": 0.0005075984376709326, + "loss": 2.6643, + "step": 16870 + }, + { + "epoch": 0.500281706846959, + "grad_norm": 0.13043220341205597, + "learning_rate": 0.0005075513921099511, + "loss": 2.6874, + "step": 16871 + }, + { + "epoch": 0.5003113601992706, + "grad_norm": 0.11722278594970703, + "learning_rate": 0.0005075043464821006, + "loss": 2.6951, + "step": 16872 + }, + { + "epoch": 0.500341013551582, + "grad_norm": 0.14204873144626617, + "learning_rate": 0.0005074573007877979, + "loss": 2.6864, + "step": 16873 + }, + { + "epoch": 0.5003706669038935, + "grad_norm": 0.11777882277965546, + "learning_rate": 0.0005074102550274594, + "loss": 2.6654, + "step": 16874 + }, + { + "epoch": 0.500400320256205, + "grad_norm": 0.11877686530351639, + "learning_rate": 0.0005073632092015018, + "loss": 2.6697, + "step": 16875 + }, + { + "epoch": 0.5004299736085165, + "grad_norm": 0.11087887734174728, + "learning_rate": 0.0005073161633103418, + "loss": 2.6686, + "step": 16876 + }, + { + "epoch": 0.5004596269608279, + "grad_norm": 0.12239687144756317, + "learning_rate": 0.0005072691173543959, + "loss": 2.6553, + "step": 16877 + }, + { + "epoch": 0.5004892803131394, + "grad_norm": 0.12623059749603271, + "learning_rate": 0.0005072220713340808, + "loss": 2.6451, + "step": 16878 + }, + { + "epoch": 0.5005189336654509, + "grad_norm": 0.12307028472423553, + "learning_rate": 0.000507175025249813, + "loss": 2.6899, + "step": 16879 + }, + { + "epoch": 0.5005485870177624, + "grad_norm": 0.11661161482334137, + "learning_rate": 0.0005071279791020089, + "loss": 2.7129, + "step": 16880 + }, + { + "epoch": 0.5005782403700738, + "grad_norm": 0.11396939307451248, + "learning_rate": 0.0005070809328910855, + "loss": 2.6506, + "step": 16881 + }, + { + "epoch": 0.5006078937223853, + "grad_norm": 0.11842692643404007, + "learning_rate": 0.0005070338866174593, + "loss": 2.682, + "step": 16882 + }, + { + "epoch": 0.5006375470746968, + "grad_norm": 0.12128590792417526, + "learning_rate": 0.0005069868402815468, + "loss": 2.6503, + "step": 16883 + }, + { + "epoch": 0.5006672004270083, + "grad_norm": 0.11900575459003448, + "learning_rate": 0.0005069397938837646, + "loss": 2.6613, + "step": 16884 + }, + { + "epoch": 0.5006968537793197, + "grad_norm": 0.10131581127643585, + "learning_rate": 0.0005068927474245292, + "loss": 2.6698, + "step": 16885 + }, + { + "epoch": 0.5007265071316312, + "grad_norm": 0.1117701306939125, + "learning_rate": 0.0005068457009042574, + "loss": 2.671, + "step": 16886 + }, + { + "epoch": 0.5007561604839427, + "grad_norm": 0.12033241242170334, + "learning_rate": 0.0005067986543233658, + "loss": 2.6585, + "step": 16887 + }, + { + "epoch": 0.5007858138362542, + "grad_norm": 0.09966861456632614, + "learning_rate": 0.000506751607682271, + "loss": 2.6903, + "step": 16888 + }, + { + "epoch": 0.5008154671885656, + "grad_norm": 0.10752367973327637, + "learning_rate": 0.0005067045609813895, + "loss": 2.6371, + "step": 16889 + }, + { + "epoch": 0.5008451205408772, + "grad_norm": 0.1057889312505722, + "learning_rate": 0.0005066575142211379, + "loss": 2.6494, + "step": 16890 + }, + { + "epoch": 0.5008747738931886, + "grad_norm": 0.1231812983751297, + "learning_rate": 0.0005066104674019329, + "loss": 2.6557, + "step": 16891 + }, + { + "epoch": 0.5009044272455001, + "grad_norm": 0.12003368884325027, + "learning_rate": 0.0005065634205241911, + "loss": 2.6834, + "step": 16892 + }, + { + "epoch": 0.5009340805978116, + "grad_norm": 0.12432152032852173, + "learning_rate": 0.0005065163735883291, + "loss": 2.6652, + "step": 16893 + }, + { + "epoch": 0.5009637339501231, + "grad_norm": 0.1347716599702835, + "learning_rate": 0.0005064693265947636, + "loss": 2.6782, + "step": 16894 + }, + { + "epoch": 0.5009933873024346, + "grad_norm": 0.11487718671560287, + "learning_rate": 0.0005064222795439109, + "loss": 2.6925, + "step": 16895 + }, + { + "epoch": 0.501023040654746, + "grad_norm": 0.1280720829963684, + "learning_rate": 0.0005063752324361879, + "loss": 2.6485, + "step": 16896 + }, + { + "epoch": 0.5010526940070575, + "grad_norm": 0.1384177803993225, + "learning_rate": 0.0005063281852720111, + "loss": 2.6818, + "step": 16897 + }, + { + "epoch": 0.501082347359369, + "grad_norm": 0.11400045454502106, + "learning_rate": 0.0005062811380517971, + "loss": 2.6833, + "step": 16898 + }, + { + "epoch": 0.5011120007116805, + "grad_norm": 0.12833480536937714, + "learning_rate": 0.0005062340907759626, + "loss": 2.6906, + "step": 16899 + }, + { + "epoch": 0.5011416540639919, + "grad_norm": 0.11980083584785461, + "learning_rate": 0.000506187043444924, + "loss": 2.7049, + "step": 16900 + }, + { + "epoch": 0.5011713074163034, + "grad_norm": 0.11961197108030319, + "learning_rate": 0.0005061399960590983, + "loss": 2.6905, + "step": 16901 + }, + { + "epoch": 0.5012009607686149, + "grad_norm": 0.12188002467155457, + "learning_rate": 0.0005060929486189017, + "loss": 2.6415, + "step": 16902 + }, + { + "epoch": 0.5012306141209264, + "grad_norm": 0.12233742326498032, + "learning_rate": 0.0005060459011247512, + "loss": 2.6787, + "step": 16903 + }, + { + "epoch": 0.5012602674732378, + "grad_norm": 0.12461377680301666, + "learning_rate": 0.0005059988535770632, + "loss": 2.699, + "step": 16904 + }, + { + "epoch": 0.5012899208255494, + "grad_norm": 0.11751040071249008, + "learning_rate": 0.0005059518059762542, + "loss": 2.6976, + "step": 16905 + }, + { + "epoch": 0.5013195741778608, + "grad_norm": 0.11197633296251297, + "learning_rate": 0.0005059047583227408, + "loss": 2.7052, + "step": 16906 + }, + { + "epoch": 0.5013492275301723, + "grad_norm": 0.12390635907649994, + "learning_rate": 0.0005058577106169399, + "loss": 2.6329, + "step": 16907 + }, + { + "epoch": 0.5013788808824837, + "grad_norm": 0.10341516137123108, + "learning_rate": 0.000505810662859268, + "loss": 2.6783, + "step": 16908 + }, + { + "epoch": 0.5014085342347953, + "grad_norm": 0.11930757761001587, + "learning_rate": 0.0005057636150501418, + "loss": 2.6529, + "step": 16909 + }, + { + "epoch": 0.5014381875871067, + "grad_norm": 0.10401397198438644, + "learning_rate": 0.0005057165671899776, + "loss": 2.6813, + "step": 16910 + }, + { + "epoch": 0.5014678409394182, + "grad_norm": 0.11338827759027481, + "learning_rate": 0.0005056695192791924, + "loss": 2.7093, + "step": 16911 + }, + { + "epoch": 0.5014974942917296, + "grad_norm": 0.12001418322324753, + "learning_rate": 0.0005056224713182023, + "loss": 2.6636, + "step": 16912 + }, + { + "epoch": 0.5015271476440412, + "grad_norm": 0.1415896862745285, + "learning_rate": 0.0005055754233074245, + "loss": 2.6717, + "step": 16913 + }, + { + "epoch": 0.5015568009963527, + "grad_norm": 0.12504452466964722, + "learning_rate": 0.0005055283752472753, + "loss": 2.6766, + "step": 16914 + }, + { + "epoch": 0.5015864543486641, + "grad_norm": 0.11360836774110794, + "learning_rate": 0.0005054813271381715, + "loss": 2.679, + "step": 16915 + }, + { + "epoch": 0.5016161077009756, + "grad_norm": 0.12344074994325638, + "learning_rate": 0.0005054342789805296, + "loss": 2.7197, + "step": 16916 + }, + { + "epoch": 0.5016457610532871, + "grad_norm": 0.12521955370903015, + "learning_rate": 0.0005053872307747661, + "loss": 2.6984, + "step": 16917 + }, + { + "epoch": 0.5016754144055986, + "grad_norm": 0.1133720651268959, + "learning_rate": 0.0005053401825212977, + "loss": 2.6872, + "step": 16918 + }, + { + "epoch": 0.50170506775791, + "grad_norm": 0.12328055500984192, + "learning_rate": 0.0005052931342205411, + "loss": 2.6798, + "step": 16919 + }, + { + "epoch": 0.5017347211102215, + "grad_norm": 0.12174059450626373, + "learning_rate": 0.000505246085872913, + "loss": 2.6644, + "step": 16920 + }, + { + "epoch": 0.501764374462533, + "grad_norm": 0.11784340441226959, + "learning_rate": 0.0005051990374788301, + "loss": 2.6342, + "step": 16921 + }, + { + "epoch": 0.5017940278148445, + "grad_norm": 0.12086832523345947, + "learning_rate": 0.0005051519890387084, + "loss": 2.6876, + "step": 16922 + }, + { + "epoch": 0.5018236811671559, + "grad_norm": 0.12931716442108154, + "learning_rate": 0.0005051049405529652, + "loss": 2.6935, + "step": 16923 + }, + { + "epoch": 0.5018533345194675, + "grad_norm": 0.11923052370548248, + "learning_rate": 0.0005050578920220167, + "loss": 2.6452, + "step": 16924 + }, + { + "epoch": 0.5018829878717789, + "grad_norm": 0.11042547971010208, + "learning_rate": 0.0005050108434462799, + "loss": 2.6797, + "step": 16925 + }, + { + "epoch": 0.5019126412240904, + "grad_norm": 0.12578007578849792, + "learning_rate": 0.0005049637948261711, + "loss": 2.7002, + "step": 16926 + }, + { + "epoch": 0.5019422945764018, + "grad_norm": 0.12347178906202316, + "learning_rate": 0.0005049167461621071, + "loss": 2.7001, + "step": 16927 + }, + { + "epoch": 0.5019719479287134, + "grad_norm": 0.11766272038221359, + "learning_rate": 0.0005048696974545045, + "loss": 2.6531, + "step": 16928 + }, + { + "epoch": 0.5020016012810248, + "grad_norm": 0.1303972750902176, + "learning_rate": 0.0005048226487037799, + "loss": 2.6603, + "step": 16929 + }, + { + "epoch": 0.5020312546333363, + "grad_norm": 0.11770857125520706, + "learning_rate": 0.0005047755999103499, + "loss": 2.6904, + "step": 16930 + }, + { + "epoch": 0.5020609079856477, + "grad_norm": 0.12718193233013153, + "learning_rate": 0.000504728551074631, + "loss": 2.6699, + "step": 16931 + }, + { + "epoch": 0.5020905613379593, + "grad_norm": 0.11896134912967682, + "learning_rate": 0.00050468150219704, + "loss": 2.6723, + "step": 16932 + }, + { + "epoch": 0.5021202146902707, + "grad_norm": 0.12389472126960754, + "learning_rate": 0.0005046344532779936, + "loss": 2.7105, + "step": 16933 + }, + { + "epoch": 0.5021498680425822, + "grad_norm": 0.12356869876384735, + "learning_rate": 0.0005045874043179083, + "loss": 2.677, + "step": 16934 + }, + { + "epoch": 0.5021795213948937, + "grad_norm": 0.11316057294607162, + "learning_rate": 0.0005045403553172007, + "loss": 2.656, + "step": 16935 + }, + { + "epoch": 0.5022091747472052, + "grad_norm": 0.10074913501739502, + "learning_rate": 0.0005044933062762875, + "loss": 2.6754, + "step": 16936 + }, + { + "epoch": 0.5022388280995167, + "grad_norm": 0.11922753602266312, + "learning_rate": 0.0005044462571955854, + "loss": 2.6751, + "step": 16937 + }, + { + "epoch": 0.5022684814518281, + "grad_norm": 0.1405619978904724, + "learning_rate": 0.0005043992080755108, + "loss": 2.6804, + "step": 16938 + }, + { + "epoch": 0.5022981348041397, + "grad_norm": 0.15351374447345734, + "learning_rate": 0.0005043521589164805, + "loss": 2.694, + "step": 16939 + }, + { + "epoch": 0.5023277881564511, + "grad_norm": 0.14028751850128174, + "learning_rate": 0.0005043051097189111, + "loss": 2.6871, + "step": 16940 + }, + { + "epoch": 0.5023574415087626, + "grad_norm": 0.13676141202449799, + "learning_rate": 0.0005042580604832192, + "loss": 2.7034, + "step": 16941 + }, + { + "epoch": 0.502387094861074, + "grad_norm": 0.11861121654510498, + "learning_rate": 0.0005042110112098214, + "loss": 2.6932, + "step": 16942 + }, + { + "epoch": 0.5024167482133856, + "grad_norm": 0.11785180121660233, + "learning_rate": 0.0005041639618991345, + "loss": 2.6843, + "step": 16943 + }, + { + "epoch": 0.502446401565697, + "grad_norm": 0.12559156119823456, + "learning_rate": 0.000504116912551575, + "loss": 2.6744, + "step": 16944 + }, + { + "epoch": 0.5024760549180085, + "grad_norm": 0.12794311344623566, + "learning_rate": 0.0005040698631675593, + "loss": 2.6444, + "step": 16945 + }, + { + "epoch": 0.5025057082703199, + "grad_norm": 0.11975590139627457, + "learning_rate": 0.0005040228137475044, + "loss": 2.665, + "step": 16946 + }, + { + "epoch": 0.5025353616226315, + "grad_norm": 0.12398206442594528, + "learning_rate": 0.0005039757642918269, + "loss": 2.6744, + "step": 16947 + }, + { + "epoch": 0.5025650149749429, + "grad_norm": 0.13885927200317383, + "learning_rate": 0.0005039287148009433, + "loss": 2.6545, + "step": 16948 + }, + { + "epoch": 0.5025946683272544, + "grad_norm": 0.1275317668914795, + "learning_rate": 0.0005038816652752702, + "loss": 2.6807, + "step": 16949 + }, + { + "epoch": 0.5026243216795658, + "grad_norm": 0.0958879366517067, + "learning_rate": 0.0005038346157152242, + "loss": 2.6642, + "step": 16950 + }, + { + "epoch": 0.5026539750318774, + "grad_norm": 0.1123296394944191, + "learning_rate": 0.000503787566121222, + "loss": 2.6719, + "step": 16951 + }, + { + "epoch": 0.5026836283841888, + "grad_norm": 0.1287948042154312, + "learning_rate": 0.0005037405164936803, + "loss": 2.6822, + "step": 16952 + }, + { + "epoch": 0.5027132817365003, + "grad_norm": 0.1266554445028305, + "learning_rate": 0.0005036934668330159, + "loss": 2.7058, + "step": 16953 + }, + { + "epoch": 0.5027429350888117, + "grad_norm": 0.13130125403404236, + "learning_rate": 0.0005036464171396449, + "loss": 2.6846, + "step": 16954 + }, + { + "epoch": 0.5027725884411233, + "grad_norm": 0.1460190862417221, + "learning_rate": 0.0005035993674139845, + "loss": 2.6704, + "step": 16955 + }, + { + "epoch": 0.5028022417934348, + "grad_norm": 0.13540560007095337, + "learning_rate": 0.000503552317656451, + "loss": 2.6645, + "step": 16956 + }, + { + "epoch": 0.5028318951457462, + "grad_norm": 0.11618704348802567, + "learning_rate": 0.0005035052678674611, + "loss": 2.6682, + "step": 16957 + }, + { + "epoch": 0.5028615484980578, + "grad_norm": 0.11914270371198654, + "learning_rate": 0.0005034582180474314, + "loss": 2.6513, + "step": 16958 + }, + { + "epoch": 0.5028912018503692, + "grad_norm": 0.13027969002723694, + "learning_rate": 0.0005034111681967787, + "loss": 2.6411, + "step": 16959 + }, + { + "epoch": 0.5029208552026807, + "grad_norm": 0.12389210611581802, + "learning_rate": 0.0005033641183159194, + "loss": 2.6952, + "step": 16960 + }, + { + "epoch": 0.5029505085549921, + "grad_norm": 0.11925418674945831, + "learning_rate": 0.0005033170684052704, + "loss": 2.6844, + "step": 16961 + }, + { + "epoch": 0.5029801619073037, + "grad_norm": 0.11700312048196793, + "learning_rate": 0.0005032700184652481, + "loss": 2.6524, + "step": 16962 + }, + { + "epoch": 0.5030098152596151, + "grad_norm": 0.11809860169887543, + "learning_rate": 0.0005032229684962692, + "loss": 2.6935, + "step": 16963 + }, + { + "epoch": 0.5030394686119266, + "grad_norm": 0.12922944128513336, + "learning_rate": 0.0005031759184987503, + "loss": 2.6898, + "step": 16964 + }, + { + "epoch": 0.503069121964238, + "grad_norm": 0.1255643367767334, + "learning_rate": 0.0005031288684731082, + "loss": 2.6775, + "step": 16965 + }, + { + "epoch": 0.5030987753165496, + "grad_norm": 0.13126152753829956, + "learning_rate": 0.0005030818184197595, + "loss": 2.6496, + "step": 16966 + }, + { + "epoch": 0.503128428668861, + "grad_norm": 0.13177922368049622, + "learning_rate": 0.0005030347683391207, + "loss": 2.6592, + "step": 16967 + }, + { + "epoch": 0.5031580820211725, + "grad_norm": 0.12036074697971344, + "learning_rate": 0.0005029877182316085, + "loss": 2.6857, + "step": 16968 + }, + { + "epoch": 0.5031877353734839, + "grad_norm": 0.10930531471967697, + "learning_rate": 0.0005029406680976395, + "loss": 2.6822, + "step": 16969 + }, + { + "epoch": 0.5032173887257955, + "grad_norm": 0.13089771568775177, + "learning_rate": 0.0005028936179376306, + "loss": 2.6661, + "step": 16970 + }, + { + "epoch": 0.5032470420781069, + "grad_norm": 0.11794596910476685, + "learning_rate": 0.0005028465677519978, + "loss": 2.6835, + "step": 16971 + }, + { + "epoch": 0.5032766954304184, + "grad_norm": 0.10709419846534729, + "learning_rate": 0.0005027995175411584, + "loss": 2.6979, + "step": 16972 + }, + { + "epoch": 0.5033063487827298, + "grad_norm": 0.12942802906036377, + "learning_rate": 0.0005027524673055288, + "loss": 2.6785, + "step": 16973 + }, + { + "epoch": 0.5033360021350414, + "grad_norm": 0.1076449304819107, + "learning_rate": 0.0005027054170455256, + "loss": 2.6067, + "step": 16974 + }, + { + "epoch": 0.5033656554873528, + "grad_norm": 0.12326955795288086, + "learning_rate": 0.0005026583667615656, + "loss": 2.6688, + "step": 16975 + }, + { + "epoch": 0.5033953088396643, + "grad_norm": 0.11672666668891907, + "learning_rate": 0.0005026113164540651, + "loss": 2.6747, + "step": 16976 + }, + { + "epoch": 0.5034249621919759, + "grad_norm": 0.11003459990024567, + "learning_rate": 0.0005025642661234409, + "loss": 2.6577, + "step": 16977 + }, + { + "epoch": 0.5034546155442873, + "grad_norm": 0.11494449526071548, + "learning_rate": 0.0005025172157701099, + "loss": 2.6908, + "step": 16978 + }, + { + "epoch": 0.5034842688965988, + "grad_norm": 0.10027584433555603, + "learning_rate": 0.0005024701653944884, + "loss": 2.6752, + "step": 16979 + }, + { + "epoch": 0.5035139222489102, + "grad_norm": 0.09736651927232742, + "learning_rate": 0.0005024231149969934, + "loss": 2.6774, + "step": 16980 + }, + { + "epoch": 0.5035435756012218, + "grad_norm": 0.109122633934021, + "learning_rate": 0.000502376064578041, + "loss": 2.6725, + "step": 16981 + }, + { + "epoch": 0.5035732289535332, + "grad_norm": 0.11547990143299103, + "learning_rate": 0.0005023290141380482, + "loss": 2.6657, + "step": 16982 + }, + { + "epoch": 0.5036028823058447, + "grad_norm": 0.11360621452331543, + "learning_rate": 0.0005022819636774316, + "loss": 2.6892, + "step": 16983 + }, + { + "epoch": 0.5036325356581561, + "grad_norm": 0.11906509101390839, + "learning_rate": 0.0005022349131966078, + "loss": 2.701, + "step": 16984 + }, + { + "epoch": 0.5036621890104677, + "grad_norm": 0.11824361979961395, + "learning_rate": 0.0005021878626959936, + "loss": 2.6968, + "step": 16985 + }, + { + "epoch": 0.5036918423627791, + "grad_norm": 0.11377482861280441, + "learning_rate": 0.0005021408121760054, + "loss": 2.6746, + "step": 16986 + }, + { + "epoch": 0.5037214957150906, + "grad_norm": 0.13911710679531097, + "learning_rate": 0.0005020937616370598, + "loss": 2.7284, + "step": 16987 + }, + { + "epoch": 0.503751149067402, + "grad_norm": 0.12816758453845978, + "learning_rate": 0.0005020467110795738, + "loss": 2.6929, + "step": 16988 + }, + { + "epoch": 0.5037808024197136, + "grad_norm": 0.12259472161531448, + "learning_rate": 0.0005019996605039637, + "loss": 2.6943, + "step": 16989 + }, + { + "epoch": 0.503810455772025, + "grad_norm": 0.11483798176050186, + "learning_rate": 0.0005019526099106461, + "loss": 2.7051, + "step": 16990 + }, + { + "epoch": 0.5038401091243365, + "grad_norm": 0.1252516359090805, + "learning_rate": 0.0005019055593000382, + "loss": 2.6886, + "step": 16991 + }, + { + "epoch": 0.5038697624766479, + "grad_norm": 0.1402147114276886, + "learning_rate": 0.000501858508672556, + "loss": 2.6923, + "step": 16992 + }, + { + "epoch": 0.5038994158289595, + "grad_norm": 0.11929120868444443, + "learning_rate": 0.0005018114580286165, + "loss": 2.6678, + "step": 16993 + }, + { + "epoch": 0.5039290691812709, + "grad_norm": 0.1603543609380722, + "learning_rate": 0.0005017644073686361, + "loss": 2.6671, + "step": 16994 + }, + { + "epoch": 0.5039587225335824, + "grad_norm": 0.18267132341861725, + "learning_rate": 0.0005017173566930316, + "loss": 2.6925, + "step": 16995 + }, + { + "epoch": 0.5039883758858938, + "grad_norm": 0.17828653752803802, + "learning_rate": 0.0005016703060022197, + "loss": 2.6861, + "step": 16996 + }, + { + "epoch": 0.5040180292382054, + "grad_norm": 0.16190920770168304, + "learning_rate": 0.0005016232552966169, + "loss": 2.6757, + "step": 16997 + }, + { + "epoch": 0.5040476825905169, + "grad_norm": 0.1562177538871765, + "learning_rate": 0.0005015762045766399, + "loss": 2.7097, + "step": 16998 + }, + { + "epoch": 0.5040773359428283, + "grad_norm": 0.11792650073766708, + "learning_rate": 0.0005015291538427054, + "loss": 2.6866, + "step": 16999 + }, + { + "epoch": 0.5041069892951399, + "grad_norm": 0.13660672307014465, + "learning_rate": 0.0005014821030952299, + "loss": 2.6771, + "step": 17000 + }, + { + "epoch": 0.5041366426474513, + "grad_norm": 0.12790094316005707, + "learning_rate": 0.0005014350523346301, + "loss": 2.6767, + "step": 17001 + }, + { + "epoch": 0.5041662959997628, + "grad_norm": 0.10989658534526825, + "learning_rate": 0.0005013880015613226, + "loss": 2.7062, + "step": 17002 + }, + { + "epoch": 0.5041959493520742, + "grad_norm": 0.1387295424938202, + "learning_rate": 0.0005013409507757243, + "loss": 2.7391, + "step": 17003 + }, + { + "epoch": 0.5042256027043858, + "grad_norm": 0.11780396848917007, + "learning_rate": 0.0005012938999782516, + "loss": 2.6795, + "step": 17004 + }, + { + "epoch": 0.5042552560566972, + "grad_norm": 0.10703809559345245, + "learning_rate": 0.000501246849169321, + "loss": 2.6448, + "step": 17005 + }, + { + "epoch": 0.5042849094090087, + "grad_norm": 0.1184384748339653, + "learning_rate": 0.0005011997983493495, + "loss": 2.6921, + "step": 17006 + }, + { + "epoch": 0.5043145627613201, + "grad_norm": 0.1111118420958519, + "learning_rate": 0.0005011527475187536, + "loss": 2.6934, + "step": 17007 + }, + { + "epoch": 0.5043442161136317, + "grad_norm": 0.11367016285657883, + "learning_rate": 0.00050110569667795, + "loss": 2.6783, + "step": 17008 + }, + { + "epoch": 0.5043738694659431, + "grad_norm": 0.10185383260250092, + "learning_rate": 0.000501058645827355, + "loss": 2.6556, + "step": 17009 + }, + { + "epoch": 0.5044035228182546, + "grad_norm": 0.10474792122840881, + "learning_rate": 0.0005010115949673858, + "loss": 2.6871, + "step": 17010 + }, + { + "epoch": 0.504433176170566, + "grad_norm": 0.11467048525810242, + "learning_rate": 0.0005009645440984586, + "loss": 2.6968, + "step": 17011 + }, + { + "epoch": 0.5044628295228776, + "grad_norm": 0.09750672429800034, + "learning_rate": 0.0005009174932209902, + "loss": 2.71, + "step": 17012 + }, + { + "epoch": 0.504492482875189, + "grad_norm": 0.11296017467975616, + "learning_rate": 0.0005008704423353973, + "loss": 2.6811, + "step": 17013 + }, + { + "epoch": 0.5045221362275005, + "grad_norm": 0.1018158346414566, + "learning_rate": 0.0005008233914420966, + "loss": 2.6837, + "step": 17014 + }, + { + "epoch": 0.504551789579812, + "grad_norm": 0.11466182768344879, + "learning_rate": 0.0005007763405415044, + "loss": 2.6189, + "step": 17015 + }, + { + "epoch": 0.5045814429321235, + "grad_norm": 0.13458849489688873, + "learning_rate": 0.0005007292896340376, + "loss": 2.6879, + "step": 17016 + }, + { + "epoch": 0.5046110962844349, + "grad_norm": 0.12801778316497803, + "learning_rate": 0.0005006822387201128, + "loss": 2.6502, + "step": 17017 + }, + { + "epoch": 0.5046407496367464, + "grad_norm": 0.13142794370651245, + "learning_rate": 0.0005006351878001467, + "loss": 2.6774, + "step": 17018 + }, + { + "epoch": 0.504670402989058, + "grad_norm": 0.13156431913375854, + "learning_rate": 0.0005005881368745559, + "loss": 2.6897, + "step": 17019 + }, + { + "epoch": 0.5047000563413694, + "grad_norm": 0.11041755974292755, + "learning_rate": 0.0005005410859437572, + "loss": 2.686, + "step": 17020 + }, + { + "epoch": 0.5047297096936809, + "grad_norm": 0.12067767232656479, + "learning_rate": 0.0005004940350081669, + "loss": 2.6898, + "step": 17021 + }, + { + "epoch": 0.5047593630459923, + "grad_norm": 0.13103778660297394, + "learning_rate": 0.0005004469840682018, + "loss": 2.6633, + "step": 17022 + }, + { + "epoch": 0.5047890163983039, + "grad_norm": 0.11720278859138489, + "learning_rate": 0.0005003999331242788, + "loss": 2.6528, + "step": 17023 + }, + { + "epoch": 0.5048186697506153, + "grad_norm": 0.13855117559432983, + "learning_rate": 0.0005003528821768142, + "loss": 2.6199, + "step": 17024 + }, + { + "epoch": 0.5048483231029268, + "grad_norm": 0.12683787941932678, + "learning_rate": 0.0005003058312262247, + "loss": 2.6747, + "step": 17025 + }, + { + "epoch": 0.5048779764552382, + "grad_norm": 0.1295173019170761, + "learning_rate": 0.0005002587802729271, + "loss": 2.6947, + "step": 17026 + }, + { + "epoch": 0.5049076298075498, + "grad_norm": 0.12630347907543182, + "learning_rate": 0.0005002117293173379, + "loss": 2.6924, + "step": 17027 + }, + { + "epoch": 0.5049372831598612, + "grad_norm": 0.1543494164943695, + "learning_rate": 0.0005001646783598738, + "loss": 2.693, + "step": 17028 + }, + { + "epoch": 0.5049669365121727, + "grad_norm": 0.13882854580879211, + "learning_rate": 0.0005001176274009514, + "loss": 2.6688, + "step": 17029 + }, + { + "epoch": 0.5049965898644841, + "grad_norm": 0.11493266373872757, + "learning_rate": 0.0005000705764409875, + "loss": 2.6791, + "step": 17030 + }, + { + "epoch": 0.5050262432167957, + "grad_norm": 0.12659905850887299, + "learning_rate": 0.0005000235254803986, + "loss": 2.675, + "step": 17031 + }, + { + "epoch": 0.5050558965691071, + "grad_norm": 0.11266981065273285, + "learning_rate": 0.0004999764745196014, + "loss": 2.6857, + "step": 17032 + }, + { + "epoch": 0.5050855499214186, + "grad_norm": 0.1226486787199974, + "learning_rate": 0.0004999294235590125, + "loss": 2.6747, + "step": 17033 + }, + { + "epoch": 0.50511520327373, + "grad_norm": 0.11936298757791519, + "learning_rate": 0.0004998823725990486, + "loss": 2.6796, + "step": 17034 + }, + { + "epoch": 0.5051448566260416, + "grad_norm": 0.10333066433668137, + "learning_rate": 0.0004998353216401263, + "loss": 2.6686, + "step": 17035 + }, + { + "epoch": 0.505174509978353, + "grad_norm": 0.13189512491226196, + "learning_rate": 0.0004997882706826623, + "loss": 2.6973, + "step": 17036 + }, + { + "epoch": 0.5052041633306645, + "grad_norm": 0.1298639327287674, + "learning_rate": 0.0004997412197270732, + "loss": 2.6812, + "step": 17037 + }, + { + "epoch": 0.505233816682976, + "grad_norm": 0.11728077381849289, + "learning_rate": 0.0004996941687737753, + "loss": 2.6615, + "step": 17038 + }, + { + "epoch": 0.5052634700352875, + "grad_norm": 0.13795463740825653, + "learning_rate": 0.000499647117823186, + "loss": 2.6874, + "step": 17039 + }, + { + "epoch": 0.505293123387599, + "grad_norm": 0.13642069697380066, + "learning_rate": 0.0004996000668757213, + "loss": 2.6753, + "step": 17040 + }, + { + "epoch": 0.5053227767399104, + "grad_norm": 0.12452031672000885, + "learning_rate": 0.0004995530159317982, + "loss": 2.6896, + "step": 17041 + }, + { + "epoch": 0.505352430092222, + "grad_norm": 0.11017273366451263, + "learning_rate": 0.0004995059649918332, + "loss": 2.6548, + "step": 17042 + }, + { + "epoch": 0.5053820834445334, + "grad_norm": 0.1177430972456932, + "learning_rate": 0.000499458914056243, + "loss": 2.7039, + "step": 17043 + }, + { + "epoch": 0.5054117367968449, + "grad_norm": 0.11733721196651459, + "learning_rate": 0.0004994118631254441, + "loss": 2.6546, + "step": 17044 + }, + { + "epoch": 0.5054413901491563, + "grad_norm": 0.11941654980182648, + "learning_rate": 0.0004993648121998534, + "loss": 2.6656, + "step": 17045 + }, + { + "epoch": 0.5054710435014679, + "grad_norm": 0.10304431617259979, + "learning_rate": 0.0004993177612798873, + "loss": 2.6691, + "step": 17046 + }, + { + "epoch": 0.5055006968537793, + "grad_norm": 0.1175391748547554, + "learning_rate": 0.0004992707103659626, + "loss": 2.6724, + "step": 17047 + }, + { + "epoch": 0.5055303502060908, + "grad_norm": 0.11727575212717056, + "learning_rate": 0.0004992236594584959, + "loss": 2.6935, + "step": 17048 + }, + { + "epoch": 0.5055600035584022, + "grad_norm": 0.11870357394218445, + "learning_rate": 0.0004991766085579037, + "loss": 2.6775, + "step": 17049 + }, + { + "epoch": 0.5055896569107138, + "grad_norm": 0.11602354049682617, + "learning_rate": 0.0004991295576646028, + "loss": 2.662, + "step": 17050 + }, + { + "epoch": 0.5056193102630252, + "grad_norm": 0.12115534394979477, + "learning_rate": 0.0004990825067790098, + "loss": 2.6958, + "step": 17051 + }, + { + "epoch": 0.5056489636153367, + "grad_norm": 0.1390627771615982, + "learning_rate": 0.0004990354559015413, + "loss": 2.659, + "step": 17052 + }, + { + "epoch": 0.5056786169676482, + "grad_norm": 0.12852200865745544, + "learning_rate": 0.0004989884050326142, + "loss": 2.6538, + "step": 17053 + }, + { + "epoch": 0.5057082703199597, + "grad_norm": 0.10078088939189911, + "learning_rate": 0.0004989413541726448, + "loss": 2.6959, + "step": 17054 + }, + { + "epoch": 0.5057379236722711, + "grad_norm": 0.10809620469808578, + "learning_rate": 0.00049889430332205, + "loss": 2.7038, + "step": 17055 + }, + { + "epoch": 0.5057675770245826, + "grad_norm": 0.11537755280733109, + "learning_rate": 0.0004988472524812464, + "loss": 2.6709, + "step": 17056 + }, + { + "epoch": 0.5057972303768941, + "grad_norm": 0.10943961143493652, + "learning_rate": 0.0004988002016506505, + "loss": 2.6913, + "step": 17057 + }, + { + "epoch": 0.5058268837292056, + "grad_norm": 0.1059495285153389, + "learning_rate": 0.000498753150830679, + "loss": 2.6816, + "step": 17058 + }, + { + "epoch": 0.505856537081517, + "grad_norm": 0.11029770970344543, + "learning_rate": 0.0004987061000217485, + "loss": 2.6761, + "step": 17059 + }, + { + "epoch": 0.5058861904338285, + "grad_norm": 0.12624678015708923, + "learning_rate": 0.0004986590492242758, + "loss": 2.691, + "step": 17060 + }, + { + "epoch": 0.5059158437861401, + "grad_norm": 0.13858100771903992, + "learning_rate": 0.0004986119984386774, + "loss": 2.6712, + "step": 17061 + }, + { + "epoch": 0.5059454971384515, + "grad_norm": 0.15791504085063934, + "learning_rate": 0.00049856494766537, + "loss": 2.6774, + "step": 17062 + }, + { + "epoch": 0.505975150490763, + "grad_norm": 0.14791744947433472, + "learning_rate": 0.0004985178969047704, + "loss": 2.6668, + "step": 17063 + }, + { + "epoch": 0.5060048038430744, + "grad_norm": 0.12028726190328598, + "learning_rate": 0.0004984708461572946, + "loss": 2.677, + "step": 17064 + }, + { + "epoch": 0.506034457195386, + "grad_norm": 0.12490001320838928, + "learning_rate": 0.0004984237954233601, + "loss": 2.6809, + "step": 17065 + }, + { + "epoch": 0.5060641105476974, + "grad_norm": 0.13180460035800934, + "learning_rate": 0.0004983767447033831, + "loss": 2.6808, + "step": 17066 + }, + { + "epoch": 0.5060937639000089, + "grad_norm": 0.1349683552980423, + "learning_rate": 0.0004983296939977804, + "loss": 2.6668, + "step": 17067 + }, + { + "epoch": 0.5061234172523204, + "grad_norm": 0.11995609104633331, + "learning_rate": 0.0004982826433069684, + "loss": 2.6825, + "step": 17068 + }, + { + "epoch": 0.5061530706046319, + "grad_norm": 0.11607814580202103, + "learning_rate": 0.0004982355926313639, + "loss": 2.7098, + "step": 17069 + }, + { + "epoch": 0.5061827239569433, + "grad_norm": 0.12143757939338684, + "learning_rate": 0.0004981885419713836, + "loss": 2.6527, + "step": 17070 + }, + { + "epoch": 0.5062123773092548, + "grad_norm": 0.09577503055334091, + "learning_rate": 0.000498141491327444, + "loss": 2.691, + "step": 17071 + }, + { + "epoch": 0.5062420306615663, + "grad_norm": 0.11180565506219864, + "learning_rate": 0.0004980944406999619, + "loss": 2.6998, + "step": 17072 + }, + { + "epoch": 0.5062716840138778, + "grad_norm": 0.11808092892169952, + "learning_rate": 0.0004980473900893539, + "loss": 2.6635, + "step": 17073 + }, + { + "epoch": 0.5063013373661892, + "grad_norm": 0.12085077911615372, + "learning_rate": 0.0004980003394960365, + "loss": 2.6917, + "step": 17074 + }, + { + "epoch": 0.5063309907185007, + "grad_norm": 0.1263504922389984, + "learning_rate": 0.0004979532889204264, + "loss": 2.687, + "step": 17075 + }, + { + "epoch": 0.5063606440708122, + "grad_norm": 0.13529852032661438, + "learning_rate": 0.0004979062383629404, + "loss": 2.6794, + "step": 17076 + }, + { + "epoch": 0.5063902974231237, + "grad_norm": 0.1134692132472992, + "learning_rate": 0.0004978591878239948, + "loss": 2.6812, + "step": 17077 + }, + { + "epoch": 0.5064199507754351, + "grad_norm": 0.12241180986166, + "learning_rate": 0.0004978121373040065, + "loss": 2.7095, + "step": 17078 + }, + { + "epoch": 0.5064496041277466, + "grad_norm": 0.11672019958496094, + "learning_rate": 0.0004977650868033922, + "loss": 2.6699, + "step": 17079 + }, + { + "epoch": 0.5064792574800582, + "grad_norm": 0.12702059745788574, + "learning_rate": 0.0004977180363225685, + "loss": 2.652, + "step": 17080 + }, + { + "epoch": 0.5065089108323696, + "grad_norm": 0.13570711016654968, + "learning_rate": 0.0004976709858619518, + "loss": 2.6919, + "step": 17081 + }, + { + "epoch": 0.5065385641846811, + "grad_norm": 0.12592880427837372, + "learning_rate": 0.0004976239354219591, + "loss": 2.649, + "step": 17082 + }, + { + "epoch": 0.5065682175369925, + "grad_norm": 0.12659235298633575, + "learning_rate": 0.0004975768850030068, + "loss": 2.6964, + "step": 17083 + }, + { + "epoch": 0.5065978708893041, + "grad_norm": 0.11957506835460663, + "learning_rate": 0.0004975298346055117, + "loss": 2.678, + "step": 17084 + }, + { + "epoch": 0.5066275242416155, + "grad_norm": 0.1233193576335907, + "learning_rate": 0.0004974827842298903, + "loss": 2.6848, + "step": 17085 + }, + { + "epoch": 0.506657177593927, + "grad_norm": 0.13394373655319214, + "learning_rate": 0.0004974357338765591, + "loss": 2.6845, + "step": 17086 + }, + { + "epoch": 0.5066868309462385, + "grad_norm": 0.09999705851078033, + "learning_rate": 0.000497388683545935, + "loss": 2.695, + "step": 17087 + }, + { + "epoch": 0.50671648429855, + "grad_norm": 0.1553381085395813, + "learning_rate": 0.0004973416332384347, + "loss": 2.6557, + "step": 17088 + }, + { + "epoch": 0.5067461376508614, + "grad_norm": 0.14814254641532898, + "learning_rate": 0.0004972945829544745, + "loss": 2.697, + "step": 17089 + }, + { + "epoch": 0.5067757910031729, + "grad_norm": 0.13400623202323914, + "learning_rate": 0.0004972475326944712, + "loss": 2.6415, + "step": 17090 + }, + { + "epoch": 0.5068054443554844, + "grad_norm": 0.11498375236988068, + "learning_rate": 0.0004972004824588416, + "loss": 2.6423, + "step": 17091 + }, + { + "epoch": 0.5068350977077959, + "grad_norm": 0.1216018795967102, + "learning_rate": 0.0004971534322480021, + "loss": 2.7271, + "step": 17092 + }, + { + "epoch": 0.5068647510601073, + "grad_norm": 0.12500005960464478, + "learning_rate": 0.0004971063820623696, + "loss": 2.6558, + "step": 17093 + }, + { + "epoch": 0.5068944044124188, + "grad_norm": 0.11910251528024673, + "learning_rate": 0.0004970593319023606, + "loss": 2.6689, + "step": 17094 + }, + { + "epoch": 0.5069240577647303, + "grad_norm": 0.11654376983642578, + "learning_rate": 0.0004970122817683916, + "loss": 2.6854, + "step": 17095 + }, + { + "epoch": 0.5069537111170418, + "grad_norm": 0.11593478173017502, + "learning_rate": 0.0004969652316608794, + "loss": 2.6822, + "step": 17096 + }, + { + "epoch": 0.5069833644693532, + "grad_norm": 0.11323532462120056, + "learning_rate": 0.0004969181815802406, + "loss": 2.7059, + "step": 17097 + }, + { + "epoch": 0.5070130178216647, + "grad_norm": 0.1159764751791954, + "learning_rate": 0.0004968711315268919, + "loss": 2.6746, + "step": 17098 + }, + { + "epoch": 0.5070426711739762, + "grad_norm": 0.12575393915176392, + "learning_rate": 0.0004968240815012497, + "loss": 2.6839, + "step": 17099 + }, + { + "epoch": 0.5070723245262877, + "grad_norm": 0.12473766505718231, + "learning_rate": 0.0004967770315037308, + "loss": 2.6814, + "step": 17100 + }, + { + "epoch": 0.5071019778785992, + "grad_norm": 0.12007637321949005, + "learning_rate": 0.0004967299815347521, + "loss": 2.6849, + "step": 17101 + }, + { + "epoch": 0.5071316312309107, + "grad_norm": 0.118128202855587, + "learning_rate": 0.0004966829315947299, + "loss": 2.6969, + "step": 17102 + }, + { + "epoch": 0.5071612845832222, + "grad_norm": 0.1256963014602661, + "learning_rate": 0.0004966358816840805, + "loss": 2.6845, + "step": 17103 + }, + { + "epoch": 0.5071909379355336, + "grad_norm": 0.10962335765361786, + "learning_rate": 0.0004965888318032213, + "loss": 2.6695, + "step": 17104 + }, + { + "epoch": 0.5072205912878451, + "grad_norm": 0.10940884053707123, + "learning_rate": 0.0004965417819525686, + "loss": 2.6662, + "step": 17105 + }, + { + "epoch": 0.5072502446401566, + "grad_norm": 0.1267094612121582, + "learning_rate": 0.0004964947321325389, + "loss": 2.6717, + "step": 17106 + }, + { + "epoch": 0.5072798979924681, + "grad_norm": 0.11750704795122147, + "learning_rate": 0.0004964476823435491, + "loss": 2.6508, + "step": 17107 + }, + { + "epoch": 0.5073095513447795, + "grad_norm": 0.12417615205049515, + "learning_rate": 0.0004964006325860155, + "loss": 2.6484, + "step": 17108 + }, + { + "epoch": 0.507339204697091, + "grad_norm": 0.12452653795480728, + "learning_rate": 0.0004963535828603551, + "loss": 2.6499, + "step": 17109 + }, + { + "epoch": 0.5073688580494025, + "grad_norm": 0.13925902545452118, + "learning_rate": 0.0004963065331669842, + "loss": 2.6627, + "step": 17110 + }, + { + "epoch": 0.507398511401714, + "grad_norm": 0.13886192440986633, + "learning_rate": 0.0004962594835063197, + "loss": 2.6619, + "step": 17111 + }, + { + "epoch": 0.5074281647540254, + "grad_norm": 0.1294020116329193, + "learning_rate": 0.0004962124338787781, + "loss": 2.6926, + "step": 17112 + }, + { + "epoch": 0.5074578181063369, + "grad_norm": 0.12930075824260712, + "learning_rate": 0.000496165384284776, + "loss": 2.6458, + "step": 17113 + }, + { + "epoch": 0.5074874714586484, + "grad_norm": 0.133748859167099, + "learning_rate": 0.0004961183347247301, + "loss": 2.6834, + "step": 17114 + }, + { + "epoch": 0.5075171248109599, + "grad_norm": 0.12142649292945862, + "learning_rate": 0.0004960712851990569, + "loss": 2.6628, + "step": 17115 + }, + { + "epoch": 0.5075467781632713, + "grad_norm": 0.1295103281736374, + "learning_rate": 0.0004960242357081732, + "loss": 2.6707, + "step": 17116 + }, + { + "epoch": 0.5075764315155828, + "grad_norm": 0.15016894042491913, + "learning_rate": 0.0004959771862524955, + "loss": 2.697, + "step": 17117 + }, + { + "epoch": 0.5076060848678943, + "grad_norm": 0.1253948211669922, + "learning_rate": 0.0004959301368324407, + "loss": 2.699, + "step": 17118 + }, + { + "epoch": 0.5076357382202058, + "grad_norm": 0.12823109328746796, + "learning_rate": 0.0004958830874484252, + "loss": 2.6733, + "step": 17119 + }, + { + "epoch": 0.5076653915725172, + "grad_norm": 0.12602651119232178, + "learning_rate": 0.0004958360381008655, + "loss": 2.6793, + "step": 17120 + }, + { + "epoch": 0.5076950449248288, + "grad_norm": 0.11061742156744003, + "learning_rate": 0.0004957889887901786, + "loss": 2.6713, + "step": 17121 + }, + { + "epoch": 0.5077246982771403, + "grad_norm": 0.1342620998620987, + "learning_rate": 0.000495741939516781, + "loss": 2.7157, + "step": 17122 + }, + { + "epoch": 0.5077543516294517, + "grad_norm": 0.11821179836988449, + "learning_rate": 0.000495694890281089, + "loss": 2.6813, + "step": 17123 + }, + { + "epoch": 0.5077840049817632, + "grad_norm": 0.11772997677326202, + "learning_rate": 0.0004956478410835196, + "loss": 2.6845, + "step": 17124 + }, + { + "epoch": 0.5078136583340747, + "grad_norm": 0.13501758873462677, + "learning_rate": 0.0004956007919244892, + "loss": 2.6653, + "step": 17125 + }, + { + "epoch": 0.5078433116863862, + "grad_norm": 0.12486700713634491, + "learning_rate": 0.0004955537428044147, + "loss": 2.6697, + "step": 17126 + }, + { + "epoch": 0.5078729650386976, + "grad_norm": 0.11843162775039673, + "learning_rate": 0.0004955066937237125, + "loss": 2.7569, + "step": 17127 + }, + { + "epoch": 0.5079026183910091, + "grad_norm": 0.10956626385450363, + "learning_rate": 0.0004954596446827995, + "loss": 2.685, + "step": 17128 + }, + { + "epoch": 0.5079322717433206, + "grad_norm": 0.10470455884933472, + "learning_rate": 0.0004954125956820916, + "loss": 2.6471, + "step": 17129 + }, + { + "epoch": 0.5079619250956321, + "grad_norm": 0.11633702367544174, + "learning_rate": 0.0004953655467220063, + "loss": 2.6569, + "step": 17130 + }, + { + "epoch": 0.5079915784479435, + "grad_norm": 0.10214725136756897, + "learning_rate": 0.0004953184978029599, + "loss": 2.7107, + "step": 17131 + }, + { + "epoch": 0.508021231800255, + "grad_norm": 0.11877737194299698, + "learning_rate": 0.0004952714489253691, + "loss": 2.6676, + "step": 17132 + }, + { + "epoch": 0.5080508851525665, + "grad_norm": 0.12945716083049774, + "learning_rate": 0.0004952244000896503, + "loss": 2.6539, + "step": 17133 + }, + { + "epoch": 0.508080538504878, + "grad_norm": 0.14958785474300385, + "learning_rate": 0.0004951773512962203, + "loss": 2.6774, + "step": 17134 + }, + { + "epoch": 0.5081101918571894, + "grad_norm": 0.12580829858779907, + "learning_rate": 0.0004951303025454956, + "loss": 2.6945, + "step": 17135 + }, + { + "epoch": 0.508139845209501, + "grad_norm": 0.12943825125694275, + "learning_rate": 0.000495083253837893, + "loss": 2.6761, + "step": 17136 + }, + { + "epoch": 0.5081694985618124, + "grad_norm": 0.13922381401062012, + "learning_rate": 0.0004950362051738289, + "loss": 2.7211, + "step": 17137 + }, + { + "epoch": 0.5081991519141239, + "grad_norm": 0.1410512626171112, + "learning_rate": 0.0004949891565537202, + "loss": 2.6701, + "step": 17138 + }, + { + "epoch": 0.5082288052664353, + "grad_norm": 0.11401069164276123, + "learning_rate": 0.0004949421079779834, + "loss": 2.6538, + "step": 17139 + }, + { + "epoch": 0.5082584586187469, + "grad_norm": 0.12049070000648499, + "learning_rate": 0.000494895059447035, + "loss": 2.6707, + "step": 17140 + }, + { + "epoch": 0.5082881119710583, + "grad_norm": 0.1366395652294159, + "learning_rate": 0.0004948480109612918, + "loss": 2.6722, + "step": 17141 + }, + { + "epoch": 0.5083177653233698, + "grad_norm": 0.12998922169208527, + "learning_rate": 0.00049480096252117, + "loss": 2.6753, + "step": 17142 + }, + { + "epoch": 0.5083474186756813, + "grad_norm": 0.13627047836780548, + "learning_rate": 0.000494753914127087, + "loss": 2.6469, + "step": 17143 + }, + { + "epoch": 0.5083770720279928, + "grad_norm": 0.1287941336631775, + "learning_rate": 0.0004947068657794588, + "loss": 2.6556, + "step": 17144 + }, + { + "epoch": 0.5084067253803043, + "grad_norm": 0.11504153162240982, + "learning_rate": 0.0004946598174787023, + "loss": 2.6695, + "step": 17145 + }, + { + "epoch": 0.5084363787326157, + "grad_norm": 0.10611696541309357, + "learning_rate": 0.000494612769225234, + "loss": 2.6689, + "step": 17146 + }, + { + "epoch": 0.5084660320849272, + "grad_norm": 0.12029198557138443, + "learning_rate": 0.0004945657210194706, + "loss": 2.6749, + "step": 17147 + }, + { + "epoch": 0.5084956854372387, + "grad_norm": 0.11666882783174515, + "learning_rate": 0.0004945186728618285, + "loss": 2.6364, + "step": 17148 + }, + { + "epoch": 0.5085253387895502, + "grad_norm": 0.11685113608837128, + "learning_rate": 0.0004944716247527248, + "loss": 2.6669, + "step": 17149 + }, + { + "epoch": 0.5085549921418616, + "grad_norm": 0.11210615932941437, + "learning_rate": 0.0004944245766925757, + "loss": 2.7095, + "step": 17150 + }, + { + "epoch": 0.5085846454941731, + "grad_norm": 0.1266842782497406, + "learning_rate": 0.0004943775286817977, + "loss": 2.6752, + "step": 17151 + }, + { + "epoch": 0.5086142988464846, + "grad_norm": 0.1635351926088333, + "learning_rate": 0.0004943304807208079, + "loss": 2.7112, + "step": 17152 + }, + { + "epoch": 0.5086439521987961, + "grad_norm": 0.16795580089092255, + "learning_rate": 0.0004942834328100225, + "loss": 2.6767, + "step": 17153 + }, + { + "epoch": 0.5086736055511075, + "grad_norm": 0.13721977174282074, + "learning_rate": 0.0004942363849498585, + "loss": 2.6677, + "step": 17154 + }, + { + "epoch": 0.508703258903419, + "grad_norm": 0.13026051223278046, + "learning_rate": 0.0004941893371407319, + "loss": 2.7006, + "step": 17155 + }, + { + "epoch": 0.5087329122557305, + "grad_norm": 0.10703485459089279, + "learning_rate": 0.00049414228938306, + "loss": 2.6936, + "step": 17156 + }, + { + "epoch": 0.508762565608042, + "grad_norm": 0.1161046177148819, + "learning_rate": 0.0004940952416772591, + "loss": 2.6717, + "step": 17157 + }, + { + "epoch": 0.5087922189603534, + "grad_norm": 0.14200477302074432, + "learning_rate": 0.0004940481940237458, + "loss": 2.6459, + "step": 17158 + }, + { + "epoch": 0.508821872312665, + "grad_norm": 0.1159079372882843, + "learning_rate": 0.0004940011464229369, + "loss": 2.6963, + "step": 17159 + }, + { + "epoch": 0.5088515256649764, + "grad_norm": 0.12006448954343796, + "learning_rate": 0.0004939540988752489, + "loss": 2.6988, + "step": 17160 + }, + { + "epoch": 0.5088811790172879, + "grad_norm": 0.12679308652877808, + "learning_rate": 0.0004939070513810983, + "loss": 2.6618, + "step": 17161 + }, + { + "epoch": 0.5089108323695993, + "grad_norm": 0.1157517060637474, + "learning_rate": 0.0004938600039409018, + "loss": 2.663, + "step": 17162 + }, + { + "epoch": 0.5089404857219109, + "grad_norm": 0.10887973010540009, + "learning_rate": 0.000493812956555076, + "loss": 2.6873, + "step": 17163 + }, + { + "epoch": 0.5089701390742224, + "grad_norm": 0.12220404297113419, + "learning_rate": 0.0004937659092240375, + "loss": 2.6811, + "step": 17164 + }, + { + "epoch": 0.5089997924265338, + "grad_norm": 0.14099635183811188, + "learning_rate": 0.000493718861948203, + "loss": 2.6656, + "step": 17165 + }, + { + "epoch": 0.5090294457788453, + "grad_norm": 0.12269853055477142, + "learning_rate": 0.0004936718147279891, + "loss": 2.6899, + "step": 17166 + }, + { + "epoch": 0.5090590991311568, + "grad_norm": 0.1093297228217125, + "learning_rate": 0.0004936247675638123, + "loss": 2.6512, + "step": 17167 + }, + { + "epoch": 0.5090887524834683, + "grad_norm": 0.11483090370893478, + "learning_rate": 0.0004935777204560891, + "loss": 2.6621, + "step": 17168 + }, + { + "epoch": 0.5091184058357797, + "grad_norm": 0.11306045949459076, + "learning_rate": 0.0004935306734052365, + "loss": 2.6921, + "step": 17169 + }, + { + "epoch": 0.5091480591880913, + "grad_norm": 0.1068299263715744, + "learning_rate": 0.000493483626411671, + "loss": 2.6895, + "step": 17170 + }, + { + "epoch": 0.5091777125404027, + "grad_norm": 0.10960879921913147, + "learning_rate": 0.0004934365794758089, + "loss": 2.6742, + "step": 17171 + }, + { + "epoch": 0.5092073658927142, + "grad_norm": 0.12249751389026642, + "learning_rate": 0.0004933895325980672, + "loss": 2.6801, + "step": 17172 + }, + { + "epoch": 0.5092370192450256, + "grad_norm": 0.12868958711624146, + "learning_rate": 0.0004933424857788621, + "loss": 2.6702, + "step": 17173 + }, + { + "epoch": 0.5092666725973372, + "grad_norm": 0.12141035497188568, + "learning_rate": 0.0004932954390186107, + "loss": 2.6697, + "step": 17174 + }, + { + "epoch": 0.5092963259496486, + "grad_norm": 0.1161445677280426, + "learning_rate": 0.0004932483923177291, + "loss": 2.6935, + "step": 17175 + }, + { + "epoch": 0.5093259793019601, + "grad_norm": 0.10374526679515839, + "learning_rate": 0.0004932013456766342, + "loss": 2.6704, + "step": 17176 + }, + { + "epoch": 0.5093556326542715, + "grad_norm": 0.11257155984640121, + "learning_rate": 0.0004931542990957427, + "loss": 2.7116, + "step": 17177 + }, + { + "epoch": 0.5093852860065831, + "grad_norm": 0.127163827419281, + "learning_rate": 0.000493107252575471, + "loss": 2.6797, + "step": 17178 + }, + { + "epoch": 0.5094149393588945, + "grad_norm": 0.1312231868505478, + "learning_rate": 0.0004930602061162357, + "loss": 2.6763, + "step": 17179 + }, + { + "epoch": 0.509444592711206, + "grad_norm": 0.11042090505361557, + "learning_rate": 0.0004930131597184535, + "loss": 2.676, + "step": 17180 + }, + { + "epoch": 0.5094742460635174, + "grad_norm": 0.10690686851739883, + "learning_rate": 0.0004929661133825407, + "loss": 2.6628, + "step": 17181 + }, + { + "epoch": 0.509503899415829, + "grad_norm": 0.12045764178037643, + "learning_rate": 0.0004929190671089145, + "loss": 2.7051, + "step": 17182 + }, + { + "epoch": 0.5095335527681404, + "grad_norm": 0.13358870148658752, + "learning_rate": 0.000492872020897991, + "loss": 2.6799, + "step": 17183 + }, + { + "epoch": 0.5095632061204519, + "grad_norm": 0.11587005108594894, + "learning_rate": 0.0004928249747501871, + "loss": 2.6393, + "step": 17184 + }, + { + "epoch": 0.5095928594727634, + "grad_norm": 0.10690011829137802, + "learning_rate": 0.0004927779286659192, + "loss": 2.673, + "step": 17185 + }, + { + "epoch": 0.5096225128250749, + "grad_norm": 0.11440369486808777, + "learning_rate": 0.0004927308826456041, + "loss": 2.6876, + "step": 17186 + }, + { + "epoch": 0.5096521661773864, + "grad_norm": 0.12151636183261871, + "learning_rate": 0.0004926838366896584, + "loss": 2.6913, + "step": 17187 + }, + { + "epoch": 0.5096818195296978, + "grad_norm": 0.14342646300792694, + "learning_rate": 0.0004926367907984984, + "loss": 2.6309, + "step": 17188 + }, + { + "epoch": 0.5097114728820094, + "grad_norm": 0.13081727921962738, + "learning_rate": 0.0004925897449725408, + "loss": 2.6443, + "step": 17189 + }, + { + "epoch": 0.5097411262343208, + "grad_norm": 0.11139626801013947, + "learning_rate": 0.0004925426992122024, + "loss": 2.7128, + "step": 17190 + }, + { + "epoch": 0.5097707795866323, + "grad_norm": 0.12833622097969055, + "learning_rate": 0.0004924956535178995, + "loss": 2.6707, + "step": 17191 + }, + { + "epoch": 0.5098004329389437, + "grad_norm": 0.11114165931940079, + "learning_rate": 0.0004924486078900491, + "loss": 2.6571, + "step": 17192 + }, + { + "epoch": 0.5098300862912553, + "grad_norm": 0.11441786587238312, + "learning_rate": 0.0004924015623290675, + "loss": 2.6654, + "step": 17193 + }, + { + "epoch": 0.5098597396435667, + "grad_norm": 0.12967154383659363, + "learning_rate": 0.0004923545168353712, + "loss": 2.6538, + "step": 17194 + }, + { + "epoch": 0.5098893929958782, + "grad_norm": 0.13052579760551453, + "learning_rate": 0.0004923074714093771, + "loss": 2.6588, + "step": 17195 + }, + { + "epoch": 0.5099190463481896, + "grad_norm": 0.14425502717494965, + "learning_rate": 0.0004922604260515017, + "loss": 2.6864, + "step": 17196 + }, + { + "epoch": 0.5099486997005012, + "grad_norm": 0.13024955987930298, + "learning_rate": 0.0004922133807621615, + "loss": 2.6457, + "step": 17197 + }, + { + "epoch": 0.5099783530528126, + "grad_norm": 0.14197476208209991, + "learning_rate": 0.0004921663355417733, + "loss": 2.6631, + "step": 17198 + }, + { + "epoch": 0.5100080064051241, + "grad_norm": 0.1284194439649582, + "learning_rate": 0.0004921192903907534, + "loss": 2.6798, + "step": 17199 + }, + { + "epoch": 0.5100376597574355, + "grad_norm": 0.11582886427640915, + "learning_rate": 0.0004920722453095187, + "loss": 2.7107, + "step": 17200 + }, + { + "epoch": 0.5100673131097471, + "grad_norm": 0.136112242937088, + "learning_rate": 0.0004920252002984854, + "loss": 2.6887, + "step": 17201 + }, + { + "epoch": 0.5100969664620585, + "grad_norm": 0.12484537810087204, + "learning_rate": 0.0004919781553580704, + "loss": 2.6687, + "step": 17202 + }, + { + "epoch": 0.51012661981437, + "grad_norm": 0.12253313511610031, + "learning_rate": 0.0004919311104886902, + "loss": 2.6741, + "step": 17203 + }, + { + "epoch": 0.5101562731666814, + "grad_norm": 0.11607982963323593, + "learning_rate": 0.0004918840656907615, + "loss": 2.6688, + "step": 17204 + }, + { + "epoch": 0.510185926518993, + "grad_norm": 0.1423293650150299, + "learning_rate": 0.0004918370209647007, + "loss": 2.6392, + "step": 17205 + }, + { + "epoch": 0.5102155798713045, + "grad_norm": 0.12617263197898865, + "learning_rate": 0.0004917899763109245, + "loss": 2.6939, + "step": 17206 + }, + { + "epoch": 0.5102452332236159, + "grad_norm": 0.14071552455425262, + "learning_rate": 0.0004917429317298493, + "loss": 2.6549, + "step": 17207 + }, + { + "epoch": 0.5102748865759275, + "grad_norm": 0.1460178643465042, + "learning_rate": 0.0004916958872218921, + "loss": 2.6866, + "step": 17208 + }, + { + "epoch": 0.5103045399282389, + "grad_norm": 0.13966266810894012, + "learning_rate": 0.0004916488427874692, + "loss": 2.6662, + "step": 17209 + }, + { + "epoch": 0.5103341932805504, + "grad_norm": 0.13306264579296112, + "learning_rate": 0.0004916017984269972, + "loss": 2.6919, + "step": 17210 + }, + { + "epoch": 0.5103638466328618, + "grad_norm": 0.12742212414741516, + "learning_rate": 0.0004915547541408927, + "loss": 2.6863, + "step": 17211 + }, + { + "epoch": 0.5103934999851734, + "grad_norm": 0.10461004823446274, + "learning_rate": 0.0004915077099295723, + "loss": 2.6407, + "step": 17212 + }, + { + "epoch": 0.5104231533374848, + "grad_norm": 0.1189194992184639, + "learning_rate": 0.0004914606657934526, + "loss": 2.6804, + "step": 17213 + }, + { + "epoch": 0.5104528066897963, + "grad_norm": 0.10660061240196228, + "learning_rate": 0.0004914136217329502, + "loss": 2.6426, + "step": 17214 + }, + { + "epoch": 0.5104824600421077, + "grad_norm": 0.10706765204668045, + "learning_rate": 0.0004913665777484817, + "loss": 2.6446, + "step": 17215 + }, + { + "epoch": 0.5105121133944193, + "grad_norm": 0.11918798834085464, + "learning_rate": 0.0004913195338404635, + "loss": 2.6931, + "step": 17216 + }, + { + "epoch": 0.5105417667467307, + "grad_norm": 0.10939238965511322, + "learning_rate": 0.0004912724900093125, + "loss": 2.6806, + "step": 17217 + }, + { + "epoch": 0.5105714200990422, + "grad_norm": 0.10667075216770172, + "learning_rate": 0.000491225446255445, + "loss": 2.6916, + "step": 17218 + }, + { + "epoch": 0.5106010734513536, + "grad_norm": 0.11374400556087494, + "learning_rate": 0.0004911784025792775, + "loss": 2.6834, + "step": 17219 + }, + { + "epoch": 0.5106307268036652, + "grad_norm": 0.12587735056877136, + "learning_rate": 0.0004911313589812269, + "loss": 2.6842, + "step": 17220 + }, + { + "epoch": 0.5106603801559766, + "grad_norm": 0.12421497702598572, + "learning_rate": 0.0004910843154617096, + "loss": 2.6469, + "step": 17221 + }, + { + "epoch": 0.5106900335082881, + "grad_norm": 0.12028272449970245, + "learning_rate": 0.0004910372720211423, + "loss": 2.6633, + "step": 17222 + }, + { + "epoch": 0.5107196868605995, + "grad_norm": 0.14355142414569855, + "learning_rate": 0.0004909902286599415, + "loss": 2.6879, + "step": 17223 + }, + { + "epoch": 0.5107493402129111, + "grad_norm": 0.117173932492733, + "learning_rate": 0.0004909431853785237, + "loss": 2.6716, + "step": 17224 + }, + { + "epoch": 0.5107789935652225, + "grad_norm": 0.11416596919298172, + "learning_rate": 0.0004908961421773057, + "loss": 2.6701, + "step": 17225 + }, + { + "epoch": 0.510808646917534, + "grad_norm": 0.10601764917373657, + "learning_rate": 0.0004908490990567038, + "loss": 2.6961, + "step": 17226 + }, + { + "epoch": 0.5108383002698456, + "grad_norm": 0.10913113504648209, + "learning_rate": 0.0004908020560171348, + "loss": 2.7197, + "step": 17227 + }, + { + "epoch": 0.510867953622157, + "grad_norm": 0.11821295320987701, + "learning_rate": 0.0004907550130590151, + "loss": 2.691, + "step": 17228 + }, + { + "epoch": 0.5108976069744685, + "grad_norm": 0.13424760103225708, + "learning_rate": 0.0004907079701827612, + "loss": 2.7105, + "step": 17229 + }, + { + "epoch": 0.5109272603267799, + "grad_norm": 0.13091571629047394, + "learning_rate": 0.00049066092738879, + "loss": 2.6679, + "step": 17230 + }, + { + "epoch": 0.5109569136790915, + "grad_norm": 0.12397341430187225, + "learning_rate": 0.0004906138846775179, + "loss": 2.6675, + "step": 17231 + }, + { + "epoch": 0.5109865670314029, + "grad_norm": 0.11178560554981232, + "learning_rate": 0.0004905668420493612, + "loss": 2.6866, + "step": 17232 + }, + { + "epoch": 0.5110162203837144, + "grad_norm": 0.10734646767377853, + "learning_rate": 0.000490519799504737, + "loss": 2.706, + "step": 17233 + }, + { + "epoch": 0.5110458737360258, + "grad_norm": 0.1068950966000557, + "learning_rate": 0.0004904727570440615, + "loss": 2.65, + "step": 17234 + }, + { + "epoch": 0.5110755270883374, + "grad_norm": 0.11256621778011322, + "learning_rate": 0.0004904257146677514, + "loss": 2.647, + "step": 17235 + }, + { + "epoch": 0.5111051804406488, + "grad_norm": 0.11847265064716339, + "learning_rate": 0.0004903786723762234, + "loss": 2.6829, + "step": 17236 + }, + { + "epoch": 0.5111348337929603, + "grad_norm": 0.10302656143903732, + "learning_rate": 0.0004903316301698937, + "loss": 2.6748, + "step": 17237 + }, + { + "epoch": 0.5111644871452717, + "grad_norm": 0.1081593781709671, + "learning_rate": 0.0004902845880491791, + "loss": 2.6401, + "step": 17238 + }, + { + "epoch": 0.5111941404975833, + "grad_norm": 0.1199624091386795, + "learning_rate": 0.0004902375460144962, + "loss": 2.6605, + "step": 17239 + }, + { + "epoch": 0.5112237938498947, + "grad_norm": 0.126271054148674, + "learning_rate": 0.0004901905040662614, + "loss": 2.6697, + "step": 17240 + }, + { + "epoch": 0.5112534472022062, + "grad_norm": 0.12900516390800476, + "learning_rate": 0.0004901434622048915, + "loss": 2.7103, + "step": 17241 + }, + { + "epoch": 0.5112831005545176, + "grad_norm": 0.1614556908607483, + "learning_rate": 0.0004900964204308029, + "loss": 2.6416, + "step": 17242 + }, + { + "epoch": 0.5113127539068292, + "grad_norm": 0.16962428390979767, + "learning_rate": 0.0004900493787444123, + "loss": 2.635, + "step": 17243 + }, + { + "epoch": 0.5113424072591406, + "grad_norm": 0.1328398585319519, + "learning_rate": 0.000490002337146136, + "loss": 2.6647, + "step": 17244 + }, + { + "epoch": 0.5113720606114521, + "grad_norm": 0.12056674063205719, + "learning_rate": 0.0004899552956363906, + "loss": 2.6853, + "step": 17245 + }, + { + "epoch": 0.5114017139637635, + "grad_norm": 0.11864468455314636, + "learning_rate": 0.000489908254215593, + "loss": 2.667, + "step": 17246 + }, + { + "epoch": 0.5114313673160751, + "grad_norm": 0.12350180745124817, + "learning_rate": 0.0004898612128841595, + "loss": 2.6578, + "step": 17247 + }, + { + "epoch": 0.5114610206683866, + "grad_norm": 0.11367496848106384, + "learning_rate": 0.0004898141716425066, + "loss": 2.6752, + "step": 17248 + }, + { + "epoch": 0.511490674020698, + "grad_norm": 0.12087580561637878, + "learning_rate": 0.0004897671304910511, + "loss": 2.6808, + "step": 17249 + }, + { + "epoch": 0.5115203273730096, + "grad_norm": 0.1159040629863739, + "learning_rate": 0.0004897200894302094, + "loss": 2.6647, + "step": 17250 + }, + { + "epoch": 0.511549980725321, + "grad_norm": 0.11363375186920166, + "learning_rate": 0.000489673048460398, + "loss": 2.6735, + "step": 17251 + }, + { + "epoch": 0.5115796340776325, + "grad_norm": 0.11967358738183975, + "learning_rate": 0.0004896260075820335, + "loss": 2.6623, + "step": 17252 + }, + { + "epoch": 0.5116092874299439, + "grad_norm": 0.12483782321214676, + "learning_rate": 0.0004895789667955327, + "loss": 2.668, + "step": 17253 + }, + { + "epoch": 0.5116389407822555, + "grad_norm": 0.11405782401561737, + "learning_rate": 0.0004895319261013117, + "loss": 2.6556, + "step": 17254 + }, + { + "epoch": 0.5116685941345669, + "grad_norm": 0.10401473194360733, + "learning_rate": 0.0004894848854997874, + "loss": 2.6715, + "step": 17255 + }, + { + "epoch": 0.5116982474868784, + "grad_norm": 0.12304513901472092, + "learning_rate": 0.0004894378449913763, + "loss": 2.6719, + "step": 17256 + }, + { + "epoch": 0.5117279008391898, + "grad_norm": 0.12281614542007446, + "learning_rate": 0.0004893908045764948, + "loss": 2.663, + "step": 17257 + }, + { + "epoch": 0.5117575541915014, + "grad_norm": 0.10862434655427933, + "learning_rate": 0.0004893437642555594, + "loss": 2.6772, + "step": 17258 + }, + { + "epoch": 0.5117872075438128, + "grad_norm": 0.1102190762758255, + "learning_rate": 0.0004892967240289869, + "loss": 2.6595, + "step": 17259 + }, + { + "epoch": 0.5118168608961243, + "grad_norm": 0.12273377925157547, + "learning_rate": 0.0004892496838971938, + "loss": 2.6826, + "step": 17260 + }, + { + "epoch": 0.5118465142484357, + "grad_norm": 0.12353022396564484, + "learning_rate": 0.0004892026438605966, + "loss": 2.6653, + "step": 17261 + }, + { + "epoch": 0.5118761676007473, + "grad_norm": 0.12258172035217285, + "learning_rate": 0.0004891556039196118, + "loss": 2.6555, + "step": 17262 + }, + { + "epoch": 0.5119058209530587, + "grad_norm": 0.10030559450387955, + "learning_rate": 0.0004891085640746562, + "loss": 2.6879, + "step": 17263 + }, + { + "epoch": 0.5119354743053702, + "grad_norm": 0.14591078460216522, + "learning_rate": 0.0004890615243261459, + "loss": 2.6635, + "step": 17264 + }, + { + "epoch": 0.5119651276576817, + "grad_norm": 0.15133781731128693, + "learning_rate": 0.0004890144846744978, + "loss": 2.6958, + "step": 17265 + }, + { + "epoch": 0.5119947810099932, + "grad_norm": 0.13141119480133057, + "learning_rate": 0.0004889674451201282, + "loss": 2.6962, + "step": 17266 + }, + { + "epoch": 0.5120244343623046, + "grad_norm": 0.13004153966903687, + "learning_rate": 0.0004889204056634539, + "loss": 2.6771, + "step": 17267 + }, + { + "epoch": 0.5120540877146161, + "grad_norm": 0.1191377341747284, + "learning_rate": 0.0004888733663048912, + "loss": 2.648, + "step": 17268 + }, + { + "epoch": 0.5120837410669277, + "grad_norm": 0.12338826060295105, + "learning_rate": 0.0004888263270448567, + "loss": 2.6747, + "step": 17269 + }, + { + "epoch": 0.5121133944192391, + "grad_norm": 0.14200764894485474, + "learning_rate": 0.0004887792878837672, + "loss": 2.6937, + "step": 17270 + }, + { + "epoch": 0.5121430477715506, + "grad_norm": 0.11718890070915222, + "learning_rate": 0.0004887322488220388, + "loss": 2.6722, + "step": 17271 + }, + { + "epoch": 0.512172701123862, + "grad_norm": 0.12659616768360138, + "learning_rate": 0.0004886852098600883, + "loss": 2.6891, + "step": 17272 + }, + { + "epoch": 0.5122023544761736, + "grad_norm": 0.10998763889074326, + "learning_rate": 0.0004886381709983323, + "loss": 2.6489, + "step": 17273 + }, + { + "epoch": 0.512232007828485, + "grad_norm": 0.12898202240467072, + "learning_rate": 0.0004885911322371874, + "loss": 2.6435, + "step": 17274 + }, + { + "epoch": 0.5122616611807965, + "grad_norm": 0.11000638455152512, + "learning_rate": 0.0004885440935770697, + "loss": 2.6729, + "step": 17275 + }, + { + "epoch": 0.5122913145331079, + "grad_norm": 0.1388736367225647, + "learning_rate": 0.0004884970550183962, + "loss": 2.6882, + "step": 17276 + }, + { + "epoch": 0.5123209678854195, + "grad_norm": 0.13561271131038666, + "learning_rate": 0.0004884500165615831, + "loss": 2.6656, + "step": 17277 + }, + { + "epoch": 0.5123506212377309, + "grad_norm": 0.15420806407928467, + "learning_rate": 0.0004884029782070472, + "loss": 2.7337, + "step": 17278 + }, + { + "epoch": 0.5123802745900424, + "grad_norm": 0.13422080874443054, + "learning_rate": 0.0004883559399552049, + "loss": 2.6357, + "step": 17279 + }, + { + "epoch": 0.5124099279423538, + "grad_norm": 0.1297522485256195, + "learning_rate": 0.0004883089018064728, + "loss": 2.6948, + "step": 17280 + }, + { + "epoch": 0.5124395812946654, + "grad_norm": 0.13382689654827118, + "learning_rate": 0.0004882618637612674, + "loss": 2.682, + "step": 17281 + }, + { + "epoch": 0.5124692346469768, + "grad_norm": 0.11662369966506958, + "learning_rate": 0.00048821482582000515, + "loss": 2.655, + "step": 17282 + }, + { + "epoch": 0.5124988879992883, + "grad_norm": 0.12205652892589569, + "learning_rate": 0.0004881677879831026, + "loss": 2.702, + "step": 17283 + }, + { + "epoch": 0.5125285413515998, + "grad_norm": 0.11208375543355942, + "learning_rate": 0.0004881207502509763, + "loss": 2.6847, + "step": 17284 + }, + { + "epoch": 0.5125581947039113, + "grad_norm": 0.11838898062705994, + "learning_rate": 0.00048807371262404284, + "loss": 2.689, + "step": 17285 + }, + { + "epoch": 0.5125878480562227, + "grad_norm": 0.10940656810998917, + "learning_rate": 0.0004880266751027187, + "loss": 2.6612, + "step": 17286 + }, + { + "epoch": 0.5126175014085342, + "grad_norm": 0.11071459203958511, + "learning_rate": 0.0004879796376874204, + "loss": 2.6884, + "step": 17287 + }, + { + "epoch": 0.5126471547608458, + "grad_norm": 0.11021191626787186, + "learning_rate": 0.00048793260037856447, + "loss": 2.6923, + "step": 17288 + }, + { + "epoch": 0.5126768081131572, + "grad_norm": 0.104852095246315, + "learning_rate": 0.00048788556317656747, + "loss": 2.6872, + "step": 17289 + }, + { + "epoch": 0.5127064614654687, + "grad_norm": 0.0922568291425705, + "learning_rate": 0.0004878385260818458, + "loss": 2.6573, + "step": 17290 + }, + { + "epoch": 0.5127361148177801, + "grad_norm": 0.10927412658929825, + "learning_rate": 0.0004877914890948163, + "loss": 2.6622, + "step": 17291 + }, + { + "epoch": 0.5127657681700917, + "grad_norm": 0.11412017792463303, + "learning_rate": 0.00048774445221589507, + "loss": 2.6873, + "step": 17292 + }, + { + "epoch": 0.5127954215224031, + "grad_norm": 0.10687123239040375, + "learning_rate": 0.00048769741544549885, + "loss": 2.6525, + "step": 17293 + }, + { + "epoch": 0.5128250748747146, + "grad_norm": 0.1102815568447113, + "learning_rate": 0.0004876503787840441, + "loss": 2.6665, + "step": 17294 + }, + { + "epoch": 0.512854728227026, + "grad_norm": 0.1135658398270607, + "learning_rate": 0.0004876033422319474, + "loss": 2.6662, + "step": 17295 + }, + { + "epoch": 0.5128843815793376, + "grad_norm": 0.11450029164552689, + "learning_rate": 0.00048755630578962517, + "loss": 2.6321, + "step": 17296 + }, + { + "epoch": 0.512914034931649, + "grad_norm": 0.12307699769735336, + "learning_rate": 0.000487509269457494, + "loss": 2.6796, + "step": 17297 + }, + { + "epoch": 0.5129436882839605, + "grad_norm": 0.13467688858509064, + "learning_rate": 0.0004874622332359704, + "loss": 2.6928, + "step": 17298 + }, + { + "epoch": 0.512973341636272, + "grad_norm": 0.11874333769083023, + "learning_rate": 0.0004874151971254709, + "loss": 2.6928, + "step": 17299 + }, + { + "epoch": 0.5130029949885835, + "grad_norm": 0.11804249882698059, + "learning_rate": 0.000487368161126412, + "loss": 2.6764, + "step": 17300 + }, + { + "epoch": 0.5130326483408949, + "grad_norm": 0.1309477835893631, + "learning_rate": 0.0004873211252392103, + "loss": 2.6976, + "step": 17301 + }, + { + "epoch": 0.5130623016932064, + "grad_norm": 0.120817631483078, + "learning_rate": 0.00048727408946428206, + "loss": 2.6682, + "step": 17302 + }, + { + "epoch": 0.5130919550455179, + "grad_norm": 0.12221866846084595, + "learning_rate": 0.000487227053802044, + "loss": 2.653, + "step": 17303 + }, + { + "epoch": 0.5131216083978294, + "grad_norm": 0.10817071050405502, + "learning_rate": 0.00048718001825291256, + "loss": 2.6527, + "step": 17304 + }, + { + "epoch": 0.5131512617501408, + "grad_norm": 0.1274014413356781, + "learning_rate": 0.0004871329828173043, + "loss": 2.6737, + "step": 17305 + }, + { + "epoch": 0.5131809151024523, + "grad_norm": 0.12298610806465149, + "learning_rate": 0.0004870859474956356, + "loss": 2.6568, + "step": 17306 + }, + { + "epoch": 0.5132105684547638, + "grad_norm": 0.11750493198633194, + "learning_rate": 0.00048703891228832314, + "loss": 2.6739, + "step": 17307 + }, + { + "epoch": 0.5132402218070753, + "grad_norm": 0.10863789170980453, + "learning_rate": 0.0004869918771957834, + "loss": 2.6502, + "step": 17308 + }, + { + "epoch": 0.5132698751593868, + "grad_norm": 0.1171891838312149, + "learning_rate": 0.0004869448422184327, + "loss": 2.7101, + "step": 17309 + }, + { + "epoch": 0.5132995285116982, + "grad_norm": 0.11680357903242111, + "learning_rate": 0.0004868978073566875, + "loss": 2.6659, + "step": 17310 + }, + { + "epoch": 0.5133291818640098, + "grad_norm": 0.1340918093919754, + "learning_rate": 0.0004868507726109646, + "loss": 2.6417, + "step": 17311 + }, + { + "epoch": 0.5133588352163212, + "grad_norm": 0.1216411367058754, + "learning_rate": 0.00048680373798168055, + "loss": 2.6452, + "step": 17312 + }, + { + "epoch": 0.5133884885686327, + "grad_norm": 0.1197705939412117, + "learning_rate": 0.0004867567034692515, + "loss": 2.6756, + "step": 17313 + }, + { + "epoch": 0.5134181419209441, + "grad_norm": 0.2245120108127594, + "learning_rate": 0.00048670966907409413, + "loss": 2.6543, + "step": 17314 + }, + { + "epoch": 0.5134477952732557, + "grad_norm": 0.19279064238071442, + "learning_rate": 0.00048666263479662494, + "loss": 2.6791, + "step": 17315 + }, + { + "epoch": 0.5134774486255671, + "grad_norm": 0.16809721291065216, + "learning_rate": 0.0004866156006372604, + "loss": 2.6851, + "step": 17316 + }, + { + "epoch": 0.5135071019778786, + "grad_norm": 0.1297571212053299, + "learning_rate": 0.000486568566596417, + "loss": 2.7031, + "step": 17317 + }, + { + "epoch": 0.51353675533019, + "grad_norm": 0.15111511945724487, + "learning_rate": 0.0004865215326745113, + "loss": 2.7075, + "step": 17318 + }, + { + "epoch": 0.5135664086825016, + "grad_norm": 0.12800006568431854, + "learning_rate": 0.0004864744988719598, + "loss": 2.6332, + "step": 17319 + }, + { + "epoch": 0.513596062034813, + "grad_norm": 0.12529011070728302, + "learning_rate": 0.0004864274651891788, + "loss": 2.645, + "step": 17320 + }, + { + "epoch": 0.5136257153871245, + "grad_norm": 0.1259434074163437, + "learning_rate": 0.00048638043162658497, + "loss": 2.6759, + "step": 17321 + }, + { + "epoch": 0.513655368739436, + "grad_norm": 0.12617403268814087, + "learning_rate": 0.0004863333981845947, + "loss": 2.641, + "step": 17322 + }, + { + "epoch": 0.5136850220917475, + "grad_norm": 0.1217566505074501, + "learning_rate": 0.0004862863648636245, + "loss": 2.6722, + "step": 17323 + }, + { + "epoch": 0.5137146754440589, + "grad_norm": 0.12154357135295868, + "learning_rate": 0.00048623933166409096, + "loss": 2.6611, + "step": 17324 + }, + { + "epoch": 0.5137443287963704, + "grad_norm": 0.12396164238452911, + "learning_rate": 0.0004861922985864105, + "loss": 2.6659, + "step": 17325 + }, + { + "epoch": 0.5137739821486819, + "grad_norm": 0.1018010675907135, + "learning_rate": 0.0004861452656309996, + "loss": 2.6727, + "step": 17326 + }, + { + "epoch": 0.5138036355009934, + "grad_norm": 0.1135086864233017, + "learning_rate": 0.0004860982327982748, + "loss": 2.6937, + "step": 17327 + }, + { + "epoch": 0.5138332888533048, + "grad_norm": 0.11350701749324799, + "learning_rate": 0.0004860512000886525, + "loss": 2.6757, + "step": 17328 + }, + { + "epoch": 0.5138629422056163, + "grad_norm": 0.13206519186496735, + "learning_rate": 0.0004860041675025493, + "loss": 2.65, + "step": 17329 + }, + { + "epoch": 0.5138925955579279, + "grad_norm": 0.11236264556646347, + "learning_rate": 0.00048595713504038157, + "loss": 2.7066, + "step": 17330 + }, + { + "epoch": 0.5139222489102393, + "grad_norm": 0.1142556443810463, + "learning_rate": 0.00048591010270256573, + "loss": 2.6613, + "step": 17331 + }, + { + "epoch": 0.5139519022625508, + "grad_norm": 0.11265423893928528, + "learning_rate": 0.0004858630704895184, + "loss": 2.6697, + "step": 17332 + }, + { + "epoch": 0.5139815556148623, + "grad_norm": 0.11668474227190018, + "learning_rate": 0.000485816038401656, + "loss": 2.6707, + "step": 17333 + }, + { + "epoch": 0.5140112089671738, + "grad_norm": 0.11290743947029114, + "learning_rate": 0.000485769006439395, + "loss": 2.643, + "step": 17334 + }, + { + "epoch": 0.5140408623194852, + "grad_norm": 0.10539907962083817, + "learning_rate": 0.0004857219746031519, + "loss": 2.6604, + "step": 17335 + }, + { + "epoch": 0.5140705156717967, + "grad_norm": 0.12324701249599457, + "learning_rate": 0.0004856749428933431, + "loss": 2.6861, + "step": 17336 + }, + { + "epoch": 0.5141001690241082, + "grad_norm": 0.11486196517944336, + "learning_rate": 0.0004856279113103852, + "loss": 2.6997, + "step": 17337 + }, + { + "epoch": 0.5141298223764197, + "grad_norm": 0.11310833692550659, + "learning_rate": 0.00048558087985469463, + "loss": 2.6612, + "step": 17338 + }, + { + "epoch": 0.5141594757287311, + "grad_norm": 0.12083370238542557, + "learning_rate": 0.00048553384852668784, + "loss": 2.6371, + "step": 17339 + }, + { + "epoch": 0.5141891290810426, + "grad_norm": 0.13085296750068665, + "learning_rate": 0.00048548681732678143, + "loss": 2.6685, + "step": 17340 + }, + { + "epoch": 0.5142187824333541, + "grad_norm": 0.1225954219698906, + "learning_rate": 0.0004854397862553916, + "loss": 2.6791, + "step": 17341 + }, + { + "epoch": 0.5142484357856656, + "grad_norm": 0.12121260911226273, + "learning_rate": 0.000485392755312935, + "loss": 2.6814, + "step": 17342 + }, + { + "epoch": 0.514278089137977, + "grad_norm": 0.11874563992023468, + "learning_rate": 0.000485345724499828, + "loss": 2.6861, + "step": 17343 + }, + { + "epoch": 0.5143077424902885, + "grad_norm": 0.1096959114074707, + "learning_rate": 0.0004852986938164872, + "loss": 2.6663, + "step": 17344 + }, + { + "epoch": 0.5143373958426, + "grad_norm": 0.11642996966838837, + "learning_rate": 0.00048525166326332894, + "loss": 2.6573, + "step": 17345 + }, + { + "epoch": 0.5143670491949115, + "grad_norm": 0.12619882822036743, + "learning_rate": 0.00048520463284076984, + "loss": 2.6692, + "step": 17346 + }, + { + "epoch": 0.5143967025472229, + "grad_norm": 0.11581992357969284, + "learning_rate": 0.0004851576025492261, + "loss": 2.7111, + "step": 17347 + }, + { + "epoch": 0.5144263558995344, + "grad_norm": 0.11091802269220352, + "learning_rate": 0.00048511057238911443, + "loss": 2.6674, + "step": 17348 + }, + { + "epoch": 0.5144560092518459, + "grad_norm": 0.10766486823558807, + "learning_rate": 0.00048506354236085093, + "loss": 2.6765, + "step": 17349 + }, + { + "epoch": 0.5144856626041574, + "grad_norm": 0.12704816460609436, + "learning_rate": 0.0004850165124648527, + "loss": 2.6589, + "step": 17350 + }, + { + "epoch": 0.5145153159564689, + "grad_norm": 0.11174260079860687, + "learning_rate": 0.00048496948270153567, + "loss": 2.6823, + "step": 17351 + }, + { + "epoch": 0.5145449693087804, + "grad_norm": 0.10400436073541641, + "learning_rate": 0.00048492245307131646, + "loss": 2.6858, + "step": 17352 + }, + { + "epoch": 0.5145746226610919, + "grad_norm": 0.11078667640686035, + "learning_rate": 0.0004848754235746115, + "loss": 2.6707, + "step": 17353 + }, + { + "epoch": 0.5146042760134033, + "grad_norm": 0.09860476851463318, + "learning_rate": 0.0004848283942118373, + "loss": 2.7018, + "step": 17354 + }, + { + "epoch": 0.5146339293657148, + "grad_norm": 0.1178043782711029, + "learning_rate": 0.00048478136498341024, + "loss": 2.647, + "step": 17355 + }, + { + "epoch": 0.5146635827180263, + "grad_norm": 0.1237083226442337, + "learning_rate": 0.0004847343358897468, + "loss": 2.6759, + "step": 17356 + }, + { + "epoch": 0.5146932360703378, + "grad_norm": 0.10127393156290054, + "learning_rate": 0.0004846873069312636, + "loss": 2.689, + "step": 17357 + }, + { + "epoch": 0.5147228894226492, + "grad_norm": 0.10604778677225113, + "learning_rate": 0.00048464027810837676, + "loss": 2.6795, + "step": 17358 + }, + { + "epoch": 0.5147525427749607, + "grad_norm": 0.11098974943161011, + "learning_rate": 0.0004845932494215029, + "loss": 2.6392, + "step": 17359 + }, + { + "epoch": 0.5147821961272722, + "grad_norm": 0.10264015197753906, + "learning_rate": 0.0004845462208710585, + "loss": 2.6622, + "step": 17360 + }, + { + "epoch": 0.5148118494795837, + "grad_norm": 0.12079581618309021, + "learning_rate": 0.00048449919245745996, + "loss": 2.6684, + "step": 17361 + }, + { + "epoch": 0.5148415028318951, + "grad_norm": 0.1360418051481247, + "learning_rate": 0.0004844521641811236, + "loss": 2.6705, + "step": 17362 + }, + { + "epoch": 0.5148711561842066, + "grad_norm": 0.13492882251739502, + "learning_rate": 0.00048440513604246606, + "loss": 2.6907, + "step": 17363 + }, + { + "epoch": 0.5149008095365181, + "grad_norm": 0.14311622083187103, + "learning_rate": 0.00048435810804190377, + "loss": 2.6949, + "step": 17364 + }, + { + "epoch": 0.5149304628888296, + "grad_norm": 0.15689082443714142, + "learning_rate": 0.0004843110801798531, + "loss": 2.6991, + "step": 17365 + }, + { + "epoch": 0.514960116241141, + "grad_norm": 0.11807962507009506, + "learning_rate": 0.00048426405245673057, + "loss": 2.6322, + "step": 17366 + }, + { + "epoch": 0.5149897695934526, + "grad_norm": 0.10359156876802444, + "learning_rate": 0.0004842170248729526, + "loss": 2.6845, + "step": 17367 + }, + { + "epoch": 0.515019422945764, + "grad_norm": 0.12379486858844757, + "learning_rate": 0.00048416999742893546, + "loss": 2.6981, + "step": 17368 + }, + { + "epoch": 0.5150490762980755, + "grad_norm": 0.11878684908151627, + "learning_rate": 0.0004841229701250958, + "loss": 2.6845, + "step": 17369 + }, + { + "epoch": 0.5150787296503869, + "grad_norm": 0.10789782553911209, + "learning_rate": 0.00048407594296184987, + "loss": 2.6521, + "step": 17370 + }, + { + "epoch": 0.5151083830026985, + "grad_norm": 0.09133083373308182, + "learning_rate": 0.00048402891593961426, + "loss": 2.6683, + "step": 17371 + }, + { + "epoch": 0.51513803635501, + "grad_norm": 0.10210252553224564, + "learning_rate": 0.0004839818890588053, + "loss": 2.6673, + "step": 17372 + }, + { + "epoch": 0.5151676897073214, + "grad_norm": 0.11871650815010071, + "learning_rate": 0.00048393486231983944, + "loss": 2.6488, + "step": 17373 + }, + { + "epoch": 0.5151973430596329, + "grad_norm": 0.12276877462863922, + "learning_rate": 0.0004838878357231333, + "loss": 2.6729, + "step": 17374 + }, + { + "epoch": 0.5152269964119444, + "grad_norm": 0.11169490963220596, + "learning_rate": 0.00048384080926910277, + "loss": 2.6926, + "step": 17375 + }, + { + "epoch": 0.5152566497642559, + "grad_norm": 0.10974248498678207, + "learning_rate": 0.0004837937829581649, + "loss": 2.6765, + "step": 17376 + }, + { + "epoch": 0.5152863031165673, + "grad_norm": 0.1379099190235138, + "learning_rate": 0.00048374675679073583, + "loss": 2.6114, + "step": 17377 + }, + { + "epoch": 0.5153159564688788, + "grad_norm": 0.14058466255664825, + "learning_rate": 0.0004836997307672322, + "loss": 2.6868, + "step": 17378 + }, + { + "epoch": 0.5153456098211903, + "grad_norm": 0.1050938293337822, + "learning_rate": 0.00048365270488807006, + "loss": 2.6964, + "step": 17379 + }, + { + "epoch": 0.5153752631735018, + "grad_norm": 0.12133537977933884, + "learning_rate": 0.00048360567915366605, + "loss": 2.6368, + "step": 17380 + }, + { + "epoch": 0.5154049165258132, + "grad_norm": 0.12345407903194427, + "learning_rate": 0.00048355865356443655, + "loss": 2.6986, + "step": 17381 + }, + { + "epoch": 0.5154345698781247, + "grad_norm": 0.13470661640167236, + "learning_rate": 0.000483511628120798, + "loss": 2.6642, + "step": 17382 + }, + { + "epoch": 0.5154642232304362, + "grad_norm": 0.1323542296886444, + "learning_rate": 0.00048346460282316684, + "loss": 2.6716, + "step": 17383 + }, + { + "epoch": 0.5154938765827477, + "grad_norm": 0.1142495647072792, + "learning_rate": 0.0004834175776719596, + "loss": 2.6754, + "step": 17384 + }, + { + "epoch": 0.5155235299350591, + "grad_norm": 0.11390183866024017, + "learning_rate": 0.0004833705526675924, + "loss": 2.6815, + "step": 17385 + }, + { + "epoch": 0.5155531832873707, + "grad_norm": 0.11469651758670807, + "learning_rate": 0.00048332352781048176, + "loss": 2.6801, + "step": 17386 + }, + { + "epoch": 0.5155828366396821, + "grad_norm": 0.11614029109477997, + "learning_rate": 0.0004832765031010442, + "loss": 2.6463, + "step": 17387 + }, + { + "epoch": 0.5156124899919936, + "grad_norm": 0.11437078565359116, + "learning_rate": 0.000483229478539696, + "loss": 2.6773, + "step": 17388 + }, + { + "epoch": 0.515642143344305, + "grad_norm": 0.11554557085037231, + "learning_rate": 0.0004831824541268537, + "loss": 2.701, + "step": 17389 + }, + { + "epoch": 0.5156717966966166, + "grad_norm": 0.11332958191633224, + "learning_rate": 0.0004831354298629337, + "loss": 2.7107, + "step": 17390 + }, + { + "epoch": 0.515701450048928, + "grad_norm": 0.10755226016044617, + "learning_rate": 0.00048308840574835235, + "loss": 2.6549, + "step": 17391 + }, + { + "epoch": 0.5157311034012395, + "grad_norm": 0.09034405648708344, + "learning_rate": 0.00048304138178352604, + "loss": 2.6545, + "step": 17392 + }, + { + "epoch": 0.515760756753551, + "grad_norm": 0.1084025427699089, + "learning_rate": 0.0004829943579688712, + "loss": 2.659, + "step": 17393 + }, + { + "epoch": 0.5157904101058625, + "grad_norm": 0.10427479445934296, + "learning_rate": 0.00048294733430480433, + "loss": 2.6564, + "step": 17394 + }, + { + "epoch": 0.515820063458174, + "grad_norm": 0.11610306054353714, + "learning_rate": 0.00048290031079174176, + "loss": 2.658, + "step": 17395 + }, + { + "epoch": 0.5158497168104854, + "grad_norm": 0.1240653395652771, + "learning_rate": 0.0004828532874300998, + "loss": 2.6619, + "step": 17396 + }, + { + "epoch": 0.515879370162797, + "grad_norm": 0.13587848842144012, + "learning_rate": 0.0004828062642202949, + "loss": 2.6497, + "step": 17397 + }, + { + "epoch": 0.5159090235151084, + "grad_norm": 0.14678144454956055, + "learning_rate": 0.00048275924116274345, + "loss": 2.6681, + "step": 17398 + }, + { + "epoch": 0.5159386768674199, + "grad_norm": 0.13409744203090668, + "learning_rate": 0.0004827122182578619, + "loss": 2.6451, + "step": 17399 + }, + { + "epoch": 0.5159683302197313, + "grad_norm": 0.15596389770507812, + "learning_rate": 0.0004826651955060667, + "loss": 2.6825, + "step": 17400 + }, + { + "epoch": 0.5159979835720429, + "grad_norm": 0.13679824769496918, + "learning_rate": 0.000482618172907774, + "loss": 2.6963, + "step": 17401 + }, + { + "epoch": 0.5160276369243543, + "grad_norm": 0.11987988650798798, + "learning_rate": 0.0004825711504634004, + "loss": 2.6702, + "step": 17402 + }, + { + "epoch": 0.5160572902766658, + "grad_norm": 0.12800821661949158, + "learning_rate": 0.0004825241281733624, + "loss": 2.6609, + "step": 17403 + }, + { + "epoch": 0.5160869436289772, + "grad_norm": 0.1327155977487564, + "learning_rate": 0.00048247710603807614, + "loss": 2.6568, + "step": 17404 + }, + { + "epoch": 0.5161165969812888, + "grad_norm": 0.10815883427858353, + "learning_rate": 0.0004824300840579583, + "loss": 2.6755, + "step": 17405 + }, + { + "epoch": 0.5161462503336002, + "grad_norm": 0.11435765773057938, + "learning_rate": 0.0004823830622334249, + "loss": 2.6575, + "step": 17406 + }, + { + "epoch": 0.5161759036859117, + "grad_norm": 0.11681120842695236, + "learning_rate": 0.0004823360405648926, + "loss": 2.6445, + "step": 17407 + }, + { + "epoch": 0.5162055570382231, + "grad_norm": 0.13345403969287872, + "learning_rate": 0.0004822890190527776, + "loss": 2.6787, + "step": 17408 + }, + { + "epoch": 0.5162352103905347, + "grad_norm": 0.18469838798046112, + "learning_rate": 0.00048224199769749646, + "loss": 2.6305, + "step": 17409 + }, + { + "epoch": 0.5162648637428461, + "grad_norm": 0.16242288053035736, + "learning_rate": 0.00048219497649946535, + "loss": 2.6346, + "step": 17410 + }, + { + "epoch": 0.5162945170951576, + "grad_norm": 0.1353989690542221, + "learning_rate": 0.0004821479554591009, + "loss": 2.693, + "step": 17411 + }, + { + "epoch": 0.516324170447469, + "grad_norm": 0.1444666087627411, + "learning_rate": 0.0004821009345768194, + "loss": 2.6605, + "step": 17412 + }, + { + "epoch": 0.5163538237997806, + "grad_norm": 0.117460697889328, + "learning_rate": 0.00048205391385303695, + "loss": 2.6716, + "step": 17413 + }, + { + "epoch": 0.5163834771520921, + "grad_norm": 0.1393747627735138, + "learning_rate": 0.0004820068932881703, + "loss": 2.6778, + "step": 17414 + }, + { + "epoch": 0.5164131305044035, + "grad_norm": 0.14950750768184662, + "learning_rate": 0.00048195987288263576, + "loss": 2.6458, + "step": 17415 + }, + { + "epoch": 0.516442783856715, + "grad_norm": 0.12554527819156647, + "learning_rate": 0.00048191285263684977, + "loss": 2.7046, + "step": 17416 + }, + { + "epoch": 0.5164724372090265, + "grad_norm": 0.1320626586675644, + "learning_rate": 0.0004818658325512284, + "loss": 2.6521, + "step": 17417 + }, + { + "epoch": 0.516502090561338, + "grad_norm": 0.12364332377910614, + "learning_rate": 0.0004818188126261882, + "loss": 2.6518, + "step": 17418 + }, + { + "epoch": 0.5165317439136494, + "grad_norm": 0.13854604959487915, + "learning_rate": 0.00048177179286214555, + "loss": 2.6522, + "step": 17419 + }, + { + "epoch": 0.516561397265961, + "grad_norm": 0.12213374674320221, + "learning_rate": 0.00048172477325951685, + "loss": 2.6973, + "step": 17420 + }, + { + "epoch": 0.5165910506182724, + "grad_norm": 0.1149369478225708, + "learning_rate": 0.00048167775381871835, + "loss": 2.6559, + "step": 17421 + }, + { + "epoch": 0.5166207039705839, + "grad_norm": 0.10259722173213959, + "learning_rate": 0.00048163073454016663, + "loss": 2.6736, + "step": 17422 + }, + { + "epoch": 0.5166503573228953, + "grad_norm": 0.11217943578958511, + "learning_rate": 0.0004815837154242778, + "loss": 2.672, + "step": 17423 + }, + { + "epoch": 0.5166800106752069, + "grad_norm": 0.10539967566728592, + "learning_rate": 0.00048153669647146835, + "loss": 2.6206, + "step": 17424 + }, + { + "epoch": 0.5167096640275183, + "grad_norm": 0.11201929301023483, + "learning_rate": 0.0004814896776821546, + "loss": 2.6891, + "step": 17425 + }, + { + "epoch": 0.5167393173798298, + "grad_norm": 0.09758095443248749, + "learning_rate": 0.0004814426590567528, + "loss": 2.6851, + "step": 17426 + }, + { + "epoch": 0.5167689707321412, + "grad_norm": 0.10776875913143158, + "learning_rate": 0.0004813956405956796, + "loss": 2.6856, + "step": 17427 + }, + { + "epoch": 0.5167986240844528, + "grad_norm": 0.10184009373188019, + "learning_rate": 0.00048134862229935114, + "loss": 2.6493, + "step": 17428 + }, + { + "epoch": 0.5168282774367642, + "grad_norm": 0.11629663407802582, + "learning_rate": 0.00048130160416818386, + "loss": 2.6683, + "step": 17429 + }, + { + "epoch": 0.5168579307890757, + "grad_norm": 0.11443561315536499, + "learning_rate": 0.0004812545862025941, + "loss": 2.6985, + "step": 17430 + }, + { + "epoch": 0.5168875841413871, + "grad_norm": 0.11295150965452194, + "learning_rate": 0.00048120756840299816, + "loss": 2.6877, + "step": 17431 + }, + { + "epoch": 0.5169172374936987, + "grad_norm": 0.13933177292346954, + "learning_rate": 0.00048116055076981247, + "loss": 2.6683, + "step": 17432 + }, + { + "epoch": 0.5169468908460101, + "grad_norm": 0.12298592925071716, + "learning_rate": 0.0004811135333034534, + "loss": 2.6424, + "step": 17433 + }, + { + "epoch": 0.5169765441983216, + "grad_norm": 0.12502425909042358, + "learning_rate": 0.0004810665160043372, + "loss": 2.6898, + "step": 17434 + }, + { + "epoch": 0.5170061975506332, + "grad_norm": 0.1351863592863083, + "learning_rate": 0.00048101949887288014, + "loss": 2.6794, + "step": 17435 + }, + { + "epoch": 0.5170358509029446, + "grad_norm": 0.12890875339508057, + "learning_rate": 0.00048097248190949877, + "loss": 2.641, + "step": 17436 + }, + { + "epoch": 0.5170655042552561, + "grad_norm": 0.12195973843336105, + "learning_rate": 0.00048092546511460926, + "loss": 2.6733, + "step": 17437 + }, + { + "epoch": 0.5170951576075675, + "grad_norm": 0.1268068104982376, + "learning_rate": 0.0004808784484886281, + "loss": 2.6879, + "step": 17438 + }, + { + "epoch": 0.517124810959879, + "grad_norm": 0.12878131866455078, + "learning_rate": 0.00048083143203197143, + "loss": 2.6815, + "step": 17439 + }, + { + "epoch": 0.5171544643121905, + "grad_norm": 0.11822555959224701, + "learning_rate": 0.00048078441574505584, + "loss": 2.6469, + "step": 17440 + }, + { + "epoch": 0.517184117664502, + "grad_norm": 0.11500627547502518, + "learning_rate": 0.0004807373996282975, + "loss": 2.679, + "step": 17441 + }, + { + "epoch": 0.5172137710168134, + "grad_norm": 0.12477891147136688, + "learning_rate": 0.0004806903836821128, + "loss": 2.6678, + "step": 17442 + }, + { + "epoch": 0.517243424369125, + "grad_norm": 0.1396087110042572, + "learning_rate": 0.0004806433679069182, + "loss": 2.6988, + "step": 17443 + }, + { + "epoch": 0.5172730777214364, + "grad_norm": 0.13190145790576935, + "learning_rate": 0.00048059635230312983, + "loss": 2.6893, + "step": 17444 + }, + { + "epoch": 0.5173027310737479, + "grad_norm": 0.1199718713760376, + "learning_rate": 0.00048054933687116403, + "loss": 2.6971, + "step": 17445 + }, + { + "epoch": 0.5173323844260593, + "grad_norm": 0.11884406208992004, + "learning_rate": 0.0004805023216114372, + "loss": 2.6706, + "step": 17446 + }, + { + "epoch": 0.5173620377783709, + "grad_norm": 0.11921936273574829, + "learning_rate": 0.0004804553065243657, + "loss": 2.6401, + "step": 17447 + }, + { + "epoch": 0.5173916911306823, + "grad_norm": 0.12473124265670776, + "learning_rate": 0.0004804082916103658, + "loss": 2.6642, + "step": 17448 + }, + { + "epoch": 0.5174213444829938, + "grad_norm": 0.13680613040924072, + "learning_rate": 0.0004803612768698538, + "loss": 2.6721, + "step": 17449 + }, + { + "epoch": 0.5174509978353052, + "grad_norm": 0.14027075469493866, + "learning_rate": 0.0004803142623032462, + "loss": 2.6752, + "step": 17450 + }, + { + "epoch": 0.5174806511876168, + "grad_norm": 0.09590121358633041, + "learning_rate": 0.0004802672479109591, + "loss": 2.6893, + "step": 17451 + }, + { + "epoch": 0.5175103045399282, + "grad_norm": 0.12053640931844711, + "learning_rate": 0.0004802202336934086, + "loss": 2.6642, + "step": 17452 + }, + { + "epoch": 0.5175399578922397, + "grad_norm": 0.10790103673934937, + "learning_rate": 0.00048017321965101165, + "loss": 2.6494, + "step": 17453 + }, + { + "epoch": 0.5175696112445511, + "grad_norm": 0.12685523927211761, + "learning_rate": 0.00048012620578418433, + "loss": 2.6338, + "step": 17454 + }, + { + "epoch": 0.5175992645968627, + "grad_norm": 0.11962519586086273, + "learning_rate": 0.0004800791920933427, + "loss": 2.672, + "step": 17455 + }, + { + "epoch": 0.5176289179491742, + "grad_norm": 0.1382184773683548, + "learning_rate": 0.0004800321785789033, + "loss": 2.6498, + "step": 17456 + }, + { + "epoch": 0.5176585713014856, + "grad_norm": 0.15065717697143555, + "learning_rate": 0.0004799851652412824, + "loss": 2.6528, + "step": 17457 + }, + { + "epoch": 0.5176882246537972, + "grad_norm": 0.12757998704910278, + "learning_rate": 0.00047993815208089623, + "loss": 2.6444, + "step": 17458 + }, + { + "epoch": 0.5177178780061086, + "grad_norm": 0.12351731210947037, + "learning_rate": 0.00047989113909816124, + "loss": 2.6893, + "step": 17459 + }, + { + "epoch": 0.5177475313584201, + "grad_norm": 0.1311495155096054, + "learning_rate": 0.0004798441262934937, + "loss": 2.677, + "step": 17460 + }, + { + "epoch": 0.5177771847107315, + "grad_norm": 0.11608006060123444, + "learning_rate": 0.00047979711366730985, + "loss": 2.6749, + "step": 17461 + }, + { + "epoch": 0.5178068380630431, + "grad_norm": 0.10272806882858276, + "learning_rate": 0.000479750101220026, + "loss": 2.6832, + "step": 17462 + }, + { + "epoch": 0.5178364914153545, + "grad_norm": 0.11987990885972977, + "learning_rate": 0.00047970308895205844, + "loss": 2.673, + "step": 17463 + }, + { + "epoch": 0.517866144767666, + "grad_norm": 0.14843696355819702, + "learning_rate": 0.0004796560768638236, + "loss": 2.7212, + "step": 17464 + }, + { + "epoch": 0.5178957981199774, + "grad_norm": 0.12699732184410095, + "learning_rate": 0.0004796090649557375, + "loss": 2.679, + "step": 17465 + }, + { + "epoch": 0.517925451472289, + "grad_norm": 0.1082950234413147, + "learning_rate": 0.0004795620532282167, + "loss": 2.683, + "step": 17466 + }, + { + "epoch": 0.5179551048246004, + "grad_norm": 0.141789972782135, + "learning_rate": 0.00047951504168167755, + "loss": 2.6871, + "step": 17467 + }, + { + "epoch": 0.5179847581769119, + "grad_norm": 0.1357172727584839, + "learning_rate": 0.0004794680303165361, + "loss": 2.6825, + "step": 17468 + }, + { + "epoch": 0.5180144115292233, + "grad_norm": 0.11635913699865341, + "learning_rate": 0.00047942101913320885, + "loss": 2.67, + "step": 17469 + }, + { + "epoch": 0.5180440648815349, + "grad_norm": 0.1353379786014557, + "learning_rate": 0.000479374008132112, + "loss": 2.6826, + "step": 17470 + }, + { + "epoch": 0.5180737182338463, + "grad_norm": 0.1206074208021164, + "learning_rate": 0.0004793269973136619, + "loss": 2.6999, + "step": 17471 + }, + { + "epoch": 0.5181033715861578, + "grad_norm": 0.13091081380844116, + "learning_rate": 0.0004792799866782747, + "loss": 2.6919, + "step": 17472 + }, + { + "epoch": 0.5181330249384692, + "grad_norm": 0.14229899644851685, + "learning_rate": 0.00047923297622636684, + "loss": 2.6538, + "step": 17473 + }, + { + "epoch": 0.5181626782907808, + "grad_norm": 0.12106490880250931, + "learning_rate": 0.0004791859659583545, + "loss": 2.6703, + "step": 17474 + }, + { + "epoch": 0.5181923316430922, + "grad_norm": 0.11346156150102615, + "learning_rate": 0.000479138955874654, + "loss": 2.691, + "step": 17475 + }, + { + "epoch": 0.5182219849954037, + "grad_norm": 0.11085107177495956, + "learning_rate": 0.00047909194597568164, + "loss": 2.6924, + "step": 17476 + }, + { + "epoch": 0.5182516383477153, + "grad_norm": 0.12193947285413742, + "learning_rate": 0.0004790449362618538, + "loss": 2.6628, + "step": 17477 + }, + { + "epoch": 0.5182812917000267, + "grad_norm": 0.12459050118923187, + "learning_rate": 0.00047899792673358624, + "loss": 2.7001, + "step": 17478 + }, + { + "epoch": 0.5183109450523382, + "grad_norm": 0.12040792405605316, + "learning_rate": 0.0004789509173912959, + "loss": 2.6659, + "step": 17479 + }, + { + "epoch": 0.5183405984046496, + "grad_norm": 0.11104294657707214, + "learning_rate": 0.00047890390823539887, + "loss": 2.6829, + "step": 17480 + }, + { + "epoch": 0.5183702517569612, + "grad_norm": 0.11766041815280914, + "learning_rate": 0.00047885689926631137, + "loss": 2.6856, + "step": 17481 + }, + { + "epoch": 0.5183999051092726, + "grad_norm": 0.12983547151088715, + "learning_rate": 0.0004788098904844496, + "loss": 2.6952, + "step": 17482 + }, + { + "epoch": 0.5184295584615841, + "grad_norm": 0.11472775042057037, + "learning_rate": 0.0004787628818902299, + "loss": 2.6714, + "step": 17483 + }, + { + "epoch": 0.5184592118138955, + "grad_norm": 0.1178833544254303, + "learning_rate": 0.0004787158734840685, + "loss": 2.6817, + "step": 17484 + }, + { + "epoch": 0.5184888651662071, + "grad_norm": 0.12835825979709625, + "learning_rate": 0.00047866886526638173, + "loss": 2.6944, + "step": 17485 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 0.11651841551065445, + "learning_rate": 0.0004786218572375857, + "loss": 2.6741, + "step": 17486 + }, + { + "epoch": 0.51854817187083, + "grad_norm": 0.12235002964735031, + "learning_rate": 0.00047857484939809695, + "loss": 2.689, + "step": 17487 + }, + { + "epoch": 0.5185778252231414, + "grad_norm": 0.10909949988126755, + "learning_rate": 0.0004785278417483316, + "loss": 2.6223, + "step": 17488 + }, + { + "epoch": 0.518607478575453, + "grad_norm": 0.12280576676130295, + "learning_rate": 0.0004784808342887058, + "loss": 2.68, + "step": 17489 + }, + { + "epoch": 0.5186371319277644, + "grad_norm": 0.13909544050693512, + "learning_rate": 0.00047843382701963594, + "loss": 2.6861, + "step": 17490 + }, + { + "epoch": 0.5186667852800759, + "grad_norm": 0.12198450416326523, + "learning_rate": 0.000478386819941538, + "loss": 2.6799, + "step": 17491 + }, + { + "epoch": 0.5186964386323873, + "grad_norm": 0.12138528376817703, + "learning_rate": 0.0004783398130548288, + "loss": 2.6648, + "step": 17492 + }, + { + "epoch": 0.5187260919846989, + "grad_norm": 0.11818550527095795, + "learning_rate": 0.00047829280635992415, + "loss": 2.6634, + "step": 17493 + }, + { + "epoch": 0.5187557453370103, + "grad_norm": 0.11216837912797928, + "learning_rate": 0.00047824579985724047, + "loss": 2.6448, + "step": 17494 + }, + { + "epoch": 0.5187853986893218, + "grad_norm": 0.1062096580862999, + "learning_rate": 0.000478198793547194, + "loss": 2.6622, + "step": 17495 + }, + { + "epoch": 0.5188150520416333, + "grad_norm": 0.13149890303611755, + "learning_rate": 0.00047815178743020086, + "loss": 2.679, + "step": 17496 + }, + { + "epoch": 0.5188447053939448, + "grad_norm": 0.1295517385005951, + "learning_rate": 0.00047810478150667743, + "loss": 2.6822, + "step": 17497 + }, + { + "epoch": 0.5188743587462563, + "grad_norm": 0.12220613658428192, + "learning_rate": 0.00047805777577704004, + "loss": 2.6881, + "step": 17498 + }, + { + "epoch": 0.5189040120985677, + "grad_norm": 0.13454964756965637, + "learning_rate": 0.0004780107702417047, + "loss": 2.7152, + "step": 17499 + }, + { + "epoch": 0.5189336654508793, + "grad_norm": 0.10651422291994095, + "learning_rate": 0.0004779637649010877, + "loss": 2.6613, + "step": 17500 + }, + { + "epoch": 0.5189633188031907, + "grad_norm": 0.11447728425264359, + "learning_rate": 0.00047791675975560543, + "loss": 2.6517, + "step": 17501 + }, + { + "epoch": 0.5189929721555022, + "grad_norm": 0.12224675714969635, + "learning_rate": 0.00047786975480567403, + "loss": 2.6694, + "step": 17502 + }, + { + "epoch": 0.5190226255078136, + "grad_norm": 0.14515087008476257, + "learning_rate": 0.00047782275005170974, + "loss": 2.6814, + "step": 17503 + }, + { + "epoch": 0.5190522788601252, + "grad_norm": 0.1757650226354599, + "learning_rate": 0.0004777757454941287, + "loss": 2.6762, + "step": 17504 + }, + { + "epoch": 0.5190819322124366, + "grad_norm": 0.15095284581184387, + "learning_rate": 0.00047772874113334736, + "loss": 2.6585, + "step": 17505 + }, + { + "epoch": 0.5191115855647481, + "grad_norm": 0.10536696016788483, + "learning_rate": 0.0004776817369697818, + "loss": 2.626, + "step": 17506 + }, + { + "epoch": 0.5191412389170595, + "grad_norm": 0.12982891499996185, + "learning_rate": 0.0004776347330038484, + "loss": 2.6674, + "step": 17507 + }, + { + "epoch": 0.5191708922693711, + "grad_norm": 0.14968091249465942, + "learning_rate": 0.0004775877292359631, + "loss": 2.7011, + "step": 17508 + }, + { + "epoch": 0.5192005456216825, + "grad_norm": 0.1286589652299881, + "learning_rate": 0.0004775407256665426, + "loss": 2.6821, + "step": 17509 + }, + { + "epoch": 0.519230198973994, + "grad_norm": 0.11958945542573929, + "learning_rate": 0.0004774937222960026, + "loss": 2.6345, + "step": 17510 + }, + { + "epoch": 0.5192598523263054, + "grad_norm": 0.15376773476600647, + "learning_rate": 0.00047744671912475955, + "loss": 2.6784, + "step": 17511 + }, + { + "epoch": 0.519289505678617, + "grad_norm": 0.12926694750785828, + "learning_rate": 0.0004773997161532297, + "loss": 2.6958, + "step": 17512 + }, + { + "epoch": 0.5193191590309284, + "grad_norm": 0.13451778888702393, + "learning_rate": 0.0004773527133818293, + "loss": 2.715, + "step": 17513 + }, + { + "epoch": 0.5193488123832399, + "grad_norm": 0.1379602998495102, + "learning_rate": 0.0004773057108109744, + "loss": 2.6719, + "step": 17514 + }, + { + "epoch": 0.5193784657355514, + "grad_norm": 0.14777882397174835, + "learning_rate": 0.00047725870844108156, + "loss": 2.6808, + "step": 17515 + }, + { + "epoch": 0.5194081190878629, + "grad_norm": 0.14124158024787903, + "learning_rate": 0.00047721170627256654, + "loss": 2.6934, + "step": 17516 + }, + { + "epoch": 0.5194377724401744, + "grad_norm": 0.12230703979730606, + "learning_rate": 0.0004771647043058456, + "loss": 2.6819, + "step": 17517 + }, + { + "epoch": 0.5194674257924858, + "grad_norm": 0.1156139224767685, + "learning_rate": 0.0004771177025413354, + "loss": 2.6569, + "step": 17518 + }, + { + "epoch": 0.5194970791447974, + "grad_norm": 0.11859090626239777, + "learning_rate": 0.00047707070097945185, + "loss": 2.6668, + "step": 17519 + }, + { + "epoch": 0.5195267324971088, + "grad_norm": 0.1289599984884262, + "learning_rate": 0.00047702369962061115, + "loss": 2.6613, + "step": 17520 + }, + { + "epoch": 0.5195563858494203, + "grad_norm": 0.11101081222295761, + "learning_rate": 0.00047697669846522955, + "loss": 2.6692, + "step": 17521 + }, + { + "epoch": 0.5195860392017317, + "grad_norm": 0.10541226714849472, + "learning_rate": 0.0004769296975137232, + "loss": 2.664, + "step": 17522 + }, + { + "epoch": 0.5196156925540433, + "grad_norm": 0.12721645832061768, + "learning_rate": 0.00047688269676650835, + "loss": 2.6354, + "step": 17523 + }, + { + "epoch": 0.5196453459063547, + "grad_norm": 0.129063218832016, + "learning_rate": 0.0004768356962240012, + "loss": 2.7075, + "step": 17524 + }, + { + "epoch": 0.5196749992586662, + "grad_norm": 0.11299651116132736, + "learning_rate": 0.00047678869588661793, + "loss": 2.6817, + "step": 17525 + }, + { + "epoch": 0.5197046526109776, + "grad_norm": 0.1372227668762207, + "learning_rate": 0.00047674169575477485, + "loss": 2.6865, + "step": 17526 + }, + { + "epoch": 0.5197343059632892, + "grad_norm": 0.13174064457416534, + "learning_rate": 0.0004766946958288879, + "loss": 2.6454, + "step": 17527 + }, + { + "epoch": 0.5197639593156006, + "grad_norm": 0.13737724721431732, + "learning_rate": 0.00047664769610937345, + "loss": 2.6893, + "step": 17528 + }, + { + "epoch": 0.5197936126679121, + "grad_norm": 0.1357373297214508, + "learning_rate": 0.0004766006965966477, + "loss": 2.6392, + "step": 17529 + }, + { + "epoch": 0.5198232660202236, + "grad_norm": 0.14624176919460297, + "learning_rate": 0.00047655369729112664, + "loss": 2.6575, + "step": 17530 + }, + { + "epoch": 0.5198529193725351, + "grad_norm": 0.10953306406736374, + "learning_rate": 0.0004765066981932267, + "loss": 2.6628, + "step": 17531 + }, + { + "epoch": 0.5198825727248465, + "grad_norm": 0.13046643137931824, + "learning_rate": 0.000476459699303364, + "loss": 2.6645, + "step": 17532 + }, + { + "epoch": 0.519912226077158, + "grad_norm": 0.11209375411272049, + "learning_rate": 0.0004764127006219547, + "loss": 2.6821, + "step": 17533 + }, + { + "epoch": 0.5199418794294695, + "grad_norm": 0.1161121353507042, + "learning_rate": 0.000476365702149415, + "loss": 2.6678, + "step": 17534 + }, + { + "epoch": 0.519971532781781, + "grad_norm": 0.11814618110656738, + "learning_rate": 0.00047631870388616115, + "loss": 2.6386, + "step": 17535 + }, + { + "epoch": 0.5200011861340924, + "grad_norm": 0.11089634150266647, + "learning_rate": 0.0004762717058326093, + "loss": 2.6631, + "step": 17536 + }, + { + "epoch": 0.5200308394864039, + "grad_norm": 0.11770179867744446, + "learning_rate": 0.0004762247079891754, + "loss": 2.6751, + "step": 17537 + }, + { + "epoch": 0.5200604928387155, + "grad_norm": 0.11445946246385574, + "learning_rate": 0.00047617771035627585, + "loss": 2.6998, + "step": 17538 + }, + { + "epoch": 0.5200901461910269, + "grad_norm": 0.11843914538621902, + "learning_rate": 0.0004761307129343267, + "loss": 2.6361, + "step": 17539 + }, + { + "epoch": 0.5201197995433384, + "grad_norm": 0.11400540173053741, + "learning_rate": 0.0004760837157237442, + "loss": 2.6521, + "step": 17540 + }, + { + "epoch": 0.5201494528956498, + "grad_norm": 0.12675613164901733, + "learning_rate": 0.0004760367187249445, + "loss": 2.7035, + "step": 17541 + }, + { + "epoch": 0.5201791062479614, + "grad_norm": 0.12302961945533752, + "learning_rate": 0.0004759897219383438, + "loss": 2.6807, + "step": 17542 + }, + { + "epoch": 0.5202087596002728, + "grad_norm": 0.12317270785570145, + "learning_rate": 0.00047594272536435813, + "loss": 2.6975, + "step": 17543 + }, + { + "epoch": 0.5202384129525843, + "grad_norm": 0.1251317262649536, + "learning_rate": 0.0004758957290034038, + "loss": 2.7207, + "step": 17544 + }, + { + "epoch": 0.5202680663048957, + "grad_norm": 0.09873883426189423, + "learning_rate": 0.000475848732855897, + "loss": 2.6673, + "step": 17545 + }, + { + "epoch": 0.5202977196572073, + "grad_norm": 0.10865506529808044, + "learning_rate": 0.00047580173692225383, + "loss": 2.6885, + "step": 17546 + }, + { + "epoch": 0.5203273730095187, + "grad_norm": 0.1280016005039215, + "learning_rate": 0.00047575474120289046, + "loss": 2.6945, + "step": 17547 + }, + { + "epoch": 0.5203570263618302, + "grad_norm": 0.10990066081285477, + "learning_rate": 0.00047570774569822297, + "loss": 2.6652, + "step": 17548 + }, + { + "epoch": 0.5203866797141417, + "grad_norm": 0.12154068052768707, + "learning_rate": 0.0004756607504086676, + "loss": 2.6757, + "step": 17549 + }, + { + "epoch": 0.5204163330664532, + "grad_norm": 0.11278050392866135, + "learning_rate": 0.0004756137553346404, + "loss": 2.6613, + "step": 17550 + }, + { + "epoch": 0.5204459864187646, + "grad_norm": 0.12130555510520935, + "learning_rate": 0.00047556676047655756, + "loss": 2.6393, + "step": 17551 + }, + { + "epoch": 0.5204756397710761, + "grad_norm": 0.12528079748153687, + "learning_rate": 0.00047551976583483526, + "loss": 2.674, + "step": 17552 + }, + { + "epoch": 0.5205052931233876, + "grad_norm": 0.10754604637622833, + "learning_rate": 0.00047547277140988975, + "loss": 2.6751, + "step": 17553 + }, + { + "epoch": 0.5205349464756991, + "grad_norm": 0.11632528901100159, + "learning_rate": 0.000475425777202137, + "loss": 2.6686, + "step": 17554 + }, + { + "epoch": 0.5205645998280105, + "grad_norm": 0.15981778502464294, + "learning_rate": 0.0004753787832119932, + "loss": 2.6588, + "step": 17555 + }, + { + "epoch": 0.520594253180322, + "grad_norm": 0.15406720340251923, + "learning_rate": 0.0004753317894398742, + "loss": 2.6585, + "step": 17556 + }, + { + "epoch": 0.5206239065326335, + "grad_norm": 0.11663626879453659, + "learning_rate": 0.0004752847958861968, + "loss": 2.6627, + "step": 17557 + }, + { + "epoch": 0.520653559884945, + "grad_norm": 0.10762781649827957, + "learning_rate": 0.00047523780255137675, + "loss": 2.6829, + "step": 17558 + }, + { + "epoch": 0.5206832132372565, + "grad_norm": 0.13983382284641266, + "learning_rate": 0.00047519080943583017, + "loss": 2.6843, + "step": 17559 + }, + { + "epoch": 0.520712866589568, + "grad_norm": 0.13489209115505219, + "learning_rate": 0.0004751438165399732, + "loss": 2.6713, + "step": 17560 + }, + { + "epoch": 0.5207425199418795, + "grad_norm": 0.11608599871397018, + "learning_rate": 0.000475096823864222, + "loss": 2.6674, + "step": 17561 + }, + { + "epoch": 0.5207721732941909, + "grad_norm": 0.12685677409172058, + "learning_rate": 0.0004750498314089927, + "loss": 2.6664, + "step": 17562 + }, + { + "epoch": 0.5208018266465024, + "grad_norm": 0.12973368167877197, + "learning_rate": 0.00047500283917470144, + "loss": 2.7098, + "step": 17563 + }, + { + "epoch": 0.5208314799988139, + "grad_norm": 0.12179741263389587, + "learning_rate": 0.00047495584716176445, + "loss": 2.6273, + "step": 17564 + }, + { + "epoch": 0.5208611333511254, + "grad_norm": 0.11904395371675491, + "learning_rate": 0.00047490885537059755, + "loss": 2.6654, + "step": 17565 + }, + { + "epoch": 0.5208907867034368, + "grad_norm": 0.1249830573797226, + "learning_rate": 0.00047486186380161713, + "loss": 2.6757, + "step": 17566 + }, + { + "epoch": 0.5209204400557483, + "grad_norm": 0.13782785832881927, + "learning_rate": 0.0004748148724552392, + "loss": 2.6545, + "step": 17567 + }, + { + "epoch": 0.5209500934080598, + "grad_norm": 0.1324070543050766, + "learning_rate": 0.0004747678813318799, + "loss": 2.6671, + "step": 17568 + }, + { + "epoch": 0.5209797467603713, + "grad_norm": 0.1112484559416771, + "learning_rate": 0.0004747208904319552, + "loss": 2.6436, + "step": 17569 + }, + { + "epoch": 0.5210094001126827, + "grad_norm": 0.11507512629032135, + "learning_rate": 0.0004746738997558815, + "loss": 2.6222, + "step": 17570 + }, + { + "epoch": 0.5210390534649942, + "grad_norm": 0.11692369729280472, + "learning_rate": 0.0004746269093040747, + "loss": 2.6462, + "step": 17571 + }, + { + "epoch": 0.5210687068173057, + "grad_norm": 0.1218399852514267, + "learning_rate": 0.00047457991907695103, + "loss": 2.6633, + "step": 17572 + }, + { + "epoch": 0.5210983601696172, + "grad_norm": 0.11993599683046341, + "learning_rate": 0.0004745329290749266, + "loss": 2.6438, + "step": 17573 + }, + { + "epoch": 0.5211280135219286, + "grad_norm": 0.10296131670475006, + "learning_rate": 0.00047448593929841744, + "loss": 2.6165, + "step": 17574 + }, + { + "epoch": 0.5211576668742401, + "grad_norm": 0.1222609281539917, + "learning_rate": 0.00047443894974783966, + "loss": 2.6466, + "step": 17575 + }, + { + "epoch": 0.5211873202265516, + "grad_norm": 0.12363164871931076, + "learning_rate": 0.00047439196042360925, + "loss": 2.6709, + "step": 17576 + }, + { + "epoch": 0.5212169735788631, + "grad_norm": 0.11910886317491531, + "learning_rate": 0.00047434497132614254, + "loss": 2.6606, + "step": 17577 + }, + { + "epoch": 0.5212466269311745, + "grad_norm": 0.11719360947608948, + "learning_rate": 0.0004742979824558555, + "loss": 2.6801, + "step": 17578 + }, + { + "epoch": 0.521276280283486, + "grad_norm": 0.10734859108924866, + "learning_rate": 0.00047425099381316415, + "loss": 2.6545, + "step": 17579 + }, + { + "epoch": 0.5213059336357976, + "grad_norm": 0.121754951775074, + "learning_rate": 0.00047420400539848476, + "loss": 2.6544, + "step": 17580 + }, + { + "epoch": 0.521335586988109, + "grad_norm": 0.11147655546665192, + "learning_rate": 0.0004741570172122334, + "loss": 2.6752, + "step": 17581 + }, + { + "epoch": 0.5213652403404205, + "grad_norm": 0.10779578983783722, + "learning_rate": 0.00047411002925482575, + "loss": 2.6633, + "step": 17582 + }, + { + "epoch": 0.521394893692732, + "grad_norm": 0.10168173164129257, + "learning_rate": 0.00047406304152667846, + "loss": 2.6827, + "step": 17583 + }, + { + "epoch": 0.5214245470450435, + "grad_norm": 0.11769115179777145, + "learning_rate": 0.00047401605402820743, + "loss": 2.6615, + "step": 17584 + }, + { + "epoch": 0.5214542003973549, + "grad_norm": 0.109254390001297, + "learning_rate": 0.00047396906675982876, + "loss": 2.6758, + "step": 17585 + }, + { + "epoch": 0.5214838537496664, + "grad_norm": 0.1264169067144394, + "learning_rate": 0.0004739220797219584, + "loss": 2.6779, + "step": 17586 + }, + { + "epoch": 0.5215135071019779, + "grad_norm": 0.13097210228443146, + "learning_rate": 0.00047387509291501245, + "loss": 2.672, + "step": 17587 + }, + { + "epoch": 0.5215431604542894, + "grad_norm": 0.12521658837795258, + "learning_rate": 0.0004738281063394071, + "loss": 2.7155, + "step": 17588 + }, + { + "epoch": 0.5215728138066008, + "grad_norm": 0.12175101041793823, + "learning_rate": 0.00047378111999555835, + "loss": 2.6759, + "step": 17589 + }, + { + "epoch": 0.5216024671589123, + "grad_norm": 0.13132885098457336, + "learning_rate": 0.0004737341338838822, + "loss": 2.6604, + "step": 17590 + }, + { + "epoch": 0.5216321205112238, + "grad_norm": 0.12272677570581436, + "learning_rate": 0.00047368714800479503, + "loss": 2.6834, + "step": 17591 + }, + { + "epoch": 0.5216617738635353, + "grad_norm": 0.11816048622131348, + "learning_rate": 0.00047364016235871246, + "loss": 2.6723, + "step": 17592 + }, + { + "epoch": 0.5216914272158467, + "grad_norm": 0.12663577497005463, + "learning_rate": 0.0004735931769460509, + "loss": 2.6429, + "step": 17593 + }, + { + "epoch": 0.5217210805681582, + "grad_norm": 0.12406199425458908, + "learning_rate": 0.00047354619176722594, + "loss": 2.6759, + "step": 17594 + }, + { + "epoch": 0.5217507339204697, + "grad_norm": 0.11852975189685822, + "learning_rate": 0.0004734992068226544, + "loss": 2.6661, + "step": 17595 + }, + { + "epoch": 0.5217803872727812, + "grad_norm": 0.1112992912530899, + "learning_rate": 0.0004734522221127519, + "loss": 2.6637, + "step": 17596 + }, + { + "epoch": 0.5218100406250926, + "grad_norm": 0.11649339646100998, + "learning_rate": 0.0004734052376379344, + "loss": 2.6364, + "step": 17597 + }, + { + "epoch": 0.5218396939774042, + "grad_norm": 0.13409148156642914, + "learning_rate": 0.0004733582533986181, + "loss": 2.6405, + "step": 17598 + }, + { + "epoch": 0.5218693473297156, + "grad_norm": 0.11924717575311661, + "learning_rate": 0.00047331126939521905, + "loss": 2.6845, + "step": 17599 + }, + { + "epoch": 0.5218990006820271, + "grad_norm": 0.10713613033294678, + "learning_rate": 0.00047326428562815333, + "loss": 2.6758, + "step": 17600 + }, + { + "epoch": 0.5219286540343386, + "grad_norm": 0.11504276096820831, + "learning_rate": 0.00047321730209783693, + "loss": 2.695, + "step": 17601 + }, + { + "epoch": 0.5219583073866501, + "grad_norm": 0.0990513265132904, + "learning_rate": 0.00047317031880468596, + "loss": 2.6701, + "step": 17602 + }, + { + "epoch": 0.5219879607389616, + "grad_norm": 0.10278863459825516, + "learning_rate": 0.00047312333574911643, + "loss": 2.6605, + "step": 17603 + }, + { + "epoch": 0.522017614091273, + "grad_norm": 0.11126850545406342, + "learning_rate": 0.0004730763529315443, + "loss": 2.6724, + "step": 17604 + }, + { + "epoch": 0.5220472674435845, + "grad_norm": 0.11514707654714584, + "learning_rate": 0.00047302937035238567, + "loss": 2.6826, + "step": 17605 + }, + { + "epoch": 0.522076920795896, + "grad_norm": 0.10305061936378479, + "learning_rate": 0.00047298238801205667, + "loss": 2.6752, + "step": 17606 + }, + { + "epoch": 0.5221065741482075, + "grad_norm": 0.10895249992609024, + "learning_rate": 0.00047293540591097314, + "loss": 2.661, + "step": 17607 + }, + { + "epoch": 0.5221362275005189, + "grad_norm": 0.11173646152019501, + "learning_rate": 0.0004728884240495513, + "loss": 2.6442, + "step": 17608 + }, + { + "epoch": 0.5221658808528304, + "grad_norm": 0.11507444828748703, + "learning_rate": 0.00047284144242820713, + "loss": 2.6801, + "step": 17609 + }, + { + "epoch": 0.5221955342051419, + "grad_norm": 0.1280406266450882, + "learning_rate": 0.0004727944610473566, + "loss": 2.7108, + "step": 17610 + }, + { + "epoch": 0.5222251875574534, + "grad_norm": 0.12088989466428757, + "learning_rate": 0.0004727474799074159, + "loss": 2.6612, + "step": 17611 + }, + { + "epoch": 0.5222548409097648, + "grad_norm": 0.12109826505184174, + "learning_rate": 0.00047270049900880097, + "loss": 2.6623, + "step": 17612 + }, + { + "epoch": 0.5222844942620763, + "grad_norm": 0.11189050227403641, + "learning_rate": 0.00047265351835192775, + "loss": 2.6504, + "step": 17613 + }, + { + "epoch": 0.5223141476143878, + "grad_norm": 0.11655260622501373, + "learning_rate": 0.00047260653793721233, + "loss": 2.6766, + "step": 17614 + }, + { + "epoch": 0.5223438009666993, + "grad_norm": 0.1222754642367363, + "learning_rate": 0.0004725595577650706, + "loss": 2.6662, + "step": 17615 + }, + { + "epoch": 0.5223734543190107, + "grad_norm": 0.10580114275217056, + "learning_rate": 0.00047251257783591884, + "loss": 2.6523, + "step": 17616 + }, + { + "epoch": 0.5224031076713223, + "grad_norm": 0.12309417128562927, + "learning_rate": 0.0004724655981501728, + "loss": 2.6769, + "step": 17617 + }, + { + "epoch": 0.5224327610236337, + "grad_norm": 0.11181621253490448, + "learning_rate": 0.0004724186187082487, + "loss": 2.6481, + "step": 17618 + }, + { + "epoch": 0.5224624143759452, + "grad_norm": 0.10599872469902039, + "learning_rate": 0.0004723716395105626, + "loss": 2.6616, + "step": 17619 + }, + { + "epoch": 0.5224920677282566, + "grad_norm": 0.10610030591487885, + "learning_rate": 0.00047232466055752994, + "loss": 2.6676, + "step": 17620 + }, + { + "epoch": 0.5225217210805682, + "grad_norm": 0.1166049912571907, + "learning_rate": 0.0004722776818495674, + "loss": 2.6423, + "step": 17621 + }, + { + "epoch": 0.5225513744328797, + "grad_norm": 0.11187422275543213, + "learning_rate": 0.00047223070338709084, + "loss": 2.6054, + "step": 17622 + }, + { + "epoch": 0.5225810277851911, + "grad_norm": 0.11432366073131561, + "learning_rate": 0.00047218372517051624, + "loss": 2.7017, + "step": 17623 + }, + { + "epoch": 0.5226106811375026, + "grad_norm": 0.1091223806142807, + "learning_rate": 0.00047213674720025936, + "loss": 2.6637, + "step": 17624 + }, + { + "epoch": 0.5226403344898141, + "grad_norm": 0.10643360018730164, + "learning_rate": 0.0004720897694767364, + "loss": 2.6383, + "step": 17625 + }, + { + "epoch": 0.5226699878421256, + "grad_norm": 0.1058436930179596, + "learning_rate": 0.00047204279200036334, + "loss": 2.6622, + "step": 17626 + }, + { + "epoch": 0.522699641194437, + "grad_norm": 0.12230613827705383, + "learning_rate": 0.0004719958147715561, + "loss": 2.6977, + "step": 17627 + }, + { + "epoch": 0.5227292945467485, + "grad_norm": 0.1316480189561844, + "learning_rate": 0.00047194883779073075, + "loss": 2.6787, + "step": 17628 + }, + { + "epoch": 0.52275894789906, + "grad_norm": 0.14167384803295135, + "learning_rate": 0.00047190186105830343, + "loss": 2.6664, + "step": 17629 + }, + { + "epoch": 0.5227886012513715, + "grad_norm": 0.14323633909225464, + "learning_rate": 0.00047185488457468977, + "loss": 2.6902, + "step": 17630 + }, + { + "epoch": 0.5228182546036829, + "grad_norm": 0.10929024964570999, + "learning_rate": 0.000471807908340306, + "loss": 2.67, + "step": 17631 + }, + { + "epoch": 0.5228479079559945, + "grad_norm": 0.13385173678398132, + "learning_rate": 0.000471760932355568, + "loss": 2.6597, + "step": 17632 + }, + { + "epoch": 0.5228775613083059, + "grad_norm": 0.15846580266952515, + "learning_rate": 0.0004717139566208917, + "loss": 2.6765, + "step": 17633 + }, + { + "epoch": 0.5229072146606174, + "grad_norm": 0.15884454548358917, + "learning_rate": 0.00047166698113669325, + "loss": 2.7249, + "step": 17634 + }, + { + "epoch": 0.5229368680129288, + "grad_norm": 0.13989706337451935, + "learning_rate": 0.0004716200059033885, + "loss": 2.6719, + "step": 17635 + }, + { + "epoch": 0.5229665213652404, + "grad_norm": 0.12942352890968323, + "learning_rate": 0.00047157303092139357, + "loss": 2.689, + "step": 17636 + }, + { + "epoch": 0.5229961747175518, + "grad_norm": 0.13554494082927704, + "learning_rate": 0.00047152605619112434, + "loss": 2.6587, + "step": 17637 + }, + { + "epoch": 0.5230258280698633, + "grad_norm": 0.13194045424461365, + "learning_rate": 0.0004714790817129967, + "loss": 2.6518, + "step": 17638 + }, + { + "epoch": 0.5230554814221747, + "grad_norm": 0.12201114743947983, + "learning_rate": 0.0004714321074874267, + "loss": 2.6601, + "step": 17639 + }, + { + "epoch": 0.5230851347744863, + "grad_norm": 0.14082622528076172, + "learning_rate": 0.00047138513351483043, + "loss": 2.6586, + "step": 17640 + }, + { + "epoch": 0.5231147881267977, + "grad_norm": 0.1266058385372162, + "learning_rate": 0.00047133815979562353, + "loss": 2.6494, + "step": 17641 + }, + { + "epoch": 0.5231444414791092, + "grad_norm": 0.113047294318676, + "learning_rate": 0.00047129118633022223, + "loss": 2.6226, + "step": 17642 + }, + { + "epoch": 0.5231740948314207, + "grad_norm": 0.13496890664100647, + "learning_rate": 0.0004712442131190424, + "loss": 2.6627, + "step": 17643 + }, + { + "epoch": 0.5232037481837322, + "grad_norm": 0.11368390172719955, + "learning_rate": 0.00047119724016249994, + "loss": 2.6483, + "step": 17644 + }, + { + "epoch": 0.5232334015360437, + "grad_norm": 0.1190737783908844, + "learning_rate": 0.0004711502674610109, + "loss": 2.6434, + "step": 17645 + }, + { + "epoch": 0.5232630548883551, + "grad_norm": 0.1188417449593544, + "learning_rate": 0.0004711032950149911, + "loss": 2.692, + "step": 17646 + }, + { + "epoch": 0.5232927082406666, + "grad_norm": 0.11618672311306, + "learning_rate": 0.00047105632282485666, + "loss": 2.6705, + "step": 17647 + }, + { + "epoch": 0.5233223615929781, + "grad_norm": 0.11961248517036438, + "learning_rate": 0.0004710093508910235, + "loss": 2.6496, + "step": 17648 + }, + { + "epoch": 0.5233520149452896, + "grad_norm": 0.11436427384614944, + "learning_rate": 0.00047096237921390746, + "loss": 2.6455, + "step": 17649 + }, + { + "epoch": 0.523381668297601, + "grad_norm": 0.12728165090084076, + "learning_rate": 0.00047091540779392463, + "loss": 2.6583, + "step": 17650 + }, + { + "epoch": 0.5234113216499126, + "grad_norm": 0.11593876034021378, + "learning_rate": 0.0004708684366314908, + "loss": 2.674, + "step": 17651 + }, + { + "epoch": 0.523440975002224, + "grad_norm": 0.10899651795625687, + "learning_rate": 0.0004708214657270219, + "loss": 2.6517, + "step": 17652 + }, + { + "epoch": 0.5234706283545355, + "grad_norm": 0.10322943329811096, + "learning_rate": 0.00047077449508093396, + "loss": 2.6742, + "step": 17653 + }, + { + "epoch": 0.5235002817068469, + "grad_norm": 0.11166661232709885, + "learning_rate": 0.00047072752469364286, + "loss": 2.6623, + "step": 17654 + }, + { + "epoch": 0.5235299350591585, + "grad_norm": 0.1080566942691803, + "learning_rate": 0.00047068055456556465, + "loss": 2.6473, + "step": 17655 + }, + { + "epoch": 0.5235595884114699, + "grad_norm": 0.12120907753705978, + "learning_rate": 0.0004706335846971151, + "loss": 2.6813, + "step": 17656 + }, + { + "epoch": 0.5235892417637814, + "grad_norm": 0.12070180475711823, + "learning_rate": 0.0004705866150887103, + "loss": 2.6776, + "step": 17657 + }, + { + "epoch": 0.5236188951160928, + "grad_norm": 0.10713627189397812, + "learning_rate": 0.00047053964574076594, + "loss": 2.7108, + "step": 17658 + }, + { + "epoch": 0.5236485484684044, + "grad_norm": 0.11141146719455719, + "learning_rate": 0.0004704926766536979, + "loss": 2.7112, + "step": 17659 + }, + { + "epoch": 0.5236782018207158, + "grad_norm": 0.10804516822099686, + "learning_rate": 0.00047044570782792247, + "loss": 2.6914, + "step": 17660 + }, + { + "epoch": 0.5237078551730273, + "grad_norm": 0.10269949585199356, + "learning_rate": 0.00047039873926385545, + "loss": 2.6536, + "step": 17661 + }, + { + "epoch": 0.5237375085253387, + "grad_norm": 0.11373165994882584, + "learning_rate": 0.0004703517709619126, + "loss": 2.6751, + "step": 17662 + }, + { + "epoch": 0.5237671618776503, + "grad_norm": 0.11345339566469193, + "learning_rate": 0.00047030480292250995, + "loss": 2.6649, + "step": 17663 + }, + { + "epoch": 0.5237968152299618, + "grad_norm": 0.13817963004112244, + "learning_rate": 0.0004702578351460633, + "loss": 2.6815, + "step": 17664 + }, + { + "epoch": 0.5238264685822732, + "grad_norm": 0.15437205135822296, + "learning_rate": 0.00047021086763298866, + "loss": 2.6792, + "step": 17665 + }, + { + "epoch": 0.5238561219345848, + "grad_norm": 0.15405499935150146, + "learning_rate": 0.0004701639003837019, + "loss": 2.6517, + "step": 17666 + }, + { + "epoch": 0.5238857752868962, + "grad_norm": 0.13000796735286713, + "learning_rate": 0.000470116933398619, + "loss": 2.6831, + "step": 17667 + }, + { + "epoch": 0.5239154286392077, + "grad_norm": 0.12921515107154846, + "learning_rate": 0.0004700699666781557, + "loss": 2.7109, + "step": 17668 + }, + { + "epoch": 0.5239450819915191, + "grad_norm": 0.14329911768436432, + "learning_rate": 0.000470023000222728, + "loss": 2.6886, + "step": 17669 + }, + { + "epoch": 0.5239747353438307, + "grad_norm": 0.14523065090179443, + "learning_rate": 0.00046997603403275176, + "loss": 2.6564, + "step": 17670 + }, + { + "epoch": 0.5240043886961421, + "grad_norm": 0.13649120926856995, + "learning_rate": 0.00046992906810864286, + "loss": 2.6802, + "step": 17671 + }, + { + "epoch": 0.5240340420484536, + "grad_norm": 0.1670009195804596, + "learning_rate": 0.0004698821024508171, + "loss": 2.7157, + "step": 17672 + }, + { + "epoch": 0.524063695400765, + "grad_norm": 0.14564888179302216, + "learning_rate": 0.00046983513705969074, + "loss": 2.6504, + "step": 17673 + }, + { + "epoch": 0.5240933487530766, + "grad_norm": 0.1376153826713562, + "learning_rate": 0.0004697881719356793, + "loss": 2.6843, + "step": 17674 + }, + { + "epoch": 0.524123002105388, + "grad_norm": 0.14216850697994232, + "learning_rate": 0.0004697412070791988, + "loss": 2.6903, + "step": 17675 + }, + { + "epoch": 0.5241526554576995, + "grad_norm": 0.13649414479732513, + "learning_rate": 0.0004696942424906651, + "loss": 2.6628, + "step": 17676 + }, + { + "epoch": 0.5241823088100109, + "grad_norm": 0.1446732133626938, + "learning_rate": 0.00046964727817049414, + "loss": 2.6594, + "step": 17677 + }, + { + "epoch": 0.5242119621623225, + "grad_norm": 0.14722603559494019, + "learning_rate": 0.00046960031411910175, + "loss": 2.6962, + "step": 17678 + }, + { + "epoch": 0.5242416155146339, + "grad_norm": 0.12408516556024551, + "learning_rate": 0.0004695533503369038, + "loss": 2.6585, + "step": 17679 + }, + { + "epoch": 0.5242712688669454, + "grad_norm": 0.1406940221786499, + "learning_rate": 0.0004695063868243161, + "loss": 2.6776, + "step": 17680 + }, + { + "epoch": 0.5243009222192568, + "grad_norm": 0.13588190078735352, + "learning_rate": 0.0004694594235817546, + "loss": 2.6689, + "step": 17681 + }, + { + "epoch": 0.5243305755715684, + "grad_norm": 0.12121287733316422, + "learning_rate": 0.0004694124606096351, + "loss": 2.6614, + "step": 17682 + }, + { + "epoch": 0.5243602289238798, + "grad_norm": 0.12800797820091248, + "learning_rate": 0.0004693654979083735, + "loss": 2.6492, + "step": 17683 + }, + { + "epoch": 0.5243898822761913, + "grad_norm": 0.10325378179550171, + "learning_rate": 0.0004693185354783858, + "loss": 2.6612, + "step": 17684 + }, + { + "epoch": 0.5244195356285029, + "grad_norm": 0.11462979763746262, + "learning_rate": 0.00046927157332008753, + "loss": 2.6749, + "step": 17685 + }, + { + "epoch": 0.5244491889808143, + "grad_norm": 0.12491177022457123, + "learning_rate": 0.00046922461143389485, + "loss": 2.6901, + "step": 17686 + }, + { + "epoch": 0.5244788423331258, + "grad_norm": 0.1128234788775444, + "learning_rate": 0.00046917764982022355, + "loss": 2.6635, + "step": 17687 + }, + { + "epoch": 0.5245084956854372, + "grad_norm": 0.09824247658252716, + "learning_rate": 0.0004691306884794895, + "loss": 2.649, + "step": 17688 + }, + { + "epoch": 0.5245381490377488, + "grad_norm": 0.11660950630903244, + "learning_rate": 0.0004690837274121085, + "loss": 2.6877, + "step": 17689 + }, + { + "epoch": 0.5245678023900602, + "grad_norm": 0.11995511502027512, + "learning_rate": 0.0004690367666184963, + "loss": 2.6484, + "step": 17690 + }, + { + "epoch": 0.5245974557423717, + "grad_norm": 0.10242336988449097, + "learning_rate": 0.00046898980609906896, + "loss": 2.6899, + "step": 17691 + }, + { + "epoch": 0.5246271090946831, + "grad_norm": 0.1258486658334732, + "learning_rate": 0.0004689428458542421, + "loss": 2.7121, + "step": 17692 + }, + { + "epoch": 0.5246567624469947, + "grad_norm": 0.12381602078676224, + "learning_rate": 0.0004688958858844317, + "loss": 2.6553, + "step": 17693 + }, + { + "epoch": 0.5246864157993061, + "grad_norm": 0.11621217429637909, + "learning_rate": 0.0004688489261900536, + "loss": 2.6916, + "step": 17694 + }, + { + "epoch": 0.5247160691516176, + "grad_norm": 0.12166246771812439, + "learning_rate": 0.0004688019667715237, + "loss": 2.6758, + "step": 17695 + }, + { + "epoch": 0.524745722503929, + "grad_norm": 0.1197984367609024, + "learning_rate": 0.0004687550076292576, + "loss": 2.6821, + "step": 17696 + }, + { + "epoch": 0.5247753758562406, + "grad_norm": 0.12803207337856293, + "learning_rate": 0.0004687080487636713, + "loss": 2.6675, + "step": 17697 + }, + { + "epoch": 0.524805029208552, + "grad_norm": 0.13696934282779694, + "learning_rate": 0.00046866109017518036, + "loss": 2.6715, + "step": 17698 + }, + { + "epoch": 0.5248346825608635, + "grad_norm": 0.11546593904495239, + "learning_rate": 0.0004686141318642012, + "loss": 2.6318, + "step": 17699 + }, + { + "epoch": 0.5248643359131749, + "grad_norm": 0.13275736570358276, + "learning_rate": 0.0004685671738311492, + "loss": 2.6708, + "step": 17700 + }, + { + "epoch": 0.5248939892654865, + "grad_norm": 0.1354435235261917, + "learning_rate": 0.00046852021607644026, + "loss": 2.6744, + "step": 17701 + }, + { + "epoch": 0.5249236426177979, + "grad_norm": 0.11859071999788284, + "learning_rate": 0.00046847325860049016, + "loss": 2.6717, + "step": 17702 + }, + { + "epoch": 0.5249532959701094, + "grad_norm": 0.1479264348745346, + "learning_rate": 0.0004684263014037148, + "loss": 2.7085, + "step": 17703 + }, + { + "epoch": 0.5249829493224208, + "grad_norm": 0.16925261914730072, + "learning_rate": 0.00046837934448652996, + "loss": 2.7004, + "step": 17704 + }, + { + "epoch": 0.5250126026747324, + "grad_norm": 0.16329675912857056, + "learning_rate": 0.00046833238784935155, + "loss": 2.6685, + "step": 17705 + }, + { + "epoch": 0.5250422560270439, + "grad_norm": 0.13768216967582703, + "learning_rate": 0.0004682854314925952, + "loss": 2.6894, + "step": 17706 + }, + { + "epoch": 0.5250719093793553, + "grad_norm": 0.13405244052410126, + "learning_rate": 0.0004682384754166768, + "loss": 2.6671, + "step": 17707 + }, + { + "epoch": 0.5251015627316669, + "grad_norm": 0.1329805850982666, + "learning_rate": 0.0004681915196220121, + "loss": 2.6754, + "step": 17708 + }, + { + "epoch": 0.5251312160839783, + "grad_norm": 0.1397804468870163, + "learning_rate": 0.000468144564109017, + "loss": 2.6866, + "step": 17709 + }, + { + "epoch": 0.5251608694362898, + "grad_norm": 0.14922599494457245, + "learning_rate": 0.0004680976088781073, + "loss": 2.6992, + "step": 17710 + }, + { + "epoch": 0.5251905227886012, + "grad_norm": 0.1370038241147995, + "learning_rate": 0.00046805065392969855, + "loss": 2.6937, + "step": 17711 + }, + { + "epoch": 0.5252201761409128, + "grad_norm": 0.14703883230686188, + "learning_rate": 0.0004680036992642069, + "loss": 2.6768, + "step": 17712 + }, + { + "epoch": 0.5252498294932242, + "grad_norm": 0.12777426838874817, + "learning_rate": 0.0004679567448820479, + "loss": 2.6564, + "step": 17713 + }, + { + "epoch": 0.5252794828455357, + "grad_norm": 0.10660701990127563, + "learning_rate": 0.00046790979078363754, + "loss": 2.6504, + "step": 17714 + }, + { + "epoch": 0.5253091361978471, + "grad_norm": 0.11834985762834549, + "learning_rate": 0.0004678628369693914, + "loss": 2.6903, + "step": 17715 + }, + { + "epoch": 0.5253387895501587, + "grad_norm": 0.11662662774324417, + "learning_rate": 0.0004678158834397255, + "loss": 2.6573, + "step": 17716 + }, + { + "epoch": 0.5253684429024701, + "grad_norm": 0.1175040751695633, + "learning_rate": 0.0004677689301950554, + "loss": 2.6663, + "step": 17717 + }, + { + "epoch": 0.5253980962547816, + "grad_norm": 0.11237242817878723, + "learning_rate": 0.00046772197723579694, + "loss": 2.6608, + "step": 17718 + }, + { + "epoch": 0.525427749607093, + "grad_norm": 0.11528056114912033, + "learning_rate": 0.00046767502456236595, + "loss": 2.6772, + "step": 17719 + }, + { + "epoch": 0.5254574029594046, + "grad_norm": 0.11908788979053497, + "learning_rate": 0.0004676280721751781, + "loss": 2.642, + "step": 17720 + }, + { + "epoch": 0.525487056311716, + "grad_norm": 0.11365223675966263, + "learning_rate": 0.00046758112007464923, + "loss": 2.667, + "step": 17721 + }, + { + "epoch": 0.5255167096640275, + "grad_norm": 0.11859223991632462, + "learning_rate": 0.0004675341682611951, + "loss": 2.6392, + "step": 17722 + }, + { + "epoch": 0.525546363016339, + "grad_norm": 0.1138167530298233, + "learning_rate": 0.00046748721673523164, + "loss": 2.6359, + "step": 17723 + }, + { + "epoch": 0.5255760163686505, + "grad_norm": 0.10087977349758148, + "learning_rate": 0.0004674402654971741, + "loss": 2.6466, + "step": 17724 + }, + { + "epoch": 0.525605669720962, + "grad_norm": 0.10037761926651001, + "learning_rate": 0.00046739331454743885, + "loss": 2.6492, + "step": 17725 + }, + { + "epoch": 0.5256353230732734, + "grad_norm": 0.10929262638092041, + "learning_rate": 0.0004673463638864415, + "loss": 2.6598, + "step": 17726 + }, + { + "epoch": 0.525664976425585, + "grad_norm": 0.10084589570760727, + "learning_rate": 0.00046729941351459747, + "loss": 2.6711, + "step": 17727 + }, + { + "epoch": 0.5256946297778964, + "grad_norm": 0.10992180556058884, + "learning_rate": 0.0004672524634323229, + "loss": 2.6451, + "step": 17728 + }, + { + "epoch": 0.5257242831302079, + "grad_norm": 0.1278790831565857, + "learning_rate": 0.00046720551364003333, + "loss": 2.6935, + "step": 17729 + }, + { + "epoch": 0.5257539364825193, + "grad_norm": 0.11340252310037613, + "learning_rate": 0.0004671585641381446, + "loss": 2.6462, + "step": 17730 + }, + { + "epoch": 0.5257835898348309, + "grad_norm": 0.09996820241212845, + "learning_rate": 0.00046711161492707235, + "loss": 2.6696, + "step": 17731 + }, + { + "epoch": 0.5258132431871423, + "grad_norm": 0.11328815668821335, + "learning_rate": 0.0004670646660072324, + "loss": 2.6613, + "step": 17732 + }, + { + "epoch": 0.5258428965394538, + "grad_norm": 0.11842554807662964, + "learning_rate": 0.00046701771737904063, + "loss": 2.7082, + "step": 17733 + }, + { + "epoch": 0.5258725498917652, + "grad_norm": 0.14175744354724884, + "learning_rate": 0.00046697076904291256, + "loss": 2.6675, + "step": 17734 + }, + { + "epoch": 0.5259022032440768, + "grad_norm": 0.15313485264778137, + "learning_rate": 0.00046692382099926396, + "loss": 2.6734, + "step": 17735 + }, + { + "epoch": 0.5259318565963882, + "grad_norm": 0.1261242926120758, + "learning_rate": 0.0004668768732485106, + "loss": 2.6729, + "step": 17736 + }, + { + "epoch": 0.5259615099486997, + "grad_norm": 0.10656803101301193, + "learning_rate": 0.0004668299257910681, + "loss": 2.6534, + "step": 17737 + }, + { + "epoch": 0.5259911633010111, + "grad_norm": 0.12274094671010971, + "learning_rate": 0.0004667829786273524, + "loss": 2.659, + "step": 17738 + }, + { + "epoch": 0.5260208166533227, + "grad_norm": 0.11171122640371323, + "learning_rate": 0.00046673603175777917, + "loss": 2.6584, + "step": 17739 + }, + { + "epoch": 0.5260504700056341, + "grad_norm": 0.11230950057506561, + "learning_rate": 0.0004666890851827641, + "loss": 2.6661, + "step": 17740 + }, + { + "epoch": 0.5260801233579456, + "grad_norm": 0.1288728266954422, + "learning_rate": 0.00046664213890272284, + "loss": 2.6371, + "step": 17741 + }, + { + "epoch": 0.526109776710257, + "grad_norm": 0.13786154985427856, + "learning_rate": 0.00046659519291807115, + "loss": 2.6928, + "step": 17742 + }, + { + "epoch": 0.5261394300625686, + "grad_norm": 0.15166978538036346, + "learning_rate": 0.00046654824722922495, + "loss": 2.687, + "step": 17743 + }, + { + "epoch": 0.52616908341488, + "grad_norm": 0.11888301372528076, + "learning_rate": 0.00046650130183659963, + "loss": 2.6664, + "step": 17744 + }, + { + "epoch": 0.5261987367671915, + "grad_norm": 0.11512306332588196, + "learning_rate": 0.000466454356740611, + "loss": 2.6491, + "step": 17745 + }, + { + "epoch": 0.5262283901195031, + "grad_norm": 0.1259700357913971, + "learning_rate": 0.0004664074119416748, + "loss": 2.6523, + "step": 17746 + }, + { + "epoch": 0.5262580434718145, + "grad_norm": 0.1201479583978653, + "learning_rate": 0.0004663604674402068, + "loss": 2.6818, + "step": 17747 + }, + { + "epoch": 0.526287696824126, + "grad_norm": 0.12450975924730301, + "learning_rate": 0.0004663135232366226, + "loss": 2.6286, + "step": 17748 + }, + { + "epoch": 0.5263173501764374, + "grad_norm": 0.1208452433347702, + "learning_rate": 0.00046626657933133785, + "loss": 2.6425, + "step": 17749 + }, + { + "epoch": 0.526347003528749, + "grad_norm": 0.11676327884197235, + "learning_rate": 0.00046621963572476827, + "loss": 2.7025, + "step": 17750 + }, + { + "epoch": 0.5263766568810604, + "grad_norm": 0.10701306164264679, + "learning_rate": 0.00046617269241732974, + "loss": 2.6891, + "step": 17751 + }, + { + "epoch": 0.5264063102333719, + "grad_norm": 0.1278705894947052, + "learning_rate": 0.00046612574940943784, + "loss": 2.6836, + "step": 17752 + }, + { + "epoch": 0.5264359635856833, + "grad_norm": 0.11339589953422546, + "learning_rate": 0.00046607880670150825, + "loss": 2.6793, + "step": 17753 + }, + { + "epoch": 0.5264656169379949, + "grad_norm": 0.10631925612688065, + "learning_rate": 0.0004660318642939567, + "loss": 2.643, + "step": 17754 + }, + { + "epoch": 0.5264952702903063, + "grad_norm": 0.10885011404752731, + "learning_rate": 0.0004659849221871988, + "loss": 2.6873, + "step": 17755 + }, + { + "epoch": 0.5265249236426178, + "grad_norm": 0.10321082919836044, + "learning_rate": 0.0004659379803816502, + "loss": 2.6564, + "step": 17756 + }, + { + "epoch": 0.5265545769949292, + "grad_norm": 0.10664662718772888, + "learning_rate": 0.00046589103887772663, + "loss": 2.6816, + "step": 17757 + }, + { + "epoch": 0.5265842303472408, + "grad_norm": 0.10315658152103424, + "learning_rate": 0.00046584409767584374, + "loss": 2.66, + "step": 17758 + }, + { + "epoch": 0.5266138836995522, + "grad_norm": 0.11610029637813568, + "learning_rate": 0.00046579715677641726, + "loss": 2.6698, + "step": 17759 + }, + { + "epoch": 0.5266435370518637, + "grad_norm": 0.11996783316135406, + "learning_rate": 0.0004657502161798629, + "loss": 2.6418, + "step": 17760 + }, + { + "epoch": 0.5266731904041752, + "grad_norm": 0.12875252962112427, + "learning_rate": 0.0004657032758865962, + "loss": 2.6643, + "step": 17761 + }, + { + "epoch": 0.5267028437564867, + "grad_norm": 0.12073959410190582, + "learning_rate": 0.0004656563358970329, + "loss": 2.6848, + "step": 17762 + }, + { + "epoch": 0.5267324971087981, + "grad_norm": 0.11929279565811157, + "learning_rate": 0.00046560939621158835, + "loss": 2.6887, + "step": 17763 + }, + { + "epoch": 0.5267621504611096, + "grad_norm": 0.12826891243457794, + "learning_rate": 0.0004655624568306789, + "loss": 2.6536, + "step": 17764 + }, + { + "epoch": 0.5267918038134211, + "grad_norm": 0.1386723518371582, + "learning_rate": 0.00046551551775471964, + "loss": 2.6589, + "step": 17765 + }, + { + "epoch": 0.5268214571657326, + "grad_norm": 0.15875650942325592, + "learning_rate": 0.00046546857898412635, + "loss": 2.6775, + "step": 17766 + }, + { + "epoch": 0.5268511105180441, + "grad_norm": 0.1356651335954666, + "learning_rate": 0.00046542164051931477, + "loss": 2.6725, + "step": 17767 + }, + { + "epoch": 0.5268807638703555, + "grad_norm": 0.12737871706485748, + "learning_rate": 0.0004653747023607006, + "loss": 2.6602, + "step": 17768 + }, + { + "epoch": 0.5269104172226671, + "grad_norm": 0.13023139536380768, + "learning_rate": 0.0004653277645086992, + "loss": 2.6737, + "step": 17769 + }, + { + "epoch": 0.5269400705749785, + "grad_norm": 0.11728654056787491, + "learning_rate": 0.00046528082696372655, + "loss": 2.6541, + "step": 17770 + }, + { + "epoch": 0.52696972392729, + "grad_norm": 0.12024985253810883, + "learning_rate": 0.0004652338897261981, + "loss": 2.6786, + "step": 17771 + }, + { + "epoch": 0.5269993772796014, + "grad_norm": 0.11840655654668808, + "learning_rate": 0.00046518695279652953, + "loss": 2.667, + "step": 17772 + }, + { + "epoch": 0.527029030631913, + "grad_norm": 0.10697996616363525, + "learning_rate": 0.0004651400161751364, + "loss": 2.6847, + "step": 17773 + }, + { + "epoch": 0.5270586839842244, + "grad_norm": 0.12521910667419434, + "learning_rate": 0.0004650930798624345, + "loss": 2.6981, + "step": 17774 + }, + { + "epoch": 0.5270883373365359, + "grad_norm": 0.12666964530944824, + "learning_rate": 0.00046504614385883917, + "loss": 2.6648, + "step": 17775 + }, + { + "epoch": 0.5271179906888473, + "grad_norm": 0.11650244891643524, + "learning_rate": 0.00046499920816476636, + "loss": 2.6583, + "step": 17776 + }, + { + "epoch": 0.5271476440411589, + "grad_norm": 0.12061044573783875, + "learning_rate": 0.0004649522727806316, + "loss": 2.7127, + "step": 17777 + }, + { + "epoch": 0.5271772973934703, + "grad_norm": 0.12938354909420013, + "learning_rate": 0.00046490533770685043, + "loss": 2.6726, + "step": 17778 + }, + { + "epoch": 0.5272069507457818, + "grad_norm": 0.11864220350980759, + "learning_rate": 0.0004648584029438386, + "loss": 2.6864, + "step": 17779 + }, + { + "epoch": 0.5272366040980933, + "grad_norm": 0.1269574612379074, + "learning_rate": 0.0004648114684920116, + "loss": 2.6425, + "step": 17780 + }, + { + "epoch": 0.5272662574504048, + "grad_norm": 0.10843141376972198, + "learning_rate": 0.00046476453435178504, + "loss": 2.6543, + "step": 17781 + }, + { + "epoch": 0.5272959108027162, + "grad_norm": 0.1197800263762474, + "learning_rate": 0.00046471760052357473, + "loss": 2.691, + "step": 17782 + }, + { + "epoch": 0.5273255641550277, + "grad_norm": 0.13342252373695374, + "learning_rate": 0.0004646706670077961, + "loss": 2.6589, + "step": 17783 + }, + { + "epoch": 0.5273552175073392, + "grad_norm": 0.11733433604240417, + "learning_rate": 0.0004646237338048647, + "loss": 2.6128, + "step": 17784 + }, + { + "epoch": 0.5273848708596507, + "grad_norm": 0.10672406107187271, + "learning_rate": 0.00046457680091519616, + "loss": 2.672, + "step": 17785 + }, + { + "epoch": 0.5274145242119621, + "grad_norm": 0.11806036531925201, + "learning_rate": 0.0004645298683392062, + "loss": 2.6543, + "step": 17786 + }, + { + "epoch": 0.5274441775642736, + "grad_norm": 0.12009638547897339, + "learning_rate": 0.0004644829360773103, + "loss": 2.6768, + "step": 17787 + }, + { + "epoch": 0.5274738309165852, + "grad_norm": 0.10716407001018524, + "learning_rate": 0.000464436004129924, + "loss": 2.6709, + "step": 17788 + }, + { + "epoch": 0.5275034842688966, + "grad_norm": 0.1246771514415741, + "learning_rate": 0.0004643890724974631, + "loss": 2.6447, + "step": 17789 + }, + { + "epoch": 0.5275331376212081, + "grad_norm": 0.10836715996265411, + "learning_rate": 0.00046434214118034304, + "loss": 2.6774, + "step": 17790 + }, + { + "epoch": 0.5275627909735195, + "grad_norm": 0.12407892197370529, + "learning_rate": 0.0004642952101789795, + "loss": 2.66, + "step": 17791 + }, + { + "epoch": 0.5275924443258311, + "grad_norm": 0.11830874532461166, + "learning_rate": 0.00046424827949378814, + "loss": 2.694, + "step": 17792 + }, + { + "epoch": 0.5276220976781425, + "grad_norm": 0.1332939863204956, + "learning_rate": 0.0004642013491251842, + "loss": 2.6876, + "step": 17793 + }, + { + "epoch": 0.527651751030454, + "grad_norm": 0.12070825695991516, + "learning_rate": 0.00046415441907358355, + "loss": 2.6815, + "step": 17794 + }, + { + "epoch": 0.5276814043827655, + "grad_norm": 0.10446736216545105, + "learning_rate": 0.0004641074893394016, + "loss": 2.6725, + "step": 17795 + }, + { + "epoch": 0.527711057735077, + "grad_norm": 0.10670534521341324, + "learning_rate": 0.00046406055992305397, + "loss": 2.6568, + "step": 17796 + }, + { + "epoch": 0.5277407110873884, + "grad_norm": 0.13606905937194824, + "learning_rate": 0.0004640136308249563, + "loss": 2.6273, + "step": 17797 + }, + { + "epoch": 0.5277703644396999, + "grad_norm": 0.1663770228624344, + "learning_rate": 0.0004639667020455241, + "loss": 2.6583, + "step": 17798 + }, + { + "epoch": 0.5278000177920114, + "grad_norm": 0.1527465432882309, + "learning_rate": 0.00046391977358517305, + "loss": 2.6669, + "step": 17799 + }, + { + "epoch": 0.5278296711443229, + "grad_norm": 0.15192323923110962, + "learning_rate": 0.0004638728454443185, + "loss": 2.6561, + "step": 17800 + }, + { + "epoch": 0.5278593244966343, + "grad_norm": 0.12433776259422302, + "learning_rate": 0.0004638259176233759, + "loss": 2.647, + "step": 17801 + }, + { + "epoch": 0.5278889778489458, + "grad_norm": 0.14794345200061798, + "learning_rate": 0.0004637789901227613, + "loss": 2.7031, + "step": 17802 + }, + { + "epoch": 0.5279186312012573, + "grad_norm": 0.1458200067281723, + "learning_rate": 0.00046373206294288984, + "loss": 2.6547, + "step": 17803 + }, + { + "epoch": 0.5279482845535688, + "grad_norm": 0.12360648810863495, + "learning_rate": 0.0004636851360841772, + "loss": 2.7031, + "step": 17804 + }, + { + "epoch": 0.5279779379058802, + "grad_norm": 0.12884670495986938, + "learning_rate": 0.00046363820954703895, + "loss": 2.6859, + "step": 17805 + }, + { + "epoch": 0.5280075912581917, + "grad_norm": 0.12665237486362457, + "learning_rate": 0.00046359128333189057, + "loss": 2.6557, + "step": 17806 + }, + { + "epoch": 0.5280372446105032, + "grad_norm": 0.11075236648321152, + "learning_rate": 0.00046354435743914765, + "loss": 2.6667, + "step": 17807 + }, + { + "epoch": 0.5280668979628147, + "grad_norm": 0.11234703660011292, + "learning_rate": 0.00046349743186922565, + "loss": 2.6739, + "step": 17808 + }, + { + "epoch": 0.5280965513151262, + "grad_norm": 0.10751611739397049, + "learning_rate": 0.00046345050662254027, + "loss": 2.6559, + "step": 17809 + }, + { + "epoch": 0.5281262046674376, + "grad_norm": 0.1156206876039505, + "learning_rate": 0.00046340358169950685, + "loss": 2.692, + "step": 17810 + }, + { + "epoch": 0.5281558580197492, + "grad_norm": 0.11405283957719803, + "learning_rate": 0.00046335665710054097, + "loss": 2.61, + "step": 17811 + }, + { + "epoch": 0.5281855113720606, + "grad_norm": 0.11928991228342056, + "learning_rate": 0.0004633097328260582, + "loss": 2.6472, + "step": 17812 + }, + { + "epoch": 0.5282151647243721, + "grad_norm": 0.11362966895103455, + "learning_rate": 0.00046326280887647403, + "loss": 2.66, + "step": 17813 + }, + { + "epoch": 0.5282448180766836, + "grad_norm": 0.10484948754310608, + "learning_rate": 0.0004632158852522039, + "loss": 2.6802, + "step": 17814 + }, + { + "epoch": 0.5282744714289951, + "grad_norm": 0.11827459931373596, + "learning_rate": 0.00046316896195366356, + "loss": 2.6572, + "step": 17815 + }, + { + "epoch": 0.5283041247813065, + "grad_norm": 0.12161106616258621, + "learning_rate": 0.0004631220389812683, + "loss": 2.6161, + "step": 17816 + }, + { + "epoch": 0.528333778133618, + "grad_norm": 0.12418568134307861, + "learning_rate": 0.0004630751163354338, + "loss": 2.6428, + "step": 17817 + }, + { + "epoch": 0.5283634314859295, + "grad_norm": 0.12710830569267273, + "learning_rate": 0.0004630281940165754, + "loss": 2.6556, + "step": 17818 + }, + { + "epoch": 0.528393084838241, + "grad_norm": 0.1270752251148224, + "learning_rate": 0.00046298127202510877, + "loss": 2.6509, + "step": 17819 + }, + { + "epoch": 0.5284227381905524, + "grad_norm": 0.11810185760259628, + "learning_rate": 0.0004629343503614494, + "loss": 2.6634, + "step": 17820 + }, + { + "epoch": 0.5284523915428639, + "grad_norm": 0.10874643176794052, + "learning_rate": 0.0004628874290260126, + "loss": 2.6772, + "step": 17821 + }, + { + "epoch": 0.5284820448951754, + "grad_norm": 0.10783425718545914, + "learning_rate": 0.000462840508019214, + "loss": 2.6978, + "step": 17822 + }, + { + "epoch": 0.5285116982474869, + "grad_norm": 0.11227177083492279, + "learning_rate": 0.0004627935873414691, + "loss": 2.6442, + "step": 17823 + }, + { + "epoch": 0.5285413515997983, + "grad_norm": 0.11595091223716736, + "learning_rate": 0.00046274666699319336, + "loss": 2.6446, + "step": 17824 + }, + { + "epoch": 0.5285710049521098, + "grad_norm": 0.1209307312965393, + "learning_rate": 0.0004626997469748023, + "loss": 2.6635, + "step": 17825 + }, + { + "epoch": 0.5286006583044213, + "grad_norm": 0.11790744215250015, + "learning_rate": 0.00046265282728671144, + "loss": 2.6744, + "step": 17826 + }, + { + "epoch": 0.5286303116567328, + "grad_norm": 0.11402607709169388, + "learning_rate": 0.0004626059079293359, + "loss": 2.6447, + "step": 17827 + }, + { + "epoch": 0.5286599650090442, + "grad_norm": 0.131493479013443, + "learning_rate": 0.0004625589889030917, + "loss": 2.6438, + "step": 17828 + }, + { + "epoch": 0.5286896183613558, + "grad_norm": 0.11730767041444778, + "learning_rate": 0.00046251207020839405, + "loss": 2.6752, + "step": 17829 + }, + { + "epoch": 0.5287192717136673, + "grad_norm": 0.122032031416893, + "learning_rate": 0.0004624651518456585, + "loss": 2.6957, + "step": 17830 + }, + { + "epoch": 0.5287489250659787, + "grad_norm": 0.12772829830646515, + "learning_rate": 0.0004624182338153005, + "loss": 2.6632, + "step": 17831 + }, + { + "epoch": 0.5287785784182902, + "grad_norm": 0.11715547740459442, + "learning_rate": 0.00046237131611773544, + "loss": 2.6413, + "step": 17832 + }, + { + "epoch": 0.5288082317706017, + "grad_norm": 0.12310421466827393, + "learning_rate": 0.0004623243987533788, + "loss": 2.6644, + "step": 17833 + }, + { + "epoch": 0.5288378851229132, + "grad_norm": 0.11374501138925552, + "learning_rate": 0.0004622774817226461, + "loss": 2.6354, + "step": 17834 + }, + { + "epoch": 0.5288675384752246, + "grad_norm": 0.11767742782831192, + "learning_rate": 0.0004622305650259527, + "loss": 2.6615, + "step": 17835 + }, + { + "epoch": 0.5288971918275361, + "grad_norm": 0.13449671864509583, + "learning_rate": 0.0004621836486637143, + "loss": 2.6751, + "step": 17836 + }, + { + "epoch": 0.5289268451798476, + "grad_norm": 0.13857701420783997, + "learning_rate": 0.00046213673263634616, + "loss": 2.6794, + "step": 17837 + }, + { + "epoch": 0.5289564985321591, + "grad_norm": 0.11362489312887192, + "learning_rate": 0.00046208981694426365, + "loss": 2.6824, + "step": 17838 + }, + { + "epoch": 0.5289861518844705, + "grad_norm": 0.11167523264884949, + "learning_rate": 0.0004620429015878824, + "loss": 2.6698, + "step": 17839 + }, + { + "epoch": 0.529015805236782, + "grad_norm": 0.1414116472005844, + "learning_rate": 0.00046199598656761757, + "loss": 2.6371, + "step": 17840 + }, + { + "epoch": 0.5290454585890935, + "grad_norm": 0.1310434639453888, + "learning_rate": 0.000461949071883885, + "loss": 2.6677, + "step": 17841 + }, + { + "epoch": 0.529075111941405, + "grad_norm": 0.12098034471273422, + "learning_rate": 0.00046190215753709983, + "loss": 2.6627, + "step": 17842 + }, + { + "epoch": 0.5291047652937164, + "grad_norm": 0.14948251843452454, + "learning_rate": 0.0004618552435276777, + "loss": 2.7095, + "step": 17843 + }, + { + "epoch": 0.529134418646028, + "grad_norm": 0.14826689660549164, + "learning_rate": 0.00046180832985603384, + "loss": 2.6639, + "step": 17844 + }, + { + "epoch": 0.5291640719983394, + "grad_norm": 0.12095773220062256, + "learning_rate": 0.0004617614165225838, + "loss": 2.6519, + "step": 17845 + }, + { + "epoch": 0.5291937253506509, + "grad_norm": 0.13019457459449768, + "learning_rate": 0.00046171450352774294, + "loss": 2.6335, + "step": 17846 + }, + { + "epoch": 0.5292233787029623, + "grad_norm": 0.12186352908611298, + "learning_rate": 0.00046166759087192693, + "loss": 2.6541, + "step": 17847 + }, + { + "epoch": 0.5292530320552739, + "grad_norm": 0.11757121235132217, + "learning_rate": 0.0004616206785555508, + "loss": 2.7015, + "step": 17848 + }, + { + "epoch": 0.5292826854075853, + "grad_norm": 0.10355823487043381, + "learning_rate": 0.00046157376657903024, + "loss": 2.6434, + "step": 17849 + }, + { + "epoch": 0.5293123387598968, + "grad_norm": 0.10551992803812027, + "learning_rate": 0.00046152685494278044, + "loss": 2.6582, + "step": 17850 + }, + { + "epoch": 0.5293419921122083, + "grad_norm": 0.1173524335026741, + "learning_rate": 0.000461479943647217, + "loss": 2.6784, + "step": 17851 + }, + { + "epoch": 0.5293716454645198, + "grad_norm": 0.1044919565320015, + "learning_rate": 0.0004614330326927553, + "loss": 2.6621, + "step": 17852 + }, + { + "epoch": 0.5294012988168313, + "grad_norm": 0.10670233517885208, + "learning_rate": 0.0004613861220798106, + "loss": 2.6409, + "step": 17853 + }, + { + "epoch": 0.5294309521691427, + "grad_norm": 0.10728045552968979, + "learning_rate": 0.0004613392118087986, + "loss": 2.6763, + "step": 17854 + }, + { + "epoch": 0.5294606055214542, + "grad_norm": 0.12828172743320465, + "learning_rate": 0.00046129230188013436, + "loss": 2.6829, + "step": 17855 + }, + { + "epoch": 0.5294902588737657, + "grad_norm": 0.13799868524074554, + "learning_rate": 0.0004612453922942335, + "loss": 2.6906, + "step": 17856 + }, + { + "epoch": 0.5295199122260772, + "grad_norm": 0.14278973639011383, + "learning_rate": 0.00046119848305151135, + "loss": 2.6542, + "step": 17857 + }, + { + "epoch": 0.5295495655783886, + "grad_norm": 0.12311345338821411, + "learning_rate": 0.0004611515741523834, + "loss": 2.7008, + "step": 17858 + }, + { + "epoch": 0.5295792189307001, + "grad_norm": 0.10220944881439209, + "learning_rate": 0.00046110466559726485, + "loss": 2.6625, + "step": 17859 + }, + { + "epoch": 0.5296088722830116, + "grad_norm": 0.10953788459300995, + "learning_rate": 0.00046105775738657106, + "loss": 2.6387, + "step": 17860 + }, + { + "epoch": 0.5296385256353231, + "grad_norm": 0.1242602989077568, + "learning_rate": 0.00046101084952071764, + "loss": 2.6339, + "step": 17861 + }, + { + "epoch": 0.5296681789876345, + "grad_norm": 0.12609373033046722, + "learning_rate": 0.0004609639420001198, + "loss": 2.7044, + "step": 17862 + }, + { + "epoch": 0.529697832339946, + "grad_norm": 0.10713879764080048, + "learning_rate": 0.00046091703482519295, + "loss": 2.6761, + "step": 17863 + }, + { + "epoch": 0.5297274856922575, + "grad_norm": 0.11772864311933517, + "learning_rate": 0.00046087012799635255, + "loss": 2.6392, + "step": 17864 + }, + { + "epoch": 0.529757139044569, + "grad_norm": 0.1184304803609848, + "learning_rate": 0.00046082322151401375, + "loss": 2.6792, + "step": 17865 + }, + { + "epoch": 0.5297867923968804, + "grad_norm": 0.1060141995549202, + "learning_rate": 0.0004607763153785919, + "loss": 2.67, + "step": 17866 + }, + { + "epoch": 0.529816445749192, + "grad_norm": 0.12602902948856354, + "learning_rate": 0.0004607294095905027, + "loss": 2.667, + "step": 17867 + }, + { + "epoch": 0.5298460991015034, + "grad_norm": 0.14650055766105652, + "learning_rate": 0.00046068250415016136, + "loss": 2.6519, + "step": 17868 + }, + { + "epoch": 0.5298757524538149, + "grad_norm": 0.12159255146980286, + "learning_rate": 0.0004606355990579832, + "loss": 2.6892, + "step": 17869 + }, + { + "epoch": 0.5299054058061263, + "grad_norm": 0.11759006232023239, + "learning_rate": 0.0004605886943143835, + "loss": 2.6488, + "step": 17870 + }, + { + "epoch": 0.5299350591584379, + "grad_norm": 0.12030765414237976, + "learning_rate": 0.00046054178991977767, + "loss": 2.6519, + "step": 17871 + }, + { + "epoch": 0.5299647125107494, + "grad_norm": 0.11440229415893555, + "learning_rate": 0.0004604948858745811, + "loss": 2.6522, + "step": 17872 + }, + { + "epoch": 0.5299943658630608, + "grad_norm": 0.12007852643728256, + "learning_rate": 0.00046044798217920906, + "loss": 2.6943, + "step": 17873 + }, + { + "epoch": 0.5300240192153723, + "grad_norm": 0.11502911150455475, + "learning_rate": 0.00046040107883407695, + "loss": 2.6703, + "step": 17874 + }, + { + "epoch": 0.5300536725676838, + "grad_norm": 0.10378915071487427, + "learning_rate": 0.0004603541758396002, + "loss": 2.635, + "step": 17875 + }, + { + "epoch": 0.5300833259199953, + "grad_norm": 0.11499658972024918, + "learning_rate": 0.00046030727319619393, + "loss": 2.6488, + "step": 17876 + }, + { + "epoch": 0.5301129792723067, + "grad_norm": 0.10962611436843872, + "learning_rate": 0.00046026037090427354, + "loss": 2.6777, + "step": 17877 + }, + { + "epoch": 0.5301426326246182, + "grad_norm": 0.13270702958106995, + "learning_rate": 0.00046021346896425437, + "loss": 2.6603, + "step": 17878 + }, + { + "epoch": 0.5301722859769297, + "grad_norm": 0.11174136400222778, + "learning_rate": 0.0004601665673765517, + "loss": 2.6693, + "step": 17879 + }, + { + "epoch": 0.5302019393292412, + "grad_norm": 0.1279059797525406, + "learning_rate": 0.000460119666141581, + "loss": 2.6882, + "step": 17880 + }, + { + "epoch": 0.5302315926815526, + "grad_norm": 0.12343262881040573, + "learning_rate": 0.0004600727652597575, + "loss": 2.6998, + "step": 17881 + }, + { + "epoch": 0.5302612460338642, + "grad_norm": 0.1114581972360611, + "learning_rate": 0.00046002586473149656, + "loss": 2.6332, + "step": 17882 + }, + { + "epoch": 0.5302908993861756, + "grad_norm": 0.1127040907740593, + "learning_rate": 0.0004599789645572134, + "loss": 2.6364, + "step": 17883 + }, + { + "epoch": 0.5303205527384871, + "grad_norm": 0.10231390595436096, + "learning_rate": 0.00045993206473732333, + "loss": 2.6638, + "step": 17884 + }, + { + "epoch": 0.5303502060907985, + "grad_norm": 0.11151310056447983, + "learning_rate": 0.0004598851652722419, + "loss": 2.6709, + "step": 17885 + }, + { + "epoch": 0.5303798594431101, + "grad_norm": 0.11166248470544815, + "learning_rate": 0.0004598382661623841, + "loss": 2.6737, + "step": 17886 + }, + { + "epoch": 0.5304095127954215, + "grad_norm": 0.1310332864522934, + "learning_rate": 0.0004597913674081653, + "loss": 2.6541, + "step": 17887 + }, + { + "epoch": 0.530439166147733, + "grad_norm": 0.11186152696609497, + "learning_rate": 0.00045974446901000086, + "loss": 2.6715, + "step": 17888 + }, + { + "epoch": 0.5304688195000444, + "grad_norm": 0.11961975693702698, + "learning_rate": 0.00045969757096830607, + "loss": 2.6476, + "step": 17889 + }, + { + "epoch": 0.530498472852356, + "grad_norm": 0.11238863319158554, + "learning_rate": 0.0004596506732834962, + "loss": 2.6679, + "step": 17890 + }, + { + "epoch": 0.5305281262046674, + "grad_norm": 0.12268054485321045, + "learning_rate": 0.0004596037759559866, + "loss": 2.6446, + "step": 17891 + }, + { + "epoch": 0.5305577795569789, + "grad_norm": 0.12482438236474991, + "learning_rate": 0.0004595568789861922, + "loss": 2.6693, + "step": 17892 + }, + { + "epoch": 0.5305874329092904, + "grad_norm": 0.1325502246618271, + "learning_rate": 0.0004595099823745289, + "loss": 2.6787, + "step": 17893 + }, + { + "epoch": 0.5306170862616019, + "grad_norm": 0.11942923814058304, + "learning_rate": 0.00045946308612141156, + "loss": 2.6516, + "step": 17894 + }, + { + "epoch": 0.5306467396139134, + "grad_norm": 0.13724137842655182, + "learning_rate": 0.0004594161902272556, + "loss": 2.6726, + "step": 17895 + }, + { + "epoch": 0.5306763929662248, + "grad_norm": 0.13850344717502594, + "learning_rate": 0.0004593692946924763, + "loss": 2.6874, + "step": 17896 + }, + { + "epoch": 0.5307060463185364, + "grad_norm": 0.1469467282295227, + "learning_rate": 0.00045932239951748877, + "loss": 2.6459, + "step": 17897 + }, + { + "epoch": 0.5307356996708478, + "grad_norm": 0.12568166851997375, + "learning_rate": 0.00045927550470270843, + "loss": 2.6324, + "step": 17898 + }, + { + "epoch": 0.5307653530231593, + "grad_norm": 0.11508530378341675, + "learning_rate": 0.00045922861024855037, + "loss": 2.6672, + "step": 17899 + }, + { + "epoch": 0.5307950063754707, + "grad_norm": 0.1327276974916458, + "learning_rate": 0.00045918171615543006, + "loss": 2.6586, + "step": 17900 + }, + { + "epoch": 0.5308246597277823, + "grad_norm": 0.1395917385816574, + "learning_rate": 0.00045913482242376265, + "loss": 2.6381, + "step": 17901 + }, + { + "epoch": 0.5308543130800937, + "grad_norm": 0.11448444426059723, + "learning_rate": 0.00045908792905396354, + "loss": 2.6487, + "step": 17902 + }, + { + "epoch": 0.5308839664324052, + "grad_norm": 0.13068681955337524, + "learning_rate": 0.00045904103604644766, + "loss": 2.6977, + "step": 17903 + }, + { + "epoch": 0.5309136197847166, + "grad_norm": 0.1337614804506302, + "learning_rate": 0.0004589941434016304, + "loss": 2.6827, + "step": 17904 + }, + { + "epoch": 0.5309432731370282, + "grad_norm": 0.10516772419214249, + "learning_rate": 0.0004589472511199269, + "loss": 2.685, + "step": 17905 + }, + { + "epoch": 0.5309729264893396, + "grad_norm": 0.12722733616828918, + "learning_rate": 0.00045890035920175286, + "loss": 2.698, + "step": 17906 + }, + { + "epoch": 0.5310025798416511, + "grad_norm": 0.14509257674217224, + "learning_rate": 0.0004588534676475231, + "loss": 2.6703, + "step": 17907 + }, + { + "epoch": 0.5310322331939625, + "grad_norm": 0.16380833089351654, + "learning_rate": 0.0004588065764576529, + "loss": 2.6641, + "step": 17908 + }, + { + "epoch": 0.5310618865462741, + "grad_norm": 0.1513233333826065, + "learning_rate": 0.0004587596856325576, + "loss": 2.6555, + "step": 17909 + }, + { + "epoch": 0.5310915398985855, + "grad_norm": 0.11308211833238602, + "learning_rate": 0.0004587127951726523, + "loss": 2.7085, + "step": 17910 + }, + { + "epoch": 0.531121193250897, + "grad_norm": 0.12167541682720184, + "learning_rate": 0.0004586659050783523, + "loss": 2.6795, + "step": 17911 + }, + { + "epoch": 0.5311508466032084, + "grad_norm": 0.1362735480070114, + "learning_rate": 0.00045861901535007284, + "loss": 2.6946, + "step": 17912 + }, + { + "epoch": 0.53118049995552, + "grad_norm": 0.13719508051872253, + "learning_rate": 0.00045857212598822915, + "loss": 2.6375, + "step": 17913 + }, + { + "epoch": 0.5312101533078315, + "grad_norm": 0.12488355487585068, + "learning_rate": 0.00045852523699323633, + "loss": 2.6643, + "step": 17914 + }, + { + "epoch": 0.5312398066601429, + "grad_norm": 0.13797332346439362, + "learning_rate": 0.0004584783483655096, + "loss": 2.6634, + "step": 17915 + }, + { + "epoch": 0.5312694600124545, + "grad_norm": 0.10898920893669128, + "learning_rate": 0.00045843146010546434, + "loss": 2.6803, + "step": 17916 + }, + { + "epoch": 0.5312991133647659, + "grad_norm": 0.11971442401409149, + "learning_rate": 0.00045838457221351555, + "loss": 2.6678, + "step": 17917 + }, + { + "epoch": 0.5313287667170774, + "grad_norm": 0.1111731082201004, + "learning_rate": 0.0004583376846900784, + "loss": 2.6803, + "step": 17918 + }, + { + "epoch": 0.5313584200693888, + "grad_norm": 0.11286617070436478, + "learning_rate": 0.0004582907975355683, + "loss": 2.6296, + "step": 17919 + }, + { + "epoch": 0.5313880734217004, + "grad_norm": 0.09791930019855499, + "learning_rate": 0.0004582439107504004, + "loss": 2.6407, + "step": 17920 + }, + { + "epoch": 0.5314177267740118, + "grad_norm": 0.10006880760192871, + "learning_rate": 0.00045819702433498984, + "loss": 2.6442, + "step": 17921 + }, + { + "epoch": 0.5314473801263233, + "grad_norm": 0.10633155703544617, + "learning_rate": 0.00045815013828975177, + "loss": 2.6843, + "step": 17922 + }, + { + "epoch": 0.5314770334786347, + "grad_norm": 0.11478718370199203, + "learning_rate": 0.00045810325261510154, + "loss": 2.676, + "step": 17923 + }, + { + "epoch": 0.5315066868309463, + "grad_norm": 0.12318319827318192, + "learning_rate": 0.0004580563673114541, + "loss": 2.7217, + "step": 17924 + }, + { + "epoch": 0.5315363401832577, + "grad_norm": 0.14836813509464264, + "learning_rate": 0.00045800948237922467, + "loss": 2.6836, + "step": 17925 + }, + { + "epoch": 0.5315659935355692, + "grad_norm": 0.14646120369434357, + "learning_rate": 0.00045796259781882853, + "loss": 2.6855, + "step": 17926 + }, + { + "epoch": 0.5315956468878806, + "grad_norm": 0.10980454832315445, + "learning_rate": 0.0004579157136306808, + "loss": 2.6755, + "step": 17927 + }, + { + "epoch": 0.5316253002401922, + "grad_norm": 0.12295016646385193, + "learning_rate": 0.0004578688298151966, + "loss": 2.6665, + "step": 17928 + }, + { + "epoch": 0.5316549535925036, + "grad_norm": 0.14240087568759918, + "learning_rate": 0.0004578219463727912, + "loss": 2.6811, + "step": 17929 + }, + { + "epoch": 0.5316846069448151, + "grad_norm": 0.12425778806209564, + "learning_rate": 0.0004577750633038798, + "loss": 2.6302, + "step": 17930 + }, + { + "epoch": 0.5317142602971265, + "grad_norm": 0.13349443674087524, + "learning_rate": 0.0004577281806088771, + "loss": 2.664, + "step": 17931 + }, + { + "epoch": 0.5317439136494381, + "grad_norm": 0.13237369060516357, + "learning_rate": 0.00045768129828819887, + "loss": 2.631, + "step": 17932 + }, + { + "epoch": 0.5317735670017496, + "grad_norm": 0.12148378789424896, + "learning_rate": 0.00045763441634226, + "loss": 2.6951, + "step": 17933 + }, + { + "epoch": 0.531803220354061, + "grad_norm": 0.11868329346179962, + "learning_rate": 0.0004575875347714758, + "loss": 2.6868, + "step": 17934 + }, + { + "epoch": 0.5318328737063726, + "grad_norm": 0.12852613627910614, + "learning_rate": 0.0004575406535762611, + "loss": 2.6707, + "step": 17935 + }, + { + "epoch": 0.531862527058684, + "grad_norm": 0.12479373067617416, + "learning_rate": 0.00045749377275703117, + "loss": 2.6791, + "step": 17936 + }, + { + "epoch": 0.5318921804109955, + "grad_norm": 0.1150493174791336, + "learning_rate": 0.00045744689231420123, + "loss": 2.6416, + "step": 17937 + }, + { + "epoch": 0.5319218337633069, + "grad_norm": 0.12405747920274734, + "learning_rate": 0.0004574000122481864, + "loss": 2.6651, + "step": 17938 + }, + { + "epoch": 0.5319514871156185, + "grad_norm": 0.12564539909362793, + "learning_rate": 0.0004573531325594017, + "loss": 2.6354, + "step": 17939 + }, + { + "epoch": 0.5319811404679299, + "grad_norm": 0.13338661193847656, + "learning_rate": 0.00045730625324826246, + "loss": 2.6765, + "step": 17940 + }, + { + "epoch": 0.5320107938202414, + "grad_norm": 0.1128157526254654, + "learning_rate": 0.00045725937431518357, + "loss": 2.6477, + "step": 17941 + }, + { + "epoch": 0.5320404471725528, + "grad_norm": 0.122764952480793, + "learning_rate": 0.0004572124957605803, + "loss": 2.6659, + "step": 17942 + }, + { + "epoch": 0.5320701005248644, + "grad_norm": 0.12479646503925323, + "learning_rate": 0.0004571656175848676, + "loss": 2.6881, + "step": 17943 + }, + { + "epoch": 0.5320997538771758, + "grad_norm": 0.11817729473114014, + "learning_rate": 0.00045711873978846075, + "loss": 2.6762, + "step": 17944 + }, + { + "epoch": 0.5321294072294873, + "grad_norm": 0.11110732704401016, + "learning_rate": 0.0004570718623717748, + "loss": 2.674, + "step": 17945 + }, + { + "epoch": 0.5321590605817987, + "grad_norm": 0.11574618518352509, + "learning_rate": 0.00045702498533522497, + "loss": 2.6663, + "step": 17946 + }, + { + "epoch": 0.5321887139341103, + "grad_norm": 0.1427784264087677, + "learning_rate": 0.00045697810867922624, + "loss": 2.6707, + "step": 17947 + }, + { + "epoch": 0.5322183672864217, + "grad_norm": 0.12075798958539963, + "learning_rate": 0.00045693123240419376, + "loss": 2.712, + "step": 17948 + }, + { + "epoch": 0.5322480206387332, + "grad_norm": 0.13922156393527985, + "learning_rate": 0.00045688435651054256, + "loss": 2.6521, + "step": 17949 + }, + { + "epoch": 0.5322776739910446, + "grad_norm": 0.15521886944770813, + "learning_rate": 0.00045683748099868785, + "loss": 2.6722, + "step": 17950 + }, + { + "epoch": 0.5323073273433562, + "grad_norm": 0.15168671309947968, + "learning_rate": 0.0004567906058690447, + "loss": 2.704, + "step": 17951 + }, + { + "epoch": 0.5323369806956676, + "grad_norm": 0.10356125980615616, + "learning_rate": 0.0004567437311220281, + "loss": 2.6832, + "step": 17952 + }, + { + "epoch": 0.5323666340479791, + "grad_norm": 0.13505873084068298, + "learning_rate": 0.00045669685675805315, + "loss": 2.6565, + "step": 17953 + }, + { + "epoch": 0.5323962874002907, + "grad_norm": 0.149223193526268, + "learning_rate": 0.00045664998277753497, + "loss": 2.6832, + "step": 17954 + }, + { + "epoch": 0.5324259407526021, + "grad_norm": 0.12511898577213287, + "learning_rate": 0.00045660310918088865, + "loss": 2.6554, + "step": 17955 + }, + { + "epoch": 0.5324555941049136, + "grad_norm": 0.12857067584991455, + "learning_rate": 0.0004565562359685291, + "loss": 2.674, + "step": 17956 + }, + { + "epoch": 0.532485247457225, + "grad_norm": 0.13871443271636963, + "learning_rate": 0.00045650936314087166, + "loss": 2.6494, + "step": 17957 + }, + { + "epoch": 0.5325149008095366, + "grad_norm": 0.1358727514743805, + "learning_rate": 0.0004564624906983313, + "loss": 2.6704, + "step": 17958 + }, + { + "epoch": 0.532544554161848, + "grad_norm": 0.11655464768409729, + "learning_rate": 0.000456415618641323, + "loss": 2.6268, + "step": 17959 + }, + { + "epoch": 0.5325742075141595, + "grad_norm": 0.12805277109146118, + "learning_rate": 0.000456368746970262, + "loss": 2.6361, + "step": 17960 + }, + { + "epoch": 0.5326038608664709, + "grad_norm": 0.1224212646484375, + "learning_rate": 0.0004563218756855632, + "loss": 2.6786, + "step": 17961 + }, + { + "epoch": 0.5326335142187825, + "grad_norm": 0.12124796956777573, + "learning_rate": 0.00045627500478764166, + "loss": 2.6464, + "step": 17962 + }, + { + "epoch": 0.5326631675710939, + "grad_norm": 0.11139789968729019, + "learning_rate": 0.00045622813427691243, + "loss": 2.6821, + "step": 17963 + }, + { + "epoch": 0.5326928209234054, + "grad_norm": 0.12237544357776642, + "learning_rate": 0.00045618126415379064, + "loss": 2.6911, + "step": 17964 + }, + { + "epoch": 0.5327224742757168, + "grad_norm": 0.11848289519548416, + "learning_rate": 0.0004561343944186912, + "loss": 2.6608, + "step": 17965 + }, + { + "epoch": 0.5327521276280284, + "grad_norm": 0.11940531432628632, + "learning_rate": 0.0004560875250720293, + "loss": 2.6529, + "step": 17966 + }, + { + "epoch": 0.5327817809803398, + "grad_norm": 0.1201644241809845, + "learning_rate": 0.00045604065611421987, + "loss": 2.6622, + "step": 17967 + }, + { + "epoch": 0.5328114343326513, + "grad_norm": 0.12560266256332397, + "learning_rate": 0.00045599378754567805, + "loss": 2.6771, + "step": 17968 + }, + { + "epoch": 0.5328410876849627, + "grad_norm": 0.11973969638347626, + "learning_rate": 0.00045594691936681856, + "loss": 2.6451, + "step": 17969 + }, + { + "epoch": 0.5328707410372743, + "grad_norm": 0.1359984427690506, + "learning_rate": 0.00045590005157805674, + "loss": 2.6443, + "step": 17970 + }, + { + "epoch": 0.5329003943895857, + "grad_norm": 0.1220938190817833, + "learning_rate": 0.0004558531841798076, + "loss": 2.6651, + "step": 17971 + }, + { + "epoch": 0.5329300477418972, + "grad_norm": 0.10304701328277588, + "learning_rate": 0.0004558063171724862, + "loss": 2.6946, + "step": 17972 + }, + { + "epoch": 0.5329597010942086, + "grad_norm": 0.1288825273513794, + "learning_rate": 0.00045575945055650744, + "loss": 2.665, + "step": 17973 + }, + { + "epoch": 0.5329893544465202, + "grad_norm": 0.1264129877090454, + "learning_rate": 0.00045571258433228616, + "loss": 2.6763, + "step": 17974 + }, + { + "epoch": 0.5330190077988317, + "grad_norm": 0.12925702333450317, + "learning_rate": 0.00045566571850023767, + "loss": 2.6588, + "step": 17975 + }, + { + "epoch": 0.5330486611511431, + "grad_norm": 0.12732286751270294, + "learning_rate": 0.00045561885306077683, + "loss": 2.6386, + "step": 17976 + }, + { + "epoch": 0.5330783145034547, + "grad_norm": 0.11116798222064972, + "learning_rate": 0.0004555719880143186, + "loss": 2.66, + "step": 17977 + }, + { + "epoch": 0.5331079678557661, + "grad_norm": 0.12555819749832153, + "learning_rate": 0.00045552512336127825, + "loss": 2.655, + "step": 17978 + }, + { + "epoch": 0.5331376212080776, + "grad_norm": 0.13176940381526947, + "learning_rate": 0.0004554782591020704, + "loss": 2.639, + "step": 17979 + }, + { + "epoch": 0.533167274560389, + "grad_norm": 0.14884717762470245, + "learning_rate": 0.00045543139523711025, + "loss": 2.6713, + "step": 17980 + }, + { + "epoch": 0.5331969279127006, + "grad_norm": 0.12464026361703873, + "learning_rate": 0.00045538453176681274, + "loss": 2.6846, + "step": 17981 + }, + { + "epoch": 0.533226581265012, + "grad_norm": 0.11148705333471298, + "learning_rate": 0.00045533766869159265, + "loss": 2.6615, + "step": 17982 + }, + { + "epoch": 0.5332562346173235, + "grad_norm": 0.1303054243326187, + "learning_rate": 0.00045529080601186534, + "loss": 2.6354, + "step": 17983 + }, + { + "epoch": 0.5332858879696349, + "grad_norm": 0.12744420766830444, + "learning_rate": 0.00045524394372804567, + "loss": 2.6654, + "step": 17984 + }, + { + "epoch": 0.5333155413219465, + "grad_norm": 0.10519880801439285, + "learning_rate": 0.0004551970818405485, + "loss": 2.6578, + "step": 17985 + }, + { + "epoch": 0.5333451946742579, + "grad_norm": 0.12836198508739471, + "learning_rate": 0.00045515022034978885, + "loss": 2.6771, + "step": 17986 + }, + { + "epoch": 0.5333748480265694, + "grad_norm": 0.11376550048589706, + "learning_rate": 0.0004551033592561817, + "loss": 2.6609, + "step": 17987 + }, + { + "epoch": 0.5334045013788808, + "grad_norm": 0.11487483978271484, + "learning_rate": 0.0004550564985601421, + "loss": 2.673, + "step": 17988 + }, + { + "epoch": 0.5334341547311924, + "grad_norm": 0.10982242971658707, + "learning_rate": 0.000455009638262085, + "loss": 2.6851, + "step": 17989 + }, + { + "epoch": 0.5334638080835038, + "grad_norm": 0.10361296683549881, + "learning_rate": 0.00045496277836242513, + "loss": 2.6785, + "step": 17990 + }, + { + "epoch": 0.5334934614358153, + "grad_norm": 0.11802773177623749, + "learning_rate": 0.00045491591886157756, + "loss": 2.6141, + "step": 17991 + }, + { + "epoch": 0.5335231147881268, + "grad_norm": 0.11171183735132217, + "learning_rate": 0.0004548690597599573, + "loss": 2.6643, + "step": 17992 + }, + { + "epoch": 0.5335527681404383, + "grad_norm": 0.11842915415763855, + "learning_rate": 0.00045482220105797926, + "loss": 2.6683, + "step": 17993 + }, + { + "epoch": 0.5335824214927497, + "grad_norm": 0.11249187588691711, + "learning_rate": 0.0004547753427560584, + "loss": 2.6777, + "step": 17994 + }, + { + "epoch": 0.5336120748450612, + "grad_norm": 0.12626399099826813, + "learning_rate": 0.0004547284848546095, + "loss": 2.7059, + "step": 17995 + }, + { + "epoch": 0.5336417281973728, + "grad_norm": 0.12314652651548386, + "learning_rate": 0.0004546816273540478, + "loss": 2.6614, + "step": 17996 + }, + { + "epoch": 0.5336713815496842, + "grad_norm": 0.1369035840034485, + "learning_rate": 0.000454634770254788, + "loss": 2.6532, + "step": 17997 + }, + { + "epoch": 0.5337010349019957, + "grad_norm": 0.11519569158554077, + "learning_rate": 0.00045458791355724516, + "loss": 2.683, + "step": 17998 + }, + { + "epoch": 0.5337306882543071, + "grad_norm": 0.1130187064409256, + "learning_rate": 0.0004545410572618342, + "loss": 2.629, + "step": 17999 + }, + { + "epoch": 0.5337603416066187, + "grad_norm": 0.11491178721189499, + "learning_rate": 0.0004544942013689699, + "loss": 2.6351, + "step": 18000 + }, + { + "epoch": 0.5337899949589301, + "grad_norm": 0.11251475661993027, + "learning_rate": 0.0004544473458790672, + "loss": 2.6943, + "step": 18001 + }, + { + "epoch": 0.5338196483112416, + "grad_norm": 0.10954834520816803, + "learning_rate": 0.00045440049079254114, + "loss": 2.6495, + "step": 18002 + }, + { + "epoch": 0.533849301663553, + "grad_norm": 0.12274465709924698, + "learning_rate": 0.00045435363610980654, + "loss": 2.6387, + "step": 18003 + }, + { + "epoch": 0.5338789550158646, + "grad_norm": 0.13267719745635986, + "learning_rate": 0.00045430678183127834, + "loss": 2.6346, + "step": 18004 + }, + { + "epoch": 0.533908608368176, + "grad_norm": 0.12077748775482178, + "learning_rate": 0.0004542599279573714, + "loss": 2.6404, + "step": 18005 + }, + { + "epoch": 0.5339382617204875, + "grad_norm": 0.12542904913425446, + "learning_rate": 0.0004542130744885008, + "loss": 2.6448, + "step": 18006 + }, + { + "epoch": 0.533967915072799, + "grad_norm": 0.12726308405399323, + "learning_rate": 0.0004541662214250811, + "loss": 2.6385, + "step": 18007 + }, + { + "epoch": 0.5339975684251105, + "grad_norm": 0.11411605030298233, + "learning_rate": 0.00045411936876752726, + "loss": 2.6334, + "step": 18008 + }, + { + "epoch": 0.5340272217774219, + "grad_norm": 0.11819858849048615, + "learning_rate": 0.0004540725165162545, + "loss": 2.635, + "step": 18009 + }, + { + "epoch": 0.5340568751297334, + "grad_norm": 0.12585414946079254, + "learning_rate": 0.0004540256646716775, + "loss": 2.6842, + "step": 18010 + }, + { + "epoch": 0.5340865284820449, + "grad_norm": 0.11785633862018585, + "learning_rate": 0.0004539788132342111, + "loss": 2.6487, + "step": 18011 + }, + { + "epoch": 0.5341161818343564, + "grad_norm": 0.10714592039585114, + "learning_rate": 0.0004539319622042701, + "loss": 2.6479, + "step": 18012 + }, + { + "epoch": 0.5341458351866678, + "grad_norm": 0.11105950176715851, + "learning_rate": 0.00045388511158226964, + "loss": 2.6977, + "step": 18013 + }, + { + "epoch": 0.5341754885389793, + "grad_norm": 0.09732481837272644, + "learning_rate": 0.0004538382613686243, + "loss": 2.6402, + "step": 18014 + }, + { + "epoch": 0.5342051418912908, + "grad_norm": 0.11320409923791885, + "learning_rate": 0.0004537914115637492, + "loss": 2.6719, + "step": 18015 + }, + { + "epoch": 0.5342347952436023, + "grad_norm": 0.12167928367853165, + "learning_rate": 0.0004537445621680591, + "loss": 2.6696, + "step": 18016 + }, + { + "epoch": 0.5342644485959138, + "grad_norm": 0.11911435425281525, + "learning_rate": 0.0004536977131819688, + "loss": 2.6596, + "step": 18017 + }, + { + "epoch": 0.5342941019482252, + "grad_norm": 0.131103977560997, + "learning_rate": 0.0004536508646058931, + "loss": 2.6305, + "step": 18018 + }, + { + "epoch": 0.5343237553005368, + "grad_norm": 0.10538480430841446, + "learning_rate": 0.00045360401644024703, + "loss": 2.6665, + "step": 18019 + }, + { + "epoch": 0.5343534086528482, + "grad_norm": 0.09666401147842407, + "learning_rate": 0.0004535571686854453, + "loss": 2.6788, + "step": 18020 + }, + { + "epoch": 0.5343830620051597, + "grad_norm": 0.10950055718421936, + "learning_rate": 0.0004535103213419028, + "loss": 2.6559, + "step": 18021 + }, + { + "epoch": 0.5344127153574711, + "grad_norm": 0.10360384732484818, + "learning_rate": 0.0004534634744100344, + "loss": 2.6413, + "step": 18022 + }, + { + "epoch": 0.5344423687097827, + "grad_norm": 0.11182662844657898, + "learning_rate": 0.000453416627890255, + "loss": 2.6723, + "step": 18023 + }, + { + "epoch": 0.5344720220620941, + "grad_norm": 0.13264261186122894, + "learning_rate": 0.0004533697817829793, + "loss": 2.6646, + "step": 18024 + }, + { + "epoch": 0.5345016754144056, + "grad_norm": 0.16815702617168427, + "learning_rate": 0.0004533229360886222, + "loss": 2.6375, + "step": 18025 + }, + { + "epoch": 0.534531328766717, + "grad_norm": 0.17739906907081604, + "learning_rate": 0.0004532760908075985, + "loss": 2.6507, + "step": 18026 + }, + { + "epoch": 0.5345609821190286, + "grad_norm": 0.17797748744487762, + "learning_rate": 0.0004532292459403231, + "loss": 2.6575, + "step": 18027 + }, + { + "epoch": 0.53459063547134, + "grad_norm": 0.1470666229724884, + "learning_rate": 0.0004531824014872107, + "loss": 2.6922, + "step": 18028 + }, + { + "epoch": 0.5346202888236515, + "grad_norm": 0.14342719316482544, + "learning_rate": 0.00045313555744867616, + "loss": 2.6688, + "step": 18029 + }, + { + "epoch": 0.534649942175963, + "grad_norm": 0.1599096804857254, + "learning_rate": 0.0004530887138251344, + "loss": 2.6467, + "step": 18030 + }, + { + "epoch": 0.5346795955282745, + "grad_norm": 0.1153336614370346, + "learning_rate": 0.00045304187061700004, + "loss": 2.6886, + "step": 18031 + }, + { + "epoch": 0.5347092488805859, + "grad_norm": 0.14631839096546173, + "learning_rate": 0.00045299502782468796, + "loss": 2.6971, + "step": 18032 + }, + { + "epoch": 0.5347389022328974, + "grad_norm": 0.14687243103981018, + "learning_rate": 0.0004529481854486131, + "loss": 2.6982, + "step": 18033 + }, + { + "epoch": 0.5347685555852089, + "grad_norm": 0.12305303663015366, + "learning_rate": 0.0004529013434891898, + "loss": 2.6671, + "step": 18034 + }, + { + "epoch": 0.5347982089375204, + "grad_norm": 0.11595510691404343, + "learning_rate": 0.0004528545019468334, + "loss": 2.6626, + "step": 18035 + }, + { + "epoch": 0.5348278622898318, + "grad_norm": 0.12396835535764694, + "learning_rate": 0.0004528076608219585, + "loss": 2.6745, + "step": 18036 + }, + { + "epoch": 0.5348575156421433, + "grad_norm": 0.11274547129869461, + "learning_rate": 0.00045276082011497996, + "loss": 2.6616, + "step": 18037 + }, + { + "epoch": 0.5348871689944549, + "grad_norm": 0.10859549790620804, + "learning_rate": 0.0004527139798263124, + "loss": 2.6681, + "step": 18038 + }, + { + "epoch": 0.5349168223467663, + "grad_norm": 0.1133730560541153, + "learning_rate": 0.0004526671399563707, + "loss": 2.6588, + "step": 18039 + }, + { + "epoch": 0.5349464756990778, + "grad_norm": 0.11143665760755539, + "learning_rate": 0.00045262030050556947, + "loss": 2.6339, + "step": 18040 + }, + { + "epoch": 0.5349761290513892, + "grad_norm": 0.11026162654161453, + "learning_rate": 0.0004525734614743237, + "loss": 2.6976, + "step": 18041 + }, + { + "epoch": 0.5350057824037008, + "grad_norm": 0.10030274838209152, + "learning_rate": 0.000452526622863048, + "loss": 2.6364, + "step": 18042 + }, + { + "epoch": 0.5350354357560122, + "grad_norm": 0.11489439755678177, + "learning_rate": 0.00045247978467215726, + "loss": 2.628, + "step": 18043 + }, + { + "epoch": 0.5350650891083237, + "grad_norm": 0.10824180394411087, + "learning_rate": 0.0004524329469020663, + "loss": 2.6861, + "step": 18044 + }, + { + "epoch": 0.5350947424606352, + "grad_norm": 0.11218063533306122, + "learning_rate": 0.00045238610955318964, + "loss": 2.6783, + "step": 18045 + }, + { + "epoch": 0.5351243958129467, + "grad_norm": 0.1075064018368721, + "learning_rate": 0.0004523392726259422, + "loss": 2.6538, + "step": 18046 + }, + { + "epoch": 0.5351540491652581, + "grad_norm": 0.1202070340514183, + "learning_rate": 0.00045229243612073834, + "loss": 2.6317, + "step": 18047 + }, + { + "epoch": 0.5351837025175696, + "grad_norm": 0.11343945562839508, + "learning_rate": 0.0004522456000379935, + "loss": 2.6471, + "step": 18048 + }, + { + "epoch": 0.5352133558698811, + "grad_norm": 0.1137973740696907, + "learning_rate": 0.00045219876437812206, + "loss": 2.6782, + "step": 18049 + }, + { + "epoch": 0.5352430092221926, + "grad_norm": 0.11238940060138702, + "learning_rate": 0.0004521519291415387, + "loss": 2.6835, + "step": 18050 + }, + { + "epoch": 0.535272662574504, + "grad_norm": 0.11274579167366028, + "learning_rate": 0.00045210509432865823, + "loss": 2.6414, + "step": 18051 + }, + { + "epoch": 0.5353023159268155, + "grad_norm": 0.11227623373270035, + "learning_rate": 0.0004520582599398954, + "loss": 2.6818, + "step": 18052 + }, + { + "epoch": 0.535331969279127, + "grad_norm": 0.11550993472337723, + "learning_rate": 0.00045201142597566484, + "loss": 2.6835, + "step": 18053 + }, + { + "epoch": 0.5353616226314385, + "grad_norm": 0.11941423267126083, + "learning_rate": 0.0004519645924363815, + "loss": 2.686, + "step": 18054 + }, + { + "epoch": 0.5353912759837499, + "grad_norm": 0.10367004573345184, + "learning_rate": 0.0004519177593224598, + "loss": 2.6968, + "step": 18055 + }, + { + "epoch": 0.5354209293360614, + "grad_norm": 0.12061819434165955, + "learning_rate": 0.00045187092663431463, + "loss": 2.667, + "step": 18056 + }, + { + "epoch": 0.5354505826883729, + "grad_norm": 0.11804376542568207, + "learning_rate": 0.00045182409437236065, + "loss": 2.6462, + "step": 18057 + }, + { + "epoch": 0.5354802360406844, + "grad_norm": 0.11489323526620865, + "learning_rate": 0.0004517772625370126, + "loss": 2.6943, + "step": 18058 + }, + { + "epoch": 0.5355098893929959, + "grad_norm": 0.1127292662858963, + "learning_rate": 0.0004517304311286851, + "loss": 2.6215, + "step": 18059 + }, + { + "epoch": 0.5355395427453074, + "grad_norm": 0.12472719699144363, + "learning_rate": 0.0004516836001477929, + "loss": 2.7143, + "step": 18060 + }, + { + "epoch": 0.5355691960976189, + "grad_norm": 0.1396668404340744, + "learning_rate": 0.00045163676959475077, + "loss": 2.6483, + "step": 18061 + }, + { + "epoch": 0.5355988494499303, + "grad_norm": 0.12570258975028992, + "learning_rate": 0.00045158993946997335, + "loss": 2.6331, + "step": 18062 + }, + { + "epoch": 0.5356285028022418, + "grad_norm": 0.14101937413215637, + "learning_rate": 0.0004515431097738753, + "loss": 2.7059, + "step": 18063 + }, + { + "epoch": 0.5356581561545533, + "grad_norm": 0.13161389529705048, + "learning_rate": 0.0004514962805068714, + "loss": 2.6793, + "step": 18064 + }, + { + "epoch": 0.5356878095068648, + "grad_norm": 0.1321037858724594, + "learning_rate": 0.00045144945166937636, + "loss": 2.6369, + "step": 18065 + }, + { + "epoch": 0.5357174628591762, + "grad_norm": 0.1468936800956726, + "learning_rate": 0.0004514026232618046, + "loss": 2.6499, + "step": 18066 + }, + { + "epoch": 0.5357471162114877, + "grad_norm": 0.14150528609752655, + "learning_rate": 0.0004513557952845709, + "loss": 2.6563, + "step": 18067 + }, + { + "epoch": 0.5357767695637992, + "grad_norm": 0.15943554043769836, + "learning_rate": 0.0004513089677380901, + "loss": 2.6624, + "step": 18068 + }, + { + "epoch": 0.5358064229161107, + "grad_norm": 0.12283127754926682, + "learning_rate": 0.00045126214062277673, + "loss": 2.6629, + "step": 18069 + }, + { + "epoch": 0.5358360762684221, + "grad_norm": 0.13324563205242157, + "learning_rate": 0.0004512153139390454, + "loss": 2.7091, + "step": 18070 + }, + { + "epoch": 0.5358657296207336, + "grad_norm": 0.1509389728307724, + "learning_rate": 0.000451168487687311, + "loss": 2.72, + "step": 18071 + }, + { + "epoch": 0.5358953829730451, + "grad_norm": 0.1404084861278534, + "learning_rate": 0.0004511216618679879, + "loss": 2.6478, + "step": 18072 + }, + { + "epoch": 0.5359250363253566, + "grad_norm": 0.11745335906744003, + "learning_rate": 0.0004510748364814907, + "loss": 2.6506, + "step": 18073 + }, + { + "epoch": 0.535954689677668, + "grad_norm": 0.1428527683019638, + "learning_rate": 0.00045102801152823444, + "loss": 2.6993, + "step": 18074 + }, + { + "epoch": 0.5359843430299795, + "grad_norm": 0.14482741057872772, + "learning_rate": 0.0004509811870086336, + "loss": 2.668, + "step": 18075 + }, + { + "epoch": 0.536013996382291, + "grad_norm": 0.1264214962720871, + "learning_rate": 0.00045093436292310265, + "loss": 2.684, + "step": 18076 + }, + { + "epoch": 0.5360436497346025, + "grad_norm": 0.11230643838644028, + "learning_rate": 0.0004508875392720564, + "loss": 2.6274, + "step": 18077 + }, + { + "epoch": 0.5360733030869139, + "grad_norm": 0.12376879155635834, + "learning_rate": 0.0004508407160559094, + "loss": 2.6888, + "step": 18078 + }, + { + "epoch": 0.5361029564392255, + "grad_norm": 0.1213722974061966, + "learning_rate": 0.00045079389327507625, + "loss": 2.6442, + "step": 18079 + }, + { + "epoch": 0.536132609791537, + "grad_norm": 0.1199861541390419, + "learning_rate": 0.0004507470709299716, + "loss": 2.6522, + "step": 18080 + }, + { + "epoch": 0.5361622631438484, + "grad_norm": 0.1061263307929039, + "learning_rate": 0.00045070024902101014, + "loss": 2.6772, + "step": 18081 + }, + { + "epoch": 0.5361919164961599, + "grad_norm": 0.10939841717481613, + "learning_rate": 0.00045065342754860657, + "loss": 2.6655, + "step": 18082 + }, + { + "epoch": 0.5362215698484714, + "grad_norm": 0.11281681805849075, + "learning_rate": 0.0004506066065131752, + "loss": 2.6378, + "step": 18083 + }, + { + "epoch": 0.5362512232007829, + "grad_norm": 0.123131163418293, + "learning_rate": 0.00045055978591513083, + "loss": 2.6955, + "step": 18084 + }, + { + "epoch": 0.5362808765530943, + "grad_norm": 0.13317649066448212, + "learning_rate": 0.0004505129657548881, + "loss": 2.6402, + "step": 18085 + }, + { + "epoch": 0.5363105299054058, + "grad_norm": 0.10985060781240463, + "learning_rate": 0.00045046614603286143, + "loss": 2.6406, + "step": 18086 + }, + { + "epoch": 0.5363401832577173, + "grad_norm": 0.11633016914129257, + "learning_rate": 0.00045041932674946556, + "loss": 2.6871, + "step": 18087 + }, + { + "epoch": 0.5363698366100288, + "grad_norm": 0.11119720339775085, + "learning_rate": 0.00045037250790511515, + "loss": 2.6443, + "step": 18088 + }, + { + "epoch": 0.5363994899623402, + "grad_norm": 0.1188775822520256, + "learning_rate": 0.00045032568950022467, + "loss": 2.6745, + "step": 18089 + }, + { + "epoch": 0.5364291433146517, + "grad_norm": 0.11772772669792175, + "learning_rate": 0.0004502788715352087, + "loss": 2.6631, + "step": 18090 + }, + { + "epoch": 0.5364587966669632, + "grad_norm": 0.11927571892738342, + "learning_rate": 0.0004502320540104819, + "loss": 2.6591, + "step": 18091 + }, + { + "epoch": 0.5364884500192747, + "grad_norm": 0.11308746784925461, + "learning_rate": 0.0004501852369264589, + "loss": 2.6325, + "step": 18092 + }, + { + "epoch": 0.5365181033715861, + "grad_norm": 0.12137749046087265, + "learning_rate": 0.0004501384202835541, + "loss": 2.6526, + "step": 18093 + }, + { + "epoch": 0.5365477567238977, + "grad_norm": 0.11025030165910721, + "learning_rate": 0.00045009160408218213, + "loss": 2.6797, + "step": 18094 + }, + { + "epoch": 0.5365774100762091, + "grad_norm": 0.12288988381624222, + "learning_rate": 0.0004500447883227575, + "loss": 2.6569, + "step": 18095 + }, + { + "epoch": 0.5366070634285206, + "grad_norm": 0.12570762634277344, + "learning_rate": 0.00044999797300569494, + "loss": 2.6546, + "step": 18096 + }, + { + "epoch": 0.536636716780832, + "grad_norm": 0.10466638952493668, + "learning_rate": 0.00044995115813140893, + "loss": 2.6735, + "step": 18097 + }, + { + "epoch": 0.5366663701331436, + "grad_norm": 0.11943002790212631, + "learning_rate": 0.0004499043437003139, + "loss": 2.691, + "step": 18098 + }, + { + "epoch": 0.536696023485455, + "grad_norm": 0.11505813896656036, + "learning_rate": 0.00044985752971282445, + "loss": 2.6613, + "step": 18099 + }, + { + "epoch": 0.5367256768377665, + "grad_norm": 0.112599678337574, + "learning_rate": 0.0004498107161693553, + "loss": 2.6623, + "step": 18100 + }, + { + "epoch": 0.536755330190078, + "grad_norm": 0.1338929384946823, + "learning_rate": 0.0004497639030703209, + "loss": 2.6545, + "step": 18101 + }, + { + "epoch": 0.5367849835423895, + "grad_norm": 0.12384327501058578, + "learning_rate": 0.0004497170904161357, + "loss": 2.6854, + "step": 18102 + }, + { + "epoch": 0.536814636894701, + "grad_norm": 0.1289389729499817, + "learning_rate": 0.0004496702782072145, + "loss": 2.678, + "step": 18103 + }, + { + "epoch": 0.5368442902470124, + "grad_norm": 0.15715867280960083, + "learning_rate": 0.0004496234664439714, + "loss": 2.6362, + "step": 18104 + }, + { + "epoch": 0.5368739435993239, + "grad_norm": 0.1370094120502472, + "learning_rate": 0.00044957665512682115, + "loss": 2.6488, + "step": 18105 + }, + { + "epoch": 0.5369035969516354, + "grad_norm": 0.14372287690639496, + "learning_rate": 0.00044952984425617837, + "loss": 2.6711, + "step": 18106 + }, + { + "epoch": 0.5369332503039469, + "grad_norm": 0.13940131664276123, + "learning_rate": 0.00044948303383245743, + "loss": 2.6671, + "step": 18107 + }, + { + "epoch": 0.5369629036562583, + "grad_norm": 0.10571373999118805, + "learning_rate": 0.0004494362238560729, + "loss": 2.6689, + "step": 18108 + }, + { + "epoch": 0.5369925570085698, + "grad_norm": 0.13432936370372772, + "learning_rate": 0.0004493894143274394, + "loss": 2.6751, + "step": 18109 + }, + { + "epoch": 0.5370222103608813, + "grad_norm": 0.14199745655059814, + "learning_rate": 0.00044934260524697123, + "loss": 2.7045, + "step": 18110 + }, + { + "epoch": 0.5370518637131928, + "grad_norm": 0.11970999836921692, + "learning_rate": 0.000449295796615083, + "loss": 2.6442, + "step": 18111 + }, + { + "epoch": 0.5370815170655042, + "grad_norm": 0.14664186537265778, + "learning_rate": 0.0004492489884321889, + "loss": 2.6505, + "step": 18112 + }, + { + "epoch": 0.5371111704178158, + "grad_norm": 0.11128430813550949, + "learning_rate": 0.00044920218069870405, + "loss": 2.6391, + "step": 18113 + }, + { + "epoch": 0.5371408237701272, + "grad_norm": 0.09722130000591278, + "learning_rate": 0.0004491553734150426, + "loss": 2.6704, + "step": 18114 + }, + { + "epoch": 0.5371704771224387, + "grad_norm": 0.11805082112550735, + "learning_rate": 0.00044910856658161895, + "loss": 2.6668, + "step": 18115 + }, + { + "epoch": 0.5372001304747501, + "grad_norm": 0.1047189012169838, + "learning_rate": 0.0004490617601988477, + "loss": 2.664, + "step": 18116 + }, + { + "epoch": 0.5372297838270617, + "grad_norm": 0.10629633814096451, + "learning_rate": 0.00044901495426714335, + "loss": 2.6517, + "step": 18117 + }, + { + "epoch": 0.5372594371793731, + "grad_norm": 0.10695091634988785, + "learning_rate": 0.0004489681487869203, + "loss": 2.6629, + "step": 18118 + }, + { + "epoch": 0.5372890905316846, + "grad_norm": 0.10138742625713348, + "learning_rate": 0.00044892134375859306, + "loss": 2.6485, + "step": 18119 + }, + { + "epoch": 0.537318743883996, + "grad_norm": 0.11225525289773941, + "learning_rate": 0.0004488745391825761, + "loss": 2.6354, + "step": 18120 + }, + { + "epoch": 0.5373483972363076, + "grad_norm": 0.11197853833436966, + "learning_rate": 0.00044882773505928395, + "loss": 2.668, + "step": 18121 + }, + { + "epoch": 0.5373780505886191, + "grad_norm": 0.12565116584300995, + "learning_rate": 0.0004487809313891309, + "loss": 2.6787, + "step": 18122 + }, + { + "epoch": 0.5374077039409305, + "grad_norm": 0.1197289302945137, + "learning_rate": 0.00044873412817253146, + "loss": 2.7015, + "step": 18123 + }, + { + "epoch": 0.537437357293242, + "grad_norm": 0.11281360685825348, + "learning_rate": 0.00044868732540990017, + "loss": 2.6596, + "step": 18124 + }, + { + "epoch": 0.5374670106455535, + "grad_norm": 0.10286291688680649, + "learning_rate": 0.0004486405231016513, + "loss": 2.6547, + "step": 18125 + }, + { + "epoch": 0.537496663997865, + "grad_norm": 0.11238762736320496, + "learning_rate": 0.00044859372124819946, + "loss": 2.6771, + "step": 18126 + }, + { + "epoch": 0.5375263173501764, + "grad_norm": 0.1198241338133812, + "learning_rate": 0.0004485469198499591, + "loss": 2.6234, + "step": 18127 + }, + { + "epoch": 0.537555970702488, + "grad_norm": 0.12705005705356598, + "learning_rate": 0.0004485001189073446, + "loss": 2.639, + "step": 18128 + }, + { + "epoch": 0.5375856240547994, + "grad_norm": 0.12420939654111862, + "learning_rate": 0.00044845331842077035, + "loss": 2.6556, + "step": 18129 + }, + { + "epoch": 0.5376152774071109, + "grad_norm": 0.13425204157829285, + "learning_rate": 0.0004484065183906509, + "loss": 2.6731, + "step": 18130 + }, + { + "epoch": 0.5376449307594223, + "grad_norm": 0.12295174598693848, + "learning_rate": 0.0004483597188174006, + "loss": 2.6467, + "step": 18131 + }, + { + "epoch": 0.5376745841117339, + "grad_norm": 0.1367443948984146, + "learning_rate": 0.00044831291970143374, + "loss": 2.6586, + "step": 18132 + }, + { + "epoch": 0.5377042374640453, + "grad_norm": 0.14375156164169312, + "learning_rate": 0.00044826612104316486, + "loss": 2.6795, + "step": 18133 + }, + { + "epoch": 0.5377338908163568, + "grad_norm": 0.13182055950164795, + "learning_rate": 0.00044821932284300843, + "loss": 2.684, + "step": 18134 + }, + { + "epoch": 0.5377635441686682, + "grad_norm": 0.14153365790843964, + "learning_rate": 0.0004481725251013787, + "loss": 2.6837, + "step": 18135 + }, + { + "epoch": 0.5377931975209798, + "grad_norm": 0.12621723115444183, + "learning_rate": 0.0004481257278186902, + "loss": 2.6744, + "step": 18136 + }, + { + "epoch": 0.5378228508732912, + "grad_norm": 0.14418016374111176, + "learning_rate": 0.0004480789309953574, + "loss": 2.6726, + "step": 18137 + }, + { + "epoch": 0.5378525042256027, + "grad_norm": 0.1424104869365692, + "learning_rate": 0.0004480321346317942, + "loss": 2.7258, + "step": 18138 + }, + { + "epoch": 0.5378821575779141, + "grad_norm": 0.12695975601673126, + "learning_rate": 0.0004479853387284156, + "loss": 2.6918, + "step": 18139 + }, + { + "epoch": 0.5379118109302257, + "grad_norm": 0.11066446453332901, + "learning_rate": 0.0004479385432856358, + "loss": 2.6456, + "step": 18140 + }, + { + "epoch": 0.5379414642825372, + "grad_norm": 0.12099581956863403, + "learning_rate": 0.00044789174830386924, + "loss": 2.6615, + "step": 18141 + }, + { + "epoch": 0.5379711176348486, + "grad_norm": 0.13526880741119385, + "learning_rate": 0.00044784495378353005, + "loss": 2.6629, + "step": 18142 + }, + { + "epoch": 0.5380007709871601, + "grad_norm": 0.11203330755233765, + "learning_rate": 0.0004477981597250328, + "loss": 2.6666, + "step": 18143 + }, + { + "epoch": 0.5380304243394716, + "grad_norm": 0.1220257505774498, + "learning_rate": 0.00044775136612879176, + "loss": 2.7025, + "step": 18144 + }, + { + "epoch": 0.5380600776917831, + "grad_norm": 0.13120286166667938, + "learning_rate": 0.0004477045729952213, + "loss": 2.6375, + "step": 18145 + }, + { + "epoch": 0.5380897310440945, + "grad_norm": 0.10948437452316284, + "learning_rate": 0.00044765778032473594, + "loss": 2.6643, + "step": 18146 + }, + { + "epoch": 0.538119384396406, + "grad_norm": 0.10996796190738678, + "learning_rate": 0.00044761098811774993, + "loss": 2.6875, + "step": 18147 + }, + { + "epoch": 0.5381490377487175, + "grad_norm": 0.11716100573539734, + "learning_rate": 0.00044756419637467755, + "loss": 2.6711, + "step": 18148 + }, + { + "epoch": 0.538178691101029, + "grad_norm": 0.10323002189397812, + "learning_rate": 0.0004475174050959332, + "loss": 2.6731, + "step": 18149 + }, + { + "epoch": 0.5382083444533404, + "grad_norm": 0.10973704606294632, + "learning_rate": 0.000447470614281931, + "loss": 2.6678, + "step": 18150 + }, + { + "epoch": 0.538237997805652, + "grad_norm": 0.10881586372852325, + "learning_rate": 0.00044742382393308584, + "loss": 2.6918, + "step": 18151 + }, + { + "epoch": 0.5382676511579634, + "grad_norm": 0.12240100651979446, + "learning_rate": 0.0004473770340498117, + "loss": 2.6482, + "step": 18152 + }, + { + "epoch": 0.5382973045102749, + "grad_norm": 0.12436062842607498, + "learning_rate": 0.0004473302446325229, + "loss": 2.6691, + "step": 18153 + }, + { + "epoch": 0.5383269578625863, + "grad_norm": 0.11443617194890976, + "learning_rate": 0.00044728345568163384, + "loss": 2.6285, + "step": 18154 + }, + { + "epoch": 0.5383566112148979, + "grad_norm": 0.10940881818532944, + "learning_rate": 0.00044723666719755886, + "loss": 2.661, + "step": 18155 + }, + { + "epoch": 0.5383862645672093, + "grad_norm": 0.10746129602193832, + "learning_rate": 0.0004471898791807122, + "loss": 2.662, + "step": 18156 + }, + { + "epoch": 0.5384159179195208, + "grad_norm": 0.10424131900072098, + "learning_rate": 0.00044714309163150826, + "loss": 2.6651, + "step": 18157 + }, + { + "epoch": 0.5384455712718322, + "grad_norm": 0.10578027367591858, + "learning_rate": 0.0004470963045503614, + "loss": 2.6709, + "step": 18158 + }, + { + "epoch": 0.5384752246241438, + "grad_norm": 0.11309948563575745, + "learning_rate": 0.0004470495179376857, + "loss": 2.6689, + "step": 18159 + }, + { + "epoch": 0.5385048779764552, + "grad_norm": 0.10216556489467621, + "learning_rate": 0.00044700273179389566, + "loss": 2.6724, + "step": 18160 + }, + { + "epoch": 0.5385345313287667, + "grad_norm": 0.11108755320310593, + "learning_rate": 0.00044695594611940555, + "loss": 2.6802, + "step": 18161 + }, + { + "epoch": 0.5385641846810783, + "grad_norm": 0.11691587418317795, + "learning_rate": 0.00044690916091462966, + "loss": 2.6362, + "step": 18162 + }, + { + "epoch": 0.5385938380333897, + "grad_norm": 0.10698574036359787, + "learning_rate": 0.00044686237617998204, + "loss": 2.6647, + "step": 18163 + }, + { + "epoch": 0.5386234913857012, + "grad_norm": 0.1100633293390274, + "learning_rate": 0.0004468155919158774, + "loss": 2.6579, + "step": 18164 + }, + { + "epoch": 0.5386531447380126, + "grad_norm": 0.12294768542051315, + "learning_rate": 0.0004467688081227298, + "loss": 2.6692, + "step": 18165 + }, + { + "epoch": 0.5386827980903242, + "grad_norm": 0.13183698058128357, + "learning_rate": 0.00044672202480095354, + "loss": 2.6917, + "step": 18166 + }, + { + "epoch": 0.5387124514426356, + "grad_norm": 0.14232237637043, + "learning_rate": 0.00044667524195096285, + "loss": 2.6897, + "step": 18167 + }, + { + "epoch": 0.5387421047949471, + "grad_norm": 0.1166234090924263, + "learning_rate": 0.00044662845957317224, + "loss": 2.6248, + "step": 18168 + }, + { + "epoch": 0.5387717581472585, + "grad_norm": 0.12345250695943832, + "learning_rate": 0.0004465816776679956, + "loss": 2.6412, + "step": 18169 + }, + { + "epoch": 0.5388014114995701, + "grad_norm": 0.12859036028385162, + "learning_rate": 0.0004465348962358474, + "loss": 2.6685, + "step": 18170 + }, + { + "epoch": 0.5388310648518815, + "grad_norm": 0.12869465351104736, + "learning_rate": 0.0004464881152771419, + "loss": 2.6756, + "step": 18171 + }, + { + "epoch": 0.538860718204193, + "grad_norm": 0.11166363209486008, + "learning_rate": 0.00044644133479229334, + "loss": 2.6523, + "step": 18172 + }, + { + "epoch": 0.5388903715565044, + "grad_norm": 0.12022367119789124, + "learning_rate": 0.00044639455478171587, + "loss": 2.6434, + "step": 18173 + }, + { + "epoch": 0.538920024908816, + "grad_norm": 0.11099974066019058, + "learning_rate": 0.00044634777524582385, + "loss": 2.6839, + "step": 18174 + }, + { + "epoch": 0.5389496782611274, + "grad_norm": 0.11561932414770126, + "learning_rate": 0.0004463009961850316, + "loss": 2.6725, + "step": 18175 + }, + { + "epoch": 0.5389793316134389, + "grad_norm": 0.1221252828836441, + "learning_rate": 0.00044625421759975293, + "loss": 2.6799, + "step": 18176 + }, + { + "epoch": 0.5390089849657503, + "grad_norm": 0.11153195798397064, + "learning_rate": 0.0004462074394904026, + "loss": 2.6909, + "step": 18177 + }, + { + "epoch": 0.5390386383180619, + "grad_norm": 0.12145757675170898, + "learning_rate": 0.00044616066185739465, + "loss": 2.6209, + "step": 18178 + }, + { + "epoch": 0.5390682916703733, + "grad_norm": 0.11202506721019745, + "learning_rate": 0.0004461138847011434, + "loss": 2.6827, + "step": 18179 + }, + { + "epoch": 0.5390979450226848, + "grad_norm": 0.12129592895507812, + "learning_rate": 0.0004460671080220628, + "loss": 2.6284, + "step": 18180 + }, + { + "epoch": 0.5391275983749962, + "grad_norm": 0.12233272939920425, + "learning_rate": 0.0004460203318205672, + "loss": 2.6489, + "step": 18181 + }, + { + "epoch": 0.5391572517273078, + "grad_norm": 0.13098223507404327, + "learning_rate": 0.00044597355609707085, + "loss": 2.6804, + "step": 18182 + }, + { + "epoch": 0.5391869050796193, + "grad_norm": 0.1220940425992012, + "learning_rate": 0.00044592678085198797, + "loss": 2.6724, + "step": 18183 + }, + { + "epoch": 0.5392165584319307, + "grad_norm": 0.10816039890050888, + "learning_rate": 0.00044588000608573273, + "loss": 2.6666, + "step": 18184 + }, + { + "epoch": 0.5392462117842423, + "grad_norm": 0.13623470067977905, + "learning_rate": 0.00044583323179871935, + "loss": 2.6476, + "step": 18185 + }, + { + "epoch": 0.5392758651365537, + "grad_norm": 0.12714077532291412, + "learning_rate": 0.00044578645799136195, + "loss": 2.6668, + "step": 18186 + }, + { + "epoch": 0.5393055184888652, + "grad_norm": 0.13974528014659882, + "learning_rate": 0.0004457396846640748, + "loss": 2.6521, + "step": 18187 + }, + { + "epoch": 0.5393351718411766, + "grad_norm": 0.12643732130527496, + "learning_rate": 0.000445692911817272, + "loss": 2.6406, + "step": 18188 + }, + { + "epoch": 0.5393648251934882, + "grad_norm": 0.1307019144296646, + "learning_rate": 0.00044564613945136767, + "loss": 2.6885, + "step": 18189 + }, + { + "epoch": 0.5393944785457996, + "grad_norm": 0.12580262124538422, + "learning_rate": 0.00044559936756677625, + "loss": 2.6816, + "step": 18190 + }, + { + "epoch": 0.5394241318981111, + "grad_norm": 0.11279749125242233, + "learning_rate": 0.00044555259616391174, + "loss": 2.6414, + "step": 18191 + }, + { + "epoch": 0.5394537852504225, + "grad_norm": 0.10336748510599136, + "learning_rate": 0.0004455058252431884, + "loss": 2.6656, + "step": 18192 + }, + { + "epoch": 0.5394834386027341, + "grad_norm": 0.12729232013225555, + "learning_rate": 0.0004454590548050203, + "loss": 2.6686, + "step": 18193 + }, + { + "epoch": 0.5395130919550455, + "grad_norm": 0.13418829441070557, + "learning_rate": 0.0004454122848498216, + "loss": 2.6877, + "step": 18194 + }, + { + "epoch": 0.539542745307357, + "grad_norm": 0.11655790358781815, + "learning_rate": 0.0004453655153780065, + "loss": 2.6694, + "step": 18195 + }, + { + "epoch": 0.5395723986596684, + "grad_norm": 0.12955084443092346, + "learning_rate": 0.00044531874638998925, + "loss": 2.6411, + "step": 18196 + }, + { + "epoch": 0.53960205201198, + "grad_norm": 0.11843645572662354, + "learning_rate": 0.00044527197788618377, + "loss": 2.6633, + "step": 18197 + }, + { + "epoch": 0.5396317053642914, + "grad_norm": 0.12415667623281479, + "learning_rate": 0.00044522520986700436, + "loss": 2.6262, + "step": 18198 + }, + { + "epoch": 0.5396613587166029, + "grad_norm": 0.12191101163625717, + "learning_rate": 0.00044517844233286506, + "loss": 2.6953, + "step": 18199 + }, + { + "epoch": 0.5396910120689143, + "grad_norm": 0.11529311537742615, + "learning_rate": 0.0004451316752841801, + "loss": 2.6706, + "step": 18200 + }, + { + "epoch": 0.5397206654212259, + "grad_norm": 0.13652241230010986, + "learning_rate": 0.0004450849087213636, + "loss": 2.6495, + "step": 18201 + }, + { + "epoch": 0.5397503187735373, + "grad_norm": 0.13266639411449432, + "learning_rate": 0.0004450381426448295, + "loss": 2.655, + "step": 18202 + }, + { + "epoch": 0.5397799721258488, + "grad_norm": 0.10062903165817261, + "learning_rate": 0.0004449913770549922, + "loss": 2.7075, + "step": 18203 + }, + { + "epoch": 0.5398096254781604, + "grad_norm": 0.13400188088417053, + "learning_rate": 0.00044494461195226573, + "loss": 2.6648, + "step": 18204 + }, + { + "epoch": 0.5398392788304718, + "grad_norm": 0.1363297700881958, + "learning_rate": 0.0004448978473370641, + "loss": 2.6595, + "step": 18205 + }, + { + "epoch": 0.5398689321827833, + "grad_norm": 0.15810342133045197, + "learning_rate": 0.00044485108320980164, + "loss": 2.681, + "step": 18206 + }, + { + "epoch": 0.5398985855350947, + "grad_norm": 0.13617421686649323, + "learning_rate": 0.00044480431957089224, + "loss": 2.6336, + "step": 18207 + }, + { + "epoch": 0.5399282388874063, + "grad_norm": 0.13994595408439636, + "learning_rate": 0.0004447575564207501, + "loss": 2.6661, + "step": 18208 + }, + { + "epoch": 0.5399578922397177, + "grad_norm": 0.1323477327823639, + "learning_rate": 0.00044471079375978914, + "loss": 2.7055, + "step": 18209 + }, + { + "epoch": 0.5399875455920292, + "grad_norm": 0.11330918967723846, + "learning_rate": 0.00044466403158842366, + "loss": 2.6636, + "step": 18210 + }, + { + "epoch": 0.5400171989443406, + "grad_norm": 0.12283828109502792, + "learning_rate": 0.0004446172699070677, + "loss": 2.6695, + "step": 18211 + }, + { + "epoch": 0.5400468522966522, + "grad_norm": 0.10515979677438736, + "learning_rate": 0.0004445705087161353, + "loss": 2.6331, + "step": 18212 + }, + { + "epoch": 0.5400765056489636, + "grad_norm": 0.1157299056649208, + "learning_rate": 0.00044452374801604065, + "loss": 2.6675, + "step": 18213 + }, + { + "epoch": 0.5401061590012751, + "grad_norm": 0.1109519749879837, + "learning_rate": 0.00044447698780719766, + "loss": 2.6784, + "step": 18214 + }, + { + "epoch": 0.5401358123535865, + "grad_norm": 0.10735996067523956, + "learning_rate": 0.00044443022809002023, + "loss": 2.6376, + "step": 18215 + }, + { + "epoch": 0.5401654657058981, + "grad_norm": 0.10086089372634888, + "learning_rate": 0.00044438346886492287, + "loss": 2.6968, + "step": 18216 + }, + { + "epoch": 0.5401951190582095, + "grad_norm": 0.11037059128284454, + "learning_rate": 0.0004443367101323196, + "loss": 2.6791, + "step": 18217 + }, + { + "epoch": 0.540224772410521, + "grad_norm": 0.11519010365009308, + "learning_rate": 0.0004442899518926242, + "loss": 2.6642, + "step": 18218 + }, + { + "epoch": 0.5402544257628324, + "grad_norm": 0.10538835823535919, + "learning_rate": 0.0004442431941462508, + "loss": 2.6714, + "step": 18219 + }, + { + "epoch": 0.540284079115144, + "grad_norm": 0.10236771404743195, + "learning_rate": 0.0004441964368936135, + "loss": 2.6338, + "step": 18220 + }, + { + "epoch": 0.5403137324674554, + "grad_norm": 0.11577337980270386, + "learning_rate": 0.0004441496801351263, + "loss": 2.6597, + "step": 18221 + }, + { + "epoch": 0.5403433858197669, + "grad_norm": 0.13791801035404205, + "learning_rate": 0.0004441029238712033, + "loss": 2.6887, + "step": 18222 + }, + { + "epoch": 0.5403730391720784, + "grad_norm": 0.13147050142288208, + "learning_rate": 0.00044405616810225853, + "loss": 2.6904, + "step": 18223 + }, + { + "epoch": 0.5404026925243899, + "grad_norm": 0.10753215104341507, + "learning_rate": 0.0004440094128287061, + "loss": 2.6665, + "step": 18224 + }, + { + "epoch": 0.5404323458767014, + "grad_norm": 0.12127262353897095, + "learning_rate": 0.00044396265805095975, + "loss": 2.6826, + "step": 18225 + }, + { + "epoch": 0.5404619992290128, + "grad_norm": 0.12766540050506592, + "learning_rate": 0.00044391590376943377, + "loss": 2.6924, + "step": 18226 + }, + { + "epoch": 0.5404916525813244, + "grad_norm": 0.11580908298492432, + "learning_rate": 0.00044386914998454204, + "loss": 2.6555, + "step": 18227 + }, + { + "epoch": 0.5405213059336358, + "grad_norm": 0.1285502314567566, + "learning_rate": 0.00044382239669669857, + "loss": 2.6686, + "step": 18228 + }, + { + "epoch": 0.5405509592859473, + "grad_norm": 0.12287063151597977, + "learning_rate": 0.0004437756439063174, + "loss": 2.6641, + "step": 18229 + }, + { + "epoch": 0.5405806126382587, + "grad_norm": 0.12049433588981628, + "learning_rate": 0.0004437288916138127, + "loss": 2.6565, + "step": 18230 + }, + { + "epoch": 0.5406102659905703, + "grad_norm": 0.1111472100019455, + "learning_rate": 0.00044368213981959827, + "loss": 2.6124, + "step": 18231 + }, + { + "epoch": 0.5406399193428817, + "grad_norm": 0.10856245458126068, + "learning_rate": 0.0004436353885240881, + "loss": 2.6617, + "step": 18232 + }, + { + "epoch": 0.5406695726951932, + "grad_norm": 0.11641545593738556, + "learning_rate": 0.0004435886377276963, + "loss": 2.6834, + "step": 18233 + }, + { + "epoch": 0.5406992260475046, + "grad_norm": 0.12487591058015823, + "learning_rate": 0.0004435418874308369, + "loss": 2.6629, + "step": 18234 + }, + { + "epoch": 0.5407288793998162, + "grad_norm": 0.12158077955245972, + "learning_rate": 0.0004434951376339237, + "loss": 2.6618, + "step": 18235 + }, + { + "epoch": 0.5407585327521276, + "grad_norm": 0.11407236754894257, + "learning_rate": 0.00044344838833737066, + "loss": 2.6842, + "step": 18236 + }, + { + "epoch": 0.5407881861044391, + "grad_norm": 0.11348257213830948, + "learning_rate": 0.0004434016395415919, + "loss": 2.655, + "step": 18237 + }, + { + "epoch": 0.5408178394567505, + "grad_norm": 0.11683596670627594, + "learning_rate": 0.00044335489124700135, + "loss": 2.6502, + "step": 18238 + }, + { + "epoch": 0.5408474928090621, + "grad_norm": 0.1262694150209427, + "learning_rate": 0.000443308143454013, + "loss": 2.6774, + "step": 18239 + }, + { + "epoch": 0.5408771461613735, + "grad_norm": 0.13333284854888916, + "learning_rate": 0.00044326139616304066, + "loss": 2.6302, + "step": 18240 + }, + { + "epoch": 0.540906799513685, + "grad_norm": 0.13037684559822083, + "learning_rate": 0.0004432146493744984, + "loss": 2.6565, + "step": 18241 + }, + { + "epoch": 0.5409364528659965, + "grad_norm": 0.12322396785020828, + "learning_rate": 0.00044316790308880026, + "loss": 2.6861, + "step": 18242 + }, + { + "epoch": 0.540966106218308, + "grad_norm": 0.11826194822788239, + "learning_rate": 0.00044312115730636, + "loss": 2.6633, + "step": 18243 + }, + { + "epoch": 0.5409957595706194, + "grad_norm": 0.1318531036376953, + "learning_rate": 0.00044307441202759185, + "loss": 2.6867, + "step": 18244 + }, + { + "epoch": 0.5410254129229309, + "grad_norm": 0.118806391954422, + "learning_rate": 0.00044302766725290936, + "loss": 2.6248, + "step": 18245 + }, + { + "epoch": 0.5410550662752425, + "grad_norm": 0.12340467423200607, + "learning_rate": 0.00044298092298272675, + "loss": 2.6761, + "step": 18246 + }, + { + "epoch": 0.5410847196275539, + "grad_norm": 0.13586702942848206, + "learning_rate": 0.00044293417921745785, + "loss": 2.638, + "step": 18247 + }, + { + "epoch": 0.5411143729798654, + "grad_norm": 0.11283491551876068, + "learning_rate": 0.0004428874359575165, + "loss": 2.6372, + "step": 18248 + }, + { + "epoch": 0.5411440263321768, + "grad_norm": 0.13320186734199524, + "learning_rate": 0.00044284069320331675, + "loss": 2.6175, + "step": 18249 + }, + { + "epoch": 0.5411736796844884, + "grad_norm": 0.14260338246822357, + "learning_rate": 0.0004427939509552725, + "loss": 2.6697, + "step": 18250 + }, + { + "epoch": 0.5412033330367998, + "grad_norm": 0.1328626573085785, + "learning_rate": 0.0004427472092137977, + "loss": 2.645, + "step": 18251 + }, + { + "epoch": 0.5412329863891113, + "grad_norm": 0.11928336322307587, + "learning_rate": 0.0004427004679793062, + "loss": 2.659, + "step": 18252 + }, + { + "epoch": 0.5412626397414227, + "grad_norm": 0.11296726018190384, + "learning_rate": 0.00044265372725221173, + "loss": 2.6545, + "step": 18253 + }, + { + "epoch": 0.5412922930937343, + "grad_norm": 0.11350255459547043, + "learning_rate": 0.0004426069870329282, + "loss": 2.6728, + "step": 18254 + }, + { + "epoch": 0.5413219464460457, + "grad_norm": 0.13609164953231812, + "learning_rate": 0.00044256024732187006, + "loss": 2.6705, + "step": 18255 + }, + { + "epoch": 0.5413515997983572, + "grad_norm": 0.12507306039333344, + "learning_rate": 0.00044251350811945055, + "loss": 2.6643, + "step": 18256 + }, + { + "epoch": 0.5413812531506687, + "grad_norm": 0.12534502148628235, + "learning_rate": 0.00044246676942608386, + "loss": 2.6788, + "step": 18257 + }, + { + "epoch": 0.5414109065029802, + "grad_norm": 0.11554423719644547, + "learning_rate": 0.00044242003124218376, + "loss": 2.6792, + "step": 18258 + }, + { + "epoch": 0.5414405598552916, + "grad_norm": 0.12070640176534653, + "learning_rate": 0.00044237329356816427, + "loss": 2.6374, + "step": 18259 + }, + { + "epoch": 0.5414702132076031, + "grad_norm": 0.11964424699544907, + "learning_rate": 0.000442326556404439, + "loss": 2.6735, + "step": 18260 + }, + { + "epoch": 0.5414998665599146, + "grad_norm": 0.12076397985219955, + "learning_rate": 0.0004422798197514221, + "loss": 2.6439, + "step": 18261 + }, + { + "epoch": 0.5415295199122261, + "grad_norm": 0.11498205363750458, + "learning_rate": 0.00044223308360952733, + "loss": 2.6945, + "step": 18262 + }, + { + "epoch": 0.5415591732645375, + "grad_norm": 0.11828740686178207, + "learning_rate": 0.00044218634797916844, + "loss": 2.6616, + "step": 18263 + }, + { + "epoch": 0.541588826616849, + "grad_norm": 0.12993045151233673, + "learning_rate": 0.00044213961286075935, + "loss": 2.6771, + "step": 18264 + }, + { + "epoch": 0.5416184799691605, + "grad_norm": 0.11949057877063751, + "learning_rate": 0.000442092878254714, + "loss": 2.647, + "step": 18265 + }, + { + "epoch": 0.541648133321472, + "grad_norm": 0.10610070079565048, + "learning_rate": 0.00044204614416144605, + "loss": 2.6333, + "step": 18266 + }, + { + "epoch": 0.5416777866737835, + "grad_norm": 0.12252742797136307, + "learning_rate": 0.0004419994105813694, + "loss": 2.6598, + "step": 18267 + }, + { + "epoch": 0.5417074400260949, + "grad_norm": 0.11222263425588608, + "learning_rate": 0.000441952677514898, + "loss": 2.6273, + "step": 18268 + }, + { + "epoch": 0.5417370933784065, + "grad_norm": 0.12978242337703705, + "learning_rate": 0.0004419059449624456, + "loss": 2.6903, + "step": 18269 + }, + { + "epoch": 0.5417667467307179, + "grad_norm": 0.11092893034219742, + "learning_rate": 0.00044185921292442604, + "loss": 2.6443, + "step": 18270 + }, + { + "epoch": 0.5417964000830294, + "grad_norm": 0.12386525422334671, + "learning_rate": 0.00044181248140125315, + "loss": 2.6646, + "step": 18271 + }, + { + "epoch": 0.5418260534353408, + "grad_norm": 0.13134214282035828, + "learning_rate": 0.00044176575039334083, + "loss": 2.6719, + "step": 18272 + }, + { + "epoch": 0.5418557067876524, + "grad_norm": 0.12863394618034363, + "learning_rate": 0.00044171901990110266, + "loss": 2.6836, + "step": 18273 + }, + { + "epoch": 0.5418853601399638, + "grad_norm": 0.11190430074930191, + "learning_rate": 0.0004416722899249526, + "loss": 2.6494, + "step": 18274 + }, + { + "epoch": 0.5419150134922753, + "grad_norm": 0.12376557290554047, + "learning_rate": 0.00044162556046530434, + "loss": 2.6888, + "step": 18275 + }, + { + "epoch": 0.5419446668445868, + "grad_norm": 0.09454295039176941, + "learning_rate": 0.00044157883152257186, + "loss": 2.6584, + "step": 18276 + }, + { + "epoch": 0.5419743201968983, + "grad_norm": 0.12065865099430084, + "learning_rate": 0.0004415321030971688, + "loss": 2.6822, + "step": 18277 + }, + { + "epoch": 0.5420039735492097, + "grad_norm": 0.13614393770694733, + "learning_rate": 0.000441485375189509, + "loss": 2.6554, + "step": 18278 + }, + { + "epoch": 0.5420336269015212, + "grad_norm": 0.11403029412031174, + "learning_rate": 0.00044143864780000634, + "loss": 2.6615, + "step": 18279 + }, + { + "epoch": 0.5420632802538327, + "grad_norm": 0.13451746106147766, + "learning_rate": 0.0004413919209290743, + "loss": 2.653, + "step": 18280 + }, + { + "epoch": 0.5420929336061442, + "grad_norm": 0.10676513612270355, + "learning_rate": 0.000441345194577127, + "loss": 2.68, + "step": 18281 + }, + { + "epoch": 0.5421225869584556, + "grad_norm": 0.10898466408252716, + "learning_rate": 0.0004412984687445781, + "loss": 2.6772, + "step": 18282 + }, + { + "epoch": 0.5421522403107671, + "grad_norm": 0.12616531550884247, + "learning_rate": 0.00044125174343184135, + "loss": 2.6737, + "step": 18283 + }, + { + "epoch": 0.5421818936630786, + "grad_norm": 0.11665784567594528, + "learning_rate": 0.0004412050186393304, + "loss": 2.67, + "step": 18284 + }, + { + "epoch": 0.5422115470153901, + "grad_norm": 0.1128685250878334, + "learning_rate": 0.00044115829436745915, + "loss": 2.6541, + "step": 18285 + }, + { + "epoch": 0.5422412003677015, + "grad_norm": 0.12259289622306824, + "learning_rate": 0.0004411115706166413, + "loss": 2.6554, + "step": 18286 + }, + { + "epoch": 0.542270853720013, + "grad_norm": 0.11106941848993301, + "learning_rate": 0.00044106484738729056, + "loss": 2.6591, + "step": 18287 + }, + { + "epoch": 0.5423005070723246, + "grad_norm": 0.1253669708967209, + "learning_rate": 0.0004410181246798208, + "loss": 2.7055, + "step": 18288 + }, + { + "epoch": 0.542330160424636, + "grad_norm": 0.12501384317874908, + "learning_rate": 0.00044097140249464576, + "loss": 2.6517, + "step": 18289 + }, + { + "epoch": 0.5423598137769475, + "grad_norm": 0.11865300685167313, + "learning_rate": 0.0004409246808321789, + "loss": 2.6765, + "step": 18290 + }, + { + "epoch": 0.542389467129259, + "grad_norm": 0.11351841688156128, + "learning_rate": 0.00044087795969283415, + "loss": 2.6478, + "step": 18291 + }, + { + "epoch": 0.5424191204815705, + "grad_norm": 0.10737727582454681, + "learning_rate": 0.00044083123907702525, + "loss": 2.6682, + "step": 18292 + }, + { + "epoch": 0.5424487738338819, + "grad_norm": 0.11484482139348984, + "learning_rate": 0.00044078451898516577, + "loss": 2.6369, + "step": 18293 + }, + { + "epoch": 0.5424784271861934, + "grad_norm": 0.11086220294237137, + "learning_rate": 0.0004407377994176696, + "loss": 2.6645, + "step": 18294 + }, + { + "epoch": 0.5425080805385049, + "grad_norm": 0.11157050728797913, + "learning_rate": 0.00044069108037495046, + "loss": 2.6486, + "step": 18295 + }, + { + "epoch": 0.5425377338908164, + "grad_norm": 0.11089063435792923, + "learning_rate": 0.000440644361857422, + "loss": 2.6587, + "step": 18296 + }, + { + "epoch": 0.5425673872431278, + "grad_norm": 0.10488598048686981, + "learning_rate": 0.00044059764386549785, + "loss": 2.6443, + "step": 18297 + }, + { + "epoch": 0.5425970405954393, + "grad_norm": 0.11453048884868622, + "learning_rate": 0.0004405509263995917, + "loss": 2.6396, + "step": 18298 + }, + { + "epoch": 0.5426266939477508, + "grad_norm": 0.11632204800844193, + "learning_rate": 0.00044050420946011736, + "loss": 2.6558, + "step": 18299 + }, + { + "epoch": 0.5426563473000623, + "grad_norm": 0.12802903354167938, + "learning_rate": 0.00044045749304748863, + "loss": 2.6828, + "step": 18300 + }, + { + "epoch": 0.5426860006523737, + "grad_norm": 0.10562671720981598, + "learning_rate": 0.00044041077716211886, + "loss": 2.6385, + "step": 18301 + }, + { + "epoch": 0.5427156540046852, + "grad_norm": 0.11743706464767456, + "learning_rate": 0.0004403640618044218, + "loss": 2.6805, + "step": 18302 + }, + { + "epoch": 0.5427453073569967, + "grad_norm": 0.12292532622814178, + "learning_rate": 0.00044031734697481137, + "loss": 2.6673, + "step": 18303 + }, + { + "epoch": 0.5427749607093082, + "grad_norm": 0.12901516258716583, + "learning_rate": 0.000440270632673701, + "loss": 2.6564, + "step": 18304 + }, + { + "epoch": 0.5428046140616196, + "grad_norm": 0.13685472309589386, + "learning_rate": 0.0004402239189015044, + "loss": 2.6941, + "step": 18305 + }, + { + "epoch": 0.5428342674139311, + "grad_norm": 0.11741481721401215, + "learning_rate": 0.0004401772056586352, + "loss": 2.6599, + "step": 18306 + }, + { + "epoch": 0.5428639207662426, + "grad_norm": 0.12195463478565216, + "learning_rate": 0.00044013049294550717, + "loss": 2.6598, + "step": 18307 + }, + { + "epoch": 0.5428935741185541, + "grad_norm": 0.10296769440174103, + "learning_rate": 0.00044008378076253397, + "loss": 2.6719, + "step": 18308 + }, + { + "epoch": 0.5429232274708656, + "grad_norm": 0.11882335692644119, + "learning_rate": 0.0004400370691101291, + "loss": 2.6649, + "step": 18309 + }, + { + "epoch": 0.542952880823177, + "grad_norm": 0.10912232846021652, + "learning_rate": 0.00043999035798870646, + "loss": 2.6325, + "step": 18310 + }, + { + "epoch": 0.5429825341754886, + "grad_norm": 0.11422689259052277, + "learning_rate": 0.0004399436473986793, + "loss": 2.6881, + "step": 18311 + }, + { + "epoch": 0.5430121875278, + "grad_norm": 0.12123214453458786, + "learning_rate": 0.00043989693734046155, + "loss": 2.6734, + "step": 18312 + }, + { + "epoch": 0.5430418408801115, + "grad_norm": 0.10615039616823196, + "learning_rate": 0.00043985022781446666, + "loss": 2.6753, + "step": 18313 + }, + { + "epoch": 0.543071494232423, + "grad_norm": 0.12455429881811142, + "learning_rate": 0.00043980351882110837, + "loss": 2.6099, + "step": 18314 + }, + { + "epoch": 0.5431011475847345, + "grad_norm": 0.12894824147224426, + "learning_rate": 0.00043975681036080024, + "loss": 2.7031, + "step": 18315 + }, + { + "epoch": 0.5431308009370459, + "grad_norm": 0.15140745043754578, + "learning_rate": 0.00043971010243395585, + "loss": 2.6749, + "step": 18316 + }, + { + "epoch": 0.5431604542893574, + "grad_norm": 0.15544012188911438, + "learning_rate": 0.0004396633950409891, + "loss": 2.6942, + "step": 18317 + }, + { + "epoch": 0.5431901076416689, + "grad_norm": 0.1449604481458664, + "learning_rate": 0.0004396166881823131, + "loss": 2.6754, + "step": 18318 + }, + { + "epoch": 0.5432197609939804, + "grad_norm": 0.1414208710193634, + "learning_rate": 0.0004395699818583415, + "loss": 2.6474, + "step": 18319 + }, + { + "epoch": 0.5432494143462918, + "grad_norm": 0.13571518659591675, + "learning_rate": 0.00043952327606948844, + "loss": 2.6812, + "step": 18320 + }, + { + "epoch": 0.5432790676986033, + "grad_norm": 0.13330933451652527, + "learning_rate": 0.00043947657081616696, + "loss": 2.6591, + "step": 18321 + }, + { + "epoch": 0.5433087210509148, + "grad_norm": 0.13741900026798248, + "learning_rate": 0.0004394298660987909, + "loss": 2.6575, + "step": 18322 + }, + { + "epoch": 0.5433383744032263, + "grad_norm": 0.13154064118862152, + "learning_rate": 0.00043938316191777384, + "loss": 2.6608, + "step": 18323 + }, + { + "epoch": 0.5433680277555377, + "grad_norm": 0.1242017075419426, + "learning_rate": 0.0004393364582735292, + "loss": 2.686, + "step": 18324 + }, + { + "epoch": 0.5433976811078493, + "grad_norm": 0.11834674328565598, + "learning_rate": 0.00043928975516647063, + "loss": 2.6484, + "step": 18325 + }, + { + "epoch": 0.5434273344601607, + "grad_norm": 0.1292089819908142, + "learning_rate": 0.00043924305259701173, + "loss": 2.6482, + "step": 18326 + }, + { + "epoch": 0.5434569878124722, + "grad_norm": 0.11155138909816742, + "learning_rate": 0.0004391963505655661, + "loss": 2.6537, + "step": 18327 + }, + { + "epoch": 0.5434866411647836, + "grad_norm": 0.11191894859075546, + "learning_rate": 0.0004391496490725472, + "loss": 2.6792, + "step": 18328 + }, + { + "epoch": 0.5435162945170952, + "grad_norm": 0.12486520409584045, + "learning_rate": 0.00043910294811836856, + "loss": 2.6833, + "step": 18329 + }, + { + "epoch": 0.5435459478694067, + "grad_norm": 0.11973556131124496, + "learning_rate": 0.0004390562477034437, + "loss": 2.6948, + "step": 18330 + }, + { + "epoch": 0.5435756012217181, + "grad_norm": 0.10666562616825104, + "learning_rate": 0.0004390095478281862, + "loss": 2.6933, + "step": 18331 + }, + { + "epoch": 0.5436052545740296, + "grad_norm": 0.11420293152332306, + "learning_rate": 0.00043896284849300973, + "loss": 2.6522, + "step": 18332 + }, + { + "epoch": 0.5436349079263411, + "grad_norm": 0.1234796941280365, + "learning_rate": 0.00043891614969832765, + "loss": 2.6638, + "step": 18333 + }, + { + "epoch": 0.5436645612786526, + "grad_norm": 0.11494337022304535, + "learning_rate": 0.0004388694514445536, + "loss": 2.6926, + "step": 18334 + }, + { + "epoch": 0.543694214630964, + "grad_norm": 0.13217145204544067, + "learning_rate": 0.000438822753732101, + "loss": 2.6617, + "step": 18335 + }, + { + "epoch": 0.5437238679832755, + "grad_norm": 0.12293112277984619, + "learning_rate": 0.0004387760565613835, + "loss": 2.6814, + "step": 18336 + }, + { + "epoch": 0.543753521335587, + "grad_norm": 0.10691268742084503, + "learning_rate": 0.0004387293599328144, + "loss": 2.7102, + "step": 18337 + }, + { + "epoch": 0.5437831746878985, + "grad_norm": 0.1195131316781044, + "learning_rate": 0.0004386826638468076, + "loss": 2.7012, + "step": 18338 + }, + { + "epoch": 0.5438128280402099, + "grad_norm": 0.13592980802059174, + "learning_rate": 0.00043863596830377613, + "loss": 2.6522, + "step": 18339 + }, + { + "epoch": 0.5438424813925214, + "grad_norm": 0.132618710398674, + "learning_rate": 0.00043858927330413374, + "loss": 2.6666, + "step": 18340 + }, + { + "epoch": 0.5438721347448329, + "grad_norm": 0.12955041229724884, + "learning_rate": 0.0004385425788482938, + "loss": 2.6627, + "step": 18341 + }, + { + "epoch": 0.5439017880971444, + "grad_norm": 0.11551106721162796, + "learning_rate": 0.0004384958849366699, + "loss": 2.6613, + "step": 18342 + }, + { + "epoch": 0.5439314414494558, + "grad_norm": 0.12544748187065125, + "learning_rate": 0.00043844919156967553, + "loss": 2.6913, + "step": 18343 + }, + { + "epoch": 0.5439610948017674, + "grad_norm": 0.0993833839893341, + "learning_rate": 0.000438402498747724, + "loss": 2.6593, + "step": 18344 + }, + { + "epoch": 0.5439907481540788, + "grad_norm": 0.11101894825696945, + "learning_rate": 0.00043835580647122907, + "loss": 2.6488, + "step": 18345 + }, + { + "epoch": 0.5440204015063903, + "grad_norm": 0.1162545382976532, + "learning_rate": 0.000438309114740604, + "loss": 2.6476, + "step": 18346 + }, + { + "epoch": 0.5440500548587017, + "grad_norm": 0.10437044501304626, + "learning_rate": 0.0004382624235562624, + "loss": 2.6934, + "step": 18347 + }, + { + "epoch": 0.5440797082110133, + "grad_norm": 0.11916698515415192, + "learning_rate": 0.00043821573291861766, + "loss": 2.664, + "step": 18348 + }, + { + "epoch": 0.5441093615633248, + "grad_norm": 0.1008216068148613, + "learning_rate": 0.0004381690428280831, + "loss": 2.6059, + "step": 18349 + }, + { + "epoch": 0.5441390149156362, + "grad_norm": 0.11749057471752167, + "learning_rate": 0.0004381223532850723, + "loss": 2.6513, + "step": 18350 + }, + { + "epoch": 0.5441686682679477, + "grad_norm": 0.12080414593219757, + "learning_rate": 0.00043807566428999867, + "loss": 2.6802, + "step": 18351 + }, + { + "epoch": 0.5441983216202592, + "grad_norm": 0.12852247059345245, + "learning_rate": 0.00043802897584327573, + "loss": 2.6554, + "step": 18352 + }, + { + "epoch": 0.5442279749725707, + "grad_norm": 0.1346663236618042, + "learning_rate": 0.00043798228794531674, + "loss": 2.6319, + "step": 18353 + }, + { + "epoch": 0.5442576283248821, + "grad_norm": 0.11869416385889053, + "learning_rate": 0.0004379356005965353, + "loss": 2.6723, + "step": 18354 + }, + { + "epoch": 0.5442872816771936, + "grad_norm": 0.10913562774658203, + "learning_rate": 0.00043788891379734486, + "loss": 2.6837, + "step": 18355 + }, + { + "epoch": 0.5443169350295051, + "grad_norm": 0.1277826428413391, + "learning_rate": 0.0004378422275481587, + "loss": 2.6852, + "step": 18356 + }, + { + "epoch": 0.5443465883818166, + "grad_norm": 0.1327304244041443, + "learning_rate": 0.00043779554184938995, + "loss": 2.651, + "step": 18357 + }, + { + "epoch": 0.544376241734128, + "grad_norm": 0.11690416932106018, + "learning_rate": 0.00043774885670145274, + "loss": 2.6554, + "step": 18358 + }, + { + "epoch": 0.5444058950864396, + "grad_norm": 0.11381781101226807, + "learning_rate": 0.00043770217210476, + "loss": 2.6587, + "step": 18359 + }, + { + "epoch": 0.544435548438751, + "grad_norm": 0.13094587624073029, + "learning_rate": 0.00043765548805972524, + "loss": 2.684, + "step": 18360 + }, + { + "epoch": 0.5444652017910625, + "grad_norm": 0.1462332308292389, + "learning_rate": 0.00043760880456676177, + "loss": 2.6638, + "step": 18361 + }, + { + "epoch": 0.5444948551433739, + "grad_norm": 0.13913194835186005, + "learning_rate": 0.00043756212162628306, + "loss": 2.6552, + "step": 18362 + }, + { + "epoch": 0.5445245084956855, + "grad_norm": 0.10958810150623322, + "learning_rate": 0.00043751543923870246, + "loss": 2.682, + "step": 18363 + }, + { + "epoch": 0.5445541618479969, + "grad_norm": 0.14136429131031036, + "learning_rate": 0.00043746875740443344, + "loss": 2.6796, + "step": 18364 + }, + { + "epoch": 0.5445838152003084, + "grad_norm": 0.13438980281352997, + "learning_rate": 0.00043742207612388934, + "loss": 2.674, + "step": 18365 + }, + { + "epoch": 0.5446134685526198, + "grad_norm": 0.12501278519630432, + "learning_rate": 0.00043737539539748346, + "loss": 2.6473, + "step": 18366 + }, + { + "epoch": 0.5446431219049314, + "grad_norm": 0.12199166417121887, + "learning_rate": 0.00043732871522562917, + "loss": 2.6604, + "step": 18367 + }, + { + "epoch": 0.5446727752572428, + "grad_norm": 0.13182322680950165, + "learning_rate": 0.00043728203560873985, + "loss": 2.681, + "step": 18368 + }, + { + "epoch": 0.5447024286095543, + "grad_norm": 0.12385497242212296, + "learning_rate": 0.0004372353565472289, + "loss": 2.6764, + "step": 18369 + }, + { + "epoch": 0.5447320819618658, + "grad_norm": 0.11965971440076828, + "learning_rate": 0.0004371886780415095, + "loss": 2.6565, + "step": 18370 + }, + { + "epoch": 0.5447617353141773, + "grad_norm": 0.12371186912059784, + "learning_rate": 0.0004371420000919952, + "loss": 2.6495, + "step": 18371 + }, + { + "epoch": 0.5447913886664888, + "grad_norm": 0.11771297454833984, + "learning_rate": 0.0004370953226990994, + "loss": 2.6632, + "step": 18372 + }, + { + "epoch": 0.5448210420188002, + "grad_norm": 0.1203012615442276, + "learning_rate": 0.00043704864586323523, + "loss": 2.637, + "step": 18373 + }, + { + "epoch": 0.5448506953711117, + "grad_norm": 0.1103818267583847, + "learning_rate": 0.00043700196958481615, + "loss": 2.6427, + "step": 18374 + }, + { + "epoch": 0.5448803487234232, + "grad_norm": 0.10578575730323792, + "learning_rate": 0.0004369552938642554, + "loss": 2.6523, + "step": 18375 + }, + { + "epoch": 0.5449100020757347, + "grad_norm": 0.11553186178207397, + "learning_rate": 0.0004369086187019665, + "loss": 2.65, + "step": 18376 + }, + { + "epoch": 0.5449396554280461, + "grad_norm": 0.13036799430847168, + "learning_rate": 0.0004368619440983625, + "loss": 2.6811, + "step": 18377 + }, + { + "epoch": 0.5449693087803577, + "grad_norm": 0.1431598663330078, + "learning_rate": 0.0004368152700538568, + "loss": 2.6522, + "step": 18378 + }, + { + "epoch": 0.5449989621326691, + "grad_norm": 0.15721216797828674, + "learning_rate": 0.0004367685965688627, + "loss": 2.6241, + "step": 18379 + }, + { + "epoch": 0.5450286154849806, + "grad_norm": 0.12284648418426514, + "learning_rate": 0.0004367219236437936, + "loss": 2.6672, + "step": 18380 + }, + { + "epoch": 0.545058268837292, + "grad_norm": 0.12280022352933884, + "learning_rate": 0.0004366752512790627, + "loss": 2.6465, + "step": 18381 + }, + { + "epoch": 0.5450879221896036, + "grad_norm": 0.1188652440905571, + "learning_rate": 0.00043662857947508336, + "loss": 2.6259, + "step": 18382 + }, + { + "epoch": 0.545117575541915, + "grad_norm": 0.09389045089483261, + "learning_rate": 0.0004365819082322686, + "loss": 2.6704, + "step": 18383 + }, + { + "epoch": 0.5451472288942265, + "grad_norm": 0.10754238814115524, + "learning_rate": 0.00043653523755103206, + "loss": 2.6731, + "step": 18384 + }, + { + "epoch": 0.5451768822465379, + "grad_norm": 0.13128520548343658, + "learning_rate": 0.000436488567431787, + "loss": 2.6643, + "step": 18385 + }, + { + "epoch": 0.5452065355988495, + "grad_norm": 0.11725231260061264, + "learning_rate": 0.00043644189787494657, + "loss": 2.6895, + "step": 18386 + }, + { + "epoch": 0.5452361889511609, + "grad_norm": 0.12091650813817978, + "learning_rate": 0.000436395228880924, + "loss": 2.6268, + "step": 18387 + }, + { + "epoch": 0.5452658423034724, + "grad_norm": 0.13207842409610748, + "learning_rate": 0.00043634856045013257, + "loss": 2.641, + "step": 18388 + }, + { + "epoch": 0.5452954956557838, + "grad_norm": 0.11283282190561295, + "learning_rate": 0.0004363018925829856, + "loss": 2.6623, + "step": 18389 + }, + { + "epoch": 0.5453251490080954, + "grad_norm": 0.11691804230213165, + "learning_rate": 0.0004362552252798963, + "loss": 2.6333, + "step": 18390 + }, + { + "epoch": 0.5453548023604069, + "grad_norm": 0.1187114417552948, + "learning_rate": 0.00043620855854127784, + "loss": 2.6322, + "step": 18391 + }, + { + "epoch": 0.5453844557127183, + "grad_norm": 0.12518107891082764, + "learning_rate": 0.0004361618923675436, + "loss": 2.6542, + "step": 18392 + }, + { + "epoch": 0.5454141090650299, + "grad_norm": 0.12239182740449905, + "learning_rate": 0.0004361152267591069, + "loss": 2.6634, + "step": 18393 + }, + { + "epoch": 0.5454437624173413, + "grad_norm": 0.12543842196464539, + "learning_rate": 0.00043606856171638067, + "loss": 2.6544, + "step": 18394 + }, + { + "epoch": 0.5454734157696528, + "grad_norm": 0.13053631782531738, + "learning_rate": 0.00043602189723977833, + "loss": 2.6804, + "step": 18395 + }, + { + "epoch": 0.5455030691219642, + "grad_norm": 0.11226744949817657, + "learning_rate": 0.0004359752333297128, + "loss": 2.6152, + "step": 18396 + }, + { + "epoch": 0.5455327224742758, + "grad_norm": 0.1274213343858719, + "learning_rate": 0.000435928569986598, + "loss": 2.6643, + "step": 18397 + }, + { + "epoch": 0.5455623758265872, + "grad_norm": 0.13348761200904846, + "learning_rate": 0.00043588190721084654, + "loss": 2.6578, + "step": 18398 + }, + { + "epoch": 0.5455920291788987, + "grad_norm": 0.09674414247274399, + "learning_rate": 0.00043583524500287175, + "loss": 2.6559, + "step": 18399 + }, + { + "epoch": 0.5456216825312101, + "grad_norm": 0.1319756954908371, + "learning_rate": 0.00043578858336308694, + "loss": 2.7026, + "step": 18400 + }, + { + "epoch": 0.5456513358835217, + "grad_norm": 0.15043818950653076, + "learning_rate": 0.00043574192229190524, + "loss": 2.6795, + "step": 18401 + }, + { + "epoch": 0.5456809892358331, + "grad_norm": 0.13281336426734924, + "learning_rate": 0.00043569526178973984, + "loss": 2.6636, + "step": 18402 + }, + { + "epoch": 0.5457106425881446, + "grad_norm": 0.1462584286928177, + "learning_rate": 0.0004356486018570041, + "loss": 2.6544, + "step": 18403 + }, + { + "epoch": 0.545740295940456, + "grad_norm": 0.13005651533603668, + "learning_rate": 0.00043560194249411084, + "loss": 2.6583, + "step": 18404 + }, + { + "epoch": 0.5457699492927676, + "grad_norm": 0.12943924963474274, + "learning_rate": 0.00043555528370147346, + "loss": 2.6649, + "step": 18405 + }, + { + "epoch": 0.545799602645079, + "grad_norm": 0.11749288439750671, + "learning_rate": 0.0004355086254795051, + "loss": 2.6889, + "step": 18406 + }, + { + "epoch": 0.5458292559973905, + "grad_norm": 0.13323096930980682, + "learning_rate": 0.00043546196782861895, + "loss": 2.6814, + "step": 18407 + }, + { + "epoch": 0.5458589093497019, + "grad_norm": 0.11889364570379257, + "learning_rate": 0.00043541531074922814, + "loss": 2.6515, + "step": 18408 + }, + { + "epoch": 0.5458885627020135, + "grad_norm": 0.10966600477695465, + "learning_rate": 0.0004353686542417458, + "loss": 2.6591, + "step": 18409 + }, + { + "epoch": 0.5459182160543249, + "grad_norm": 0.11906415969133377, + "learning_rate": 0.0004353219983065851, + "loss": 2.6559, + "step": 18410 + }, + { + "epoch": 0.5459478694066364, + "grad_norm": 0.10808035731315613, + "learning_rate": 0.0004352753429441593, + "loss": 2.6621, + "step": 18411 + }, + { + "epoch": 0.545977522758948, + "grad_norm": 0.12533868849277496, + "learning_rate": 0.00043522868815488135, + "loss": 2.6283, + "step": 18412 + }, + { + "epoch": 0.5460071761112594, + "grad_norm": 0.1091431975364685, + "learning_rate": 0.0004351820339391646, + "loss": 2.6451, + "step": 18413 + }, + { + "epoch": 0.5460368294635709, + "grad_norm": 0.10092774778604507, + "learning_rate": 0.00043513538029742215, + "loss": 2.6965, + "step": 18414 + }, + { + "epoch": 0.5460664828158823, + "grad_norm": 0.11619032919406891, + "learning_rate": 0.0004350887272300669, + "loss": 2.6505, + "step": 18415 + }, + { + "epoch": 0.5460961361681939, + "grad_norm": 0.12073671072721481, + "learning_rate": 0.0004350420747375121, + "loss": 2.656, + "step": 18416 + }, + { + "epoch": 0.5461257895205053, + "grad_norm": 0.1254105269908905, + "learning_rate": 0.0004349954228201709, + "loss": 2.6536, + "step": 18417 + }, + { + "epoch": 0.5461554428728168, + "grad_norm": 0.1217104122042656, + "learning_rate": 0.0004349487714784564, + "loss": 2.6845, + "step": 18418 + }, + { + "epoch": 0.5461850962251282, + "grad_norm": 0.0999780222773552, + "learning_rate": 0.00043490212071278165, + "loss": 2.6571, + "step": 18419 + }, + { + "epoch": 0.5462147495774398, + "grad_norm": 0.12507736682891846, + "learning_rate": 0.00043485547052356, + "loss": 2.6801, + "step": 18420 + }, + { + "epoch": 0.5462444029297512, + "grad_norm": 0.11018966883420944, + "learning_rate": 0.00043480882091120415, + "loss": 2.624, + "step": 18421 + }, + { + "epoch": 0.5462740562820627, + "grad_norm": 0.10687576234340668, + "learning_rate": 0.0004347621718761272, + "loss": 2.6375, + "step": 18422 + }, + { + "epoch": 0.5463037096343741, + "grad_norm": 0.11168351024389267, + "learning_rate": 0.0004347155234187426, + "loss": 2.6612, + "step": 18423 + }, + { + "epoch": 0.5463333629866857, + "grad_norm": 0.12238464504480362, + "learning_rate": 0.0004346688755394634, + "loss": 2.6447, + "step": 18424 + }, + { + "epoch": 0.5463630163389971, + "grad_norm": 0.10191617161035538, + "learning_rate": 0.00043462222823870237, + "loss": 2.668, + "step": 18425 + }, + { + "epoch": 0.5463926696913086, + "grad_norm": 0.11349092423915863, + "learning_rate": 0.0004345755815168727, + "loss": 2.7007, + "step": 18426 + }, + { + "epoch": 0.54642232304362, + "grad_norm": 0.1199786365032196, + "learning_rate": 0.00043452893537438753, + "loss": 2.6697, + "step": 18427 + }, + { + "epoch": 0.5464519763959316, + "grad_norm": 0.12257687002420425, + "learning_rate": 0.00043448228981165995, + "loss": 2.6454, + "step": 18428 + }, + { + "epoch": 0.546481629748243, + "grad_norm": 0.12022137641906738, + "learning_rate": 0.0004344356448291028, + "loss": 2.6508, + "step": 18429 + }, + { + "epoch": 0.5465112831005545, + "grad_norm": 0.10603133589029312, + "learning_rate": 0.0004343890004271294, + "loss": 2.6435, + "step": 18430 + }, + { + "epoch": 0.5465409364528659, + "grad_norm": 0.1292305439710617, + "learning_rate": 0.0004343423566061527, + "loss": 2.7104, + "step": 18431 + }, + { + "epoch": 0.5465705898051775, + "grad_norm": 0.12767037749290466, + "learning_rate": 0.0004342957133665856, + "loss": 2.6335, + "step": 18432 + }, + { + "epoch": 0.546600243157489, + "grad_norm": 0.1249396875500679, + "learning_rate": 0.00043424907070884123, + "loss": 2.6765, + "step": 18433 + }, + { + "epoch": 0.5466298965098004, + "grad_norm": 0.11621648073196411, + "learning_rate": 0.0004342024286333326, + "loss": 2.6617, + "step": 18434 + }, + { + "epoch": 0.546659549862112, + "grad_norm": 0.10758412629365921, + "learning_rate": 0.0004341557871404727, + "loss": 2.662, + "step": 18435 + }, + { + "epoch": 0.5466892032144234, + "grad_norm": 0.12685364484786987, + "learning_rate": 0.00043410914623067466, + "loss": 2.6811, + "step": 18436 + }, + { + "epoch": 0.5467188565667349, + "grad_norm": 0.1112569272518158, + "learning_rate": 0.0004340625059043515, + "loss": 2.6676, + "step": 18437 + }, + { + "epoch": 0.5467485099190463, + "grad_norm": 0.09881052374839783, + "learning_rate": 0.0004340158661619161, + "loss": 2.6488, + "step": 18438 + }, + { + "epoch": 0.5467781632713579, + "grad_norm": 0.11360117048025131, + "learning_rate": 0.0004339692270037816, + "loss": 2.6343, + "step": 18439 + }, + { + "epoch": 0.5468078166236693, + "grad_norm": 0.11271155625581741, + "learning_rate": 0.0004339225884303609, + "loss": 2.6466, + "step": 18440 + }, + { + "epoch": 0.5468374699759808, + "grad_norm": 0.11164578795433044, + "learning_rate": 0.0004338759504420672, + "loss": 2.6615, + "step": 18441 + }, + { + "epoch": 0.5468671233282922, + "grad_norm": 0.11901866644620895, + "learning_rate": 0.0004338293130393131, + "loss": 2.6668, + "step": 18442 + }, + { + "epoch": 0.5468967766806038, + "grad_norm": 0.11522367596626282, + "learning_rate": 0.0004337826762225118, + "loss": 2.6189, + "step": 18443 + }, + { + "epoch": 0.5469264300329152, + "grad_norm": 0.12647338211536407, + "learning_rate": 0.0004337360399920763, + "loss": 2.6493, + "step": 18444 + }, + { + "epoch": 0.5469560833852267, + "grad_norm": 0.1359853744506836, + "learning_rate": 0.0004336894043484195, + "loss": 2.667, + "step": 18445 + }, + { + "epoch": 0.5469857367375381, + "grad_norm": 0.12068354338407516, + "learning_rate": 0.0004336427692919545, + "loss": 2.6441, + "step": 18446 + }, + { + "epoch": 0.5470153900898497, + "grad_norm": 0.1156538650393486, + "learning_rate": 0.0004335961348230941, + "loss": 2.6513, + "step": 18447 + }, + { + "epoch": 0.5470450434421611, + "grad_norm": 0.11989566683769226, + "learning_rate": 0.0004335495009422512, + "loss": 2.6767, + "step": 18448 + }, + { + "epoch": 0.5470746967944726, + "grad_norm": 0.12468577176332474, + "learning_rate": 0.000433502867649839, + "loss": 2.6497, + "step": 18449 + }, + { + "epoch": 0.547104350146784, + "grad_norm": 0.12567371129989624, + "learning_rate": 0.00043345623494627037, + "loss": 2.6357, + "step": 18450 + }, + { + "epoch": 0.5471340034990956, + "grad_norm": 0.12672968208789825, + "learning_rate": 0.0004334096028319582, + "loss": 2.6685, + "step": 18451 + }, + { + "epoch": 0.547163656851407, + "grad_norm": 0.11788138002157211, + "learning_rate": 0.00043336297130731546, + "loss": 2.6732, + "step": 18452 + }, + { + "epoch": 0.5471933102037185, + "grad_norm": 0.14177305996418, + "learning_rate": 0.000433316340372755, + "loss": 2.6464, + "step": 18453 + }, + { + "epoch": 0.5472229635560301, + "grad_norm": 0.14532989263534546, + "learning_rate": 0.0004332697100286898, + "loss": 2.6493, + "step": 18454 + }, + { + "epoch": 0.5472526169083415, + "grad_norm": 0.16985675692558289, + "learning_rate": 0.0004332230802755327, + "loss": 2.6894, + "step": 18455 + }, + { + "epoch": 0.547282270260653, + "grad_norm": 0.14631015062332153, + "learning_rate": 0.0004331764511136967, + "loss": 2.6444, + "step": 18456 + }, + { + "epoch": 0.5473119236129644, + "grad_norm": 0.128211110830307, + "learning_rate": 0.00043312982254359474, + "loss": 2.6685, + "step": 18457 + }, + { + "epoch": 0.547341576965276, + "grad_norm": 0.13103339076042175, + "learning_rate": 0.00043308319456563976, + "loss": 2.6496, + "step": 18458 + }, + { + "epoch": 0.5473712303175874, + "grad_norm": 0.1386798769235611, + "learning_rate": 0.0004330365671802445, + "loss": 2.6372, + "step": 18459 + }, + { + "epoch": 0.5474008836698989, + "grad_norm": 0.12046252191066742, + "learning_rate": 0.0004329899403878219, + "loss": 2.6852, + "step": 18460 + }, + { + "epoch": 0.5474305370222103, + "grad_norm": 0.12393698841333389, + "learning_rate": 0.0004329433141887846, + "loss": 2.6602, + "step": 18461 + }, + { + "epoch": 0.5474601903745219, + "grad_norm": 0.10682790726423264, + "learning_rate": 0.0004328966885835462, + "loss": 2.6388, + "step": 18462 + }, + { + "epoch": 0.5474898437268333, + "grad_norm": 0.13397324085235596, + "learning_rate": 0.00043285006357251897, + "loss": 2.6949, + "step": 18463 + }, + { + "epoch": 0.5475194970791448, + "grad_norm": 0.12338291853666306, + "learning_rate": 0.00043280343915611604, + "loss": 2.6478, + "step": 18464 + }, + { + "epoch": 0.5475491504314562, + "grad_norm": 0.11474689841270447, + "learning_rate": 0.0004327568153347501, + "loss": 2.6234, + "step": 18465 + }, + { + "epoch": 0.5475788037837678, + "grad_norm": 0.1262824684381485, + "learning_rate": 0.0004327101921088341, + "loss": 2.6419, + "step": 18466 + }, + { + "epoch": 0.5476084571360792, + "grad_norm": 0.13379131257534027, + "learning_rate": 0.00043266356947878093, + "loss": 2.6674, + "step": 18467 + }, + { + "epoch": 0.5476381104883907, + "grad_norm": 0.11438284069299698, + "learning_rate": 0.00043261694744500345, + "loss": 2.656, + "step": 18468 + }, + { + "epoch": 0.5476677638407021, + "grad_norm": 0.12341662496328354, + "learning_rate": 0.0004325703260079145, + "loss": 2.6859, + "step": 18469 + }, + { + "epoch": 0.5476974171930137, + "grad_norm": 0.13329851627349854, + "learning_rate": 0.00043252370516792685, + "loss": 2.6311, + "step": 18470 + }, + { + "epoch": 0.5477270705453251, + "grad_norm": 0.108563631772995, + "learning_rate": 0.00043247708492545335, + "loss": 2.6583, + "step": 18471 + }, + { + "epoch": 0.5477567238976366, + "grad_norm": 0.11199215799570084, + "learning_rate": 0.00043243046528090677, + "loss": 2.6423, + "step": 18472 + }, + { + "epoch": 0.547786377249948, + "grad_norm": 0.10610699653625488, + "learning_rate": 0.0004323838462347001, + "loss": 2.6127, + "step": 18473 + }, + { + "epoch": 0.5478160306022596, + "grad_norm": 0.11565817147493362, + "learning_rate": 0.00043233722778724597, + "loss": 2.6776, + "step": 18474 + }, + { + "epoch": 0.5478456839545711, + "grad_norm": 0.11371300369501114, + "learning_rate": 0.0004322906099389574, + "loss": 2.6854, + "step": 18475 + }, + { + "epoch": 0.5478753373068825, + "grad_norm": 0.1267012357711792, + "learning_rate": 0.00043224399269024713, + "loss": 2.6846, + "step": 18476 + }, + { + "epoch": 0.5479049906591941, + "grad_norm": 0.10736361891031265, + "learning_rate": 0.0004321973760415279, + "loss": 2.66, + "step": 18477 + }, + { + "epoch": 0.5479346440115055, + "grad_norm": 0.10378644615411758, + "learning_rate": 0.00043215075999321253, + "loss": 2.6609, + "step": 18478 + }, + { + "epoch": 0.547964297363817, + "grad_norm": 0.10940949618816376, + "learning_rate": 0.00043210414454571393, + "loss": 2.6554, + "step": 18479 + }, + { + "epoch": 0.5479939507161284, + "grad_norm": 0.11451999843120575, + "learning_rate": 0.00043205752969944475, + "loss": 2.6582, + "step": 18480 + }, + { + "epoch": 0.54802360406844, + "grad_norm": 0.11436969041824341, + "learning_rate": 0.0004320109154548177, + "loss": 2.6665, + "step": 18481 + }, + { + "epoch": 0.5480532574207514, + "grad_norm": 0.09512311965227127, + "learning_rate": 0.0004319643018122458, + "loss": 2.635, + "step": 18482 + }, + { + "epoch": 0.5480829107730629, + "grad_norm": 0.11098861694335938, + "learning_rate": 0.00043191768877214157, + "loss": 2.6611, + "step": 18483 + }, + { + "epoch": 0.5481125641253743, + "grad_norm": 0.11878786981105804, + "learning_rate": 0.0004318710763349179, + "loss": 2.6914, + "step": 18484 + }, + { + "epoch": 0.5481422174776859, + "grad_norm": 0.13008762896060944, + "learning_rate": 0.00043182446450098756, + "loss": 2.6758, + "step": 18485 + }, + { + "epoch": 0.5481718708299973, + "grad_norm": 0.13086526095867157, + "learning_rate": 0.0004317778532707634, + "loss": 2.6556, + "step": 18486 + }, + { + "epoch": 0.5482015241823088, + "grad_norm": 0.13664524257183075, + "learning_rate": 0.00043173124264465776, + "loss": 2.662, + "step": 18487 + }, + { + "epoch": 0.5482311775346203, + "grad_norm": 0.12633544206619263, + "learning_rate": 0.0004316846326230839, + "loss": 2.6644, + "step": 18488 + }, + { + "epoch": 0.5482608308869318, + "grad_norm": 0.12664762139320374, + "learning_rate": 0.0004316380232064543, + "loss": 2.68, + "step": 18489 + }, + { + "epoch": 0.5482904842392432, + "grad_norm": 0.12286826968193054, + "learning_rate": 0.0004315914143951819, + "loss": 2.6272, + "step": 18490 + }, + { + "epoch": 0.5483201375915547, + "grad_norm": 0.13718393445014954, + "learning_rate": 0.0004315448061896791, + "loss": 2.6409, + "step": 18491 + }, + { + "epoch": 0.5483497909438662, + "grad_norm": 0.14006762206554413, + "learning_rate": 0.00043149819859035883, + "loss": 2.6696, + "step": 18492 + }, + { + "epoch": 0.5483794442961777, + "grad_norm": 0.11818905919790268, + "learning_rate": 0.00043145159159763374, + "loss": 2.6498, + "step": 18493 + }, + { + "epoch": 0.5484090976484891, + "grad_norm": 0.13914723694324493, + "learning_rate": 0.00043140498521191664, + "loss": 2.6768, + "step": 18494 + }, + { + "epoch": 0.5484387510008006, + "grad_norm": 0.14145278930664062, + "learning_rate": 0.0004313583794336201, + "loss": 2.6569, + "step": 18495 + }, + { + "epoch": 0.5484684043531122, + "grad_norm": 0.1238338053226471, + "learning_rate": 0.00043131177426315704, + "loss": 2.656, + "step": 18496 + }, + { + "epoch": 0.5484980577054236, + "grad_norm": 0.13224166631698608, + "learning_rate": 0.0004312651697009399, + "loss": 2.688, + "step": 18497 + }, + { + "epoch": 0.5485277110577351, + "grad_norm": 0.12087791413068771, + "learning_rate": 0.00043121856574738143, + "loss": 2.6699, + "step": 18498 + }, + { + "epoch": 0.5485573644100465, + "grad_norm": 0.12140781432390213, + "learning_rate": 0.00043117196240289446, + "loss": 2.6201, + "step": 18499 + }, + { + "epoch": 0.5485870177623581, + "grad_norm": 0.10937787592411041, + "learning_rate": 0.0004311253596678914, + "loss": 2.6636, + "step": 18500 + }, + { + "epoch": 0.5486166711146695, + "grad_norm": 0.12268460541963577, + "learning_rate": 0.00043107875754278517, + "loss": 2.6586, + "step": 18501 + }, + { + "epoch": 0.548646324466981, + "grad_norm": 0.12405385822057724, + "learning_rate": 0.00043103215602798846, + "loss": 2.6553, + "step": 18502 + }, + { + "epoch": 0.5486759778192924, + "grad_norm": 0.11815039813518524, + "learning_rate": 0.0004309855551239138, + "loss": 2.6945, + "step": 18503 + }, + { + "epoch": 0.548705631171604, + "grad_norm": 0.13001950085163116, + "learning_rate": 0.0004309389548309739, + "loss": 2.649, + "step": 18504 + }, + { + "epoch": 0.5487352845239154, + "grad_norm": 0.13643896579742432, + "learning_rate": 0.00043089235514958145, + "loss": 2.6442, + "step": 18505 + }, + { + "epoch": 0.5487649378762269, + "grad_norm": 0.11952794343233109, + "learning_rate": 0.000430845756080149, + "loss": 2.6472, + "step": 18506 + }, + { + "epoch": 0.5487945912285384, + "grad_norm": 0.11724235862493515, + "learning_rate": 0.00043079915762308943, + "loss": 2.6657, + "step": 18507 + }, + { + "epoch": 0.5488242445808499, + "grad_norm": 0.1222744882106781, + "learning_rate": 0.000430752559778815, + "loss": 2.657, + "step": 18508 + }, + { + "epoch": 0.5488538979331613, + "grad_norm": 0.12166638672351837, + "learning_rate": 0.0004307059625477386, + "loss": 2.6598, + "step": 18509 + }, + { + "epoch": 0.5488835512854728, + "grad_norm": 0.12207379937171936, + "learning_rate": 0.00043065936593027275, + "loss": 2.6634, + "step": 18510 + }, + { + "epoch": 0.5489132046377843, + "grad_norm": 0.10917166620492935, + "learning_rate": 0.00043061276992683017, + "loss": 2.6815, + "step": 18511 + }, + { + "epoch": 0.5489428579900958, + "grad_norm": 0.11003045737743378, + "learning_rate": 0.00043056617453782333, + "loss": 2.6698, + "step": 18512 + }, + { + "epoch": 0.5489725113424072, + "grad_norm": 0.10154377669095993, + "learning_rate": 0.00043051957976366495, + "loss": 2.6656, + "step": 18513 + }, + { + "epoch": 0.5490021646947187, + "grad_norm": 0.1176241934299469, + "learning_rate": 0.00043047298560476766, + "loss": 2.6835, + "step": 18514 + }, + { + "epoch": 0.5490318180470302, + "grad_norm": 0.1143464520573616, + "learning_rate": 0.00043042639206154407, + "loss": 2.6696, + "step": 18515 + }, + { + "epoch": 0.5490614713993417, + "grad_norm": 0.11618509143590927, + "learning_rate": 0.0004303797991344066, + "loss": 2.6474, + "step": 18516 + }, + { + "epoch": 0.5490911247516532, + "grad_norm": 0.1068492978811264, + "learning_rate": 0.0004303332068237682, + "loss": 2.6714, + "step": 18517 + }, + { + "epoch": 0.5491207781039646, + "grad_norm": 0.10705582052469254, + "learning_rate": 0.000430286615130041, + "loss": 2.6645, + "step": 18518 + }, + { + "epoch": 0.5491504314562762, + "grad_norm": 0.11041225492954254, + "learning_rate": 0.00043024002405363785, + "loss": 2.6553, + "step": 18519 + }, + { + "epoch": 0.5491800848085876, + "grad_norm": 0.11345241963863373, + "learning_rate": 0.0004301934335949713, + "loss": 2.6846, + "step": 18520 + }, + { + "epoch": 0.5492097381608991, + "grad_norm": 0.11960411071777344, + "learning_rate": 0.0004301468437544538, + "loss": 2.6929, + "step": 18521 + }, + { + "epoch": 0.5492393915132106, + "grad_norm": 0.13662785291671753, + "learning_rate": 0.000430100254532498, + "loss": 2.6668, + "step": 18522 + }, + { + "epoch": 0.5492690448655221, + "grad_norm": 0.14665763080120087, + "learning_rate": 0.00043005366592951646, + "loss": 2.6717, + "step": 18523 + }, + { + "epoch": 0.5492986982178335, + "grad_norm": 0.14187367260456085, + "learning_rate": 0.0004300070779459218, + "loss": 2.6488, + "step": 18524 + }, + { + "epoch": 0.549328351570145, + "grad_norm": 0.13558173179626465, + "learning_rate": 0.00042996049058212615, + "loss": 2.674, + "step": 18525 + }, + { + "epoch": 0.5493580049224565, + "grad_norm": 0.1400189846754074, + "learning_rate": 0.0004299139038385426, + "loss": 2.6656, + "step": 18526 + }, + { + "epoch": 0.549387658274768, + "grad_norm": 0.12464770674705505, + "learning_rate": 0.0004298673177155835, + "loss": 2.6868, + "step": 18527 + }, + { + "epoch": 0.5494173116270794, + "grad_norm": 0.1248757466673851, + "learning_rate": 0.0004298207322136615, + "loss": 2.6537, + "step": 18528 + }, + { + "epoch": 0.5494469649793909, + "grad_norm": 0.13308526575565338, + "learning_rate": 0.00042977414733318874, + "loss": 2.6794, + "step": 18529 + }, + { + "epoch": 0.5494766183317024, + "grad_norm": 0.1272946000099182, + "learning_rate": 0.000429727563074578, + "loss": 2.6376, + "step": 18530 + }, + { + "epoch": 0.5495062716840139, + "grad_norm": 0.12500904500484467, + "learning_rate": 0.00042968097943824177, + "loss": 2.6848, + "step": 18531 + }, + { + "epoch": 0.5495359250363253, + "grad_norm": 0.13467800617218018, + "learning_rate": 0.00042963439642459245, + "loss": 2.673, + "step": 18532 + }, + { + "epoch": 0.5495655783886368, + "grad_norm": 0.1264791190624237, + "learning_rate": 0.00042958781403404275, + "loss": 2.6414, + "step": 18533 + }, + { + "epoch": 0.5495952317409483, + "grad_norm": 0.1278533935546875, + "learning_rate": 0.0004295412322670051, + "loss": 2.6465, + "step": 18534 + }, + { + "epoch": 0.5496248850932598, + "grad_norm": 0.12747541069984436, + "learning_rate": 0.0004294946511238918, + "loss": 2.6735, + "step": 18535 + }, + { + "epoch": 0.5496545384455712, + "grad_norm": 0.12469340860843658, + "learning_rate": 0.0004294480706051155, + "loss": 2.6703, + "step": 18536 + }, + { + "epoch": 0.5496841917978827, + "grad_norm": 0.11186837404966354, + "learning_rate": 0.0004294014907110886, + "loss": 2.6372, + "step": 18537 + }, + { + "epoch": 0.5497138451501943, + "grad_norm": 0.11503982543945312, + "learning_rate": 0.00042935491144222357, + "loss": 2.649, + "step": 18538 + }, + { + "epoch": 0.5497434985025057, + "grad_norm": 0.11227124184370041, + "learning_rate": 0.000429308332798933, + "loss": 2.6555, + "step": 18539 + }, + { + "epoch": 0.5497731518548172, + "grad_norm": 0.11067485064268112, + "learning_rate": 0.00042926175478162924, + "loss": 2.6252, + "step": 18540 + }, + { + "epoch": 0.5498028052071287, + "grad_norm": 0.10245733708143234, + "learning_rate": 0.0004292151773907249, + "loss": 2.6274, + "step": 18541 + }, + { + "epoch": 0.5498324585594402, + "grad_norm": 0.11216060072183609, + "learning_rate": 0.0004291686006266322, + "loss": 2.6469, + "step": 18542 + }, + { + "epoch": 0.5498621119117516, + "grad_norm": 0.09802559018135071, + "learning_rate": 0.0004291220244897637, + "loss": 2.6833, + "step": 18543 + }, + { + "epoch": 0.5498917652640631, + "grad_norm": 0.10673448443412781, + "learning_rate": 0.0004290754489805319, + "loss": 2.6689, + "step": 18544 + }, + { + "epoch": 0.5499214186163746, + "grad_norm": 0.10284487903118134, + "learning_rate": 0.00042902887409934924, + "loss": 2.6249, + "step": 18545 + }, + { + "epoch": 0.5499510719686861, + "grad_norm": 0.10670758783817291, + "learning_rate": 0.00042898229984662806, + "loss": 2.6424, + "step": 18546 + }, + { + "epoch": 0.5499807253209975, + "grad_norm": 0.1078253835439682, + "learning_rate": 0.0004289357262227807, + "loss": 2.6265, + "step": 18547 + }, + { + "epoch": 0.550010378673309, + "grad_norm": 0.09635476022958755, + "learning_rate": 0.0004288891532282198, + "loss": 2.6666, + "step": 18548 + }, + { + "epoch": 0.5500400320256205, + "grad_norm": 0.10925573110580444, + "learning_rate": 0.0004288425808633575, + "loss": 2.6172, + "step": 18549 + }, + { + "epoch": 0.550069685377932, + "grad_norm": 0.11291433870792389, + "learning_rate": 0.00042879600912860646, + "loss": 2.6309, + "step": 18550 + }, + { + "epoch": 0.5500993387302434, + "grad_norm": 0.1201251670718193, + "learning_rate": 0.00042874943802437884, + "loss": 2.6757, + "step": 18551 + }, + { + "epoch": 0.550128992082555, + "grad_norm": 0.1146964579820633, + "learning_rate": 0.0004287028675510873, + "loss": 2.6565, + "step": 18552 + }, + { + "epoch": 0.5501586454348664, + "grad_norm": 0.12246357649564743, + "learning_rate": 0.0004286562977091441, + "loss": 2.6475, + "step": 18553 + }, + { + "epoch": 0.5501882987871779, + "grad_norm": 0.13378338515758514, + "learning_rate": 0.0004286097284989616, + "loss": 2.6728, + "step": 18554 + }, + { + "epoch": 0.5502179521394893, + "grad_norm": 0.11223205924034119, + "learning_rate": 0.00042856315992095237, + "loss": 2.6483, + "step": 18555 + }, + { + "epoch": 0.5502476054918009, + "grad_norm": 0.11045198887586594, + "learning_rate": 0.0004285165919755285, + "loss": 2.6232, + "step": 18556 + }, + { + "epoch": 0.5502772588441123, + "grad_norm": 0.1136055439710617, + "learning_rate": 0.0004284700246631025, + "loss": 2.6562, + "step": 18557 + }, + { + "epoch": 0.5503069121964238, + "grad_norm": 0.10496906191110611, + "learning_rate": 0.0004284234579840866, + "loss": 2.6406, + "step": 18558 + }, + { + "epoch": 0.5503365655487353, + "grad_norm": 0.1064295843243599, + "learning_rate": 0.0004283768919388934, + "loss": 2.6926, + "step": 18559 + }, + { + "epoch": 0.5503662189010468, + "grad_norm": 0.10461051762104034, + "learning_rate": 0.00042833032652793505, + "loss": 2.6302, + "step": 18560 + }, + { + "epoch": 0.5503958722533583, + "grad_norm": 0.09742846339941025, + "learning_rate": 0.0004282837617516239, + "loss": 2.6705, + "step": 18561 + }, + { + "epoch": 0.5504255256056697, + "grad_norm": 0.12088185548782349, + "learning_rate": 0.00042823719761037255, + "loss": 2.6495, + "step": 18562 + }, + { + "epoch": 0.5504551789579812, + "grad_norm": 0.14330698549747467, + "learning_rate": 0.000428190634104593, + "loss": 2.681, + "step": 18563 + }, + { + "epoch": 0.5504848323102927, + "grad_norm": 0.13430869579315186, + "learning_rate": 0.0004281440712346975, + "loss": 2.6535, + "step": 18564 + }, + { + "epoch": 0.5505144856626042, + "grad_norm": 0.1328255534172058, + "learning_rate": 0.00042809750900109876, + "loss": 2.6912, + "step": 18565 + }, + { + "epoch": 0.5505441390149156, + "grad_norm": 0.11279679834842682, + "learning_rate": 0.00042805094740420914, + "loss": 2.6215, + "step": 18566 + }, + { + "epoch": 0.5505737923672271, + "grad_norm": 0.10440744459629059, + "learning_rate": 0.0004280043864444405, + "loss": 2.7036, + "step": 18567 + }, + { + "epoch": 0.5506034457195386, + "grad_norm": 0.13060154020786285, + "learning_rate": 0.00042795782612220544, + "loss": 2.6588, + "step": 18568 + }, + { + "epoch": 0.5506330990718501, + "grad_norm": 0.11901485174894333, + "learning_rate": 0.00042791126643791617, + "loss": 2.6681, + "step": 18569 + }, + { + "epoch": 0.5506627524241615, + "grad_norm": 0.11243471503257751, + "learning_rate": 0.000427864707391985, + "loss": 2.6688, + "step": 18570 + }, + { + "epoch": 0.550692405776473, + "grad_norm": 0.1022353544831276, + "learning_rate": 0.00042781814898482426, + "loss": 2.7028, + "step": 18571 + }, + { + "epoch": 0.5507220591287845, + "grad_norm": 0.11757034808397293, + "learning_rate": 0.0004277715912168463, + "loss": 2.6585, + "step": 18572 + }, + { + "epoch": 0.550751712481096, + "grad_norm": 0.11646566540002823, + "learning_rate": 0.0004277250340884632, + "loss": 2.6559, + "step": 18573 + }, + { + "epoch": 0.5507813658334074, + "grad_norm": 0.11186657100915909, + "learning_rate": 0.00042767847760008727, + "loss": 2.6833, + "step": 18574 + }, + { + "epoch": 0.550811019185719, + "grad_norm": 0.11447838693857193, + "learning_rate": 0.0004276319217521309, + "loss": 2.6889, + "step": 18575 + }, + { + "epoch": 0.5508406725380304, + "grad_norm": 0.11717908829450607, + "learning_rate": 0.00042758536654500623, + "loss": 2.6542, + "step": 18576 + }, + { + "epoch": 0.5508703258903419, + "grad_norm": 0.11302062124013901, + "learning_rate": 0.00042753881197912547, + "loss": 2.6517, + "step": 18577 + }, + { + "epoch": 0.5508999792426534, + "grad_norm": 0.10685698688030243, + "learning_rate": 0.0004274922580549011, + "loss": 2.6796, + "step": 18578 + }, + { + "epoch": 0.5509296325949649, + "grad_norm": 0.12548603117465973, + "learning_rate": 0.00042744570477274525, + "loss": 2.6547, + "step": 18579 + }, + { + "epoch": 0.5509592859472764, + "grad_norm": 0.1302674412727356, + "learning_rate": 0.0004273991521330701, + "loss": 2.6642, + "step": 18580 + }, + { + "epoch": 0.5509889392995878, + "grad_norm": 0.1226942390203476, + "learning_rate": 0.000427352600136288, + "loss": 2.6634, + "step": 18581 + }, + { + "epoch": 0.5510185926518993, + "grad_norm": 0.12544456124305725, + "learning_rate": 0.000427306048782811, + "loss": 2.6674, + "step": 18582 + }, + { + "epoch": 0.5510482460042108, + "grad_norm": 0.10684595257043839, + "learning_rate": 0.00042725949807305154, + "loss": 2.6339, + "step": 18583 + }, + { + "epoch": 0.5510778993565223, + "grad_norm": 0.11747884750366211, + "learning_rate": 0.0004272129480074216, + "loss": 2.6599, + "step": 18584 + }, + { + "epoch": 0.5511075527088337, + "grad_norm": 0.11205948144197464, + "learning_rate": 0.00042716639858633357, + "loss": 2.6897, + "step": 18585 + }, + { + "epoch": 0.5511372060611452, + "grad_norm": 0.1155514046549797, + "learning_rate": 0.0004271198498101995, + "loss": 2.6665, + "step": 18586 + }, + { + "epoch": 0.5511668594134567, + "grad_norm": 0.10495004057884216, + "learning_rate": 0.0004270733016794317, + "loss": 2.6683, + "step": 18587 + }, + { + "epoch": 0.5511965127657682, + "grad_norm": 0.11205582320690155, + "learning_rate": 0.0004270267541944423, + "loss": 2.6593, + "step": 18588 + }, + { + "epoch": 0.5512261661180796, + "grad_norm": 0.11007298529148102, + "learning_rate": 0.0004269802073556437, + "loss": 2.6844, + "step": 18589 + }, + { + "epoch": 0.5512558194703912, + "grad_norm": 0.1032559871673584, + "learning_rate": 0.0004269336611634475, + "loss": 2.6891, + "step": 18590 + }, + { + "epoch": 0.5512854728227026, + "grad_norm": 0.13401155173778534, + "learning_rate": 0.0004268871156182665, + "loss": 2.6545, + "step": 18591 + }, + { + "epoch": 0.5513151261750141, + "grad_norm": 0.11776817589998245, + "learning_rate": 0.0004268405707205127, + "loss": 2.6511, + "step": 18592 + }, + { + "epoch": 0.5513447795273255, + "grad_norm": 0.10135069489479065, + "learning_rate": 0.00042679402647059826, + "loss": 2.635, + "step": 18593 + }, + { + "epoch": 0.551374432879637, + "grad_norm": 0.10159451514482498, + "learning_rate": 0.00042674748286893516, + "loss": 2.6707, + "step": 18594 + }, + { + "epoch": 0.5514040862319485, + "grad_norm": 0.10266918689012527, + "learning_rate": 0.00042670093991593574, + "loss": 2.6518, + "step": 18595 + }, + { + "epoch": 0.55143373958426, + "grad_norm": 0.1149747222661972, + "learning_rate": 0.00042665439761201194, + "loss": 2.6342, + "step": 18596 + }, + { + "epoch": 0.5514633929365714, + "grad_norm": 0.12624715268611908, + "learning_rate": 0.00042660785595757616, + "loss": 2.6603, + "step": 18597 + }, + { + "epoch": 0.551493046288883, + "grad_norm": 0.12327062338590622, + "learning_rate": 0.0004265613149530404, + "loss": 2.683, + "step": 18598 + }, + { + "epoch": 0.5515226996411945, + "grad_norm": 0.11460784077644348, + "learning_rate": 0.0004265147745988168, + "loss": 2.6458, + "step": 18599 + }, + { + "epoch": 0.5515523529935059, + "grad_norm": 0.10688420385122299, + "learning_rate": 0.0004264682348953176, + "loss": 2.6837, + "step": 18600 + }, + { + "epoch": 0.5515820063458174, + "grad_norm": 0.1055818572640419, + "learning_rate": 0.00042642169584295467, + "loss": 2.6542, + "step": 18601 + }, + { + "epoch": 0.5516116596981289, + "grad_norm": 0.0988185778260231, + "learning_rate": 0.0004263751574421402, + "loss": 2.6794, + "step": 18602 + }, + { + "epoch": 0.5516413130504404, + "grad_norm": 0.11468001455068588, + "learning_rate": 0.00042632861969328623, + "loss": 2.646, + "step": 18603 + }, + { + "epoch": 0.5516709664027518, + "grad_norm": 0.13895666599273682, + "learning_rate": 0.0004262820825968052, + "loss": 2.6644, + "step": 18604 + }, + { + "epoch": 0.5517006197550633, + "grad_norm": 0.14010018110275269, + "learning_rate": 0.000426235546153109, + "loss": 2.6475, + "step": 18605 + }, + { + "epoch": 0.5517302731073748, + "grad_norm": 0.12039764970541, + "learning_rate": 0.00042618901036260964, + "loss": 2.6877, + "step": 18606 + }, + { + "epoch": 0.5517599264596863, + "grad_norm": 0.11869272589683533, + "learning_rate": 0.00042614247522571925, + "loss": 2.6645, + "step": 18607 + }, + { + "epoch": 0.5517895798119977, + "grad_norm": 0.12496870756149292, + "learning_rate": 0.00042609594074284997, + "loss": 2.6748, + "step": 18608 + }, + { + "epoch": 0.5518192331643093, + "grad_norm": 0.13668113946914673, + "learning_rate": 0.00042604940691441374, + "loss": 2.6599, + "step": 18609 + }, + { + "epoch": 0.5518488865166207, + "grad_norm": 0.10923022776842117, + "learning_rate": 0.00042600287374082284, + "loss": 2.6582, + "step": 18610 + }, + { + "epoch": 0.5518785398689322, + "grad_norm": 0.1181633248925209, + "learning_rate": 0.00042595634122248913, + "loss": 2.6607, + "step": 18611 + }, + { + "epoch": 0.5519081932212436, + "grad_norm": 0.12165538966655731, + "learning_rate": 0.00042590980935982463, + "loss": 2.6671, + "step": 18612 + }, + { + "epoch": 0.5519378465735552, + "grad_norm": 0.11973220854997635, + "learning_rate": 0.0004258632781532416, + "loss": 2.6725, + "step": 18613 + }, + { + "epoch": 0.5519674999258666, + "grad_norm": 0.1318296492099762, + "learning_rate": 0.00042581674760315194, + "loss": 2.6976, + "step": 18614 + }, + { + "epoch": 0.5519971532781781, + "grad_norm": 0.1346285343170166, + "learning_rate": 0.00042577021770996763, + "loss": 2.6539, + "step": 18615 + }, + { + "epoch": 0.5520268066304895, + "grad_norm": 0.1286294013261795, + "learning_rate": 0.00042572368847410073, + "loss": 2.6784, + "step": 18616 + }, + { + "epoch": 0.5520564599828011, + "grad_norm": 0.13616278767585754, + "learning_rate": 0.00042567715989596345, + "loss": 2.6646, + "step": 18617 + }, + { + "epoch": 0.5520861133351125, + "grad_norm": 0.1275232881307602, + "learning_rate": 0.0004256306319759677, + "loss": 2.6739, + "step": 18618 + }, + { + "epoch": 0.552115766687424, + "grad_norm": 0.11262361705303192, + "learning_rate": 0.0004255841047145254, + "loss": 2.6494, + "step": 18619 + }, + { + "epoch": 0.5521454200397355, + "grad_norm": 0.1358823925256729, + "learning_rate": 0.0004255375781120486, + "loss": 2.668, + "step": 18620 + }, + { + "epoch": 0.552175073392047, + "grad_norm": 0.13427092134952545, + "learning_rate": 0.00042549105216894956, + "loss": 2.6721, + "step": 18621 + }, + { + "epoch": 0.5522047267443585, + "grad_norm": 0.12387257069349289, + "learning_rate": 0.00042544452688563986, + "loss": 2.6379, + "step": 18622 + }, + { + "epoch": 0.5522343800966699, + "grad_norm": 0.13264669477939606, + "learning_rate": 0.00042539800226253164, + "loss": 2.6659, + "step": 18623 + }, + { + "epoch": 0.5522640334489815, + "grad_norm": 0.10754110664129257, + "learning_rate": 0.000425351478300037, + "loss": 2.639, + "step": 18624 + }, + { + "epoch": 0.5522936868012929, + "grad_norm": 0.12374896556138992, + "learning_rate": 0.00042530495499856774, + "loss": 2.6496, + "step": 18625 + }, + { + "epoch": 0.5523233401536044, + "grad_norm": 0.13004080951213837, + "learning_rate": 0.000425258432358536, + "loss": 2.6698, + "step": 18626 + }, + { + "epoch": 0.5523529935059158, + "grad_norm": 0.10933125019073486, + "learning_rate": 0.0004252119103803537, + "loss": 2.6693, + "step": 18627 + }, + { + "epoch": 0.5523826468582274, + "grad_norm": 0.09815064072608948, + "learning_rate": 0.00042516538906443277, + "loss": 2.6724, + "step": 18628 + }, + { + "epoch": 0.5524123002105388, + "grad_norm": 0.10278838127851486, + "learning_rate": 0.00042511886841118486, + "loss": 2.64, + "step": 18629 + }, + { + "epoch": 0.5524419535628503, + "grad_norm": 0.12043853104114532, + "learning_rate": 0.0004250723484210225, + "loss": 2.6705, + "step": 18630 + }, + { + "epoch": 0.5524716069151617, + "grad_norm": 0.11914807558059692, + "learning_rate": 0.0004250258290943574, + "loss": 2.6837, + "step": 18631 + }, + { + "epoch": 0.5525012602674733, + "grad_norm": 0.10553532093763351, + "learning_rate": 0.00042497931043160135, + "loss": 2.6496, + "step": 18632 + }, + { + "epoch": 0.5525309136197847, + "grad_norm": 0.10886240750551224, + "learning_rate": 0.00042493279243316644, + "loss": 2.6122, + "step": 18633 + }, + { + "epoch": 0.5525605669720962, + "grad_norm": 0.12070481479167938, + "learning_rate": 0.00042488627509946455, + "loss": 2.6604, + "step": 18634 + }, + { + "epoch": 0.5525902203244076, + "grad_norm": 0.14112569391727448, + "learning_rate": 0.0004248397584309075, + "loss": 2.6445, + "step": 18635 + }, + { + "epoch": 0.5526198736767192, + "grad_norm": 0.13192997872829437, + "learning_rate": 0.00042479324242790736, + "loss": 2.6599, + "step": 18636 + }, + { + "epoch": 0.5526495270290306, + "grad_norm": 0.10888597369194031, + "learning_rate": 0.00042474672709087594, + "loss": 2.6466, + "step": 18637 + }, + { + "epoch": 0.5526791803813421, + "grad_norm": 0.10581660270690918, + "learning_rate": 0.00042470021242022526, + "loss": 2.6723, + "step": 18638 + }, + { + "epoch": 0.5527088337336535, + "grad_norm": 0.12847042083740234, + "learning_rate": 0.00042465369841636705, + "loss": 2.6683, + "step": 18639 + }, + { + "epoch": 0.5527384870859651, + "grad_norm": 0.12932200729846954, + "learning_rate": 0.00042460718507971324, + "loss": 2.6588, + "step": 18640 + }, + { + "epoch": 0.5527681404382766, + "grad_norm": 0.10413297265768051, + "learning_rate": 0.00042456067241067574, + "loss": 2.6431, + "step": 18641 + }, + { + "epoch": 0.552797793790588, + "grad_norm": 0.1237955167889595, + "learning_rate": 0.0004245141604096664, + "loss": 2.6878, + "step": 18642 + }, + { + "epoch": 0.5528274471428996, + "grad_norm": 0.13458462059497833, + "learning_rate": 0.0004244676490770972, + "loss": 2.6305, + "step": 18643 + }, + { + "epoch": 0.552857100495211, + "grad_norm": 0.12380695343017578, + "learning_rate": 0.0004244211384133799, + "loss": 2.6497, + "step": 18644 + }, + { + "epoch": 0.5528867538475225, + "grad_norm": 0.1318221539258957, + "learning_rate": 0.00042437462841892637, + "loss": 2.6685, + "step": 18645 + }, + { + "epoch": 0.5529164071998339, + "grad_norm": 0.13742399215698242, + "learning_rate": 0.00042432811909414857, + "loss": 2.6826, + "step": 18646 + }, + { + "epoch": 0.5529460605521455, + "grad_norm": 0.10458040237426758, + "learning_rate": 0.00042428161043945824, + "loss": 2.6318, + "step": 18647 + }, + { + "epoch": 0.5529757139044569, + "grad_norm": 0.1297779232263565, + "learning_rate": 0.0004242351024552673, + "loss": 2.6612, + "step": 18648 + }, + { + "epoch": 0.5530053672567684, + "grad_norm": 0.12118101865053177, + "learning_rate": 0.0004241885951419875, + "loss": 2.6927, + "step": 18649 + }, + { + "epoch": 0.5530350206090798, + "grad_norm": 0.11622268706560135, + "learning_rate": 0.0004241420885000307, + "loss": 2.6706, + "step": 18650 + }, + { + "epoch": 0.5530646739613914, + "grad_norm": 0.12614892423152924, + "learning_rate": 0.0004240955825298086, + "loss": 2.6688, + "step": 18651 + }, + { + "epoch": 0.5530943273137028, + "grad_norm": 0.12364668399095535, + "learning_rate": 0.00042404907723173323, + "loss": 2.6789, + "step": 18652 + }, + { + "epoch": 0.5531239806660143, + "grad_norm": 0.1138501763343811, + "learning_rate": 0.0004240025726062164, + "loss": 2.6355, + "step": 18653 + }, + { + "epoch": 0.5531536340183257, + "grad_norm": 0.1311633288860321, + "learning_rate": 0.00042395606865366974, + "loss": 2.6271, + "step": 18654 + }, + { + "epoch": 0.5531832873706373, + "grad_norm": 0.1122538223862648, + "learning_rate": 0.000423909565374505, + "loss": 2.6856, + "step": 18655 + }, + { + "epoch": 0.5532129407229487, + "grad_norm": 0.11322169005870819, + "learning_rate": 0.0004238630627691343, + "loss": 2.6688, + "step": 18656 + }, + { + "epoch": 0.5532425940752602, + "grad_norm": 0.10360359400510788, + "learning_rate": 0.00042381656083796926, + "loss": 2.6471, + "step": 18657 + }, + { + "epoch": 0.5532722474275716, + "grad_norm": 0.1102796122431755, + "learning_rate": 0.00042377005958142163, + "loss": 2.6771, + "step": 18658 + }, + { + "epoch": 0.5533019007798832, + "grad_norm": 0.11878730356693268, + "learning_rate": 0.0004237235589999033, + "loss": 2.6316, + "step": 18659 + }, + { + "epoch": 0.5533315541321946, + "grad_norm": 0.1120305061340332, + "learning_rate": 0.0004236770590938259, + "loss": 2.7013, + "step": 18660 + }, + { + "epoch": 0.5533612074845061, + "grad_norm": 0.11348570883274078, + "learning_rate": 0.00042363055986360115, + "loss": 2.673, + "step": 18661 + }, + { + "epoch": 0.5533908608368177, + "grad_norm": 0.11847617477178574, + "learning_rate": 0.0004235840613096409, + "loss": 2.648, + "step": 18662 + }, + { + "epoch": 0.5534205141891291, + "grad_norm": 0.12368586659431458, + "learning_rate": 0.00042353756343235696, + "loss": 2.6855, + "step": 18663 + }, + { + "epoch": 0.5534501675414406, + "grad_norm": 0.1105254516005516, + "learning_rate": 0.00042349106623216105, + "loss": 2.6815, + "step": 18664 + }, + { + "epoch": 0.553479820893752, + "grad_norm": 0.11833681166172028, + "learning_rate": 0.0004234445697094648, + "loss": 2.6639, + "step": 18665 + }, + { + "epoch": 0.5535094742460636, + "grad_norm": 0.10934220254421234, + "learning_rate": 0.00042339807386468023, + "loss": 2.6251, + "step": 18666 + }, + { + "epoch": 0.553539127598375, + "grad_norm": 0.12076275795698166, + "learning_rate": 0.00042335157869821866, + "loss": 2.6395, + "step": 18667 + }, + { + "epoch": 0.5535687809506865, + "grad_norm": 0.15486779808998108, + "learning_rate": 0.00042330508421049184, + "loss": 2.6567, + "step": 18668 + }, + { + "epoch": 0.5535984343029979, + "grad_norm": 0.11861676722764969, + "learning_rate": 0.00042325859040191196, + "loss": 2.6629, + "step": 18669 + }, + { + "epoch": 0.5536280876553095, + "grad_norm": 0.12805801630020142, + "learning_rate": 0.00042321209727289033, + "loss": 2.6825, + "step": 18670 + }, + { + "epoch": 0.5536577410076209, + "grad_norm": 0.11933567374944687, + "learning_rate": 0.00042316560482383883, + "loss": 2.6161, + "step": 18671 + }, + { + "epoch": 0.5536873943599324, + "grad_norm": 0.13736605644226074, + "learning_rate": 0.000423119113055169, + "loss": 2.6487, + "step": 18672 + }, + { + "epoch": 0.5537170477122438, + "grad_norm": 0.14094145596027374, + "learning_rate": 0.0004230726219672927, + "loss": 2.6599, + "step": 18673 + }, + { + "epoch": 0.5537467010645554, + "grad_norm": 0.14647875726222992, + "learning_rate": 0.00042302613156062146, + "loss": 2.6445, + "step": 18674 + }, + { + "epoch": 0.5537763544168668, + "grad_norm": 0.13877083361148834, + "learning_rate": 0.0004229796418355671, + "loss": 2.6554, + "step": 18675 + }, + { + "epoch": 0.5538060077691783, + "grad_norm": 0.1374383270740509, + "learning_rate": 0.00042293315279254127, + "loss": 2.6501, + "step": 18676 + }, + { + "epoch": 0.5538356611214897, + "grad_norm": 0.12716995179653168, + "learning_rate": 0.00042288666443195556, + "loss": 2.641, + "step": 18677 + }, + { + "epoch": 0.5538653144738013, + "grad_norm": 0.1379184126853943, + "learning_rate": 0.0004228401767542217, + "loss": 2.6769, + "step": 18678 + }, + { + "epoch": 0.5538949678261127, + "grad_norm": 0.12969408929347992, + "learning_rate": 0.0004227936897597512, + "loss": 2.6701, + "step": 18679 + }, + { + "epoch": 0.5539246211784242, + "grad_norm": 0.12427078187465668, + "learning_rate": 0.0004227472034489559, + "loss": 2.6424, + "step": 18680 + }, + { + "epoch": 0.5539542745307356, + "grad_norm": 0.10653313994407654, + "learning_rate": 0.0004227007178222473, + "loss": 2.6649, + "step": 18681 + }, + { + "epoch": 0.5539839278830472, + "grad_norm": 0.11694467812776566, + "learning_rate": 0.0004226542328800372, + "loss": 2.6644, + "step": 18682 + }, + { + "epoch": 0.5540135812353587, + "grad_norm": 0.11866483837366104, + "learning_rate": 0.00042260774862273707, + "loss": 2.6466, + "step": 18683 + }, + { + "epoch": 0.5540432345876701, + "grad_norm": 0.12864448130130768, + "learning_rate": 0.0004225612650507587, + "loss": 2.6783, + "step": 18684 + }, + { + "epoch": 0.5540728879399817, + "grad_norm": 0.11786926537752151, + "learning_rate": 0.0004225147821645136, + "loss": 2.678, + "step": 18685 + }, + { + "epoch": 0.5541025412922931, + "grad_norm": 0.1059906929731369, + "learning_rate": 0.0004224682999644135, + "loss": 2.6583, + "step": 18686 + }, + { + "epoch": 0.5541321946446046, + "grad_norm": 0.11822595447301865, + "learning_rate": 0.0004224218184508698, + "loss": 2.666, + "step": 18687 + }, + { + "epoch": 0.554161847996916, + "grad_norm": 0.10695113241672516, + "learning_rate": 0.0004223753376242942, + "loss": 2.6431, + "step": 18688 + }, + { + "epoch": 0.5541915013492276, + "grad_norm": 0.10175736993551254, + "learning_rate": 0.0004223288574850983, + "loss": 2.6746, + "step": 18689 + }, + { + "epoch": 0.554221154701539, + "grad_norm": 0.10690216720104218, + "learning_rate": 0.0004222823780336937, + "loss": 2.6469, + "step": 18690 + }, + { + "epoch": 0.5542508080538505, + "grad_norm": 0.10671897232532501, + "learning_rate": 0.00042223589927049203, + "loss": 2.6796, + "step": 18691 + }, + { + "epoch": 0.5542804614061619, + "grad_norm": 0.09780076146125793, + "learning_rate": 0.0004221894211959048, + "loss": 2.6553, + "step": 18692 + }, + { + "epoch": 0.5543101147584735, + "grad_norm": 0.10943811386823654, + "learning_rate": 0.0004221429438103435, + "loss": 2.6974, + "step": 18693 + }, + { + "epoch": 0.5543397681107849, + "grad_norm": 0.12689073383808136, + "learning_rate": 0.00042209646711421987, + "loss": 2.658, + "step": 18694 + }, + { + "epoch": 0.5543694214630964, + "grad_norm": 0.11880248785018921, + "learning_rate": 0.00042204999110794547, + "loss": 2.6505, + "step": 18695 + }, + { + "epoch": 0.5543990748154078, + "grad_norm": 0.10707200318574905, + "learning_rate": 0.00042200351579193174, + "loss": 2.7098, + "step": 18696 + }, + { + "epoch": 0.5544287281677194, + "grad_norm": 0.10685323923826218, + "learning_rate": 0.00042195704116659036, + "loss": 2.6528, + "step": 18697 + }, + { + "epoch": 0.5544583815200308, + "grad_norm": 0.09715773165225983, + "learning_rate": 0.00042191056723233267, + "loss": 2.6491, + "step": 18698 + }, + { + "epoch": 0.5544880348723423, + "grad_norm": 0.10733028501272202, + "learning_rate": 0.0004218640939895703, + "loss": 2.6571, + "step": 18699 + }, + { + "epoch": 0.5545176882246537, + "grad_norm": 0.11156773567199707, + "learning_rate": 0.00042181762143871484, + "loss": 2.667, + "step": 18700 + }, + { + "epoch": 0.5545473415769653, + "grad_norm": 0.10278249531984329, + "learning_rate": 0.0004217711495801777, + "loss": 2.6559, + "step": 18701 + }, + { + "epoch": 0.5545769949292767, + "grad_norm": 0.10448473691940308, + "learning_rate": 0.0004217246784143705, + "loss": 2.6709, + "step": 18702 + }, + { + "epoch": 0.5546066482815882, + "grad_norm": 0.11463803052902222, + "learning_rate": 0.00042167820794170464, + "loss": 2.668, + "step": 18703 + }, + { + "epoch": 0.5546363016338998, + "grad_norm": 0.12164616584777832, + "learning_rate": 0.00042163173816259187, + "loss": 2.6219, + "step": 18704 + }, + { + "epoch": 0.5546659549862112, + "grad_norm": 0.1008865088224411, + "learning_rate": 0.00042158526907744336, + "loss": 2.6676, + "step": 18705 + }, + { + "epoch": 0.5546956083385227, + "grad_norm": 0.11039257794618607, + "learning_rate": 0.0004215388006866706, + "loss": 2.6356, + "step": 18706 + }, + { + "epoch": 0.5547252616908341, + "grad_norm": 0.11241055279970169, + "learning_rate": 0.0004214923329906855, + "loss": 2.6799, + "step": 18707 + }, + { + "epoch": 0.5547549150431457, + "grad_norm": 0.110847108066082, + "learning_rate": 0.00042144586598989915, + "loss": 2.6382, + "step": 18708 + }, + { + "epoch": 0.5547845683954571, + "grad_norm": 0.10679349303245544, + "learning_rate": 0.0004213993996847232, + "loss": 2.6897, + "step": 18709 + }, + { + "epoch": 0.5548142217477686, + "grad_norm": 0.10302310436964035, + "learning_rate": 0.00042135293407556895, + "loss": 2.6425, + "step": 18710 + }, + { + "epoch": 0.55484387510008, + "grad_norm": 0.10358895361423492, + "learning_rate": 0.0004213064691628481, + "loss": 2.6506, + "step": 18711 + }, + { + "epoch": 0.5548735284523916, + "grad_norm": 0.10792340338230133, + "learning_rate": 0.00042126000494697194, + "loss": 2.6603, + "step": 18712 + }, + { + "epoch": 0.554903181804703, + "grad_norm": 0.12902623414993286, + "learning_rate": 0.0004212135414283519, + "loss": 2.6862, + "step": 18713 + }, + { + "epoch": 0.5549328351570145, + "grad_norm": 0.15219733119010925, + "learning_rate": 0.0004211670786073996, + "loss": 2.6012, + "step": 18714 + }, + { + "epoch": 0.554962488509326, + "grad_norm": 0.13656887412071228, + "learning_rate": 0.0004211206164845262, + "loss": 2.6488, + "step": 18715 + }, + { + "epoch": 0.5549921418616375, + "grad_norm": 0.10783156007528305, + "learning_rate": 0.00042107415506014334, + "loss": 2.6623, + "step": 18716 + }, + { + "epoch": 0.5550217952139489, + "grad_norm": 0.12318123877048492, + "learning_rate": 0.0004210276943346624, + "loss": 2.6982, + "step": 18717 + }, + { + "epoch": 0.5550514485662604, + "grad_norm": 0.13544581830501556, + "learning_rate": 0.0004209812343084947, + "loss": 2.6673, + "step": 18718 + }, + { + "epoch": 0.5550811019185719, + "grad_norm": 0.13764610886573792, + "learning_rate": 0.0004209347749820517, + "loss": 2.6392, + "step": 18719 + }, + { + "epoch": 0.5551107552708834, + "grad_norm": 0.12416181713342667, + "learning_rate": 0.00042088831635574494, + "loss": 2.6364, + "step": 18720 + }, + { + "epoch": 0.5551404086231948, + "grad_norm": 0.1097990944981575, + "learning_rate": 0.00042084185842998566, + "loss": 2.677, + "step": 18721 + }, + { + "epoch": 0.5551700619755063, + "grad_norm": 0.15166804194450378, + "learning_rate": 0.0004207954012051854, + "loss": 2.6788, + "step": 18722 + }, + { + "epoch": 0.5551997153278178, + "grad_norm": 0.138314887881279, + "learning_rate": 0.0004207489446817554, + "loss": 2.6562, + "step": 18723 + }, + { + "epoch": 0.5552293686801293, + "grad_norm": 0.11347472667694092, + "learning_rate": 0.0004207024888601072, + "loss": 2.6615, + "step": 18724 + }, + { + "epoch": 0.5552590220324408, + "grad_norm": 0.13036373257637024, + "learning_rate": 0.000420656033740652, + "loss": 2.6798, + "step": 18725 + }, + { + "epoch": 0.5552886753847522, + "grad_norm": 0.15916180610656738, + "learning_rate": 0.00042060957932380124, + "loss": 2.668, + "step": 18726 + }, + { + "epoch": 0.5553183287370638, + "grad_norm": 0.13815909624099731, + "learning_rate": 0.0004205631256099662, + "loss": 2.6455, + "step": 18727 + }, + { + "epoch": 0.5553479820893752, + "grad_norm": 0.1122511699795723, + "learning_rate": 0.00042051667259955847, + "loss": 2.6803, + "step": 18728 + }, + { + "epoch": 0.5553776354416867, + "grad_norm": 0.11302804946899414, + "learning_rate": 0.00042047022029298914, + "loss": 2.6693, + "step": 18729 + }, + { + "epoch": 0.5554072887939981, + "grad_norm": 0.14182049036026, + "learning_rate": 0.00042042376869066967, + "loss": 2.69, + "step": 18730 + }, + { + "epoch": 0.5554369421463097, + "grad_norm": 0.1183805763721466, + "learning_rate": 0.0004203773177930115, + "loss": 2.6608, + "step": 18731 + }, + { + "epoch": 0.5554665954986211, + "grad_norm": 0.11059907078742981, + "learning_rate": 0.00042033086760042554, + "loss": 2.6584, + "step": 18732 + }, + { + "epoch": 0.5554962488509326, + "grad_norm": 0.1109241172671318, + "learning_rate": 0.00042028441811332363, + "loss": 2.6714, + "step": 18733 + }, + { + "epoch": 0.555525902203244, + "grad_norm": 0.11365145444869995, + "learning_rate": 0.00042023796933211686, + "loss": 2.6477, + "step": 18734 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.12168054282665253, + "learning_rate": 0.0004201915212572167, + "loss": 2.6853, + "step": 18735 + }, + { + "epoch": 0.555585208907867, + "grad_norm": 0.09855128079652786, + "learning_rate": 0.00042014507388903424, + "loss": 2.6632, + "step": 18736 + }, + { + "epoch": 0.5556148622601785, + "grad_norm": 0.10766778886318207, + "learning_rate": 0.00042009862722798075, + "loss": 2.6269, + "step": 18737 + }, + { + "epoch": 0.55564451561249, + "grad_norm": 0.10246209055185318, + "learning_rate": 0.00042005218127446774, + "loss": 2.6412, + "step": 18738 + }, + { + "epoch": 0.5556741689648015, + "grad_norm": 0.08980298042297363, + "learning_rate": 0.0004200057360289063, + "loss": 2.6075, + "step": 18739 + }, + { + "epoch": 0.5557038223171129, + "grad_norm": 0.11161834746599197, + "learning_rate": 0.00041995929149170786, + "loss": 2.6691, + "step": 18740 + }, + { + "epoch": 0.5557334756694244, + "grad_norm": 0.10035392642021179, + "learning_rate": 0.00041991284766328363, + "loss": 2.6455, + "step": 18741 + }, + { + "epoch": 0.5557631290217359, + "grad_norm": 0.11461172252893448, + "learning_rate": 0.000419866404544045, + "loss": 2.6864, + "step": 18742 + }, + { + "epoch": 0.5557927823740474, + "grad_norm": 0.13642393052577972, + "learning_rate": 0.000419819962134403, + "loss": 2.6817, + "step": 18743 + }, + { + "epoch": 0.5558224357263588, + "grad_norm": 0.11986473947763443, + "learning_rate": 0.000419773520434769, + "loss": 2.6567, + "step": 18744 + }, + { + "epoch": 0.5558520890786703, + "grad_norm": 0.12357880175113678, + "learning_rate": 0.00041972707944555403, + "loss": 2.6405, + "step": 18745 + }, + { + "epoch": 0.5558817424309819, + "grad_norm": 0.11841101944446564, + "learning_rate": 0.0004196806391671698, + "loss": 2.6259, + "step": 18746 + }, + { + "epoch": 0.5559113957832933, + "grad_norm": 0.1297348439693451, + "learning_rate": 0.00041963419960002726, + "loss": 2.6603, + "step": 18747 + }, + { + "epoch": 0.5559410491356048, + "grad_norm": 0.1468082219362259, + "learning_rate": 0.00041958776074453764, + "loss": 2.6561, + "step": 18748 + }, + { + "epoch": 0.5559707024879162, + "grad_norm": 0.09235488623380661, + "learning_rate": 0.0004195413226011122, + "loss": 2.6279, + "step": 18749 + }, + { + "epoch": 0.5560003558402278, + "grad_norm": 0.11533965915441513, + "learning_rate": 0.00041949488517016223, + "loss": 2.6485, + "step": 18750 + }, + { + "epoch": 0.5560300091925392, + "grad_norm": 0.13027256727218628, + "learning_rate": 0.0004194484484520988, + "loss": 2.7014, + "step": 18751 + }, + { + "epoch": 0.5560596625448507, + "grad_norm": 0.11379370838403702, + "learning_rate": 0.0004194020124473333, + "loss": 2.6844, + "step": 18752 + }, + { + "epoch": 0.5560893158971622, + "grad_norm": 0.12834815680980682, + "learning_rate": 0.0004193555771562767, + "loss": 2.6676, + "step": 18753 + }, + { + "epoch": 0.5561189692494737, + "grad_norm": 0.10496724396944046, + "learning_rate": 0.00041930914257934035, + "loss": 2.71, + "step": 18754 + }, + { + "epoch": 0.5561486226017851, + "grad_norm": 0.13029490411281586, + "learning_rate": 0.00041926270871693534, + "loss": 2.6554, + "step": 18755 + }, + { + "epoch": 0.5561782759540966, + "grad_norm": 0.1267295628786087, + "learning_rate": 0.0004192162755694729, + "loss": 2.6689, + "step": 18756 + }, + { + "epoch": 0.556207929306408, + "grad_norm": 0.1265234649181366, + "learning_rate": 0.0004191698431373643, + "loss": 2.6662, + "step": 18757 + }, + { + "epoch": 0.5562375826587196, + "grad_norm": 0.12064522504806519, + "learning_rate": 0.0004191234114210204, + "loss": 2.6722, + "step": 18758 + }, + { + "epoch": 0.556267236011031, + "grad_norm": 0.1130671352148056, + "learning_rate": 0.00041907698042085274, + "loss": 2.6737, + "step": 18759 + }, + { + "epoch": 0.5562968893633425, + "grad_norm": 0.12204524874687195, + "learning_rate": 0.0004190305501372723, + "loss": 2.6496, + "step": 18760 + }, + { + "epoch": 0.556326542715654, + "grad_norm": 0.1193331703543663, + "learning_rate": 0.0004189841205706902, + "loss": 2.6692, + "step": 18761 + }, + { + "epoch": 0.5563561960679655, + "grad_norm": 0.1170082539319992, + "learning_rate": 0.0004189376917215178, + "loss": 2.6592, + "step": 18762 + }, + { + "epoch": 0.5563858494202769, + "grad_norm": 0.12969368696212769, + "learning_rate": 0.0004188912635901659, + "loss": 2.652, + "step": 18763 + }, + { + "epoch": 0.5564155027725884, + "grad_norm": 0.11781095713376999, + "learning_rate": 0.00041884483617704577, + "loss": 2.6473, + "step": 18764 + }, + { + "epoch": 0.5564451561248999, + "grad_norm": 0.10693107545375824, + "learning_rate": 0.0004187984094825686, + "loss": 2.705, + "step": 18765 + }, + { + "epoch": 0.5564748094772114, + "grad_norm": 0.10613524913787842, + "learning_rate": 0.0004187519835071454, + "loss": 2.6211, + "step": 18766 + }, + { + "epoch": 0.5565044628295229, + "grad_norm": 0.10255411267280579, + "learning_rate": 0.0004187055582511873, + "loss": 2.6506, + "step": 18767 + }, + { + "epoch": 0.5565341161818343, + "grad_norm": 0.11938349157571793, + "learning_rate": 0.00041865913371510555, + "loss": 2.6402, + "step": 18768 + }, + { + "epoch": 0.5565637695341459, + "grad_norm": 0.12184028327465057, + "learning_rate": 0.00041861270989931117, + "loss": 2.6686, + "step": 18769 + }, + { + "epoch": 0.5565934228864573, + "grad_norm": 0.1090468168258667, + "learning_rate": 0.0004185662868042151, + "loss": 2.6417, + "step": 18770 + }, + { + "epoch": 0.5566230762387688, + "grad_norm": 0.10424765199422836, + "learning_rate": 0.0004185198644302283, + "loss": 2.6901, + "step": 18771 + }, + { + "epoch": 0.5566527295910803, + "grad_norm": 0.11616259813308716, + "learning_rate": 0.00041847344277776236, + "loss": 2.6352, + "step": 18772 + }, + { + "epoch": 0.5566823829433918, + "grad_norm": 0.1096283346414566, + "learning_rate": 0.00041842702184722815, + "loss": 2.6599, + "step": 18773 + }, + { + "epoch": 0.5567120362957032, + "grad_norm": 0.10883237421512604, + "learning_rate": 0.0004183806016390366, + "loss": 2.6405, + "step": 18774 + }, + { + "epoch": 0.5567416896480147, + "grad_norm": 0.1216520145535469, + "learning_rate": 0.00041833418215359876, + "loss": 2.6646, + "step": 18775 + }, + { + "epoch": 0.5567713430003262, + "grad_norm": 0.11834314465522766, + "learning_rate": 0.0004182877633913258, + "loss": 2.6332, + "step": 18776 + }, + { + "epoch": 0.5568009963526377, + "grad_norm": 0.1148541197180748, + "learning_rate": 0.0004182413453526287, + "loss": 2.6649, + "step": 18777 + }, + { + "epoch": 0.5568306497049491, + "grad_norm": 0.10903136432170868, + "learning_rate": 0.00041819492803791853, + "loss": 2.6203, + "step": 18778 + }, + { + "epoch": 0.5568603030572606, + "grad_norm": 0.11085806787014008, + "learning_rate": 0.00041814851144760635, + "loss": 2.6614, + "step": 18779 + }, + { + "epoch": 0.5568899564095721, + "grad_norm": 0.12856067717075348, + "learning_rate": 0.0004181020955821032, + "loss": 2.6707, + "step": 18780 + }, + { + "epoch": 0.5569196097618836, + "grad_norm": 0.1270325779914856, + "learning_rate": 0.00041805568044182, + "loss": 2.643, + "step": 18781 + }, + { + "epoch": 0.556949263114195, + "grad_norm": 0.12742388248443604, + "learning_rate": 0.00041800926602716774, + "loss": 2.6373, + "step": 18782 + }, + { + "epoch": 0.5569789164665065, + "grad_norm": 0.11829089373350143, + "learning_rate": 0.00041796285233855756, + "loss": 2.6812, + "step": 18783 + }, + { + "epoch": 0.557008569818818, + "grad_norm": 0.1130073145031929, + "learning_rate": 0.0004179164393764003, + "loss": 2.6281, + "step": 18784 + }, + { + "epoch": 0.5570382231711295, + "grad_norm": 0.10707933455705643, + "learning_rate": 0.0004178700271411071, + "loss": 2.6554, + "step": 18785 + }, + { + "epoch": 0.557067876523441, + "grad_norm": 0.12270566821098328, + "learning_rate": 0.000417823615633089, + "loss": 2.6237, + "step": 18786 + }, + { + "epoch": 0.5570975298757525, + "grad_norm": 0.12978266179561615, + "learning_rate": 0.0004177772048527568, + "loss": 2.6581, + "step": 18787 + }, + { + "epoch": 0.557127183228064, + "grad_norm": 0.11464279145002365, + "learning_rate": 0.0004177307948005217, + "loss": 2.6614, + "step": 18788 + }, + { + "epoch": 0.5571568365803754, + "grad_norm": 0.10224858671426773, + "learning_rate": 0.00041768438547679445, + "loss": 2.6669, + "step": 18789 + }, + { + "epoch": 0.5571864899326869, + "grad_norm": 0.12119394540786743, + "learning_rate": 0.0004176379768819861, + "loss": 2.6936, + "step": 18790 + }, + { + "epoch": 0.5572161432849984, + "grad_norm": 0.11211510747671127, + "learning_rate": 0.0004175915690165076, + "loss": 2.6455, + "step": 18791 + }, + { + "epoch": 0.5572457966373099, + "grad_norm": 0.13082227110862732, + "learning_rate": 0.00041754516188077, + "loss": 2.6612, + "step": 18792 + }, + { + "epoch": 0.5572754499896213, + "grad_norm": 0.11549527943134308, + "learning_rate": 0.000417498755475184, + "loss": 2.6607, + "step": 18793 + }, + { + "epoch": 0.5573051033419328, + "grad_norm": 0.11466653645038605, + "learning_rate": 0.0004174523498001607, + "loss": 2.6349, + "step": 18794 + }, + { + "epoch": 0.5573347566942443, + "grad_norm": 0.1114039346575737, + "learning_rate": 0.00041740594485611103, + "loss": 2.6693, + "step": 18795 + }, + { + "epoch": 0.5573644100465558, + "grad_norm": 0.10757361352443695, + "learning_rate": 0.0004173595406434459, + "loss": 2.6579, + "step": 18796 + }, + { + "epoch": 0.5573940633988672, + "grad_norm": 0.11775334179401398, + "learning_rate": 0.0004173131371625761, + "loss": 2.6471, + "step": 18797 + }, + { + "epoch": 0.5574237167511787, + "grad_norm": 0.10130616277456284, + "learning_rate": 0.0004172667344139128, + "loss": 2.6283, + "step": 18798 + }, + { + "epoch": 0.5574533701034902, + "grad_norm": 0.11874649673700333, + "learning_rate": 0.0004172203323978667, + "loss": 2.712, + "step": 18799 + }, + { + "epoch": 0.5574830234558017, + "grad_norm": 0.11308437585830688, + "learning_rate": 0.00041717393111484897, + "loss": 2.6575, + "step": 18800 + }, + { + "epoch": 0.5575126768081131, + "grad_norm": 0.1101621687412262, + "learning_rate": 0.00041712753056527015, + "loss": 2.6612, + "step": 18801 + }, + { + "epoch": 0.5575423301604246, + "grad_norm": 0.11257553100585938, + "learning_rate": 0.0004170811307495412, + "loss": 2.6774, + "step": 18802 + }, + { + "epoch": 0.5575719835127361, + "grad_norm": 0.11134203523397446, + "learning_rate": 0.00041703473166807323, + "loss": 2.658, + "step": 18803 + }, + { + "epoch": 0.5576016368650476, + "grad_norm": 0.12692882120609283, + "learning_rate": 0.00041698833332127686, + "loss": 2.653, + "step": 18804 + }, + { + "epoch": 0.557631290217359, + "grad_norm": 0.11641029268503189, + "learning_rate": 0.000416941935709563, + "loss": 2.6593, + "step": 18805 + }, + { + "epoch": 0.5576609435696706, + "grad_norm": 0.11206728965044022, + "learning_rate": 0.00041689553883334266, + "loss": 2.6769, + "step": 18806 + }, + { + "epoch": 0.5576905969219821, + "grad_norm": 0.10507034510374069, + "learning_rate": 0.0004168491426930266, + "loss": 2.671, + "step": 18807 + }, + { + "epoch": 0.5577202502742935, + "grad_norm": 0.11241676658391953, + "learning_rate": 0.00041680274728902555, + "loss": 2.6902, + "step": 18808 + }, + { + "epoch": 0.557749903626605, + "grad_norm": 0.10399741679430008, + "learning_rate": 0.0004167563526217505, + "loss": 2.6528, + "step": 18809 + }, + { + "epoch": 0.5577795569789165, + "grad_norm": 0.11315475404262543, + "learning_rate": 0.000416709958691612, + "loss": 2.6217, + "step": 18810 + }, + { + "epoch": 0.557809210331228, + "grad_norm": 0.12005587667226791, + "learning_rate": 0.0004166635654990215, + "loss": 2.6641, + "step": 18811 + }, + { + "epoch": 0.5578388636835394, + "grad_norm": 0.126426562666893, + "learning_rate": 0.00041661717304438924, + "loss": 2.688, + "step": 18812 + }, + { + "epoch": 0.5578685170358509, + "grad_norm": 0.1251785010099411, + "learning_rate": 0.0004165707813281262, + "loss": 2.6431, + "step": 18813 + }, + { + "epoch": 0.5578981703881624, + "grad_norm": 0.1286834329366684, + "learning_rate": 0.0004165243903506433, + "loss": 2.6607, + "step": 18814 + }, + { + "epoch": 0.5579278237404739, + "grad_norm": 0.11659617722034454, + "learning_rate": 0.00041647800011235123, + "loss": 2.6582, + "step": 18815 + }, + { + "epoch": 0.5579574770927853, + "grad_norm": 0.1344568431377411, + "learning_rate": 0.00041643161061366075, + "loss": 2.6555, + "step": 18816 + }, + { + "epoch": 0.5579871304450968, + "grad_norm": 0.13879437744617462, + "learning_rate": 0.00041638522185498275, + "loss": 2.6376, + "step": 18817 + }, + { + "epoch": 0.5580167837974083, + "grad_norm": 0.12444859743118286, + "learning_rate": 0.0004163388338367281, + "loss": 2.6819, + "step": 18818 + }, + { + "epoch": 0.5580464371497198, + "grad_norm": 0.10953623056411743, + "learning_rate": 0.00041629244655930724, + "loss": 2.6953, + "step": 18819 + }, + { + "epoch": 0.5580760905020312, + "grad_norm": 0.13554716110229492, + "learning_rate": 0.0004162460600231312, + "loss": 2.6614, + "step": 18820 + }, + { + "epoch": 0.5581057438543428, + "grad_norm": 0.12677313387393951, + "learning_rate": 0.0004161996742286107, + "loss": 2.6449, + "step": 18821 + }, + { + "epoch": 0.5581353972066542, + "grad_norm": 0.1300523281097412, + "learning_rate": 0.00041615328917615643, + "loss": 2.6557, + "step": 18822 + }, + { + "epoch": 0.5581650505589657, + "grad_norm": 0.1018727570772171, + "learning_rate": 0.000416106904866179, + "loss": 2.6241, + "step": 18823 + }, + { + "epoch": 0.5581947039112771, + "grad_norm": 0.12337352335453033, + "learning_rate": 0.00041606052129908956, + "loss": 2.6536, + "step": 18824 + }, + { + "epoch": 0.5582243572635887, + "grad_norm": 0.10880471020936966, + "learning_rate": 0.0004160141384752986, + "loss": 2.6424, + "step": 18825 + }, + { + "epoch": 0.5582540106159001, + "grad_norm": 0.11664125323295593, + "learning_rate": 0.0004159677563952168, + "loss": 2.6312, + "step": 18826 + }, + { + "epoch": 0.5582836639682116, + "grad_norm": 0.1158258467912674, + "learning_rate": 0.0004159213750592549, + "loss": 2.6767, + "step": 18827 + }, + { + "epoch": 0.5583133173205231, + "grad_norm": 0.11830519884824753, + "learning_rate": 0.00041587499446782384, + "loss": 2.6886, + "step": 18828 + }, + { + "epoch": 0.5583429706728346, + "grad_norm": 0.12172611057758331, + "learning_rate": 0.00041582861462133406, + "loss": 2.6635, + "step": 18829 + }, + { + "epoch": 0.5583726240251461, + "grad_norm": 0.1190258264541626, + "learning_rate": 0.00041578223552019624, + "loss": 2.6467, + "step": 18830 + }, + { + "epoch": 0.5584022773774575, + "grad_norm": 0.10855535417795181, + "learning_rate": 0.00041573585716482125, + "loss": 2.6423, + "step": 18831 + }, + { + "epoch": 0.558431930729769, + "grad_norm": 0.10495711863040924, + "learning_rate": 0.00041568947955561967, + "loss": 2.6811, + "step": 18832 + }, + { + "epoch": 0.5584615840820805, + "grad_norm": 0.1250522881746292, + "learning_rate": 0.00041564310269300226, + "loss": 2.658, + "step": 18833 + }, + { + "epoch": 0.558491237434392, + "grad_norm": 0.12334340065717697, + "learning_rate": 0.0004155967265773797, + "loss": 2.6752, + "step": 18834 + }, + { + "epoch": 0.5585208907867034, + "grad_norm": 0.11013906449079514, + "learning_rate": 0.0004155503512091626, + "loss": 2.6651, + "step": 18835 + }, + { + "epoch": 0.558550544139015, + "grad_norm": 0.11840593814849854, + "learning_rate": 0.00041550397658876137, + "loss": 2.6701, + "step": 18836 + }, + { + "epoch": 0.5585801974913264, + "grad_norm": 0.11022132635116577, + "learning_rate": 0.0004154576027165872, + "loss": 2.6568, + "step": 18837 + }, + { + "epoch": 0.5586098508436379, + "grad_norm": 0.13372494280338287, + "learning_rate": 0.00041541122959305043, + "loss": 2.6579, + "step": 18838 + }, + { + "epoch": 0.5586395041959493, + "grad_norm": 0.15598727762699127, + "learning_rate": 0.0004153648572185618, + "loss": 2.7037, + "step": 18839 + }, + { + "epoch": 0.5586691575482609, + "grad_norm": 0.14951838552951813, + "learning_rate": 0.0004153184855935319, + "loss": 2.684, + "step": 18840 + }, + { + "epoch": 0.5586988109005723, + "grad_norm": 0.12001579999923706, + "learning_rate": 0.00041527211471837125, + "loss": 2.6432, + "step": 18841 + }, + { + "epoch": 0.5587284642528838, + "grad_norm": 0.11117486655712128, + "learning_rate": 0.00041522574459349057, + "loss": 2.6607, + "step": 18842 + }, + { + "epoch": 0.5587581176051952, + "grad_norm": 0.13318143784999847, + "learning_rate": 0.0004151793752193005, + "loss": 2.6603, + "step": 18843 + }, + { + "epoch": 0.5587877709575068, + "grad_norm": 0.13722844421863556, + "learning_rate": 0.0004151330065962116, + "loss": 2.6928, + "step": 18844 + }, + { + "epoch": 0.5588174243098182, + "grad_norm": 0.11549080163240433, + "learning_rate": 0.0004150866387246346, + "loss": 2.6602, + "step": 18845 + }, + { + "epoch": 0.5588470776621297, + "grad_norm": 0.12824800610542297, + "learning_rate": 0.0004150402716049799, + "loss": 2.6279, + "step": 18846 + }, + { + "epoch": 0.5588767310144411, + "grad_norm": 0.11481111496686935, + "learning_rate": 0.00041499390523765816, + "loss": 2.6319, + "step": 18847 + }, + { + "epoch": 0.5589063843667527, + "grad_norm": 0.1058037281036377, + "learning_rate": 0.00041494753962308004, + "loss": 2.6578, + "step": 18848 + }, + { + "epoch": 0.5589360377190642, + "grad_norm": 0.12836652994155884, + "learning_rate": 0.0004149011747616559, + "loss": 2.651, + "step": 18849 + }, + { + "epoch": 0.5589656910713756, + "grad_norm": 0.10983728617429733, + "learning_rate": 0.00041485481065379657, + "loss": 2.6824, + "step": 18850 + }, + { + "epoch": 0.5589953444236871, + "grad_norm": 0.11936930567026138, + "learning_rate": 0.0004148084472999125, + "loss": 2.6187, + "step": 18851 + }, + { + "epoch": 0.5590249977759986, + "grad_norm": 0.11739924550056458, + "learning_rate": 0.00041476208470041427, + "loss": 2.6574, + "step": 18852 + }, + { + "epoch": 0.5590546511283101, + "grad_norm": 0.11641807854175568, + "learning_rate": 0.00041471572285571237, + "loss": 2.651, + "step": 18853 + }, + { + "epoch": 0.5590843044806215, + "grad_norm": 0.11014597862958908, + "learning_rate": 0.00041466936176621746, + "loss": 2.6341, + "step": 18854 + }, + { + "epoch": 0.559113957832933, + "grad_norm": 0.12445235252380371, + "learning_rate": 0.00041462300143233996, + "loss": 2.6101, + "step": 18855 + }, + { + "epoch": 0.5591436111852445, + "grad_norm": 0.12424451857805252, + "learning_rate": 0.0004145766418544905, + "loss": 2.6385, + "step": 18856 + }, + { + "epoch": 0.559173264537556, + "grad_norm": 0.11585196852684021, + "learning_rate": 0.0004145302830330795, + "loss": 2.6342, + "step": 18857 + }, + { + "epoch": 0.5592029178898674, + "grad_norm": 0.1132708340883255, + "learning_rate": 0.00041448392496851747, + "loss": 2.6442, + "step": 18858 + }, + { + "epoch": 0.559232571242179, + "grad_norm": 0.13018745183944702, + "learning_rate": 0.000414437567661215, + "loss": 2.6943, + "step": 18859 + }, + { + "epoch": 0.5592622245944904, + "grad_norm": 0.12137921154499054, + "learning_rate": 0.0004143912111115825, + "loss": 2.6305, + "step": 18860 + }, + { + "epoch": 0.5592918779468019, + "grad_norm": 0.11209193617105484, + "learning_rate": 0.00041434485532003054, + "loss": 2.6592, + "step": 18861 + }, + { + "epoch": 0.5593215312991133, + "grad_norm": 0.11822634935379028, + "learning_rate": 0.00041429850028696947, + "loss": 2.6608, + "step": 18862 + }, + { + "epoch": 0.5593511846514249, + "grad_norm": 0.10517312586307526, + "learning_rate": 0.00041425214601281, + "loss": 2.6472, + "step": 18863 + }, + { + "epoch": 0.5593808380037363, + "grad_norm": 0.11048772186040878, + "learning_rate": 0.0004142057924979626, + "loss": 2.685, + "step": 18864 + }, + { + "epoch": 0.5594104913560478, + "grad_norm": 0.12174319475889206, + "learning_rate": 0.00041415943974283757, + "loss": 2.6999, + "step": 18865 + }, + { + "epoch": 0.5594401447083592, + "grad_norm": 0.13102547824382782, + "learning_rate": 0.0004141130877478455, + "loss": 2.6582, + "step": 18866 + }, + { + "epoch": 0.5594697980606708, + "grad_norm": 0.13322195410728455, + "learning_rate": 0.00041406673651339665, + "loss": 2.6638, + "step": 18867 + }, + { + "epoch": 0.5594994514129822, + "grad_norm": 0.13272221386432648, + "learning_rate": 0.0004140203860399017, + "loss": 2.6435, + "step": 18868 + }, + { + "epoch": 0.5595291047652937, + "grad_norm": 0.10998839884996414, + "learning_rate": 0.00041397403632777093, + "loss": 2.6661, + "step": 18869 + }, + { + "epoch": 0.5595587581176052, + "grad_norm": 0.12361843883991241, + "learning_rate": 0.00041392768737741483, + "loss": 2.622, + "step": 18870 + }, + { + "epoch": 0.5595884114699167, + "grad_norm": 0.14564669132232666, + "learning_rate": 0.0004138813391892438, + "loss": 2.6694, + "step": 18871 + }, + { + "epoch": 0.5596180648222282, + "grad_norm": 0.1589134931564331, + "learning_rate": 0.00041383499176366834, + "loss": 2.6498, + "step": 18872 + }, + { + "epoch": 0.5596477181745396, + "grad_norm": 0.11691656708717346, + "learning_rate": 0.00041378864510109895, + "loss": 2.6377, + "step": 18873 + }, + { + "epoch": 0.5596773715268512, + "grad_norm": 0.12182316184043884, + "learning_rate": 0.0004137422992019458, + "loss": 2.6174, + "step": 18874 + }, + { + "epoch": 0.5597070248791626, + "grad_norm": 0.11911146342754364, + "learning_rate": 0.00041369595406661906, + "loss": 2.668, + "step": 18875 + }, + { + "epoch": 0.5597366782314741, + "grad_norm": 0.12094727158546448, + "learning_rate": 0.00041364960969552983, + "loss": 2.7129, + "step": 18876 + }, + { + "epoch": 0.5597663315837855, + "grad_norm": 0.12291692197322845, + "learning_rate": 0.00041360326608908817, + "loss": 2.6944, + "step": 18877 + }, + { + "epoch": 0.5597959849360971, + "grad_norm": 0.13325455784797668, + "learning_rate": 0.00041355692324770434, + "loss": 2.6914, + "step": 18878 + }, + { + "epoch": 0.5598256382884085, + "grad_norm": 0.12875120341777802, + "learning_rate": 0.00041351058117178877, + "loss": 2.6349, + "step": 18879 + }, + { + "epoch": 0.55985529164072, + "grad_norm": 0.11060430854558945, + "learning_rate": 0.0004134642398617518, + "loss": 2.6473, + "step": 18880 + }, + { + "epoch": 0.5598849449930314, + "grad_norm": 0.12127097696065903, + "learning_rate": 0.00041341789931800387, + "loss": 2.6364, + "step": 18881 + }, + { + "epoch": 0.559914598345343, + "grad_norm": 0.11022595316171646, + "learning_rate": 0.00041337155954095534, + "loss": 2.67, + "step": 18882 + }, + { + "epoch": 0.5599442516976544, + "grad_norm": 0.11820097267627716, + "learning_rate": 0.0004133252205310166, + "loss": 2.6785, + "step": 18883 + }, + { + "epoch": 0.5599739050499659, + "grad_norm": 0.1188754066824913, + "learning_rate": 0.00041327888228859775, + "loss": 2.6601, + "step": 18884 + }, + { + "epoch": 0.5600035584022773, + "grad_norm": 0.11524835973978043, + "learning_rate": 0.00041323254481410944, + "loss": 2.6827, + "step": 18885 + }, + { + "epoch": 0.5600332117545889, + "grad_norm": 0.11411368101835251, + "learning_rate": 0.0004131862081079617, + "loss": 2.6858, + "step": 18886 + }, + { + "epoch": 0.5600628651069003, + "grad_norm": 0.1120716780424118, + "learning_rate": 0.0004131398721705649, + "loss": 2.6489, + "step": 18887 + }, + { + "epoch": 0.5600925184592118, + "grad_norm": 0.10623801499605179, + "learning_rate": 0.0004130935370023296, + "loss": 2.6724, + "step": 18888 + }, + { + "epoch": 0.5601221718115232, + "grad_norm": 0.12231239676475525, + "learning_rate": 0.00041304720260366593, + "loss": 2.6782, + "step": 18889 + }, + { + "epoch": 0.5601518251638348, + "grad_norm": 0.11754009127616882, + "learning_rate": 0.00041300086897498416, + "loss": 2.6627, + "step": 18890 + }, + { + "epoch": 0.5601814785161463, + "grad_norm": 0.11463424563407898, + "learning_rate": 0.00041295453611669465, + "loss": 2.6547, + "step": 18891 + }, + { + "epoch": 0.5602111318684577, + "grad_norm": 0.1259148269891739, + "learning_rate": 0.0004129082040292077, + "loss": 2.6253, + "step": 18892 + }, + { + "epoch": 0.5602407852207693, + "grad_norm": 0.09915214776992798, + "learning_rate": 0.00041286187271293354, + "loss": 2.6564, + "step": 18893 + }, + { + "epoch": 0.5602704385730807, + "grad_norm": 0.11083835363388062, + "learning_rate": 0.00041281554216828257, + "loss": 2.6602, + "step": 18894 + }, + { + "epoch": 0.5603000919253922, + "grad_norm": 0.10734007507562637, + "learning_rate": 0.0004127692123956648, + "loss": 2.6576, + "step": 18895 + }, + { + "epoch": 0.5603297452777036, + "grad_norm": 0.11369059979915619, + "learning_rate": 0.00041272288339549057, + "loss": 2.6726, + "step": 18896 + }, + { + "epoch": 0.5603593986300152, + "grad_norm": 0.11691151559352875, + "learning_rate": 0.0004126765551681703, + "loss": 2.6416, + "step": 18897 + }, + { + "epoch": 0.5603890519823266, + "grad_norm": 0.11807571351528168, + "learning_rate": 0.0004126302277141141, + "loss": 2.6698, + "step": 18898 + }, + { + "epoch": 0.5604187053346381, + "grad_norm": 0.11059946566820145, + "learning_rate": 0.00041258390103373217, + "loss": 2.6601, + "step": 18899 + }, + { + "epoch": 0.5604483586869495, + "grad_norm": 0.11043122410774231, + "learning_rate": 0.0004125375751274347, + "loss": 2.6797, + "step": 18900 + }, + { + "epoch": 0.5604780120392611, + "grad_norm": 0.11869234591722488, + "learning_rate": 0.0004124912499956322, + "loss": 2.6349, + "step": 18901 + }, + { + "epoch": 0.5605076653915725, + "grad_norm": 0.11751625686883926, + "learning_rate": 0.00041244492563873464, + "loss": 2.6522, + "step": 18902 + }, + { + "epoch": 0.560537318743884, + "grad_norm": 0.10600338876247406, + "learning_rate": 0.0004123986020571523, + "loss": 2.6459, + "step": 18903 + }, + { + "epoch": 0.5605669720961954, + "grad_norm": 0.10009054094552994, + "learning_rate": 0.00041235227925129553, + "loss": 2.6575, + "step": 18904 + }, + { + "epoch": 0.560596625448507, + "grad_norm": 0.09989242255687714, + "learning_rate": 0.0004123059572215742, + "loss": 2.6308, + "step": 18905 + }, + { + "epoch": 0.5606262788008184, + "grad_norm": 0.1034647673368454, + "learning_rate": 0.0004122596359683987, + "loss": 2.6571, + "step": 18906 + }, + { + "epoch": 0.5606559321531299, + "grad_norm": 0.1159796193242073, + "learning_rate": 0.00041221331549217923, + "loss": 2.6953, + "step": 18907 + }, + { + "epoch": 0.5606855855054413, + "grad_norm": 0.1408625692129135, + "learning_rate": 0.00041216699579332583, + "loss": 2.6534, + "step": 18908 + }, + { + "epoch": 0.5607152388577529, + "grad_norm": 0.1214267835021019, + "learning_rate": 0.0004121206768722488, + "loss": 2.6209, + "step": 18909 + }, + { + "epoch": 0.5607448922100643, + "grad_norm": 0.10256348550319672, + "learning_rate": 0.0004120743587293583, + "loss": 2.6656, + "step": 18910 + }, + { + "epoch": 0.5607745455623758, + "grad_norm": 0.11321251094341278, + "learning_rate": 0.0004120280413650645, + "loss": 2.6518, + "step": 18911 + }, + { + "epoch": 0.5608041989146874, + "grad_norm": 0.13063043355941772, + "learning_rate": 0.00041198172477977733, + "loss": 2.6787, + "step": 18912 + }, + { + "epoch": 0.5608338522669988, + "grad_norm": 0.12159550935029984, + "learning_rate": 0.00041193540897390703, + "loss": 2.6553, + "step": 18913 + }, + { + "epoch": 0.5608635056193103, + "grad_norm": 0.12103990465402603, + "learning_rate": 0.0004118890939478639, + "loss": 2.6649, + "step": 18914 + }, + { + "epoch": 0.5608931589716217, + "grad_norm": 0.11972180008888245, + "learning_rate": 0.0004118427797020581, + "loss": 2.6712, + "step": 18915 + }, + { + "epoch": 0.5609228123239333, + "grad_norm": 0.11592014878988266, + "learning_rate": 0.00041179646623689954, + "loss": 2.6372, + "step": 18916 + }, + { + "epoch": 0.5609524656762447, + "grad_norm": 0.1082436665892601, + "learning_rate": 0.00041175015355279836, + "loss": 2.6597, + "step": 18917 + }, + { + "epoch": 0.5609821190285562, + "grad_norm": 0.12244050204753876, + "learning_rate": 0.0004117038416501648, + "loss": 2.6945, + "step": 18918 + }, + { + "epoch": 0.5610117723808676, + "grad_norm": 0.12948842346668243, + "learning_rate": 0.0004116575305294088, + "loss": 2.6598, + "step": 18919 + }, + { + "epoch": 0.5610414257331792, + "grad_norm": 0.11819623410701752, + "learning_rate": 0.00041161122019094055, + "loss": 2.6338, + "step": 18920 + }, + { + "epoch": 0.5610710790854906, + "grad_norm": 0.12464351952075958, + "learning_rate": 0.00041156491063517025, + "loss": 2.6747, + "step": 18921 + }, + { + "epoch": 0.5611007324378021, + "grad_norm": 0.11596237868070602, + "learning_rate": 0.00041151860186250773, + "loss": 2.6462, + "step": 18922 + }, + { + "epoch": 0.5611303857901135, + "grad_norm": 0.11011696606874466, + "learning_rate": 0.00041147229387336316, + "loss": 2.6386, + "step": 18923 + }, + { + "epoch": 0.5611600391424251, + "grad_norm": 0.1089383214712143, + "learning_rate": 0.00041142598666814664, + "loss": 2.6065, + "step": 18924 + }, + { + "epoch": 0.5611896924947365, + "grad_norm": 0.10348797589540482, + "learning_rate": 0.0004113796802472682, + "loss": 2.6438, + "step": 18925 + }, + { + "epoch": 0.561219345847048, + "grad_norm": 0.12800975143909454, + "learning_rate": 0.0004113333746111378, + "loss": 2.6077, + "step": 18926 + }, + { + "epoch": 0.5612489991993594, + "grad_norm": 0.1396535336971283, + "learning_rate": 0.0004112870697601657, + "loss": 2.6128, + "step": 18927 + }, + { + "epoch": 0.561278652551671, + "grad_norm": 0.11650652438402176, + "learning_rate": 0.0004112407656947618, + "loss": 2.6434, + "step": 18928 + }, + { + "epoch": 0.5613083059039824, + "grad_norm": 0.13375632464885712, + "learning_rate": 0.0004111944624153362, + "loss": 2.683, + "step": 18929 + }, + { + "epoch": 0.5613379592562939, + "grad_norm": 0.11826319992542267, + "learning_rate": 0.00041114815992229883, + "loss": 2.649, + "step": 18930 + }, + { + "epoch": 0.5613676126086053, + "grad_norm": 0.10454254597425461, + "learning_rate": 0.0004111018582160598, + "loss": 2.6653, + "step": 18931 + }, + { + "epoch": 0.5613972659609169, + "grad_norm": 0.13118384778499603, + "learning_rate": 0.0004110555572970291, + "loss": 2.614, + "step": 18932 + }, + { + "epoch": 0.5614269193132284, + "grad_norm": 0.13687071204185486, + "learning_rate": 0.0004110092571656167, + "loss": 2.685, + "step": 18933 + }, + { + "epoch": 0.5614565726655398, + "grad_norm": 0.13166864216327667, + "learning_rate": 0.00041096295782223257, + "loss": 2.6834, + "step": 18934 + }, + { + "epoch": 0.5614862260178514, + "grad_norm": 0.12636980414390564, + "learning_rate": 0.0004109166592672867, + "loss": 2.6553, + "step": 18935 + }, + { + "epoch": 0.5615158793701628, + "grad_norm": 0.11553282290697098, + "learning_rate": 0.00041087036150118915, + "loss": 2.6265, + "step": 18936 + }, + { + "epoch": 0.5615455327224743, + "grad_norm": 0.12314147502183914, + "learning_rate": 0.00041082406452434985, + "loss": 2.6569, + "step": 18937 + }, + { + "epoch": 0.5615751860747857, + "grad_norm": 0.13182979822158813, + "learning_rate": 0.00041077776833717885, + "loss": 2.6544, + "step": 18938 + }, + { + "epoch": 0.5616048394270973, + "grad_norm": 0.11891800910234451, + "learning_rate": 0.00041073147294008565, + "loss": 2.6363, + "step": 18939 + }, + { + "epoch": 0.5616344927794087, + "grad_norm": 0.11019252240657806, + "learning_rate": 0.00041068517833348084, + "loss": 2.6472, + "step": 18940 + }, + { + "epoch": 0.5616641461317202, + "grad_norm": 0.12202446162700653, + "learning_rate": 0.00041063888451777414, + "loss": 2.6369, + "step": 18941 + }, + { + "epoch": 0.5616937994840316, + "grad_norm": 0.12015582621097565, + "learning_rate": 0.0004105925914933756, + "loss": 2.6566, + "step": 18942 + }, + { + "epoch": 0.5617234528363432, + "grad_norm": 0.1283552050590515, + "learning_rate": 0.00041054629926069475, + "loss": 2.6758, + "step": 18943 + }, + { + "epoch": 0.5617531061886546, + "grad_norm": 0.13819588720798492, + "learning_rate": 0.0004105000078201419, + "loss": 2.666, + "step": 18944 + }, + { + "epoch": 0.5617827595409661, + "grad_norm": 0.1350407600402832, + "learning_rate": 0.00041045371717212683, + "loss": 2.6735, + "step": 18945 + }, + { + "epoch": 0.5618124128932775, + "grad_norm": 0.1313459277153015, + "learning_rate": 0.0004104074273170594, + "loss": 2.7104, + "step": 18946 + }, + { + "epoch": 0.5618420662455891, + "grad_norm": 0.11654561758041382, + "learning_rate": 0.00041036113825534964, + "loss": 2.6544, + "step": 18947 + }, + { + "epoch": 0.5618717195979005, + "grad_norm": 0.13490451872348785, + "learning_rate": 0.0004103148499874074, + "loss": 2.6342, + "step": 18948 + }, + { + "epoch": 0.561901372950212, + "grad_norm": 0.1297486424446106, + "learning_rate": 0.0004102685625136426, + "loss": 2.6624, + "step": 18949 + }, + { + "epoch": 0.5619310263025235, + "grad_norm": 0.12425536662340164, + "learning_rate": 0.00041022227583446504, + "loss": 2.6216, + "step": 18950 + }, + { + "epoch": 0.561960679654835, + "grad_norm": 0.12281766533851624, + "learning_rate": 0.0004101759899502846, + "loss": 2.6212, + "step": 18951 + }, + { + "epoch": 0.5619903330071464, + "grad_norm": 0.11711497604846954, + "learning_rate": 0.0004101297048615109, + "loss": 2.6909, + "step": 18952 + }, + { + "epoch": 0.5620199863594579, + "grad_norm": 0.11455237865447998, + "learning_rate": 0.00041008342056855453, + "loss": 2.6669, + "step": 18953 + }, + { + "epoch": 0.5620496397117695, + "grad_norm": 0.10816741734743118, + "learning_rate": 0.0004100371370718248, + "loss": 2.6405, + "step": 18954 + }, + { + "epoch": 0.5620792930640809, + "grad_norm": 0.10775966942310333, + "learning_rate": 0.00040999085437173156, + "loss": 2.6388, + "step": 18955 + }, + { + "epoch": 0.5621089464163924, + "grad_norm": 0.12914451956748962, + "learning_rate": 0.00040994457246868485, + "loss": 2.7045, + "step": 18956 + }, + { + "epoch": 0.5621385997687038, + "grad_norm": 0.11977265030145645, + "learning_rate": 0.00040989829136309436, + "loss": 2.643, + "step": 18957 + }, + { + "epoch": 0.5621682531210154, + "grad_norm": 0.1282794177532196, + "learning_rate": 0.00040985201105536994, + "loss": 2.6729, + "step": 18958 + }, + { + "epoch": 0.5621979064733268, + "grad_norm": 0.11680933833122253, + "learning_rate": 0.0004098057315459216, + "loss": 2.6555, + "step": 18959 + }, + { + "epoch": 0.5622275598256383, + "grad_norm": 0.1149979904294014, + "learning_rate": 0.00040975945283515885, + "loss": 2.6753, + "step": 18960 + }, + { + "epoch": 0.5622572131779497, + "grad_norm": 0.11769989132881165, + "learning_rate": 0.0004097131749234917, + "loss": 2.6328, + "step": 18961 + }, + { + "epoch": 0.5622868665302613, + "grad_norm": 0.13412810862064362, + "learning_rate": 0.00040966689781132983, + "loss": 2.663, + "step": 18962 + }, + { + "epoch": 0.5623165198825727, + "grad_norm": 0.1031407043337822, + "learning_rate": 0.00040962062149908307, + "loss": 2.6508, + "step": 18963 + }, + { + "epoch": 0.5623461732348842, + "grad_norm": 0.11788634210824966, + "learning_rate": 0.0004095743459871612, + "loss": 2.6441, + "step": 18964 + }, + { + "epoch": 0.5623758265871956, + "grad_norm": 0.12645141780376434, + "learning_rate": 0.000409528071275974, + "loss": 2.6292, + "step": 18965 + }, + { + "epoch": 0.5624054799395072, + "grad_norm": 0.10690168291330338, + "learning_rate": 0.00040948179736593126, + "loss": 2.6402, + "step": 18966 + }, + { + "epoch": 0.5624351332918186, + "grad_norm": 0.11721745878458023, + "learning_rate": 0.0004094355242574428, + "loss": 2.6293, + "step": 18967 + }, + { + "epoch": 0.5624647866441301, + "grad_norm": 0.11985684931278229, + "learning_rate": 0.0004093892519509183, + "loss": 2.6851, + "step": 18968 + }, + { + "epoch": 0.5624944399964416, + "grad_norm": 0.1236533522605896, + "learning_rate": 0.00040934298044676754, + "loss": 2.6449, + "step": 18969 + }, + { + "epoch": 0.5625240933487531, + "grad_norm": 0.13234390318393707, + "learning_rate": 0.00040929670974540033, + "loss": 2.6473, + "step": 18970 + }, + { + "epoch": 0.5625537467010645, + "grad_norm": 0.1360880732536316, + "learning_rate": 0.0004092504398472263, + "loss": 2.6518, + "step": 18971 + }, + { + "epoch": 0.562583400053376, + "grad_norm": 0.12678252160549164, + "learning_rate": 0.00040920417075265516, + "loss": 2.6478, + "step": 18972 + }, + { + "epoch": 0.5626130534056875, + "grad_norm": 0.10502857714891434, + "learning_rate": 0.0004091579024620966, + "loss": 2.6754, + "step": 18973 + }, + { + "epoch": 0.562642706757999, + "grad_norm": 0.1076209619641304, + "learning_rate": 0.00040911163497596046, + "loss": 2.6721, + "step": 18974 + }, + { + "epoch": 0.5626723601103105, + "grad_norm": 0.11913634091615677, + "learning_rate": 0.00040906536829465645, + "loss": 2.6609, + "step": 18975 + }, + { + "epoch": 0.5627020134626219, + "grad_norm": 0.10963285714387894, + "learning_rate": 0.00040901910241859427, + "loss": 2.6366, + "step": 18976 + }, + { + "epoch": 0.5627316668149335, + "grad_norm": 0.11022462695837021, + "learning_rate": 0.0004089728373481834, + "loss": 2.6258, + "step": 18977 + }, + { + "epoch": 0.5627613201672449, + "grad_norm": 0.09826027601957321, + "learning_rate": 0.00040892657308383353, + "loss": 2.6709, + "step": 18978 + }, + { + "epoch": 0.5627909735195564, + "grad_norm": 0.12836094200611115, + "learning_rate": 0.0004088803096259547, + "loss": 2.6315, + "step": 18979 + }, + { + "epoch": 0.5628206268718678, + "grad_norm": 0.10804089903831482, + "learning_rate": 0.00040883404697495643, + "loss": 2.6236, + "step": 18980 + }, + { + "epoch": 0.5628502802241794, + "grad_norm": 0.10641662031412125, + "learning_rate": 0.00040878778513124824, + "loss": 2.6461, + "step": 18981 + }, + { + "epoch": 0.5628799335764908, + "grad_norm": 0.10022521018981934, + "learning_rate": 0.0004087415240952399, + "loss": 2.6698, + "step": 18982 + }, + { + "epoch": 0.5629095869288023, + "grad_norm": 0.10987338423728943, + "learning_rate": 0.00040869526386734094, + "loss": 2.6513, + "step": 18983 + }, + { + "epoch": 0.5629392402811138, + "grad_norm": 0.10281826555728912, + "learning_rate": 0.0004086490044479612, + "loss": 2.6621, + "step": 18984 + }, + { + "epoch": 0.5629688936334253, + "grad_norm": 0.11200665682554245, + "learning_rate": 0.0004086027458375101, + "loss": 2.6502, + "step": 18985 + }, + { + "epoch": 0.5629985469857367, + "grad_norm": 0.11472837626934052, + "learning_rate": 0.00040855648803639745, + "loss": 2.6968, + "step": 18986 + }, + { + "epoch": 0.5630282003380482, + "grad_norm": 0.11546892672777176, + "learning_rate": 0.00040851023104503294, + "loss": 2.6639, + "step": 18987 + }, + { + "epoch": 0.5630578536903597, + "grad_norm": 0.10564026981592178, + "learning_rate": 0.00040846397486382586, + "loss": 2.6719, + "step": 18988 + }, + { + "epoch": 0.5630875070426712, + "grad_norm": 0.1044110655784607, + "learning_rate": 0.00040841771949318595, + "loss": 2.68, + "step": 18989 + }, + { + "epoch": 0.5631171603949826, + "grad_norm": 0.11211175471544266, + "learning_rate": 0.0004083714649335229, + "loss": 2.6431, + "step": 18990 + }, + { + "epoch": 0.5631468137472941, + "grad_norm": 0.45642000436782837, + "learning_rate": 0.00040832521118524623, + "loss": 2.695, + "step": 18991 + }, + { + "epoch": 0.5631764670996056, + "grad_norm": 0.11306135356426239, + "learning_rate": 0.0004082789582487656, + "loss": 2.6749, + "step": 18992 + }, + { + "epoch": 0.5632061204519171, + "grad_norm": 0.1306128352880478, + "learning_rate": 0.0004082327061244905, + "loss": 2.635, + "step": 18993 + }, + { + "epoch": 0.5632357738042286, + "grad_norm": 0.12808310985565186, + "learning_rate": 0.00040818645481283057, + "loss": 2.6604, + "step": 18994 + }, + { + "epoch": 0.56326542715654, + "grad_norm": 0.10077085345983505, + "learning_rate": 0.0004081402043141953, + "loss": 2.6616, + "step": 18995 + }, + { + "epoch": 0.5632950805088516, + "grad_norm": 0.11595115810632706, + "learning_rate": 0.00040809395462899434, + "loss": 2.6847, + "step": 18996 + }, + { + "epoch": 0.563324733861163, + "grad_norm": 0.10899404436349869, + "learning_rate": 0.00040804770575763726, + "loss": 2.6628, + "step": 18997 + }, + { + "epoch": 0.5633543872134745, + "grad_norm": 0.10832979530096054, + "learning_rate": 0.00040800145770053347, + "loss": 2.6683, + "step": 18998 + }, + { + "epoch": 0.563384040565786, + "grad_norm": 0.1282690465450287, + "learning_rate": 0.0004079552104580925, + "loss": 2.6637, + "step": 18999 + }, + { + "epoch": 0.5634136939180975, + "grad_norm": 0.10238691419363022, + "learning_rate": 0.000407908964030724, + "loss": 2.6699, + "step": 19000 + }, + { + "epoch": 0.5634433472704089, + "grad_norm": 0.10167035460472107, + "learning_rate": 0.00040786271841883743, + "loss": 2.6576, + "step": 19001 + }, + { + "epoch": 0.5634730006227204, + "grad_norm": 0.11882318556308746, + "learning_rate": 0.00040781647362284225, + "loss": 2.667, + "step": 19002 + }, + { + "epoch": 0.5635026539750319, + "grad_norm": 0.11115999519824982, + "learning_rate": 0.0004077702296431481, + "loss": 2.6441, + "step": 19003 + }, + { + "epoch": 0.5635323073273434, + "grad_norm": 0.10856377333402634, + "learning_rate": 0.0004077239864801642, + "loss": 2.6587, + "step": 19004 + }, + { + "epoch": 0.5635619606796548, + "grad_norm": 0.10260740667581558, + "learning_rate": 0.00040767774413430036, + "loss": 2.6652, + "step": 19005 + }, + { + "epoch": 0.5635916140319663, + "grad_norm": 0.11157497018575668, + "learning_rate": 0.000407631502605966, + "loss": 2.6421, + "step": 19006 + }, + { + "epoch": 0.5636212673842778, + "grad_norm": 0.0991833508014679, + "learning_rate": 0.00040758526189557046, + "loss": 2.6937, + "step": 19007 + }, + { + "epoch": 0.5636509207365893, + "grad_norm": 0.10184457153081894, + "learning_rate": 0.0004075390220035235, + "loss": 2.668, + "step": 19008 + }, + { + "epoch": 0.5636805740889007, + "grad_norm": 0.11131810396909714, + "learning_rate": 0.0004074927829302342, + "loss": 2.6391, + "step": 19009 + }, + { + "epoch": 0.5637102274412122, + "grad_norm": 0.11105599999427795, + "learning_rate": 0.0004074465446761122, + "loss": 2.6423, + "step": 19010 + }, + { + "epoch": 0.5637398807935237, + "grad_norm": 0.11561084538698196, + "learning_rate": 0.0004074003072415669, + "loss": 2.6396, + "step": 19011 + }, + { + "epoch": 0.5637695341458352, + "grad_norm": 0.11013134568929672, + "learning_rate": 0.0004073540706270078, + "loss": 2.6687, + "step": 19012 + }, + { + "epoch": 0.5637991874981466, + "grad_norm": 0.11796490848064423, + "learning_rate": 0.00040730783483284433, + "loss": 2.664, + "step": 19013 + }, + { + "epoch": 0.5638288408504581, + "grad_norm": 0.1057654619216919, + "learning_rate": 0.00040726159985948594, + "loss": 2.6024, + "step": 19014 + }, + { + "epoch": 0.5638584942027697, + "grad_norm": 0.107698954641819, + "learning_rate": 0.00040721536570734196, + "loss": 2.6403, + "step": 19015 + }, + { + "epoch": 0.5638881475550811, + "grad_norm": 0.11421805620193481, + "learning_rate": 0.00040716913237682183, + "loss": 2.6221, + "step": 19016 + }, + { + "epoch": 0.5639178009073926, + "grad_norm": 0.10801693052053452, + "learning_rate": 0.00040712289986833474, + "loss": 2.6338, + "step": 19017 + }, + { + "epoch": 0.563947454259704, + "grad_norm": 0.10837219655513763, + "learning_rate": 0.0004070766681822906, + "loss": 2.6723, + "step": 19018 + }, + { + "epoch": 0.5639771076120156, + "grad_norm": 0.11943544447422028, + "learning_rate": 0.0004070304373190984, + "loss": 2.6471, + "step": 19019 + }, + { + "epoch": 0.564006760964327, + "grad_norm": 0.1108146384358406, + "learning_rate": 0.0004069842072791677, + "loss": 2.686, + "step": 19020 + }, + { + "epoch": 0.5640364143166385, + "grad_norm": 0.11613841354846954, + "learning_rate": 0.00040693797806290786, + "loss": 2.6485, + "step": 19021 + }, + { + "epoch": 0.56406606766895, + "grad_norm": 0.10968046635389328, + "learning_rate": 0.00040689174967072805, + "loss": 2.6451, + "step": 19022 + }, + { + "epoch": 0.5640957210212615, + "grad_norm": 0.11303708702325821, + "learning_rate": 0.00040684552210303786, + "loss": 2.6269, + "step": 19023 + }, + { + "epoch": 0.5641253743735729, + "grad_norm": 0.10117454081773758, + "learning_rate": 0.0004067992953602466, + "loss": 2.6805, + "step": 19024 + }, + { + "epoch": 0.5641550277258844, + "grad_norm": 0.1143360584974289, + "learning_rate": 0.00040675306944276365, + "loss": 2.6384, + "step": 19025 + }, + { + "epoch": 0.5641846810781959, + "grad_norm": 0.09991158545017242, + "learning_rate": 0.00040670684435099814, + "loss": 2.6445, + "step": 19026 + }, + { + "epoch": 0.5642143344305074, + "grad_norm": 0.10906554758548737, + "learning_rate": 0.0004066606200853595, + "loss": 2.6301, + "step": 19027 + }, + { + "epoch": 0.5642439877828188, + "grad_norm": 0.10625959187746048, + "learning_rate": 0.0004066143966462572, + "loss": 2.666, + "step": 19028 + }, + { + "epoch": 0.5642736411351303, + "grad_norm": 0.11512085050344467, + "learning_rate": 0.0004065681740341004, + "loss": 2.6659, + "step": 19029 + }, + { + "epoch": 0.5643032944874418, + "grad_norm": 0.11709747463464737, + "learning_rate": 0.0004065219522492983, + "loss": 2.6586, + "step": 19030 + }, + { + "epoch": 0.5643329478397533, + "grad_norm": 0.10779066383838654, + "learning_rate": 0.0004064757312922605, + "loss": 2.6483, + "step": 19031 + }, + { + "epoch": 0.5643626011920647, + "grad_norm": 0.1053754910826683, + "learning_rate": 0.00040642951116339615, + "loss": 2.6562, + "step": 19032 + }, + { + "epoch": 0.5643922545443762, + "grad_norm": 0.11442887783050537, + "learning_rate": 0.0004063832918631146, + "loss": 2.6425, + "step": 19033 + }, + { + "epoch": 0.5644219078966877, + "grad_norm": 0.11680764704942703, + "learning_rate": 0.000406337073391825, + "loss": 2.6755, + "step": 19034 + }, + { + "epoch": 0.5644515612489992, + "grad_norm": 0.13558220863342285, + "learning_rate": 0.0004062908557499368, + "loss": 2.6411, + "step": 19035 + }, + { + "epoch": 0.5644812146013107, + "grad_norm": 0.0964268296957016, + "learning_rate": 0.00040624463893785905, + "loss": 2.6688, + "step": 19036 + }, + { + "epoch": 0.5645108679536222, + "grad_norm": 0.11232201755046844, + "learning_rate": 0.0004061984229560012, + "loss": 2.635, + "step": 19037 + }, + { + "epoch": 0.5645405213059337, + "grad_norm": 0.11176407337188721, + "learning_rate": 0.0004061522078047723, + "loss": 2.6402, + "step": 19038 + }, + { + "epoch": 0.5645701746582451, + "grad_norm": 0.10905805230140686, + "learning_rate": 0.0004061059934845818, + "loss": 2.6172, + "step": 19039 + }, + { + "epoch": 0.5645998280105566, + "grad_norm": 0.11757546663284302, + "learning_rate": 0.00040605977999583876, + "loss": 2.6506, + "step": 19040 + }, + { + "epoch": 0.5646294813628681, + "grad_norm": 0.1140090674161911, + "learning_rate": 0.00040601356733895255, + "loss": 2.6681, + "step": 19041 + }, + { + "epoch": 0.5646591347151796, + "grad_norm": 0.10219357162714005, + "learning_rate": 0.0004059673555143324, + "loss": 2.6696, + "step": 19042 + }, + { + "epoch": 0.564688788067491, + "grad_norm": 0.11839964240789413, + "learning_rate": 0.00040592114452238717, + "loss": 2.6364, + "step": 19043 + }, + { + "epoch": 0.5647184414198025, + "grad_norm": 0.11285068839788437, + "learning_rate": 0.0004058749343635265, + "loss": 2.6534, + "step": 19044 + }, + { + "epoch": 0.564748094772114, + "grad_norm": 0.12677115201950073, + "learning_rate": 0.00040582872503815956, + "loss": 2.6539, + "step": 19045 + }, + { + "epoch": 0.5647777481244255, + "grad_norm": 0.12343592196702957, + "learning_rate": 0.00040578251654669543, + "loss": 2.6232, + "step": 19046 + }, + { + "epoch": 0.5648074014767369, + "grad_norm": 0.13232634961605072, + "learning_rate": 0.00040573630888954325, + "loss": 2.6308, + "step": 19047 + }, + { + "epoch": 0.5648370548290484, + "grad_norm": 0.12441761791706085, + "learning_rate": 0.0004056901020671122, + "loss": 2.666, + "step": 19048 + }, + { + "epoch": 0.5648667081813599, + "grad_norm": 0.14078058302402496, + "learning_rate": 0.0004056438960798115, + "loss": 2.6832, + "step": 19049 + }, + { + "epoch": 0.5648963615336714, + "grad_norm": 0.12421552836894989, + "learning_rate": 0.00040559769092805034, + "loss": 2.6232, + "step": 19050 + }, + { + "epoch": 0.5649260148859828, + "grad_norm": 0.12617909908294678, + "learning_rate": 0.00040555148661223773, + "loss": 2.6659, + "step": 19051 + }, + { + "epoch": 0.5649556682382944, + "grad_norm": 0.12536118924617767, + "learning_rate": 0.0004055052831327831, + "loss": 2.6844, + "step": 19052 + }, + { + "epoch": 0.5649853215906058, + "grad_norm": 0.10995734483003616, + "learning_rate": 0.00040545908049009527, + "loss": 2.6728, + "step": 19053 + }, + { + "epoch": 0.5650149749429173, + "grad_norm": 0.12910787761211395, + "learning_rate": 0.0004054128786845835, + "loss": 2.6968, + "step": 19054 + }, + { + "epoch": 0.5650446282952287, + "grad_norm": 0.13470931351184845, + "learning_rate": 0.00040536667771665693, + "loss": 2.6786, + "step": 19055 + }, + { + "epoch": 0.5650742816475403, + "grad_norm": 0.12470319122076035, + "learning_rate": 0.0004053204775867246, + "loss": 2.6652, + "step": 19056 + }, + { + "epoch": 0.5651039349998518, + "grad_norm": 0.12797297537326813, + "learning_rate": 0.00040527427829519576, + "loss": 2.6789, + "step": 19057 + }, + { + "epoch": 0.5651335883521632, + "grad_norm": 0.12168292701244354, + "learning_rate": 0.00040522807984247946, + "loss": 2.6559, + "step": 19058 + }, + { + "epoch": 0.5651632417044747, + "grad_norm": 0.12425075471401215, + "learning_rate": 0.00040518188222898475, + "loss": 2.6825, + "step": 19059 + }, + { + "epoch": 0.5651928950567862, + "grad_norm": 0.12328741699457169, + "learning_rate": 0.0004051356854551208, + "loss": 2.6548, + "step": 19060 + }, + { + "epoch": 0.5652225484090977, + "grad_norm": 0.10922911763191223, + "learning_rate": 0.0004050894895212966, + "loss": 2.64, + "step": 19061 + }, + { + "epoch": 0.5652522017614091, + "grad_norm": 0.11555950343608856, + "learning_rate": 0.00040504329442792134, + "loss": 2.6448, + "step": 19062 + }, + { + "epoch": 0.5652818551137206, + "grad_norm": 0.1018524244427681, + "learning_rate": 0.00040499710017540404, + "loss": 2.6349, + "step": 19063 + }, + { + "epoch": 0.5653115084660321, + "grad_norm": 0.11800742894411087, + "learning_rate": 0.00040495090676415357, + "loss": 2.6779, + "step": 19064 + }, + { + "epoch": 0.5653411618183436, + "grad_norm": 0.11415454000234604, + "learning_rate": 0.00040490471419457917, + "loss": 2.6555, + "step": 19065 + }, + { + "epoch": 0.565370815170655, + "grad_norm": 0.10888538509607315, + "learning_rate": 0.00040485852246708996, + "loss": 2.6596, + "step": 19066 + }, + { + "epoch": 0.5654004685229665, + "grad_norm": 0.11194933205842972, + "learning_rate": 0.00040481233158209473, + "loss": 2.656, + "step": 19067 + }, + { + "epoch": 0.565430121875278, + "grad_norm": 0.12541049718856812, + "learning_rate": 0.00040476614154000257, + "loss": 2.6723, + "step": 19068 + }, + { + "epoch": 0.5654597752275895, + "grad_norm": 0.11732402443885803, + "learning_rate": 0.0004047199523412227, + "loss": 2.6482, + "step": 19069 + }, + { + "epoch": 0.5654894285799009, + "grad_norm": 0.11298680305480957, + "learning_rate": 0.000404673763986164, + "loss": 2.6516, + "step": 19070 + }, + { + "epoch": 0.5655190819322125, + "grad_norm": 0.11106070131063461, + "learning_rate": 0.0004046275764752355, + "loss": 2.6643, + "step": 19071 + }, + { + "epoch": 0.5655487352845239, + "grad_norm": 0.11434987932443619, + "learning_rate": 0.00040458138980884617, + "loss": 2.6596, + "step": 19072 + }, + { + "epoch": 0.5655783886368354, + "grad_norm": 0.09982044249773026, + "learning_rate": 0.0004045352039874051, + "loss": 2.6348, + "step": 19073 + }, + { + "epoch": 0.5656080419891468, + "grad_norm": 0.10711711645126343, + "learning_rate": 0.0004044890190113211, + "loss": 2.6495, + "step": 19074 + }, + { + "epoch": 0.5656376953414584, + "grad_norm": 0.11678794771432877, + "learning_rate": 0.0004044428348810032, + "loss": 2.6677, + "step": 19075 + }, + { + "epoch": 0.5656673486937698, + "grad_norm": 0.1247260645031929, + "learning_rate": 0.00040439665159686046, + "loss": 2.6285, + "step": 19076 + }, + { + "epoch": 0.5656970020460813, + "grad_norm": 0.11852776259183884, + "learning_rate": 0.00040435046915930173, + "loss": 2.6654, + "step": 19077 + }, + { + "epoch": 0.5657266553983928, + "grad_norm": 0.11488320678472519, + "learning_rate": 0.00040430428756873605, + "loss": 2.6318, + "step": 19078 + }, + { + "epoch": 0.5657563087507043, + "grad_norm": 0.11605816334486008, + "learning_rate": 0.0004042581068255723, + "loss": 2.6814, + "step": 19079 + }, + { + "epoch": 0.5657859621030158, + "grad_norm": 0.11202661693096161, + "learning_rate": 0.00040421192693021956, + "loss": 2.6578, + "step": 19080 + }, + { + "epoch": 0.5658156154553272, + "grad_norm": 0.13036125898361206, + "learning_rate": 0.0004041657478830863, + "loss": 2.6651, + "step": 19081 + }, + { + "epoch": 0.5658452688076387, + "grad_norm": 0.11000263690948486, + "learning_rate": 0.0004041195696845821, + "loss": 2.6363, + "step": 19082 + }, + { + "epoch": 0.5658749221599502, + "grad_norm": 0.13175998628139496, + "learning_rate": 0.0004040733923351154, + "loss": 2.6762, + "step": 19083 + }, + { + "epoch": 0.5659045755122617, + "grad_norm": 0.15580999851226807, + "learning_rate": 0.00040402721583509556, + "loss": 2.6767, + "step": 19084 + }, + { + "epoch": 0.5659342288645731, + "grad_norm": 0.15392382442951202, + "learning_rate": 0.000403981040184931, + "loss": 2.6517, + "step": 19085 + }, + { + "epoch": 0.5659638822168847, + "grad_norm": 0.12501253187656403, + "learning_rate": 0.00040393486538503083, + "loss": 2.6706, + "step": 19086 + }, + { + "epoch": 0.5659935355691961, + "grad_norm": 0.11399107426404953, + "learning_rate": 0.0004038886914358039, + "loss": 2.5953, + "step": 19087 + }, + { + "epoch": 0.5660231889215076, + "grad_norm": 0.1424490213394165, + "learning_rate": 0.0004038425183376591, + "loss": 2.6472, + "step": 19088 + }, + { + "epoch": 0.566052842273819, + "grad_norm": 0.14056836068630219, + "learning_rate": 0.00040379634609100535, + "loss": 2.6214, + "step": 19089 + }, + { + "epoch": 0.5660824956261306, + "grad_norm": 0.10852232575416565, + "learning_rate": 0.0004037501746962515, + "loss": 2.6404, + "step": 19090 + }, + { + "epoch": 0.566112148978442, + "grad_norm": 0.1213131994009018, + "learning_rate": 0.0004037040041538064, + "loss": 2.6733, + "step": 19091 + }, + { + "epoch": 0.5661418023307535, + "grad_norm": 0.1275511085987091, + "learning_rate": 0.00040365783446407876, + "loss": 2.6585, + "step": 19092 + }, + { + "epoch": 0.5661714556830649, + "grad_norm": 0.1373722404241562, + "learning_rate": 0.0004036116656274775, + "loss": 2.6708, + "step": 19093 + }, + { + "epoch": 0.5662011090353765, + "grad_norm": 0.1284709870815277, + "learning_rate": 0.00040356549764441145, + "loss": 2.6554, + "step": 19094 + }, + { + "epoch": 0.5662307623876879, + "grad_norm": 0.13267017900943756, + "learning_rate": 0.0004035193305152896, + "loss": 2.658, + "step": 19095 + }, + { + "epoch": 0.5662604157399994, + "grad_norm": 0.13038450479507446, + "learning_rate": 0.0004034731642405206, + "loss": 2.6381, + "step": 19096 + }, + { + "epoch": 0.5662900690923108, + "grad_norm": 0.1290382444858551, + "learning_rate": 0.0004034269988205133, + "loss": 2.6825, + "step": 19097 + }, + { + "epoch": 0.5663197224446224, + "grad_norm": 0.12769658863544464, + "learning_rate": 0.00040338083425567653, + "loss": 2.6635, + "step": 19098 + }, + { + "epoch": 0.5663493757969339, + "grad_norm": 0.1174214705824852, + "learning_rate": 0.00040333467054641905, + "loss": 2.6515, + "step": 19099 + }, + { + "epoch": 0.5663790291492453, + "grad_norm": 0.11282037198543549, + "learning_rate": 0.00040328850769314964, + "loss": 2.6211, + "step": 19100 + }, + { + "epoch": 0.5664086825015568, + "grad_norm": 0.11233066022396088, + "learning_rate": 0.00040324234569627716, + "loss": 2.6547, + "step": 19101 + }, + { + "epoch": 0.5664383358538683, + "grad_norm": 0.11127323657274246, + "learning_rate": 0.00040319618455621025, + "loss": 2.6602, + "step": 19102 + }, + { + "epoch": 0.5664679892061798, + "grad_norm": 0.11255166679620743, + "learning_rate": 0.0004031500242733578, + "loss": 2.6494, + "step": 19103 + }, + { + "epoch": 0.5664976425584912, + "grad_norm": 0.10752388834953308, + "learning_rate": 0.0004031038648481284, + "loss": 2.6908, + "step": 19104 + }, + { + "epoch": 0.5665272959108028, + "grad_norm": 0.09903734922409058, + "learning_rate": 0.00040305770628093094, + "loss": 2.6469, + "step": 19105 + }, + { + "epoch": 0.5665569492631142, + "grad_norm": 0.11689140647649765, + "learning_rate": 0.00040301154857217417, + "loss": 2.6625, + "step": 19106 + }, + { + "epoch": 0.5665866026154257, + "grad_norm": 0.11238765716552734, + "learning_rate": 0.00040296539172226663, + "loss": 2.627, + "step": 19107 + }, + { + "epoch": 0.5666162559677371, + "grad_norm": 0.10812829434871674, + "learning_rate": 0.0004029192357316174, + "loss": 2.6383, + "step": 19108 + }, + { + "epoch": 0.5666459093200487, + "grad_norm": 0.1022181510925293, + "learning_rate": 0.00040287308060063493, + "loss": 2.6432, + "step": 19109 + }, + { + "epoch": 0.5666755626723601, + "grad_norm": 0.11693846434354782, + "learning_rate": 0.000402826926329728, + "loss": 2.6672, + "step": 19110 + }, + { + "epoch": 0.5667052160246716, + "grad_norm": 0.11672473698854446, + "learning_rate": 0.0004027807729193054, + "loss": 2.6375, + "step": 19111 + }, + { + "epoch": 0.566734869376983, + "grad_norm": 0.11306370794773102, + "learning_rate": 0.0004027346203697757, + "loss": 2.6445, + "step": 19112 + }, + { + "epoch": 0.5667645227292946, + "grad_norm": 0.09986000508069992, + "learning_rate": 0.0004026884686815476, + "loss": 2.6812, + "step": 19113 + }, + { + "epoch": 0.566794176081606, + "grad_norm": 0.10296642780303955, + "learning_rate": 0.0004026423178550298, + "loss": 2.6243, + "step": 19114 + }, + { + "epoch": 0.5668238294339175, + "grad_norm": 0.11395805329084396, + "learning_rate": 0.00040259616789063105, + "loss": 2.6364, + "step": 19115 + }, + { + "epoch": 0.5668534827862289, + "grad_norm": 0.12937688827514648, + "learning_rate": 0.0004025500187887599, + "loss": 2.6557, + "step": 19116 + }, + { + "epoch": 0.5668831361385405, + "grad_norm": 0.12828829884529114, + "learning_rate": 0.00040250387054982506, + "loss": 2.6758, + "step": 19117 + }, + { + "epoch": 0.5669127894908519, + "grad_norm": 0.12860248982906342, + "learning_rate": 0.0004024577231742353, + "loss": 2.6488, + "step": 19118 + }, + { + "epoch": 0.5669424428431634, + "grad_norm": 0.11310023814439774, + "learning_rate": 0.000402411576662399, + "loss": 2.6531, + "step": 19119 + }, + { + "epoch": 0.566972096195475, + "grad_norm": 0.11172880977392197, + "learning_rate": 0.0004023654310147248, + "loss": 2.6534, + "step": 19120 + }, + { + "epoch": 0.5670017495477864, + "grad_norm": 0.12637895345687866, + "learning_rate": 0.0004023192862316216, + "loss": 2.6401, + "step": 19121 + }, + { + "epoch": 0.5670314029000979, + "grad_norm": 0.12678676843643188, + "learning_rate": 0.0004022731423134981, + "loss": 2.6477, + "step": 19122 + }, + { + "epoch": 0.5670610562524093, + "grad_norm": 0.1326330602169037, + "learning_rate": 0.0004022269992607625, + "loss": 2.6739, + "step": 19123 + }, + { + "epoch": 0.5670907096047209, + "grad_norm": 0.12944889068603516, + "learning_rate": 0.0004021808570738236, + "loss": 2.6385, + "step": 19124 + }, + { + "epoch": 0.5671203629570323, + "grad_norm": 0.11088196188211441, + "learning_rate": 0.00040213471575309005, + "loss": 2.6644, + "step": 19125 + }, + { + "epoch": 0.5671500163093438, + "grad_norm": 0.10427041351795197, + "learning_rate": 0.00040208857529897034, + "loss": 2.6426, + "step": 19126 + }, + { + "epoch": 0.5671796696616552, + "grad_norm": 0.12516789138317108, + "learning_rate": 0.0004020424357118732, + "loss": 2.6402, + "step": 19127 + }, + { + "epoch": 0.5672093230139668, + "grad_norm": 0.10302494466304779, + "learning_rate": 0.00040199629699220714, + "loss": 2.6478, + "step": 19128 + }, + { + "epoch": 0.5672389763662782, + "grad_norm": 0.10327247530221939, + "learning_rate": 0.0004019501591403806, + "loss": 2.6604, + "step": 19129 + }, + { + "epoch": 0.5672686297185897, + "grad_norm": 0.106324702501297, + "learning_rate": 0.00040190402215680224, + "loss": 2.6547, + "step": 19130 + }, + { + "epoch": 0.5672982830709011, + "grad_norm": 0.11421376466751099, + "learning_rate": 0.0004018578860418806, + "loss": 2.6688, + "step": 19131 + }, + { + "epoch": 0.5673279364232127, + "grad_norm": 0.09986788034439087, + "learning_rate": 0.00040181175079602417, + "loss": 2.6617, + "step": 19132 + }, + { + "epoch": 0.5673575897755241, + "grad_norm": 0.10475548356771469, + "learning_rate": 0.0004017656164196415, + "loss": 2.6886, + "step": 19133 + }, + { + "epoch": 0.5673872431278356, + "grad_norm": 0.10390019416809082, + "learning_rate": 0.0004017194829131412, + "loss": 2.648, + "step": 19134 + }, + { + "epoch": 0.567416896480147, + "grad_norm": 0.11015492677688599, + "learning_rate": 0.0004016733502769318, + "loss": 2.6682, + "step": 19135 + }, + { + "epoch": 0.5674465498324586, + "grad_norm": 0.09566337615251541, + "learning_rate": 0.0004016272185114217, + "loss": 2.6592, + "step": 19136 + }, + { + "epoch": 0.56747620318477, + "grad_norm": 0.09814610332250595, + "learning_rate": 0.00040158108761701957, + "loss": 2.6573, + "step": 19137 + }, + { + "epoch": 0.5675058565370815, + "grad_norm": 0.10255805402994156, + "learning_rate": 0.0004015349575941337, + "loss": 2.6577, + "step": 19138 + }, + { + "epoch": 0.5675355098893929, + "grad_norm": 0.11058967560529709, + "learning_rate": 0.00040148882844317287, + "loss": 2.6588, + "step": 19139 + }, + { + "epoch": 0.5675651632417045, + "grad_norm": 0.09426596015691757, + "learning_rate": 0.0004014427001645452, + "loss": 2.6288, + "step": 19140 + }, + { + "epoch": 0.567594816594016, + "grad_norm": 0.11772263050079346, + "learning_rate": 0.0004013965727586594, + "loss": 2.6848, + "step": 19141 + }, + { + "epoch": 0.5676244699463274, + "grad_norm": 0.1405010223388672, + "learning_rate": 0.0004013504462259238, + "loss": 2.6329, + "step": 19142 + }, + { + "epoch": 0.567654123298639, + "grad_norm": 0.11239787191152573, + "learning_rate": 0.000401304320566747, + "loss": 2.6432, + "step": 19143 + }, + { + "epoch": 0.5676837766509504, + "grad_norm": 0.11415871232748032, + "learning_rate": 0.00040125819578153734, + "loss": 2.6213, + "step": 19144 + }, + { + "epoch": 0.5677134300032619, + "grad_norm": 0.11309963464736938, + "learning_rate": 0.00040121207187070337, + "loss": 2.6616, + "step": 19145 + }, + { + "epoch": 0.5677430833555733, + "grad_norm": 0.09686270356178284, + "learning_rate": 0.0004011659488346533, + "loss": 2.649, + "step": 19146 + }, + { + "epoch": 0.5677727367078849, + "grad_norm": 0.12394453585147858, + "learning_rate": 0.00040111982667379584, + "loss": 2.6808, + "step": 19147 + }, + { + "epoch": 0.5678023900601963, + "grad_norm": 0.11391790956258774, + "learning_rate": 0.00040107370538853925, + "loss": 2.6362, + "step": 19148 + }, + { + "epoch": 0.5678320434125078, + "grad_norm": 0.11474558711051941, + "learning_rate": 0.0004010275849792921, + "loss": 2.6588, + "step": 19149 + }, + { + "epoch": 0.5678616967648192, + "grad_norm": 0.12975305318832397, + "learning_rate": 0.0004009814654464626, + "loss": 2.6058, + "step": 19150 + }, + { + "epoch": 0.5678913501171308, + "grad_norm": 0.11888197064399719, + "learning_rate": 0.0004009353467904592, + "loss": 2.693, + "step": 19151 + }, + { + "epoch": 0.5679210034694422, + "grad_norm": 0.11427053064107895, + "learning_rate": 0.0004008892290116903, + "loss": 2.6703, + "step": 19152 + }, + { + "epoch": 0.5679506568217537, + "grad_norm": 0.1161293163895607, + "learning_rate": 0.0004008431121105643, + "loss": 2.6115, + "step": 19153 + }, + { + "epoch": 0.5679803101740651, + "grad_norm": 0.1282462328672409, + "learning_rate": 0.00040079699608748954, + "loss": 2.6651, + "step": 19154 + }, + { + "epoch": 0.5680099635263767, + "grad_norm": 0.11133256554603577, + "learning_rate": 0.00040075088094287443, + "loss": 2.6963, + "step": 19155 + }, + { + "epoch": 0.5680396168786881, + "grad_norm": 0.11859627068042755, + "learning_rate": 0.00040070476667712743, + "loss": 2.6421, + "step": 19156 + }, + { + "epoch": 0.5680692702309996, + "grad_norm": 0.12275262176990509, + "learning_rate": 0.0004006586532906566, + "loss": 2.6814, + "step": 19157 + }, + { + "epoch": 0.568098923583311, + "grad_norm": 0.13394004106521606, + "learning_rate": 0.0004006125407838705, + "loss": 2.6457, + "step": 19158 + }, + { + "epoch": 0.5681285769356226, + "grad_norm": 0.1257414072751999, + "learning_rate": 0.0004005664291571772, + "loss": 2.636, + "step": 19159 + }, + { + "epoch": 0.568158230287934, + "grad_norm": 0.11830893158912659, + "learning_rate": 0.00040052031841098553, + "loss": 2.6335, + "step": 19160 + }, + { + "epoch": 0.5681878836402455, + "grad_norm": 0.10993937402963638, + "learning_rate": 0.0004004742085457034, + "loss": 2.6421, + "step": 19161 + }, + { + "epoch": 0.5682175369925571, + "grad_norm": 0.12594424188137054, + "learning_rate": 0.00040042809956173926, + "loss": 2.6513, + "step": 19162 + }, + { + "epoch": 0.5682471903448685, + "grad_norm": 0.1436309516429901, + "learning_rate": 0.00040038199145950136, + "loss": 2.6603, + "step": 19163 + }, + { + "epoch": 0.56827684369718, + "grad_norm": 0.13163840770721436, + "learning_rate": 0.00040033588423939805, + "loss": 2.6247, + "step": 19164 + }, + { + "epoch": 0.5683064970494914, + "grad_norm": 0.10903023928403854, + "learning_rate": 0.00040028977790183763, + "loss": 2.6896, + "step": 19165 + }, + { + "epoch": 0.568336150401803, + "grad_norm": 0.12519630789756775, + "learning_rate": 0.0004002436724472284, + "loss": 2.624, + "step": 19166 + }, + { + "epoch": 0.5683658037541144, + "grad_norm": 0.1256827414035797, + "learning_rate": 0.0004001975678759785, + "loss": 2.6772, + "step": 19167 + }, + { + "epoch": 0.5683954571064259, + "grad_norm": 0.11180617660284042, + "learning_rate": 0.00040015146418849625, + "loss": 2.6821, + "step": 19168 + }, + { + "epoch": 0.5684251104587373, + "grad_norm": 0.11387254297733307, + "learning_rate": 0.0004001053613851899, + "loss": 2.6439, + "step": 19169 + }, + { + "epoch": 0.5684547638110489, + "grad_norm": 0.11925309896469116, + "learning_rate": 0.0004000592594664677, + "loss": 2.6338, + "step": 19170 + }, + { + "epoch": 0.5684844171633603, + "grad_norm": 0.10139491409063339, + "learning_rate": 0.000400013158432738, + "loss": 2.6559, + "step": 19171 + }, + { + "epoch": 0.5685140705156718, + "grad_norm": 0.11464963108301163, + "learning_rate": 0.00039996705828440875, + "loss": 2.6401, + "step": 19172 + }, + { + "epoch": 0.5685437238679832, + "grad_norm": 0.12507101893424988, + "learning_rate": 0.0003999209590218885, + "loss": 2.6392, + "step": 19173 + }, + { + "epoch": 0.5685733772202948, + "grad_norm": 0.11067423224449158, + "learning_rate": 0.0003998748606455854, + "loss": 2.6683, + "step": 19174 + }, + { + "epoch": 0.5686030305726062, + "grad_norm": 0.10545314848423004, + "learning_rate": 0.00039982876315590747, + "loss": 2.6644, + "step": 19175 + }, + { + "epoch": 0.5686326839249177, + "grad_norm": 0.12020841240882874, + "learning_rate": 0.00039978266655326316, + "loss": 2.658, + "step": 19176 + }, + { + "epoch": 0.5686623372772291, + "grad_norm": 0.1344933658838272, + "learning_rate": 0.0003997365708380606, + "loss": 2.6617, + "step": 19177 + }, + { + "epoch": 0.5686919906295407, + "grad_norm": 0.12828762829303741, + "learning_rate": 0.0003996904760107078, + "loss": 2.6143, + "step": 19178 + }, + { + "epoch": 0.5687216439818521, + "grad_norm": 0.10895885527133942, + "learning_rate": 0.00039964438207161306, + "loss": 2.6436, + "step": 19179 + }, + { + "epoch": 0.5687512973341636, + "grad_norm": 0.11679906398057938, + "learning_rate": 0.00039959828902118457, + "loss": 2.6857, + "step": 19180 + }, + { + "epoch": 0.568780950686475, + "grad_norm": 0.1260145902633667, + "learning_rate": 0.0003995521968598304, + "loss": 2.6725, + "step": 19181 + }, + { + "epoch": 0.5688106040387866, + "grad_norm": 0.12193750590085983, + "learning_rate": 0.0003995061055879588, + "loss": 2.652, + "step": 19182 + }, + { + "epoch": 0.5688402573910981, + "grad_norm": 0.12244588881731033, + "learning_rate": 0.0003994600152059779, + "loss": 2.6798, + "step": 19183 + }, + { + "epoch": 0.5688699107434095, + "grad_norm": 0.12837207317352295, + "learning_rate": 0.0003994139257142959, + "loss": 2.6378, + "step": 19184 + }, + { + "epoch": 0.5688995640957211, + "grad_norm": 0.10756748169660568, + "learning_rate": 0.0003993678371133206, + "loss": 2.6748, + "step": 19185 + }, + { + "epoch": 0.5689292174480325, + "grad_norm": 0.12891604006290436, + "learning_rate": 0.0003993217494034605, + "loss": 2.6579, + "step": 19186 + }, + { + "epoch": 0.568958870800344, + "grad_norm": 0.11313849687576294, + "learning_rate": 0.00039927566258512375, + "loss": 2.6617, + "step": 19187 + }, + { + "epoch": 0.5689885241526554, + "grad_norm": 0.11866133660078049, + "learning_rate": 0.0003992295766587182, + "loss": 2.6314, + "step": 19188 + }, + { + "epoch": 0.569018177504967, + "grad_norm": 0.12406503409147263, + "learning_rate": 0.000399183491624652, + "loss": 2.66, + "step": 19189 + }, + { + "epoch": 0.5690478308572784, + "grad_norm": 0.11540552228689194, + "learning_rate": 0.0003991374074833333, + "loss": 2.6455, + "step": 19190 + }, + { + "epoch": 0.5690774842095899, + "grad_norm": 0.1258777529001236, + "learning_rate": 0.0003990913242351702, + "loss": 2.6542, + "step": 19191 + }, + { + "epoch": 0.5691071375619013, + "grad_norm": 0.1208655834197998, + "learning_rate": 0.00039904524188057075, + "loss": 2.6355, + "step": 19192 + }, + { + "epoch": 0.5691367909142129, + "grad_norm": 0.10621314495801926, + "learning_rate": 0.000398999160419943, + "loss": 2.676, + "step": 19193 + }, + { + "epoch": 0.5691664442665243, + "grad_norm": 0.11069932579994202, + "learning_rate": 0.0003989530798536951, + "loss": 2.7005, + "step": 19194 + }, + { + "epoch": 0.5691960976188358, + "grad_norm": 0.11441276222467422, + "learning_rate": 0.0003989070001822349, + "loss": 2.6173, + "step": 19195 + }, + { + "epoch": 0.5692257509711472, + "grad_norm": 0.11581496149301529, + "learning_rate": 0.00039886092140597063, + "loss": 2.6222, + "step": 19196 + }, + { + "epoch": 0.5692554043234588, + "grad_norm": 0.10348714888095856, + "learning_rate": 0.0003988148435253102, + "loss": 2.6581, + "step": 19197 + }, + { + "epoch": 0.5692850576757702, + "grad_norm": 0.12702879309654236, + "learning_rate": 0.00039876876654066165, + "loss": 2.6535, + "step": 19198 + }, + { + "epoch": 0.5693147110280817, + "grad_norm": 0.13260023295879364, + "learning_rate": 0.0003987226904524331, + "loss": 2.669, + "step": 19199 + }, + { + "epoch": 0.5693443643803932, + "grad_norm": 0.1331915706396103, + "learning_rate": 0.00039867661526103254, + "loss": 2.6435, + "step": 19200 + }, + { + "epoch": 0.5693740177327047, + "grad_norm": 0.1148630902171135, + "learning_rate": 0.0003986305409668679, + "loss": 2.6741, + "step": 19201 + }, + { + "epoch": 0.5694036710850162, + "grad_norm": 0.12570154666900635, + "learning_rate": 0.0003985844675703473, + "loss": 2.6581, + "step": 19202 + }, + { + "epoch": 0.5694333244373276, + "grad_norm": 0.14876045286655426, + "learning_rate": 0.0003985383950718786, + "loss": 2.656, + "step": 19203 + }, + { + "epoch": 0.5694629777896392, + "grad_norm": 0.14386774599552155, + "learning_rate": 0.0003984923234718699, + "loss": 2.6509, + "step": 19204 + }, + { + "epoch": 0.5694926311419506, + "grad_norm": 0.11894161254167557, + "learning_rate": 0.0003984462527707291, + "loss": 2.6422, + "step": 19205 + }, + { + "epoch": 0.5695222844942621, + "grad_norm": 0.10857166349887848, + "learning_rate": 0.00039840018296886405, + "loss": 2.651, + "step": 19206 + }, + { + "epoch": 0.5695519378465735, + "grad_norm": 0.10888514667749405, + "learning_rate": 0.00039835411406668287, + "loss": 2.62, + "step": 19207 + }, + { + "epoch": 0.5695815911988851, + "grad_norm": 0.11116675287485123, + "learning_rate": 0.0003983080460645935, + "loss": 2.6399, + "step": 19208 + }, + { + "epoch": 0.5696112445511965, + "grad_norm": 0.11909741908311844, + "learning_rate": 0.0003982619789630038, + "loss": 2.5921, + "step": 19209 + }, + { + "epoch": 0.569640897903508, + "grad_norm": 0.10830334573984146, + "learning_rate": 0.00039821591276232167, + "loss": 2.6693, + "step": 19210 + }, + { + "epoch": 0.5696705512558194, + "grad_norm": 0.11203356832265854, + "learning_rate": 0.00039816984746295504, + "loss": 2.6295, + "step": 19211 + }, + { + "epoch": 0.569700204608131, + "grad_norm": 0.10615746676921844, + "learning_rate": 0.00039812378306531204, + "loss": 2.6392, + "step": 19212 + }, + { + "epoch": 0.5697298579604424, + "grad_norm": 0.10390372574329376, + "learning_rate": 0.0003980777195698004, + "loss": 2.6662, + "step": 19213 + }, + { + "epoch": 0.5697595113127539, + "grad_norm": 0.1143045499920845, + "learning_rate": 0.00039803165697682805, + "loss": 2.6561, + "step": 19214 + }, + { + "epoch": 0.5697891646650654, + "grad_norm": 0.11322628706693649, + "learning_rate": 0.000397985595286803, + "loss": 2.6623, + "step": 19215 + }, + { + "epoch": 0.5698188180173769, + "grad_norm": 0.11198494583368301, + "learning_rate": 0.0003979395345001329, + "loss": 2.6501, + "step": 19216 + }, + { + "epoch": 0.5698484713696883, + "grad_norm": 0.10996908694505692, + "learning_rate": 0.0003978934746172257, + "loss": 2.6544, + "step": 19217 + }, + { + "epoch": 0.5698781247219998, + "grad_norm": 0.11213278025388718, + "learning_rate": 0.00039784741563848934, + "loss": 2.646, + "step": 19218 + }, + { + "epoch": 0.5699077780743113, + "grad_norm": 0.11686468869447708, + "learning_rate": 0.00039780135756433164, + "loss": 2.6538, + "step": 19219 + }, + { + "epoch": 0.5699374314266228, + "grad_norm": 0.10935286432504654, + "learning_rate": 0.0003977553003951605, + "loss": 2.7084, + "step": 19220 + }, + { + "epoch": 0.5699670847789342, + "grad_norm": 0.12512610852718353, + "learning_rate": 0.00039770924413138367, + "loss": 2.6627, + "step": 19221 + }, + { + "epoch": 0.5699967381312457, + "grad_norm": 0.1264522820711136, + "learning_rate": 0.00039766318877340915, + "loss": 2.67, + "step": 19222 + }, + { + "epoch": 0.5700263914835573, + "grad_norm": 0.1231018528342247, + "learning_rate": 0.0003976171343216446, + "loss": 2.6716, + "step": 19223 + }, + { + "epoch": 0.5700560448358687, + "grad_norm": 0.11051788926124573, + "learning_rate": 0.0003975710807764977, + "loss": 2.6344, + "step": 19224 + }, + { + "epoch": 0.5700856981881802, + "grad_norm": 0.10612327605485916, + "learning_rate": 0.0003975250281383767, + "loss": 2.6504, + "step": 19225 + }, + { + "epoch": 0.5701153515404916, + "grad_norm": 0.10451232641935349, + "learning_rate": 0.0003974789764076891, + "loss": 2.7192, + "step": 19226 + }, + { + "epoch": 0.5701450048928032, + "grad_norm": 0.10720300674438477, + "learning_rate": 0.0003974329255848428, + "loss": 2.6494, + "step": 19227 + }, + { + "epoch": 0.5701746582451146, + "grad_norm": 0.1269567906856537, + "learning_rate": 0.0003973868756702455, + "loss": 2.6733, + "step": 19228 + }, + { + "epoch": 0.5702043115974261, + "grad_norm": 0.12310220301151276, + "learning_rate": 0.00039734082666430503, + "loss": 2.6414, + "step": 19229 + }, + { + "epoch": 0.5702339649497375, + "grad_norm": 0.11408466100692749, + "learning_rate": 0.0003972947785674292, + "loss": 2.6915, + "step": 19230 + }, + { + "epoch": 0.5702636183020491, + "grad_norm": 0.1043701320886612, + "learning_rate": 0.0003972487313800257, + "loss": 2.6654, + "step": 19231 + }, + { + "epoch": 0.5702932716543605, + "grad_norm": 0.10880789160728455, + "learning_rate": 0.00039720268510250244, + "loss": 2.6333, + "step": 19232 + }, + { + "epoch": 0.570322925006672, + "grad_norm": 0.11131894588470459, + "learning_rate": 0.00039715663973526695, + "loss": 2.6208, + "step": 19233 + }, + { + "epoch": 0.5703525783589835, + "grad_norm": 0.10559418052434921, + "learning_rate": 0.0003971105952787271, + "loss": 2.6367, + "step": 19234 + }, + { + "epoch": 0.570382231711295, + "grad_norm": 0.10749156028032303, + "learning_rate": 0.0003970645517332905, + "loss": 2.6574, + "step": 19235 + }, + { + "epoch": 0.5704118850636064, + "grad_norm": 0.10573038458824158, + "learning_rate": 0.000397018509099365, + "loss": 2.6251, + "step": 19236 + }, + { + "epoch": 0.5704415384159179, + "grad_norm": 0.10749807953834534, + "learning_rate": 0.00039697246737735816, + "loss": 2.6499, + "step": 19237 + }, + { + "epoch": 0.5704711917682294, + "grad_norm": 0.11058001965284348, + "learning_rate": 0.0003969264265676779, + "loss": 2.6316, + "step": 19238 + }, + { + "epoch": 0.5705008451205409, + "grad_norm": 0.11669579148292542, + "learning_rate": 0.0003968803866707319, + "loss": 2.6291, + "step": 19239 + }, + { + "epoch": 0.5705304984728523, + "grad_norm": 0.09938310086727142, + "learning_rate": 0.00039683434768692774, + "loss": 2.6528, + "step": 19240 + }, + { + "epoch": 0.5705601518251638, + "grad_norm": 0.10380107164382935, + "learning_rate": 0.0003967883096166731, + "loss": 2.6704, + "step": 19241 + }, + { + "epoch": 0.5705898051774753, + "grad_norm": 0.09459760040044785, + "learning_rate": 0.0003967422724603759, + "loss": 2.6282, + "step": 19242 + }, + { + "epoch": 0.5706194585297868, + "grad_norm": 0.1031619980931282, + "learning_rate": 0.00039669623621844334, + "loss": 2.6613, + "step": 19243 + }, + { + "epoch": 0.5706491118820983, + "grad_norm": 0.1127002090215683, + "learning_rate": 0.00039665020089128345, + "loss": 2.6421, + "step": 19244 + }, + { + "epoch": 0.5706787652344097, + "grad_norm": 0.10252166539430618, + "learning_rate": 0.00039660416647930376, + "loss": 2.7138, + "step": 19245 + }, + { + "epoch": 0.5707084185867213, + "grad_norm": 0.11944554001092911, + "learning_rate": 0.00039655813298291193, + "loss": 2.6558, + "step": 19246 + }, + { + "epoch": 0.5707380719390327, + "grad_norm": 0.14483247697353363, + "learning_rate": 0.0003965121004025156, + "loss": 2.673, + "step": 19247 + }, + { + "epoch": 0.5707677252913442, + "grad_norm": 0.1396532654762268, + "learning_rate": 0.0003964660687385223, + "loss": 2.6464, + "step": 19248 + }, + { + "epoch": 0.5707973786436557, + "grad_norm": 0.1528111696243286, + "learning_rate": 0.0003964200379913397, + "loss": 2.6765, + "step": 19249 + }, + { + "epoch": 0.5708270319959672, + "grad_norm": 0.10507531464099884, + "learning_rate": 0.00039637400816137556, + "loss": 2.6777, + "step": 19250 + }, + { + "epoch": 0.5708566853482786, + "grad_norm": 0.129974827170372, + "learning_rate": 0.0003963279792490373, + "loss": 2.6481, + "step": 19251 + }, + { + "epoch": 0.5708863387005901, + "grad_norm": 0.13343355059623718, + "learning_rate": 0.0003962819512547326, + "loss": 2.6658, + "step": 19252 + }, + { + "epoch": 0.5709159920529016, + "grad_norm": 0.13866819441318512, + "learning_rate": 0.00039623592417886916, + "loss": 2.6818, + "step": 19253 + }, + { + "epoch": 0.5709456454052131, + "grad_norm": 0.13248272240161896, + "learning_rate": 0.0003961898980218543, + "loss": 2.6485, + "step": 19254 + }, + { + "epoch": 0.5709752987575245, + "grad_norm": 0.1283845603466034, + "learning_rate": 0.0003961438727840957, + "loss": 2.6605, + "step": 19255 + }, + { + "epoch": 0.571004952109836, + "grad_norm": 0.11109990626573563, + "learning_rate": 0.00039609784846600095, + "loss": 2.6934, + "step": 19256 + }, + { + "epoch": 0.5710346054621475, + "grad_norm": 0.1285131722688675, + "learning_rate": 0.0003960518250679776, + "loss": 2.6675, + "step": 19257 + }, + { + "epoch": 0.571064258814459, + "grad_norm": 0.11391790211200714, + "learning_rate": 0.0003960058025904332, + "loss": 2.6567, + "step": 19258 + }, + { + "epoch": 0.5710939121667704, + "grad_norm": 0.11179743707180023, + "learning_rate": 0.0003959597810337752, + "loss": 2.627, + "step": 19259 + }, + { + "epoch": 0.5711235655190819, + "grad_norm": 0.11566488444805145, + "learning_rate": 0.0003959137603984113, + "loss": 2.6677, + "step": 19260 + }, + { + "epoch": 0.5711532188713934, + "grad_norm": 0.11786086857318878, + "learning_rate": 0.0003958677406847489, + "loss": 2.6547, + "step": 19261 + }, + { + "epoch": 0.5711828722237049, + "grad_norm": 0.11370299756526947, + "learning_rate": 0.00039582172189319526, + "loss": 2.6177, + "step": 19262 + }, + { + "epoch": 0.5712125255760163, + "grad_norm": 0.12776552140712738, + "learning_rate": 0.0003957757040241585, + "loss": 2.673, + "step": 19263 + }, + { + "epoch": 0.5712421789283278, + "grad_norm": 0.09913061559200287, + "learning_rate": 0.00039572968707804565, + "loss": 2.6798, + "step": 19264 + }, + { + "epoch": 0.5712718322806394, + "grad_norm": 0.1151864156126976, + "learning_rate": 0.0003956836710552643, + "loss": 2.6515, + "step": 19265 + }, + { + "epoch": 0.5713014856329508, + "grad_norm": 0.11594992130994797, + "learning_rate": 0.0003956376559562219, + "loss": 2.6094, + "step": 19266 + }, + { + "epoch": 0.5713311389852623, + "grad_norm": 0.11025460064411163, + "learning_rate": 0.00039559164178132604, + "loss": 2.6514, + "step": 19267 + }, + { + "epoch": 0.5713607923375738, + "grad_norm": 0.11536701768636703, + "learning_rate": 0.0003955456285309841, + "loss": 2.6862, + "step": 19268 + }, + { + "epoch": 0.5713904456898853, + "grad_norm": 0.10705354809761047, + "learning_rate": 0.0003954996162056036, + "loss": 2.6592, + "step": 19269 + }, + { + "epoch": 0.5714200990421967, + "grad_norm": 0.10801150649785995, + "learning_rate": 0.000395453604805592, + "loss": 2.6792, + "step": 19270 + }, + { + "epoch": 0.5714497523945082, + "grad_norm": 0.12020807713270187, + "learning_rate": 0.00039540759433135655, + "loss": 2.6444, + "step": 19271 + }, + { + "epoch": 0.5714794057468197, + "grad_norm": 0.11908475309610367, + "learning_rate": 0.0003953615847833048, + "loss": 2.6709, + "step": 19272 + }, + { + "epoch": 0.5715090590991312, + "grad_norm": 0.1132887676358223, + "learning_rate": 0.00039531557616184423, + "loss": 2.6707, + "step": 19273 + }, + { + "epoch": 0.5715387124514426, + "grad_norm": 0.13296808302402496, + "learning_rate": 0.0003952695684673822, + "loss": 2.6704, + "step": 19274 + }, + { + "epoch": 0.5715683658037541, + "grad_norm": 0.13334393501281738, + "learning_rate": 0.0003952235617003259, + "loss": 2.667, + "step": 19275 + }, + { + "epoch": 0.5715980191560656, + "grad_norm": 0.11775391548871994, + "learning_rate": 0.00039517755586108316, + "loss": 2.6309, + "step": 19276 + }, + { + "epoch": 0.5716276725083771, + "grad_norm": 0.10675542056560516, + "learning_rate": 0.00039513155095006114, + "loss": 2.6667, + "step": 19277 + }, + { + "epoch": 0.5716573258606885, + "grad_norm": 0.1186765655875206, + "learning_rate": 0.00039508554696766717, + "loss": 2.6349, + "step": 19278 + }, + { + "epoch": 0.571686979213, + "grad_norm": 0.11904774606227875, + "learning_rate": 0.0003950395439143088, + "loss": 2.6357, + "step": 19279 + }, + { + "epoch": 0.5717166325653115, + "grad_norm": 0.10475192219018936, + "learning_rate": 0.0003949935417903932, + "loss": 2.6045, + "step": 19280 + }, + { + "epoch": 0.571746285917623, + "grad_norm": 0.10829892009496689, + "learning_rate": 0.000394947540596328, + "loss": 2.6453, + "step": 19281 + }, + { + "epoch": 0.5717759392699344, + "grad_norm": 0.12704196572303772, + "learning_rate": 0.0003949015403325202, + "loss": 2.6601, + "step": 19282 + }, + { + "epoch": 0.571805592622246, + "grad_norm": 0.11772282421588898, + "learning_rate": 0.00039485554099937724, + "loss": 2.6309, + "step": 19283 + }, + { + "epoch": 0.5718352459745574, + "grad_norm": 0.11189863085746765, + "learning_rate": 0.00039480954259730664, + "loss": 2.653, + "step": 19284 + }, + { + "epoch": 0.5718648993268689, + "grad_norm": 0.13238009810447693, + "learning_rate": 0.0003947635451267155, + "loss": 2.6507, + "step": 19285 + }, + { + "epoch": 0.5718945526791804, + "grad_norm": 0.12585213780403137, + "learning_rate": 0.0003947175485880112, + "loss": 2.6529, + "step": 19286 + }, + { + "epoch": 0.5719242060314919, + "grad_norm": 0.11718674004077911, + "learning_rate": 0.0003946715529816013, + "loss": 2.664, + "step": 19287 + }, + { + "epoch": 0.5719538593838034, + "grad_norm": 0.10485319048166275, + "learning_rate": 0.0003946255583078925, + "loss": 2.6612, + "step": 19288 + }, + { + "epoch": 0.5719835127361148, + "grad_norm": 0.10881859809160233, + "learning_rate": 0.00039457956456729265, + "loss": 2.6481, + "step": 19289 + }, + { + "epoch": 0.5720131660884263, + "grad_norm": 0.12339510768651962, + "learning_rate": 0.0003945335717602089, + "loss": 2.6478, + "step": 19290 + }, + { + "epoch": 0.5720428194407378, + "grad_norm": 0.09648031741380692, + "learning_rate": 0.0003944875798870486, + "loss": 2.6632, + "step": 19291 + }, + { + "epoch": 0.5720724727930493, + "grad_norm": 0.10933991521596909, + "learning_rate": 0.00039444158894821874, + "loss": 2.6521, + "step": 19292 + }, + { + "epoch": 0.5721021261453607, + "grad_norm": 0.11164295673370361, + "learning_rate": 0.00039439559894412674, + "loss": 2.6928, + "step": 19293 + }, + { + "epoch": 0.5721317794976722, + "grad_norm": 0.11385104060173035, + "learning_rate": 0.0003943496098751799, + "loss": 2.6497, + "step": 19294 + }, + { + "epoch": 0.5721614328499837, + "grad_norm": 0.1048683300614357, + "learning_rate": 0.00039430362174178526, + "loss": 2.6463, + "step": 19295 + }, + { + "epoch": 0.5721910862022952, + "grad_norm": 0.1132989153265953, + "learning_rate": 0.00039425763454435024, + "loss": 2.7063, + "step": 19296 + }, + { + "epoch": 0.5722207395546066, + "grad_norm": 0.09452251344919205, + "learning_rate": 0.0003942116482832821, + "loss": 2.6376, + "step": 19297 + }, + { + "epoch": 0.5722503929069181, + "grad_norm": 0.1125478744506836, + "learning_rate": 0.00039416566295898803, + "loss": 2.648, + "step": 19298 + }, + { + "epoch": 0.5722800462592296, + "grad_norm": 0.10975876450538635, + "learning_rate": 0.0003941196785718751, + "loss": 2.6236, + "step": 19299 + }, + { + "epoch": 0.5723096996115411, + "grad_norm": 0.10584122687578201, + "learning_rate": 0.0003940736951223505, + "loss": 2.6497, + "step": 19300 + }, + { + "epoch": 0.5723393529638525, + "grad_norm": 0.10399605333805084, + "learning_rate": 0.0003940277126108215, + "loss": 2.6366, + "step": 19301 + }, + { + "epoch": 0.572369006316164, + "grad_norm": 0.10026834905147552, + "learning_rate": 0.00039398173103769534, + "loss": 2.6651, + "step": 19302 + }, + { + "epoch": 0.5723986596684755, + "grad_norm": 0.11340028047561646, + "learning_rate": 0.0003939357504033792, + "loss": 2.6804, + "step": 19303 + }, + { + "epoch": 0.572428313020787, + "grad_norm": 0.1114497035741806, + "learning_rate": 0.0003938897707082802, + "loss": 2.6277, + "step": 19304 + }, + { + "epoch": 0.5724579663730984, + "grad_norm": 0.11530651897192001, + "learning_rate": 0.0003938437919528055, + "loss": 2.6545, + "step": 19305 + }, + { + "epoch": 0.57248761972541, + "grad_norm": 0.09793385863304138, + "learning_rate": 0.00039379781413736216, + "loss": 2.6571, + "step": 19306 + }, + { + "epoch": 0.5725172730777215, + "grad_norm": 0.11288858950138092, + "learning_rate": 0.0003937518372623574, + "loss": 2.6344, + "step": 19307 + }, + { + "epoch": 0.5725469264300329, + "grad_norm": 0.10891906172037125, + "learning_rate": 0.0003937058613281986, + "loss": 2.5979, + "step": 19308 + }, + { + "epoch": 0.5725765797823444, + "grad_norm": 0.10327854752540588, + "learning_rate": 0.0003936598863352924, + "loss": 2.6676, + "step": 19309 + }, + { + "epoch": 0.5726062331346559, + "grad_norm": 0.13810832798480988, + "learning_rate": 0.0003936139122840462, + "loss": 2.6574, + "step": 19310 + }, + { + "epoch": 0.5726358864869674, + "grad_norm": 0.13740162551403046, + "learning_rate": 0.0003935679391748671, + "loss": 2.6265, + "step": 19311 + }, + { + "epoch": 0.5726655398392788, + "grad_norm": 0.12184791266918182, + "learning_rate": 0.00039352196700816216, + "loss": 2.6448, + "step": 19312 + }, + { + "epoch": 0.5726951931915903, + "grad_norm": 0.11228246241807938, + "learning_rate": 0.00039347599578433846, + "loss": 2.624, + "step": 19313 + }, + { + "epoch": 0.5727248465439018, + "grad_norm": 0.11348354071378708, + "learning_rate": 0.000393430025503803, + "loss": 2.6599, + "step": 19314 + }, + { + "epoch": 0.5727544998962133, + "grad_norm": 0.1091584712266922, + "learning_rate": 0.0003933840561669631, + "loss": 2.6552, + "step": 19315 + }, + { + "epoch": 0.5727841532485247, + "grad_norm": 0.11200401186943054, + "learning_rate": 0.0003933380877742256, + "loss": 2.6279, + "step": 19316 + }, + { + "epoch": 0.5728138066008363, + "grad_norm": 0.12584786117076874, + "learning_rate": 0.00039329212032599776, + "loss": 2.6469, + "step": 19317 + }, + { + "epoch": 0.5728434599531477, + "grad_norm": 0.1269783079624176, + "learning_rate": 0.0003932461538226864, + "loss": 2.6258, + "step": 19318 + }, + { + "epoch": 0.5728731133054592, + "grad_norm": 0.15440711379051208, + "learning_rate": 0.0003932001882646988, + "loss": 2.6876, + "step": 19319 + }, + { + "epoch": 0.5729027666577706, + "grad_norm": 0.196815624833107, + "learning_rate": 0.00039315422365244183, + "loss": 2.7015, + "step": 19320 + }, + { + "epoch": 0.5729324200100822, + "grad_norm": 0.15546686947345734, + "learning_rate": 0.0003931082599863225, + "loss": 2.6393, + "step": 19321 + }, + { + "epoch": 0.5729620733623936, + "grad_norm": 0.12144456803798676, + "learning_rate": 0.0003930622972667479, + "loss": 2.6751, + "step": 19322 + }, + { + "epoch": 0.5729917267147051, + "grad_norm": 0.11957814544439316, + "learning_rate": 0.000393016335494125, + "loss": 2.645, + "step": 19323 + }, + { + "epoch": 0.5730213800670165, + "grad_norm": 0.13108451664447784, + "learning_rate": 0.00039297037466886074, + "loss": 2.6423, + "step": 19324 + }, + { + "epoch": 0.5730510334193281, + "grad_norm": 0.11868628859519958, + "learning_rate": 0.0003929244147913624, + "loss": 2.6521, + "step": 19325 + }, + { + "epoch": 0.5730806867716395, + "grad_norm": 0.1393517702817917, + "learning_rate": 0.0003928784558620366, + "loss": 2.6347, + "step": 19326 + }, + { + "epoch": 0.573110340123951, + "grad_norm": 0.1216718852519989, + "learning_rate": 0.0003928324978812902, + "loss": 2.63, + "step": 19327 + }, + { + "epoch": 0.5731399934762625, + "grad_norm": 0.11828551441431046, + "learning_rate": 0.0003927865408495307, + "loss": 2.6348, + "step": 19328 + }, + { + "epoch": 0.573169646828574, + "grad_norm": 0.12162365019321442, + "learning_rate": 0.0003927405847671649, + "loss": 2.6361, + "step": 19329 + }, + { + "epoch": 0.5731993001808855, + "grad_norm": 0.11463216692209244, + "learning_rate": 0.00039269462963459947, + "loss": 2.6147, + "step": 19330 + }, + { + "epoch": 0.5732289535331969, + "grad_norm": 0.09932571649551392, + "learning_rate": 0.00039264867545224155, + "loss": 2.6133, + "step": 19331 + }, + { + "epoch": 0.5732586068855084, + "grad_norm": 0.10516583919525146, + "learning_rate": 0.00039260272222049796, + "loss": 2.6636, + "step": 19332 + }, + { + "epoch": 0.5732882602378199, + "grad_norm": 0.10885009169578552, + "learning_rate": 0.0003925567699397758, + "loss": 2.6582, + "step": 19333 + }, + { + "epoch": 0.5733179135901314, + "grad_norm": 0.10207141190767288, + "learning_rate": 0.00039251081861048186, + "loss": 2.6656, + "step": 19334 + }, + { + "epoch": 0.5733475669424428, + "grad_norm": 0.11490512639284134, + "learning_rate": 0.00039246486823302305, + "loss": 2.6689, + "step": 19335 + }, + { + "epoch": 0.5733772202947544, + "grad_norm": 0.0924612358212471, + "learning_rate": 0.00039241891880780643, + "loss": 2.5855, + "step": 19336 + }, + { + "epoch": 0.5734068736470658, + "grad_norm": 0.09992140531539917, + "learning_rate": 0.00039237297033523864, + "loss": 2.646, + "step": 19337 + }, + { + "epoch": 0.5734365269993773, + "grad_norm": 0.10478194057941437, + "learning_rate": 0.0003923270228157266, + "loss": 2.6431, + "step": 19338 + }, + { + "epoch": 0.5734661803516887, + "grad_norm": 0.10760900378227234, + "learning_rate": 0.00039228107624967733, + "loss": 2.6199, + "step": 19339 + }, + { + "epoch": 0.5734958337040003, + "grad_norm": 0.1011427640914917, + "learning_rate": 0.0003922351306374975, + "loss": 2.6334, + "step": 19340 + }, + { + "epoch": 0.5735254870563117, + "grad_norm": 0.10893481224775314, + "learning_rate": 0.0003921891859795942, + "loss": 2.6559, + "step": 19341 + }, + { + "epoch": 0.5735551404086232, + "grad_norm": 0.13310657441616058, + "learning_rate": 0.0003921432422763741, + "loss": 2.6874, + "step": 19342 + }, + { + "epoch": 0.5735847937609346, + "grad_norm": 0.12744717299938202, + "learning_rate": 0.0003920972995282441, + "loss": 2.6647, + "step": 19343 + }, + { + "epoch": 0.5736144471132462, + "grad_norm": 0.1229575052857399, + "learning_rate": 0.00039205135773561113, + "loss": 2.6319, + "step": 19344 + }, + { + "epoch": 0.5736441004655576, + "grad_norm": 0.11623919755220413, + "learning_rate": 0.00039200541689888186, + "loss": 2.6847, + "step": 19345 + }, + { + "epoch": 0.5736737538178691, + "grad_norm": 0.14312687516212463, + "learning_rate": 0.0003919594770184633, + "loss": 2.6785, + "step": 19346 + }, + { + "epoch": 0.5737034071701805, + "grad_norm": 0.13768728077411652, + "learning_rate": 0.00039191353809476197, + "loss": 2.6317, + "step": 19347 + }, + { + "epoch": 0.5737330605224921, + "grad_norm": 0.13183733820915222, + "learning_rate": 0.00039186760012818484, + "loss": 2.6548, + "step": 19348 + }, + { + "epoch": 0.5737627138748036, + "grad_norm": 0.12115781009197235, + "learning_rate": 0.00039182166311913863, + "loss": 2.6529, + "step": 19349 + }, + { + "epoch": 0.573792367227115, + "grad_norm": 0.15204490721225739, + "learning_rate": 0.0003917757270680302, + "loss": 2.6448, + "step": 19350 + }, + { + "epoch": 0.5738220205794266, + "grad_norm": 0.1407202184200287, + "learning_rate": 0.00039172979197526624, + "loss": 2.6595, + "step": 19351 + }, + { + "epoch": 0.573851673931738, + "grad_norm": 0.11675474047660828, + "learning_rate": 0.00039168385784125364, + "loss": 2.6444, + "step": 19352 + }, + { + "epoch": 0.5738813272840495, + "grad_norm": 0.13913999497890472, + "learning_rate": 0.0003916379246663989, + "loss": 2.6437, + "step": 19353 + }, + { + "epoch": 0.5739109806363609, + "grad_norm": 0.12299040704965591, + "learning_rate": 0.00039159199245110903, + "loss": 2.6661, + "step": 19354 + }, + { + "epoch": 0.5739406339886725, + "grad_norm": 0.12126868963241577, + "learning_rate": 0.0003915460611957907, + "loss": 2.6281, + "step": 19355 + }, + { + "epoch": 0.5739702873409839, + "grad_norm": 0.12338121980428696, + "learning_rate": 0.0003915001309008507, + "loss": 2.6435, + "step": 19356 + }, + { + "epoch": 0.5739999406932954, + "grad_norm": 0.11908013373613358, + "learning_rate": 0.00039145420156669564, + "loss": 2.6558, + "step": 19357 + }, + { + "epoch": 0.5740295940456068, + "grad_norm": 0.1035909354686737, + "learning_rate": 0.00039140827319373217, + "loss": 2.6419, + "step": 19358 + }, + { + "epoch": 0.5740592473979184, + "grad_norm": 0.12810014188289642, + "learning_rate": 0.0003913623457823672, + "loss": 2.6309, + "step": 19359 + }, + { + "epoch": 0.5740889007502298, + "grad_norm": 0.10643458366394043, + "learning_rate": 0.0003913164193330072, + "loss": 2.6556, + "step": 19360 + }, + { + "epoch": 0.5741185541025413, + "grad_norm": 0.12028200924396515, + "learning_rate": 0.000391270493846059, + "loss": 2.6117, + "step": 19361 + }, + { + "epoch": 0.5741482074548527, + "grad_norm": 0.1295270472764969, + "learning_rate": 0.0003912245693219292, + "loss": 2.6806, + "step": 19362 + }, + { + "epoch": 0.5741778608071643, + "grad_norm": 0.12813487648963928, + "learning_rate": 0.00039117864576102457, + "loss": 2.6573, + "step": 19363 + }, + { + "epoch": 0.5742075141594757, + "grad_norm": 0.11194717139005661, + "learning_rate": 0.0003911327231637517, + "loss": 2.673, + "step": 19364 + }, + { + "epoch": 0.5742371675117872, + "grad_norm": 0.11340994387865067, + "learning_rate": 0.0003910868015305172, + "loss": 2.6838, + "step": 19365 + }, + { + "epoch": 0.5742668208640986, + "grad_norm": 0.11182618886232376, + "learning_rate": 0.0003910408808617276, + "loss": 2.6696, + "step": 19366 + }, + { + "epoch": 0.5742964742164102, + "grad_norm": 0.11890808492898941, + "learning_rate": 0.00039099496115779, + "loss": 2.6154, + "step": 19367 + }, + { + "epoch": 0.5743261275687216, + "grad_norm": 0.10071966797113419, + "learning_rate": 0.0003909490424191106, + "loss": 2.6522, + "step": 19368 + }, + { + "epoch": 0.5743557809210331, + "grad_norm": 0.11466411501169205, + "learning_rate": 0.0003909031246460962, + "loss": 2.6639, + "step": 19369 + }, + { + "epoch": 0.5743854342733447, + "grad_norm": 0.1169881522655487, + "learning_rate": 0.0003908572078391533, + "loss": 2.6434, + "step": 19370 + }, + { + "epoch": 0.5744150876256561, + "grad_norm": 0.10692629218101501, + "learning_rate": 0.0003908112919986886, + "loss": 2.6417, + "step": 19371 + }, + { + "epoch": 0.5744447409779676, + "grad_norm": 0.11389602720737457, + "learning_rate": 0.0003907653771251086, + "loss": 2.676, + "step": 19372 + }, + { + "epoch": 0.574474394330279, + "grad_norm": 0.12429782748222351, + "learning_rate": 0.00039071946321881995, + "loss": 2.6548, + "step": 19373 + }, + { + "epoch": 0.5745040476825906, + "grad_norm": 0.12395752221345901, + "learning_rate": 0.0003906735502802294, + "loss": 2.6563, + "step": 19374 + }, + { + "epoch": 0.574533701034902, + "grad_norm": 0.13056960701942444, + "learning_rate": 0.0003906276383097431, + "loss": 2.6368, + "step": 19375 + }, + { + "epoch": 0.5745633543872135, + "grad_norm": 0.11104335635900497, + "learning_rate": 0.0003905817273077679, + "loss": 2.6505, + "step": 19376 + }, + { + "epoch": 0.5745930077395249, + "grad_norm": 0.10565740615129471, + "learning_rate": 0.0003905358172747103, + "loss": 2.6558, + "step": 19377 + }, + { + "epoch": 0.5746226610918365, + "grad_norm": 0.12712053954601288, + "learning_rate": 0.0003904899082109768, + "loss": 2.62, + "step": 19378 + }, + { + "epoch": 0.5746523144441479, + "grad_norm": 0.1285586655139923, + "learning_rate": 0.0003904440001169739, + "loss": 2.6225, + "step": 19379 + }, + { + "epoch": 0.5746819677964594, + "grad_norm": 0.10692957043647766, + "learning_rate": 0.00039039809299310825, + "loss": 2.6548, + "step": 19380 + }, + { + "epoch": 0.5747116211487708, + "grad_norm": 0.11638255417346954, + "learning_rate": 0.00039035218683978627, + "loss": 2.6471, + "step": 19381 + }, + { + "epoch": 0.5747412745010824, + "grad_norm": 0.11056258529424667, + "learning_rate": 0.00039030628165741455, + "loss": 2.6382, + "step": 19382 + }, + { + "epoch": 0.5747709278533938, + "grad_norm": 0.1200743243098259, + "learning_rate": 0.0003902603774463995, + "loss": 2.6861, + "step": 19383 + }, + { + "epoch": 0.5748005812057053, + "grad_norm": 0.1144629418849945, + "learning_rate": 0.0003902144742071478, + "loss": 2.6472, + "step": 19384 + }, + { + "epoch": 0.5748302345580167, + "grad_norm": 0.11746950447559357, + "learning_rate": 0.0003901685719400656, + "loss": 2.6505, + "step": 19385 + }, + { + "epoch": 0.5748598879103283, + "grad_norm": 0.12326522171497345, + "learning_rate": 0.00039012267064555956, + "loss": 2.6612, + "step": 19386 + }, + { + "epoch": 0.5748895412626397, + "grad_norm": 0.12429303675889969, + "learning_rate": 0.00039007677032403616, + "loss": 2.6208, + "step": 19387 + }, + { + "epoch": 0.5749191946149512, + "grad_norm": 0.11935997009277344, + "learning_rate": 0.00039003087097590184, + "loss": 2.6423, + "step": 19388 + }, + { + "epoch": 0.5749488479672626, + "grad_norm": 0.11694694310426712, + "learning_rate": 0.000389984972601563, + "loss": 2.6436, + "step": 19389 + }, + { + "epoch": 0.5749785013195742, + "grad_norm": 0.13478022813796997, + "learning_rate": 0.00038993907520142615, + "loss": 2.6517, + "step": 19390 + }, + { + "epoch": 0.5750081546718857, + "grad_norm": 0.15243194997310638, + "learning_rate": 0.0003898931787758978, + "loss": 2.6609, + "step": 19391 + }, + { + "epoch": 0.5750378080241971, + "grad_norm": 0.14074426889419556, + "learning_rate": 0.00038984728332538385, + "loss": 2.6402, + "step": 19392 + }, + { + "epoch": 0.5750674613765087, + "grad_norm": 0.11738773435354233, + "learning_rate": 0.0003898013888502914, + "loss": 2.6378, + "step": 19393 + }, + { + "epoch": 0.5750971147288201, + "grad_norm": 0.10072604566812515, + "learning_rate": 0.0003897554953510265, + "loss": 2.6422, + "step": 19394 + }, + { + "epoch": 0.5751267680811316, + "grad_norm": 0.1343574821949005, + "learning_rate": 0.0003897096028279957, + "loss": 2.6585, + "step": 19395 + }, + { + "epoch": 0.575156421433443, + "grad_norm": 0.14756496250629425, + "learning_rate": 0.0003896637112816053, + "loss": 2.5935, + "step": 19396 + }, + { + "epoch": 0.5751860747857546, + "grad_norm": 0.11917838454246521, + "learning_rate": 0.0003896178207122616, + "loss": 2.6686, + "step": 19397 + }, + { + "epoch": 0.575215728138066, + "grad_norm": 0.12316970527172089, + "learning_rate": 0.0003895719311203711, + "loss": 2.6277, + "step": 19398 + }, + { + "epoch": 0.5752453814903775, + "grad_norm": 0.12117191404104233, + "learning_rate": 0.00038952604250634004, + "loss": 2.6462, + "step": 19399 + }, + { + "epoch": 0.5752750348426889, + "grad_norm": 0.13588780164718628, + "learning_rate": 0.00038948015487057485, + "loss": 2.6805, + "step": 19400 + }, + { + "epoch": 0.5753046881950005, + "grad_norm": 0.12338002771139145, + "learning_rate": 0.00038943426821348205, + "loss": 2.6348, + "step": 19401 + }, + { + "epoch": 0.5753343415473119, + "grad_norm": 0.13567951321601868, + "learning_rate": 0.0003893883825354676, + "loss": 2.6306, + "step": 19402 + }, + { + "epoch": 0.5753639948996234, + "grad_norm": 0.1277768611907959, + "learning_rate": 0.000389342497836938, + "loss": 2.6588, + "step": 19403 + }, + { + "epoch": 0.5753936482519348, + "grad_norm": 0.11540025472640991, + "learning_rate": 0.0003892966141182996, + "loss": 2.6551, + "step": 19404 + }, + { + "epoch": 0.5754233016042464, + "grad_norm": 0.12759585678577423, + "learning_rate": 0.0003892507313799586, + "loss": 2.6584, + "step": 19405 + }, + { + "epoch": 0.5754529549565578, + "grad_norm": 0.12196554243564606, + "learning_rate": 0.0003892048496223214, + "loss": 2.6177, + "step": 19406 + }, + { + "epoch": 0.5754826083088693, + "grad_norm": 0.11934535950422287, + "learning_rate": 0.0003891589688457944, + "loss": 2.6626, + "step": 19407 + }, + { + "epoch": 0.5755122616611807, + "grad_norm": 0.13771793246269226, + "learning_rate": 0.0003891130890507837, + "loss": 2.662, + "step": 19408 + }, + { + "epoch": 0.5755419150134923, + "grad_norm": 0.12276512384414673, + "learning_rate": 0.0003890672102376956, + "loss": 2.6546, + "step": 19409 + }, + { + "epoch": 0.5755715683658038, + "grad_norm": 0.11538024246692657, + "learning_rate": 0.00038902133240693637, + "loss": 2.6387, + "step": 19410 + }, + { + "epoch": 0.5756012217181152, + "grad_norm": 0.1300434023141861, + "learning_rate": 0.0003889754555589123, + "loss": 2.6465, + "step": 19411 + }, + { + "epoch": 0.5756308750704268, + "grad_norm": 0.09898010641336441, + "learning_rate": 0.00038892957969402986, + "loss": 2.6179, + "step": 19412 + }, + { + "epoch": 0.5756605284227382, + "grad_norm": 0.14008411765098572, + "learning_rate": 0.0003888837048126949, + "loss": 2.6358, + "step": 19413 + }, + { + "epoch": 0.5756901817750497, + "grad_norm": 0.13610495626926422, + "learning_rate": 0.0003888378309153137, + "loss": 2.6391, + "step": 19414 + }, + { + "epoch": 0.5757198351273611, + "grad_norm": 0.10816924273967743, + "learning_rate": 0.00038879195800229266, + "loss": 2.6421, + "step": 19415 + }, + { + "epoch": 0.5757494884796727, + "grad_norm": 0.12796495854854584, + "learning_rate": 0.00038874608607403796, + "loss": 2.6586, + "step": 19416 + }, + { + "epoch": 0.5757791418319841, + "grad_norm": 0.10870248079299927, + "learning_rate": 0.0003887002151309557, + "loss": 2.6354, + "step": 19417 + }, + { + "epoch": 0.5758087951842956, + "grad_norm": 0.12151496857404709, + "learning_rate": 0.00038865434517345205, + "loss": 2.6885, + "step": 19418 + }, + { + "epoch": 0.575838448536607, + "grad_norm": 0.10480844229459763, + "learning_rate": 0.0003886084762019334, + "loss": 2.6612, + "step": 19419 + }, + { + "epoch": 0.5758681018889186, + "grad_norm": 0.09530240297317505, + "learning_rate": 0.00038856260821680576, + "loss": 2.6408, + "step": 19420 + }, + { + "epoch": 0.57589775524123, + "grad_norm": 0.10526113957166672, + "learning_rate": 0.0003885167412184755, + "loss": 2.6731, + "step": 19421 + }, + { + "epoch": 0.5759274085935415, + "grad_norm": 0.12543103098869324, + "learning_rate": 0.00038847087520734857, + "loss": 2.6519, + "step": 19422 + }, + { + "epoch": 0.5759570619458529, + "grad_norm": 0.13789063692092896, + "learning_rate": 0.0003884250101838312, + "loss": 2.6626, + "step": 19423 + }, + { + "epoch": 0.5759867152981645, + "grad_norm": 0.1230255737900734, + "learning_rate": 0.0003883791461483295, + "loss": 2.6235, + "step": 19424 + }, + { + "epoch": 0.5760163686504759, + "grad_norm": 0.11035805940628052, + "learning_rate": 0.00038833328310124957, + "loss": 2.6266, + "step": 19425 + }, + { + "epoch": 0.5760460220027874, + "grad_norm": 0.10255854576826096, + "learning_rate": 0.0003882874210429976, + "loss": 2.6714, + "step": 19426 + }, + { + "epoch": 0.5760756753550988, + "grad_norm": 0.1088104322552681, + "learning_rate": 0.0003882415599739796, + "loss": 2.646, + "step": 19427 + }, + { + "epoch": 0.5761053287074104, + "grad_norm": 0.11408668756484985, + "learning_rate": 0.0003881956998946019, + "loss": 2.6464, + "step": 19428 + }, + { + "epoch": 0.5761349820597218, + "grad_norm": 0.09570440649986267, + "learning_rate": 0.00038814984080527053, + "loss": 2.6618, + "step": 19429 + }, + { + "epoch": 0.5761646354120333, + "grad_norm": 0.11112780123949051, + "learning_rate": 0.00038810398270639124, + "loss": 2.6304, + "step": 19430 + }, + { + "epoch": 0.5761942887643449, + "grad_norm": 0.10595784336328506, + "learning_rate": 0.00038805812559837056, + "loss": 2.6367, + "step": 19431 + }, + { + "epoch": 0.5762239421166563, + "grad_norm": 0.11862015724182129, + "learning_rate": 0.0003880122694816144, + "loss": 2.6728, + "step": 19432 + }, + { + "epoch": 0.5762535954689678, + "grad_norm": 0.09700033068656921, + "learning_rate": 0.0003879664143565289, + "loss": 2.6085, + "step": 19433 + }, + { + "epoch": 0.5762832488212792, + "grad_norm": 0.11443093419075012, + "learning_rate": 0.00038792056022351996, + "loss": 2.691, + "step": 19434 + }, + { + "epoch": 0.5763129021735908, + "grad_norm": 0.10441568493843079, + "learning_rate": 0.00038787470708299374, + "loss": 2.6348, + "step": 19435 + }, + { + "epoch": 0.5763425555259022, + "grad_norm": 0.10129394382238388, + "learning_rate": 0.0003878288549353561, + "loss": 2.6303, + "step": 19436 + }, + { + "epoch": 0.5763722088782137, + "grad_norm": 0.1108374148607254, + "learning_rate": 0.0003877830037810133, + "loss": 2.643, + "step": 19437 + }, + { + "epoch": 0.5764018622305251, + "grad_norm": 0.10295522958040237, + "learning_rate": 0.00038773715362037134, + "loss": 2.6341, + "step": 19438 + }, + { + "epoch": 0.5764315155828367, + "grad_norm": 0.1020868793129921, + "learning_rate": 0.0003876913044538362, + "loss": 2.6398, + "step": 19439 + }, + { + "epoch": 0.5764611689351481, + "grad_norm": 0.09605494141578674, + "learning_rate": 0.00038764545628181365, + "loss": 2.6775, + "step": 19440 + }, + { + "epoch": 0.5764908222874596, + "grad_norm": 0.10743282735347748, + "learning_rate": 0.00038759960910470995, + "loss": 2.6709, + "step": 19441 + }, + { + "epoch": 0.576520475639771, + "grad_norm": 0.11772850155830383, + "learning_rate": 0.00038755376292293095, + "loss": 2.636, + "step": 19442 + }, + { + "epoch": 0.5765501289920826, + "grad_norm": 0.09833525866270065, + "learning_rate": 0.0003875079177368825, + "loss": 2.6393, + "step": 19443 + }, + { + "epoch": 0.576579782344394, + "grad_norm": 0.09407735615968704, + "learning_rate": 0.00038746207354697107, + "loss": 2.5996, + "step": 19444 + }, + { + "epoch": 0.5766094356967055, + "grad_norm": 0.11385571956634521, + "learning_rate": 0.00038741623035360215, + "loss": 2.679, + "step": 19445 + }, + { + "epoch": 0.576639089049017, + "grad_norm": 0.12434366345405579, + "learning_rate": 0.00038737038815718185, + "loss": 2.6834, + "step": 19446 + }, + { + "epoch": 0.5766687424013285, + "grad_norm": 0.1361919343471527, + "learning_rate": 0.0003873245469581161, + "loss": 2.6645, + "step": 19447 + }, + { + "epoch": 0.5766983957536399, + "grad_norm": 0.13284531235694885, + "learning_rate": 0.00038727870675681086, + "loss": 2.6455, + "step": 19448 + }, + { + "epoch": 0.5767280491059514, + "grad_norm": 0.09762489795684814, + "learning_rate": 0.00038723286755367196, + "loss": 2.6426, + "step": 19449 + }, + { + "epoch": 0.5767577024582629, + "grad_norm": 0.12030588835477829, + "learning_rate": 0.00038718702934910555, + "loss": 2.6382, + "step": 19450 + }, + { + "epoch": 0.5767873558105744, + "grad_norm": 0.1230560690164566, + "learning_rate": 0.00038714119214351716, + "loss": 2.6374, + "step": 19451 + }, + { + "epoch": 0.5768170091628859, + "grad_norm": 0.13005845248699188, + "learning_rate": 0.000387095355937313, + "loss": 2.6638, + "step": 19452 + }, + { + "epoch": 0.5768466625151973, + "grad_norm": 0.12633271515369415, + "learning_rate": 0.00038704952073089876, + "loss": 2.668, + "step": 19453 + }, + { + "epoch": 0.5768763158675089, + "grad_norm": 0.11705752462148666, + "learning_rate": 0.00038700368652468044, + "loss": 2.6856, + "step": 19454 + }, + { + "epoch": 0.5769059692198203, + "grad_norm": 0.11877014487981796, + "learning_rate": 0.00038695785331906385, + "loss": 2.6703, + "step": 19455 + }, + { + "epoch": 0.5769356225721318, + "grad_norm": 0.11901199072599411, + "learning_rate": 0.0003869120211144548, + "loss": 2.6616, + "step": 19456 + }, + { + "epoch": 0.5769652759244432, + "grad_norm": 0.13044042885303497, + "learning_rate": 0.0003868661899112593, + "loss": 2.6568, + "step": 19457 + }, + { + "epoch": 0.5769949292767548, + "grad_norm": 0.11795078963041306, + "learning_rate": 0.0003868203597098831, + "loss": 2.629, + "step": 19458 + }, + { + "epoch": 0.5770245826290662, + "grad_norm": 0.11879060417413712, + "learning_rate": 0.000386774530510732, + "loss": 2.6398, + "step": 19459 + }, + { + "epoch": 0.5770542359813777, + "grad_norm": 0.11897258460521698, + "learning_rate": 0.0003867287023142121, + "loss": 2.6438, + "step": 19460 + }, + { + "epoch": 0.5770838893336891, + "grad_norm": 0.10247582197189331, + "learning_rate": 0.00038668287512072877, + "loss": 2.6038, + "step": 19461 + }, + { + "epoch": 0.5771135426860007, + "grad_norm": 0.10948524624109268, + "learning_rate": 0.00038663704893068807, + "loss": 2.6741, + "step": 19462 + }, + { + "epoch": 0.5771431960383121, + "grad_norm": 0.09881114959716797, + "learning_rate": 0.00038659122374449574, + "loss": 2.6271, + "step": 19463 + }, + { + "epoch": 0.5771728493906236, + "grad_norm": 0.11321376264095306, + "learning_rate": 0.0003865453995625576, + "loss": 2.6212, + "step": 19464 + }, + { + "epoch": 0.577202502742935, + "grad_norm": 0.11403248459100723, + "learning_rate": 0.00038649957638527946, + "loss": 2.6307, + "step": 19465 + }, + { + "epoch": 0.5772321560952466, + "grad_norm": 0.11894749104976654, + "learning_rate": 0.0003864537542130669, + "loss": 2.6807, + "step": 19466 + }, + { + "epoch": 0.577261809447558, + "grad_norm": 0.11022716760635376, + "learning_rate": 0.00038640793304632614, + "loss": 2.6861, + "step": 19467 + }, + { + "epoch": 0.5772914627998695, + "grad_norm": 0.1088608056306839, + "learning_rate": 0.00038636211288546244, + "loss": 2.6553, + "step": 19468 + }, + { + "epoch": 0.577321116152181, + "grad_norm": 0.10974281281232834, + "learning_rate": 0.0003863162937308815, + "loss": 2.6584, + "step": 19469 + }, + { + "epoch": 0.5773507695044925, + "grad_norm": 0.13320095837116241, + "learning_rate": 0.0003862704755829896, + "loss": 2.6515, + "step": 19470 + }, + { + "epoch": 0.5773804228568039, + "grad_norm": 0.1223340556025505, + "learning_rate": 0.0003862246584421921, + "loss": 2.6814, + "step": 19471 + }, + { + "epoch": 0.5774100762091154, + "grad_norm": 0.10517499595880508, + "learning_rate": 0.00038617884230889477, + "loss": 2.6358, + "step": 19472 + }, + { + "epoch": 0.577439729561427, + "grad_norm": 0.11494746804237366, + "learning_rate": 0.00038613302718350337, + "loss": 2.6377, + "step": 19473 + }, + { + "epoch": 0.5774693829137384, + "grad_norm": 0.12982439994812012, + "learning_rate": 0.00038608721306642346, + "loss": 2.6346, + "step": 19474 + }, + { + "epoch": 0.5774990362660499, + "grad_norm": 0.1179077997803688, + "learning_rate": 0.0003860413999580609, + "loss": 2.6149, + "step": 19475 + }, + { + "epoch": 0.5775286896183613, + "grad_norm": 0.1303606629371643, + "learning_rate": 0.00038599558785882133, + "loss": 2.663, + "step": 19476 + }, + { + "epoch": 0.5775583429706729, + "grad_norm": 0.11545130610466003, + "learning_rate": 0.0003859497767691105, + "loss": 2.6729, + "step": 19477 + }, + { + "epoch": 0.5775879963229843, + "grad_norm": 0.11583109200000763, + "learning_rate": 0.00038590396668933383, + "loss": 2.6464, + "step": 19478 + }, + { + "epoch": 0.5776176496752958, + "grad_norm": 0.1194763034582138, + "learning_rate": 0.00038585815761989705, + "loss": 2.6483, + "step": 19479 + }, + { + "epoch": 0.5776473030276073, + "grad_norm": 0.11138243973255157, + "learning_rate": 0.00038581234956120596, + "loss": 2.6351, + "step": 19480 + }, + { + "epoch": 0.5776769563799188, + "grad_norm": 0.1337277889251709, + "learning_rate": 0.0003857665425136661, + "loss": 2.6527, + "step": 19481 + }, + { + "epoch": 0.5777066097322302, + "grad_norm": 0.11725717782974243, + "learning_rate": 0.000385720736477683, + "loss": 2.6461, + "step": 19482 + }, + { + "epoch": 0.5777362630845417, + "grad_norm": 0.11703622341156006, + "learning_rate": 0.00038567493145366254, + "loss": 2.6803, + "step": 19483 + }, + { + "epoch": 0.5777659164368532, + "grad_norm": 0.13789579272270203, + "learning_rate": 0.0003856291274420102, + "loss": 2.6705, + "step": 19484 + }, + { + "epoch": 0.5777955697891647, + "grad_norm": 0.09799356013536453, + "learning_rate": 0.0003855833244431315, + "loss": 2.6625, + "step": 19485 + }, + { + "epoch": 0.5778252231414761, + "grad_norm": 0.10965905338525772, + "learning_rate": 0.0003855375224574321, + "loss": 2.6022, + "step": 19486 + }, + { + "epoch": 0.5778548764937876, + "grad_norm": 0.12960992753505707, + "learning_rate": 0.00038549172148531763, + "loss": 2.6706, + "step": 19487 + }, + { + "epoch": 0.5778845298460991, + "grad_norm": 0.14652380347251892, + "learning_rate": 0.0003854459215271937, + "loss": 2.6374, + "step": 19488 + }, + { + "epoch": 0.5779141831984106, + "grad_norm": 0.13805025815963745, + "learning_rate": 0.00038540012258346567, + "loss": 2.6929, + "step": 19489 + }, + { + "epoch": 0.577943836550722, + "grad_norm": 0.11489943414926529, + "learning_rate": 0.0003853543246545393, + "loss": 2.6742, + "step": 19490 + }, + { + "epoch": 0.5779734899030335, + "grad_norm": 0.12143974006175995, + "learning_rate": 0.00038530852774082, + "loss": 2.6495, + "step": 19491 + }, + { + "epoch": 0.578003143255345, + "grad_norm": 0.11905798316001892, + "learning_rate": 0.0003852627318427134, + "loss": 2.6706, + "step": 19492 + }, + { + "epoch": 0.5780327966076565, + "grad_norm": 0.11881939321756363, + "learning_rate": 0.000385216936960625, + "loss": 2.6714, + "step": 19493 + }, + { + "epoch": 0.578062449959968, + "grad_norm": 0.11013399064540863, + "learning_rate": 0.00038517114309496043, + "loss": 2.6871, + "step": 19494 + }, + { + "epoch": 0.5780921033122794, + "grad_norm": 0.11816149950027466, + "learning_rate": 0.0003851253502461248, + "loss": 2.6609, + "step": 19495 + }, + { + "epoch": 0.578121756664591, + "grad_norm": 0.11949175596237183, + "learning_rate": 0.00038507955841452407, + "loss": 2.6424, + "step": 19496 + }, + { + "epoch": 0.5781514100169024, + "grad_norm": 0.12041642516851425, + "learning_rate": 0.00038503376760056364, + "loss": 2.6583, + "step": 19497 + }, + { + "epoch": 0.5781810633692139, + "grad_norm": 0.11502896994352341, + "learning_rate": 0.000384987977804649, + "loss": 2.6048, + "step": 19498 + }, + { + "epoch": 0.5782107167215254, + "grad_norm": 0.13729652762413025, + "learning_rate": 0.0003849421890271855, + "loss": 2.6233, + "step": 19499 + }, + { + "epoch": 0.5782403700738369, + "grad_norm": 0.11980734020471573, + "learning_rate": 0.0003848964012685786, + "loss": 2.662, + "step": 19500 + }, + { + "epoch": 0.5782700234261483, + "grad_norm": 0.10644947737455368, + "learning_rate": 0.00038485061452923387, + "loss": 2.6543, + "step": 19501 + }, + { + "epoch": 0.5782996767784598, + "grad_norm": 0.12051268666982651, + "learning_rate": 0.0003848048288095566, + "loss": 2.6876, + "step": 19502 + }, + { + "epoch": 0.5783293301307713, + "grad_norm": 0.10860684514045715, + "learning_rate": 0.0003847590441099525, + "loss": 2.6319, + "step": 19503 + }, + { + "epoch": 0.5783589834830828, + "grad_norm": 0.11818987131118774, + "learning_rate": 0.00038471326043082677, + "loss": 2.6668, + "step": 19504 + }, + { + "epoch": 0.5783886368353942, + "grad_norm": 0.11390934884548187, + "learning_rate": 0.00038466747777258505, + "loss": 2.616, + "step": 19505 + }, + { + "epoch": 0.5784182901877057, + "grad_norm": 0.11310739070177078, + "learning_rate": 0.00038462169613563246, + "loss": 2.6124, + "step": 19506 + }, + { + "epoch": 0.5784479435400172, + "grad_norm": 0.11957553774118423, + "learning_rate": 0.0003845759155203746, + "loss": 2.6645, + "step": 19507 + }, + { + "epoch": 0.5784775968923287, + "grad_norm": 0.12948815524578094, + "learning_rate": 0.00038453013592721654, + "loss": 2.6217, + "step": 19508 + }, + { + "epoch": 0.5785072502446401, + "grad_norm": 0.12496540695428848, + "learning_rate": 0.00038448435735656436, + "loss": 2.6291, + "step": 19509 + }, + { + "epoch": 0.5785369035969516, + "grad_norm": 0.10788262635469437, + "learning_rate": 0.0003844385798088228, + "loss": 2.6579, + "step": 19510 + }, + { + "epoch": 0.5785665569492631, + "grad_norm": 0.12194786965847015, + "learning_rate": 0.0003843928032843975, + "loss": 2.6066, + "step": 19511 + }, + { + "epoch": 0.5785962103015746, + "grad_norm": 0.11599605530500412, + "learning_rate": 0.00038434702778369385, + "loss": 2.6285, + "step": 19512 + }, + { + "epoch": 0.578625863653886, + "grad_norm": 0.11787621676921844, + "learning_rate": 0.00038430125330711707, + "loss": 2.6803, + "step": 19513 + }, + { + "epoch": 0.5786555170061976, + "grad_norm": 0.1289430409669876, + "learning_rate": 0.0003842554798550725, + "loss": 2.6522, + "step": 19514 + }, + { + "epoch": 0.5786851703585091, + "grad_norm": 0.1238584816455841, + "learning_rate": 0.0003842097074279657, + "loss": 2.6601, + "step": 19515 + }, + { + "epoch": 0.5787148237108205, + "grad_norm": 0.12272839993238449, + "learning_rate": 0.0003841639360262016, + "loss": 2.6586, + "step": 19516 + }, + { + "epoch": 0.578744477063132, + "grad_norm": 0.1138760894536972, + "learning_rate": 0.00038411816565018587, + "loss": 2.6735, + "step": 19517 + }, + { + "epoch": 0.5787741304154435, + "grad_norm": 0.11996618658304214, + "learning_rate": 0.00038407239630032367, + "loss": 2.644, + "step": 19518 + }, + { + "epoch": 0.578803783767755, + "grad_norm": 0.12064214050769806, + "learning_rate": 0.0003840266279770202, + "loss": 2.6304, + "step": 19519 + }, + { + "epoch": 0.5788334371200664, + "grad_norm": 0.13461744785308838, + "learning_rate": 0.0003839808606806809, + "loss": 2.6362, + "step": 19520 + }, + { + "epoch": 0.5788630904723779, + "grad_norm": 0.11319662630558014, + "learning_rate": 0.00038393509441171083, + "loss": 2.6822, + "step": 19521 + }, + { + "epoch": 0.5788927438246894, + "grad_norm": 0.14319320023059845, + "learning_rate": 0.00038388932917051555, + "loss": 2.6498, + "step": 19522 + }, + { + "epoch": 0.5789223971770009, + "grad_norm": 0.12052781879901886, + "learning_rate": 0.0003838435649575003, + "loss": 2.6422, + "step": 19523 + }, + { + "epoch": 0.5789520505293123, + "grad_norm": 0.11894010007381439, + "learning_rate": 0.0003837978017730701, + "loss": 2.6337, + "step": 19524 + }, + { + "epoch": 0.5789817038816238, + "grad_norm": 0.13412964344024658, + "learning_rate": 0.0003837520396176303, + "loss": 2.6637, + "step": 19525 + }, + { + "epoch": 0.5790113572339353, + "grad_norm": 0.10993330180644989, + "learning_rate": 0.00038370627849158633, + "loss": 2.6545, + "step": 19526 + }, + { + "epoch": 0.5790410105862468, + "grad_norm": 0.11605039983987808, + "learning_rate": 0.0003836605183953431, + "loss": 2.609, + "step": 19527 + }, + { + "epoch": 0.5790706639385582, + "grad_norm": 0.09693606197834015, + "learning_rate": 0.00038361475932930587, + "loss": 2.6652, + "step": 19528 + }, + { + "epoch": 0.5791003172908697, + "grad_norm": 0.11938546597957611, + "learning_rate": 0.00038356900129388, + "loss": 2.6527, + "step": 19529 + }, + { + "epoch": 0.5791299706431812, + "grad_norm": 0.11057659238576889, + "learning_rate": 0.00038352324428947063, + "loss": 2.6461, + "step": 19530 + }, + { + "epoch": 0.5791596239954927, + "grad_norm": 0.1170998141169548, + "learning_rate": 0.00038347748831648286, + "loss": 2.6455, + "step": 19531 + }, + { + "epoch": 0.5791892773478041, + "grad_norm": 0.10756127536296844, + "learning_rate": 0.00038343173337532205, + "loss": 2.6012, + "step": 19532 + }, + { + "epoch": 0.5792189307001157, + "grad_norm": 0.11009372770786285, + "learning_rate": 0.0003833859794663932, + "loss": 2.6657, + "step": 19533 + }, + { + "epoch": 0.5792485840524271, + "grad_norm": 0.13092336058616638, + "learning_rate": 0.00038334022659010125, + "loss": 2.6345, + "step": 19534 + }, + { + "epoch": 0.5792782374047386, + "grad_norm": 0.11665292829275131, + "learning_rate": 0.00038329447474685186, + "loss": 2.6617, + "step": 19535 + }, + { + "epoch": 0.5793078907570501, + "grad_norm": 0.1274326741695404, + "learning_rate": 0.00038324872393705, + "loss": 2.6603, + "step": 19536 + }, + { + "epoch": 0.5793375441093616, + "grad_norm": 0.1276240199804306, + "learning_rate": 0.0003832029741611006, + "loss": 2.6381, + "step": 19537 + }, + { + "epoch": 0.5793671974616731, + "grad_norm": 0.1255955547094345, + "learning_rate": 0.000383157225419409, + "loss": 2.663, + "step": 19538 + }, + { + "epoch": 0.5793968508139845, + "grad_norm": 0.11798445135354996, + "learning_rate": 0.0003831114777123802, + "loss": 2.6683, + "step": 19539 + }, + { + "epoch": 0.579426504166296, + "grad_norm": 0.10424546897411346, + "learning_rate": 0.0003830657310404193, + "loss": 2.6689, + "step": 19540 + }, + { + "epoch": 0.5794561575186075, + "grad_norm": 0.1389923244714737, + "learning_rate": 0.0003830199854039314, + "loss": 2.663, + "step": 19541 + }, + { + "epoch": 0.579485810870919, + "grad_norm": 0.14360180497169495, + "learning_rate": 0.0003829742408033217, + "loss": 2.6464, + "step": 19542 + }, + { + "epoch": 0.5795154642232304, + "grad_norm": 0.13907381892204285, + "learning_rate": 0.0003829284972389952, + "loss": 2.6715, + "step": 19543 + }, + { + "epoch": 0.579545117575542, + "grad_norm": 0.1108674556016922, + "learning_rate": 0.00038288275471135686, + "loss": 2.6375, + "step": 19544 + }, + { + "epoch": 0.5795747709278534, + "grad_norm": 0.12198375165462494, + "learning_rate": 0.0003828370132208119, + "loss": 2.6522, + "step": 19545 + }, + { + "epoch": 0.5796044242801649, + "grad_norm": 0.11342551559209824, + "learning_rate": 0.0003827912727677652, + "loss": 2.6534, + "step": 19546 + }, + { + "epoch": 0.5796340776324763, + "grad_norm": 0.11937163770198822, + "learning_rate": 0.00038274553335262187, + "loss": 2.6747, + "step": 19547 + }, + { + "epoch": 0.5796637309847879, + "grad_norm": 0.13291499018669128, + "learning_rate": 0.000382699794975787, + "loss": 2.6579, + "step": 19548 + }, + { + "epoch": 0.5796933843370993, + "grad_norm": 0.12600351870059967, + "learning_rate": 0.00038265405763766565, + "loss": 2.6657, + "step": 19549 + }, + { + "epoch": 0.5797230376894108, + "grad_norm": 0.1113237515091896, + "learning_rate": 0.00038260832133866276, + "loss": 2.6283, + "step": 19550 + }, + { + "epoch": 0.5797526910417222, + "grad_norm": 0.11782833933830261, + "learning_rate": 0.00038256258607918326, + "loss": 2.6602, + "step": 19551 + }, + { + "epoch": 0.5797823443940338, + "grad_norm": 0.1095028817653656, + "learning_rate": 0.00038251685185963227, + "loss": 2.6387, + "step": 19552 + }, + { + "epoch": 0.5798119977463452, + "grad_norm": 0.13328905403614044, + "learning_rate": 0.00038247111868041484, + "loss": 2.657, + "step": 19553 + }, + { + "epoch": 0.5798416510986567, + "grad_norm": 0.10579025000333786, + "learning_rate": 0.0003824253865419357, + "loss": 2.6323, + "step": 19554 + }, + { + "epoch": 0.5798713044509681, + "grad_norm": 0.11903475970029831, + "learning_rate": 0.0003823796554446, + "loss": 2.6582, + "step": 19555 + }, + { + "epoch": 0.5799009578032797, + "grad_norm": 0.10193534940481186, + "learning_rate": 0.00038233392538881256, + "loss": 2.6444, + "step": 19556 + }, + { + "epoch": 0.5799306111555912, + "grad_norm": 0.12325400859117508, + "learning_rate": 0.0003822881963749784, + "loss": 2.646, + "step": 19557 + }, + { + "epoch": 0.5799602645079026, + "grad_norm": 0.10293885320425034, + "learning_rate": 0.00038224246840350254, + "loss": 2.6666, + "step": 19558 + }, + { + "epoch": 0.5799899178602141, + "grad_norm": 0.11059139668941498, + "learning_rate": 0.0003821967414747898, + "loss": 2.6309, + "step": 19559 + }, + { + "epoch": 0.5800195712125256, + "grad_norm": 0.09862256795167923, + "learning_rate": 0.000382151015589245, + "loss": 2.6058, + "step": 19560 + }, + { + "epoch": 0.5800492245648371, + "grad_norm": 0.12325049936771393, + "learning_rate": 0.00038210529074727336, + "loss": 2.6401, + "step": 19561 + }, + { + "epoch": 0.5800788779171485, + "grad_norm": 0.11798281222581863, + "learning_rate": 0.0003820595669492796, + "loss": 2.639, + "step": 19562 + }, + { + "epoch": 0.58010853126946, + "grad_norm": 0.10320879518985748, + "learning_rate": 0.00038201384419566856, + "loss": 2.6252, + "step": 19563 + }, + { + "epoch": 0.5801381846217715, + "grad_norm": 0.1309146136045456, + "learning_rate": 0.0003819681224868453, + "loss": 2.6467, + "step": 19564 + }, + { + "epoch": 0.580167837974083, + "grad_norm": 0.1194753348827362, + "learning_rate": 0.0003819224018232145, + "loss": 2.6352, + "step": 19565 + }, + { + "epoch": 0.5801974913263944, + "grad_norm": 0.11023514717817307, + "learning_rate": 0.0003818766822051811, + "loss": 2.6903, + "step": 19566 + }, + { + "epoch": 0.580227144678706, + "grad_norm": 0.11275868862867355, + "learning_rate": 0.0003818309636331499, + "loss": 2.6704, + "step": 19567 + }, + { + "epoch": 0.5802567980310174, + "grad_norm": 0.13014042377471924, + "learning_rate": 0.00038178524610752584, + "loss": 2.6471, + "step": 19568 + }, + { + "epoch": 0.5802864513833289, + "grad_norm": 0.14566177129745483, + "learning_rate": 0.00038173952962871374, + "loss": 2.6539, + "step": 19569 + }, + { + "epoch": 0.5803161047356403, + "grad_norm": 0.1256108433008194, + "learning_rate": 0.0003816938141971185, + "loss": 2.6857, + "step": 19570 + }, + { + "epoch": 0.5803457580879519, + "grad_norm": 0.11942218989133835, + "learning_rate": 0.0003816480998131447, + "loss": 2.647, + "step": 19571 + }, + { + "epoch": 0.5803754114402633, + "grad_norm": 0.10469261556863785, + "learning_rate": 0.00038160238647719723, + "loss": 2.6346, + "step": 19572 + }, + { + "epoch": 0.5804050647925748, + "grad_norm": 0.10900510847568512, + "learning_rate": 0.0003815566741896809, + "loss": 2.6607, + "step": 19573 + }, + { + "epoch": 0.5804347181448862, + "grad_norm": 0.11383621394634247, + "learning_rate": 0.00038151096295100073, + "loss": 2.6419, + "step": 19574 + }, + { + "epoch": 0.5804643714971978, + "grad_norm": 0.12020628899335861, + "learning_rate": 0.0003814652527615613, + "loss": 2.6616, + "step": 19575 + }, + { + "epoch": 0.5804940248495092, + "grad_norm": 0.099850133061409, + "learning_rate": 0.0003814195436217674, + "loss": 2.6538, + "step": 19576 + }, + { + "epoch": 0.5805236782018207, + "grad_norm": 0.11251019686460495, + "learning_rate": 0.0003813738355320238, + "loss": 2.6619, + "step": 19577 + }, + { + "epoch": 0.5805533315541322, + "grad_norm": 0.11305762082338333, + "learning_rate": 0.0003813281284927352, + "loss": 2.6613, + "step": 19578 + }, + { + "epoch": 0.5805829849064437, + "grad_norm": 0.11158851534128189, + "learning_rate": 0.0003812824225043064, + "loss": 2.6435, + "step": 19579 + }, + { + "epoch": 0.5806126382587552, + "grad_norm": 0.11086805909872055, + "learning_rate": 0.0003812367175671421, + "loss": 2.6377, + "step": 19580 + }, + { + "epoch": 0.5806422916110666, + "grad_norm": 0.10447854548692703, + "learning_rate": 0.0003811910136816472, + "loss": 2.6464, + "step": 19581 + }, + { + "epoch": 0.5806719449633782, + "grad_norm": 0.1271892488002777, + "learning_rate": 0.00038114531084822617, + "loss": 2.6374, + "step": 19582 + }, + { + "epoch": 0.5807015983156896, + "grad_norm": 0.1281929612159729, + "learning_rate": 0.0003810996090672838, + "loss": 2.6278, + "step": 19583 + }, + { + "epoch": 0.5807312516680011, + "grad_norm": 0.11880862712860107, + "learning_rate": 0.0003810539083392248, + "loss": 2.6468, + "step": 19584 + }, + { + "epoch": 0.5807609050203125, + "grad_norm": 0.1172463446855545, + "learning_rate": 0.0003810082086644539, + "loss": 2.6721, + "step": 19585 + }, + { + "epoch": 0.580790558372624, + "grad_norm": 0.1356494426727295, + "learning_rate": 0.0003809625100433756, + "loss": 2.7173, + "step": 19586 + }, + { + "epoch": 0.5808202117249355, + "grad_norm": 0.13647258281707764, + "learning_rate": 0.0003809168124763948, + "loss": 2.6428, + "step": 19587 + }, + { + "epoch": 0.580849865077247, + "grad_norm": 0.12709669768810272, + "learning_rate": 0.00038087111596391606, + "loss": 2.6621, + "step": 19588 + }, + { + "epoch": 0.5808795184295584, + "grad_norm": 0.1012185588479042, + "learning_rate": 0.00038082542050634405, + "loss": 2.6443, + "step": 19589 + }, + { + "epoch": 0.58090917178187, + "grad_norm": 0.13746051490306854, + "learning_rate": 0.0003807797261040834, + "loss": 2.6632, + "step": 19590 + }, + { + "epoch": 0.5809388251341814, + "grad_norm": 0.13720504939556122, + "learning_rate": 0.0003807340327575388, + "loss": 2.6813, + "step": 19591 + }, + { + "epoch": 0.5809684784864929, + "grad_norm": 0.1161862313747406, + "learning_rate": 0.00038068834046711474, + "loss": 2.6045, + "step": 19592 + }, + { + "epoch": 0.5809981318388043, + "grad_norm": 0.12387026846408844, + "learning_rate": 0.00038064264923321595, + "loss": 2.6681, + "step": 19593 + }, + { + "epoch": 0.5810277851911159, + "grad_norm": 0.1337394267320633, + "learning_rate": 0.00038059695905624693, + "loss": 2.6457, + "step": 19594 + }, + { + "epoch": 0.5810574385434273, + "grad_norm": 0.11285756528377533, + "learning_rate": 0.00038055126993661237, + "loss": 2.6286, + "step": 19595 + }, + { + "epoch": 0.5810870918957388, + "grad_norm": 0.11122097074985504, + "learning_rate": 0.00038050558187471676, + "loss": 2.6834, + "step": 19596 + }, + { + "epoch": 0.5811167452480502, + "grad_norm": 0.12491007894277573, + "learning_rate": 0.00038045989487096475, + "loss": 2.6637, + "step": 19597 + }, + { + "epoch": 0.5811463986003618, + "grad_norm": 0.11845148354768753, + "learning_rate": 0.00038041420892576106, + "loss": 2.6723, + "step": 19598 + }, + { + "epoch": 0.5811760519526733, + "grad_norm": 0.1309293806552887, + "learning_rate": 0.0003803685240395097, + "loss": 2.6688, + "step": 19599 + }, + { + "epoch": 0.5812057053049847, + "grad_norm": 0.10557299852371216, + "learning_rate": 0.0003803228402126159, + "loss": 2.6253, + "step": 19600 + }, + { + "epoch": 0.5812353586572963, + "grad_norm": 0.1372617930173874, + "learning_rate": 0.0003802771574454837, + "loss": 2.6533, + "step": 19601 + }, + { + "epoch": 0.5812650120096077, + "grad_norm": 0.10944807529449463, + "learning_rate": 0.0003802314757385181, + "loss": 2.6306, + "step": 19602 + }, + { + "epoch": 0.5812946653619192, + "grad_norm": 0.11456960439682007, + "learning_rate": 0.00038018579509212317, + "loss": 2.6392, + "step": 19603 + }, + { + "epoch": 0.5813243187142306, + "grad_norm": 0.12221448868513107, + "learning_rate": 0.0003801401155067036, + "loss": 2.6514, + "step": 19604 + }, + { + "epoch": 0.5813539720665422, + "grad_norm": 0.11751458793878555, + "learning_rate": 0.0003800944369826639, + "loss": 2.6514, + "step": 19605 + }, + { + "epoch": 0.5813836254188536, + "grad_norm": 0.10631205886602402, + "learning_rate": 0.0003800487595204085, + "loss": 2.6907, + "step": 19606 + }, + { + "epoch": 0.5814132787711651, + "grad_norm": 0.1103563904762268, + "learning_rate": 0.00038000308312034195, + "loss": 2.63, + "step": 19607 + }, + { + "epoch": 0.5814429321234765, + "grad_norm": 0.11793306469917297, + "learning_rate": 0.0003799574077828688, + "loss": 2.6445, + "step": 19608 + }, + { + "epoch": 0.5814725854757881, + "grad_norm": 0.11730562895536423, + "learning_rate": 0.00037991173350839327, + "loss": 2.6742, + "step": 19609 + }, + { + "epoch": 0.5815022388280995, + "grad_norm": 0.1161666288971901, + "learning_rate": 0.00037986606029732, + "loss": 2.6743, + "step": 19610 + }, + { + "epoch": 0.581531892180411, + "grad_norm": 0.12590932846069336, + "learning_rate": 0.0003798203881500534, + "loss": 2.6234, + "step": 19611 + }, + { + "epoch": 0.5815615455327224, + "grad_norm": 0.13253311812877655, + "learning_rate": 0.0003797747170669977, + "loss": 2.6456, + "step": 19612 + }, + { + "epoch": 0.581591198885034, + "grad_norm": 0.09952007234096527, + "learning_rate": 0.0003797290470485577, + "loss": 2.6312, + "step": 19613 + }, + { + "epoch": 0.5816208522373454, + "grad_norm": 0.11918166279792786, + "learning_rate": 0.00037968337809513757, + "loss": 2.6459, + "step": 19614 + }, + { + "epoch": 0.5816505055896569, + "grad_norm": 0.12419182062149048, + "learning_rate": 0.0003796377102071419, + "loss": 2.6461, + "step": 19615 + }, + { + "epoch": 0.5816801589419683, + "grad_norm": 0.11748187243938446, + "learning_rate": 0.0003795920433849748, + "loss": 2.6432, + "step": 19616 + }, + { + "epoch": 0.5817098122942799, + "grad_norm": 0.11802876740694046, + "learning_rate": 0.0003795463776290409, + "loss": 2.6659, + "step": 19617 + }, + { + "epoch": 0.5817394656465913, + "grad_norm": 0.12166939675807953, + "learning_rate": 0.0003795007129397445, + "loss": 2.6587, + "step": 19618 + }, + { + "epoch": 0.5817691189989028, + "grad_norm": 0.11821851879358292, + "learning_rate": 0.00037945504931749015, + "loss": 2.6476, + "step": 19619 + }, + { + "epoch": 0.5817987723512144, + "grad_norm": 0.11727902293205261, + "learning_rate": 0.0003794093867626818, + "loss": 2.6342, + "step": 19620 + }, + { + "epoch": 0.5818284257035258, + "grad_norm": 0.11505790799856186, + "learning_rate": 0.00037936372527572416, + "loss": 2.605, + "step": 19621 + }, + { + "epoch": 0.5818580790558373, + "grad_norm": 0.13255712389945984, + "learning_rate": 0.00037931806485702135, + "loss": 2.6695, + "step": 19622 + }, + { + "epoch": 0.5818877324081487, + "grad_norm": 0.10982976108789444, + "learning_rate": 0.0003792724055069778, + "loss": 2.6126, + "step": 19623 + }, + { + "epoch": 0.5819173857604603, + "grad_norm": 0.10810167342424393, + "learning_rate": 0.0003792267472259977, + "loss": 2.6153, + "step": 19624 + }, + { + "epoch": 0.5819470391127717, + "grad_norm": 0.13576816022396088, + "learning_rate": 0.00037918109001448564, + "loss": 2.6402, + "step": 19625 + }, + { + "epoch": 0.5819766924650832, + "grad_norm": 0.12185803055763245, + "learning_rate": 0.00037913543387284575, + "loss": 2.6632, + "step": 19626 + }, + { + "epoch": 0.5820063458173946, + "grad_norm": 0.1258208453655243, + "learning_rate": 0.00037908977880148233, + "loss": 2.6565, + "step": 19627 + }, + { + "epoch": 0.5820359991697062, + "grad_norm": 0.13882984220981598, + "learning_rate": 0.0003790441248007996, + "loss": 2.636, + "step": 19628 + }, + { + "epoch": 0.5820656525220176, + "grad_norm": 0.11266322433948517, + "learning_rate": 0.0003789984718712022, + "loss": 2.6296, + "step": 19629 + }, + { + "epoch": 0.5820953058743291, + "grad_norm": 0.12039415538311005, + "learning_rate": 0.00037895282001309383, + "loss": 2.6205, + "step": 19630 + }, + { + "epoch": 0.5821249592266405, + "grad_norm": 0.11639698594808578, + "learning_rate": 0.0003789071692268791, + "loss": 2.6486, + "step": 19631 + }, + { + "epoch": 0.5821546125789521, + "grad_norm": 0.10838550329208374, + "learning_rate": 0.0003788615195129621, + "loss": 2.6941, + "step": 19632 + }, + { + "epoch": 0.5821842659312635, + "grad_norm": 0.11718854308128357, + "learning_rate": 0.0003788158708717472, + "loss": 2.6364, + "step": 19633 + }, + { + "epoch": 0.582213919283575, + "grad_norm": 0.10691506415605545, + "learning_rate": 0.00037877022330363855, + "loss": 2.6405, + "step": 19634 + }, + { + "epoch": 0.5822435726358864, + "grad_norm": 0.11616811901330948, + "learning_rate": 0.0003787245768090403, + "loss": 2.6613, + "step": 19635 + }, + { + "epoch": 0.582273225988198, + "grad_norm": 0.09886162728071213, + "learning_rate": 0.000378678931388357, + "loss": 2.6561, + "step": 19636 + }, + { + "epoch": 0.5823028793405094, + "grad_norm": 0.11069640517234802, + "learning_rate": 0.00037863328704199214, + "loss": 2.6607, + "step": 19637 + }, + { + "epoch": 0.5823325326928209, + "grad_norm": 0.11746295541524887, + "learning_rate": 0.0003785876437703506, + "loss": 2.6503, + "step": 19638 + }, + { + "epoch": 0.5823621860451325, + "grad_norm": 0.10392686724662781, + "learning_rate": 0.0003785420015738363, + "loss": 2.6656, + "step": 19639 + }, + { + "epoch": 0.5823918393974439, + "grad_norm": 0.09750974923372269, + "learning_rate": 0.00037849636045285363, + "loss": 2.6373, + "step": 19640 + }, + { + "epoch": 0.5824214927497554, + "grad_norm": 0.12030292302370071, + "learning_rate": 0.0003784507204078064, + "loss": 2.6517, + "step": 19641 + }, + { + "epoch": 0.5824511461020668, + "grad_norm": 0.10799253731966019, + "learning_rate": 0.0003784050814390988, + "loss": 2.647, + "step": 19642 + }, + { + "epoch": 0.5824807994543784, + "grad_norm": 0.09709882736206055, + "learning_rate": 0.00037835944354713515, + "loss": 2.6433, + "step": 19643 + }, + { + "epoch": 0.5825104528066898, + "grad_norm": 0.11154521256685257, + "learning_rate": 0.00037831380673231953, + "loss": 2.6446, + "step": 19644 + }, + { + "epoch": 0.5825401061590013, + "grad_norm": 0.10991651564836502, + "learning_rate": 0.000378268170995056, + "loss": 2.6468, + "step": 19645 + }, + { + "epoch": 0.5825697595113127, + "grad_norm": 0.10335873067378998, + "learning_rate": 0.0003782225363357488, + "loss": 2.6048, + "step": 19646 + }, + { + "epoch": 0.5825994128636243, + "grad_norm": 0.10041223466396332, + "learning_rate": 0.0003781769027548019, + "loss": 2.6694, + "step": 19647 + }, + { + "epoch": 0.5826290662159357, + "grad_norm": 0.0948503166437149, + "learning_rate": 0.0003781312702526194, + "loss": 2.6517, + "step": 19648 + }, + { + "epoch": 0.5826587195682472, + "grad_norm": 0.09500992298126221, + "learning_rate": 0.0003780856388296054, + "loss": 2.6691, + "step": 19649 + }, + { + "epoch": 0.5826883729205586, + "grad_norm": 0.10974012315273285, + "learning_rate": 0.0003780400084861639, + "loss": 2.6497, + "step": 19650 + }, + { + "epoch": 0.5827180262728702, + "grad_norm": 0.11919844150543213, + "learning_rate": 0.0003779943792226992, + "loss": 2.653, + "step": 19651 + }, + { + "epoch": 0.5827476796251816, + "grad_norm": 0.12276715785264969, + "learning_rate": 0.0003779487510396152, + "loss": 2.6839, + "step": 19652 + }, + { + "epoch": 0.5827773329774931, + "grad_norm": 0.13956695795059204, + "learning_rate": 0.00037790312393731594, + "loss": 2.6323, + "step": 19653 + }, + { + "epoch": 0.5828069863298045, + "grad_norm": 0.13955792784690857, + "learning_rate": 0.0003778574979162055, + "loss": 2.6434, + "step": 19654 + }, + { + "epoch": 0.5828366396821161, + "grad_norm": 0.13315007090568542, + "learning_rate": 0.0003778118729766878, + "loss": 2.658, + "step": 19655 + }, + { + "epoch": 0.5828662930344275, + "grad_norm": 0.12855924665927887, + "learning_rate": 0.00037776624911916706, + "loss": 2.6276, + "step": 19656 + }, + { + "epoch": 0.582895946386739, + "grad_norm": 0.12324211001396179, + "learning_rate": 0.00037772062634404717, + "loss": 2.633, + "step": 19657 + }, + { + "epoch": 0.5829255997390504, + "grad_norm": 0.1085425466299057, + "learning_rate": 0.0003776750046517321, + "loss": 2.6708, + "step": 19658 + }, + { + "epoch": 0.582955253091362, + "grad_norm": 0.11897004395723343, + "learning_rate": 0.0003776293840426258, + "loss": 2.6452, + "step": 19659 + }, + { + "epoch": 0.5829849064436735, + "grad_norm": 0.13083882629871368, + "learning_rate": 0.0003775837645171324, + "loss": 2.65, + "step": 19660 + }, + { + "epoch": 0.5830145597959849, + "grad_norm": 0.12282082438468933, + "learning_rate": 0.00037753814607565566, + "loss": 2.6553, + "step": 19661 + }, + { + "epoch": 0.5830442131482965, + "grad_norm": 0.1223336011171341, + "learning_rate": 0.0003774925287185997, + "loss": 2.7035, + "step": 19662 + }, + { + "epoch": 0.5830738665006079, + "grad_norm": 0.13250772655010223, + "learning_rate": 0.00037744691244636833, + "loss": 2.6401, + "step": 19663 + }, + { + "epoch": 0.5831035198529194, + "grad_norm": 0.12138521671295166, + "learning_rate": 0.0003774012972593657, + "loss": 2.646, + "step": 19664 + }, + { + "epoch": 0.5831331732052308, + "grad_norm": 0.14166304469108582, + "learning_rate": 0.0003773556831579956, + "loss": 2.6511, + "step": 19665 + }, + { + "epoch": 0.5831628265575424, + "grad_norm": 0.14267301559448242, + "learning_rate": 0.000377310070142662, + "loss": 2.659, + "step": 19666 + }, + { + "epoch": 0.5831924799098538, + "grad_norm": 0.10755937546491623, + "learning_rate": 0.0003772644582137689, + "loss": 2.6585, + "step": 19667 + }, + { + "epoch": 0.5832221332621653, + "grad_norm": 0.1571464240550995, + "learning_rate": 0.00037721884737171996, + "loss": 2.6799, + "step": 19668 + }, + { + "epoch": 0.5832517866144767, + "grad_norm": 0.12636932730674744, + "learning_rate": 0.00037717323761691927, + "loss": 2.6307, + "step": 19669 + }, + { + "epoch": 0.5832814399667883, + "grad_norm": 0.14524893462657928, + "learning_rate": 0.0003771276289497705, + "loss": 2.6686, + "step": 19670 + }, + { + "epoch": 0.5833110933190997, + "grad_norm": 0.14202070236206055, + "learning_rate": 0.00037708202137067784, + "loss": 2.6479, + "step": 19671 + }, + { + "epoch": 0.5833407466714112, + "grad_norm": 0.14166271686553955, + "learning_rate": 0.0003770364148800448, + "loss": 2.647, + "step": 19672 + }, + { + "epoch": 0.5833704000237226, + "grad_norm": 0.12678134441375732, + "learning_rate": 0.0003769908094782756, + "loss": 2.6427, + "step": 19673 + }, + { + "epoch": 0.5834000533760342, + "grad_norm": 0.1132270097732544, + "learning_rate": 0.00037694520516577386, + "loss": 2.6368, + "step": 19674 + }, + { + "epoch": 0.5834297067283456, + "grad_norm": 0.12614679336547852, + "learning_rate": 0.0003768996019429434, + "loss": 2.6381, + "step": 19675 + }, + { + "epoch": 0.5834593600806571, + "grad_norm": 0.12938609719276428, + "learning_rate": 0.00037685399981018784, + "loss": 2.6541, + "step": 19676 + }, + { + "epoch": 0.5834890134329686, + "grad_norm": 0.1257435381412506, + "learning_rate": 0.0003768083987679115, + "loss": 2.6839, + "step": 19677 + }, + { + "epoch": 0.5835186667852801, + "grad_norm": 0.12047019600868225, + "learning_rate": 0.00037676279881651803, + "loss": 2.6421, + "step": 19678 + }, + { + "epoch": 0.5835483201375915, + "grad_norm": 0.11342289298772812, + "learning_rate": 0.00037671719995641107, + "loss": 2.6183, + "step": 19679 + }, + { + "epoch": 0.583577973489903, + "grad_norm": 0.1184619590640068, + "learning_rate": 0.0003766716021879944, + "loss": 2.6415, + "step": 19680 + }, + { + "epoch": 0.5836076268422146, + "grad_norm": 0.12231209874153137, + "learning_rate": 0.0003766260055116719, + "loss": 2.6672, + "step": 19681 + }, + { + "epoch": 0.583637280194526, + "grad_norm": 0.10133865475654602, + "learning_rate": 0.00037658040992784726, + "loss": 2.6752, + "step": 19682 + }, + { + "epoch": 0.5836669335468375, + "grad_norm": 0.12120361626148224, + "learning_rate": 0.0003765348154369243, + "loss": 2.6579, + "step": 19683 + }, + { + "epoch": 0.5836965868991489, + "grad_norm": 0.11071771383285522, + "learning_rate": 0.00037648922203930684, + "loss": 2.6776, + "step": 19684 + }, + { + "epoch": 0.5837262402514605, + "grad_norm": 0.10662604868412018, + "learning_rate": 0.0003764436297353985, + "loss": 2.6439, + "step": 19685 + }, + { + "epoch": 0.5837558936037719, + "grad_norm": 0.1120050773024559, + "learning_rate": 0.0003763980385256029, + "loss": 2.6364, + "step": 19686 + }, + { + "epoch": 0.5837855469560834, + "grad_norm": 0.11356577277183533, + "learning_rate": 0.0003763524484103239, + "loss": 2.6311, + "step": 19687 + }, + { + "epoch": 0.5838152003083948, + "grad_norm": 0.1090770959854126, + "learning_rate": 0.00037630685938996525, + "loss": 2.6282, + "step": 19688 + }, + { + "epoch": 0.5838448536607064, + "grad_norm": 0.1128591001033783, + "learning_rate": 0.0003762612714649304, + "loss": 2.6246, + "step": 19689 + }, + { + "epoch": 0.5838745070130178, + "grad_norm": 0.12309031933546066, + "learning_rate": 0.0003762156846356234, + "loss": 2.6753, + "step": 19690 + }, + { + "epoch": 0.5839041603653293, + "grad_norm": 0.13239732384681702, + "learning_rate": 0.0003761700989024478, + "loss": 2.6788, + "step": 19691 + }, + { + "epoch": 0.5839338137176407, + "grad_norm": 0.11629808694124222, + "learning_rate": 0.00037612451426580716, + "loss": 2.6858, + "step": 19692 + }, + { + "epoch": 0.5839634670699523, + "grad_norm": 0.11905461549758911, + "learning_rate": 0.00037607893072610525, + "loss": 2.6685, + "step": 19693 + }, + { + "epoch": 0.5839931204222637, + "grad_norm": 0.10628343373537064, + "learning_rate": 0.0003760333482837457, + "loss": 2.656, + "step": 19694 + }, + { + "epoch": 0.5840227737745752, + "grad_norm": 0.11998873203992844, + "learning_rate": 0.00037598776693913215, + "loss": 2.6875, + "step": 19695 + }, + { + "epoch": 0.5840524271268867, + "grad_norm": 0.13475456833839417, + "learning_rate": 0.0003759421866926682, + "loss": 2.6147, + "step": 19696 + }, + { + "epoch": 0.5840820804791982, + "grad_norm": 0.12578117847442627, + "learning_rate": 0.00037589660754475747, + "loss": 2.6555, + "step": 19697 + }, + { + "epoch": 0.5841117338315096, + "grad_norm": 0.11163012683391571, + "learning_rate": 0.0003758510294958035, + "loss": 2.6374, + "step": 19698 + }, + { + "epoch": 0.5841413871838211, + "grad_norm": 0.13683703541755676, + "learning_rate": 0.00037580545254621003, + "loss": 2.6605, + "step": 19699 + }, + { + "epoch": 0.5841710405361326, + "grad_norm": 0.12883424758911133, + "learning_rate": 0.0003757598766963806, + "loss": 2.6454, + "step": 19700 + }, + { + "epoch": 0.5842006938884441, + "grad_norm": 0.12295615673065186, + "learning_rate": 0.0003757143019467188, + "loss": 2.646, + "step": 19701 + }, + { + "epoch": 0.5842303472407556, + "grad_norm": 0.13360172510147095, + "learning_rate": 0.00037566872829762805, + "loss": 2.6587, + "step": 19702 + }, + { + "epoch": 0.584260000593067, + "grad_norm": 0.12284134328365326, + "learning_rate": 0.00037562315574951214, + "loss": 2.6339, + "step": 19703 + }, + { + "epoch": 0.5842896539453786, + "grad_norm": 0.12799328565597534, + "learning_rate": 0.00037557758430277455, + "loss": 2.6367, + "step": 19704 + }, + { + "epoch": 0.58431930729769, + "grad_norm": 0.13726283609867096, + "learning_rate": 0.00037553201395781893, + "loss": 2.6383, + "step": 19705 + }, + { + "epoch": 0.5843489606500015, + "grad_norm": 0.10484210401773453, + "learning_rate": 0.0003754864447150486, + "loss": 2.663, + "step": 19706 + }, + { + "epoch": 0.584378614002313, + "grad_norm": 0.11330057680606842, + "learning_rate": 0.00037544087657486716, + "loss": 2.6201, + "step": 19707 + }, + { + "epoch": 0.5844082673546245, + "grad_norm": 0.1073198989033699, + "learning_rate": 0.00037539530953767814, + "loss": 2.6369, + "step": 19708 + }, + { + "epoch": 0.5844379207069359, + "grad_norm": 0.11105192452669144, + "learning_rate": 0.00037534974360388504, + "loss": 2.6717, + "step": 19709 + }, + { + "epoch": 0.5844675740592474, + "grad_norm": 0.11602655053138733, + "learning_rate": 0.00037530417877389133, + "loss": 2.6325, + "step": 19710 + }, + { + "epoch": 0.5844972274115589, + "grad_norm": 0.10906466841697693, + "learning_rate": 0.00037525861504810056, + "loss": 2.6503, + "step": 19711 + }, + { + "epoch": 0.5845268807638704, + "grad_norm": 0.11568106710910797, + "learning_rate": 0.00037521305242691627, + "loss": 2.6841, + "step": 19712 + }, + { + "epoch": 0.5845565341161818, + "grad_norm": 0.12587924301624298, + "learning_rate": 0.00037516749091074167, + "loss": 2.6568, + "step": 19713 + }, + { + "epoch": 0.5845861874684933, + "grad_norm": 0.12025154381990433, + "learning_rate": 0.0003751219304999804, + "loss": 2.6547, + "step": 19714 + }, + { + "epoch": 0.5846158408208048, + "grad_norm": 0.10041997581720352, + "learning_rate": 0.00037507637119503566, + "loss": 2.6302, + "step": 19715 + }, + { + "epoch": 0.5846454941731163, + "grad_norm": 0.12521639466285706, + "learning_rate": 0.0003750308129963114, + "loss": 2.6398, + "step": 19716 + }, + { + "epoch": 0.5846751475254277, + "grad_norm": 0.10731647163629532, + "learning_rate": 0.0003749852559042106, + "loss": 2.5821, + "step": 19717 + }, + { + "epoch": 0.5847048008777392, + "grad_norm": 0.11125925183296204, + "learning_rate": 0.0003749396999191369, + "loss": 2.6456, + "step": 19718 + }, + { + "epoch": 0.5847344542300507, + "grad_norm": 0.10536640882492065, + "learning_rate": 0.00037489414504149354, + "loss": 2.6284, + "step": 19719 + }, + { + "epoch": 0.5847641075823622, + "grad_norm": 0.1148093119263649, + "learning_rate": 0.00037484859127168407, + "loss": 2.6242, + "step": 19720 + }, + { + "epoch": 0.5847937609346736, + "grad_norm": 0.11717062443494797, + "learning_rate": 0.0003748030386101118, + "loss": 2.6609, + "step": 19721 + }, + { + "epoch": 0.5848234142869851, + "grad_norm": 0.09664302319288254, + "learning_rate": 0.00037475748705718005, + "loss": 2.6492, + "step": 19722 + }, + { + "epoch": 0.5848530676392967, + "grad_norm": 0.11564967781305313, + "learning_rate": 0.00037471193661329247, + "loss": 2.6571, + "step": 19723 + }, + { + "epoch": 0.5848827209916081, + "grad_norm": 0.1051400899887085, + "learning_rate": 0.000374666387278852, + "loss": 2.6509, + "step": 19724 + }, + { + "epoch": 0.5849123743439196, + "grad_norm": 0.1075378805398941, + "learning_rate": 0.0003746208390542622, + "loss": 2.6419, + "step": 19725 + }, + { + "epoch": 0.584942027696231, + "grad_norm": 0.10233286023139954, + "learning_rate": 0.0003745752919399263, + "loss": 2.6254, + "step": 19726 + }, + { + "epoch": 0.5849716810485426, + "grad_norm": 0.11287493258714676, + "learning_rate": 0.0003745297459362479, + "loss": 2.6501, + "step": 19727 + }, + { + "epoch": 0.585001334400854, + "grad_norm": 0.1374124139547348, + "learning_rate": 0.0003744842010436299, + "loss": 2.6436, + "step": 19728 + }, + { + "epoch": 0.5850309877531655, + "grad_norm": 0.10793531686067581, + "learning_rate": 0.00037443865726247605, + "loss": 2.6488, + "step": 19729 + }, + { + "epoch": 0.585060641105477, + "grad_norm": 0.12276872247457504, + "learning_rate": 0.00037439311459318937, + "loss": 2.6261, + "step": 19730 + }, + { + "epoch": 0.5850902944577885, + "grad_norm": 0.12067700177431107, + "learning_rate": 0.0003743475730361732, + "loss": 2.6494, + "step": 19731 + }, + { + "epoch": 0.5851199478100999, + "grad_norm": 0.11777812987565994, + "learning_rate": 0.00037430203259183095, + "loss": 2.6193, + "step": 19732 + }, + { + "epoch": 0.5851496011624114, + "grad_norm": 0.11208172142505646, + "learning_rate": 0.00037425649326056575, + "loss": 2.6598, + "step": 19733 + }, + { + "epoch": 0.5851792545147229, + "grad_norm": 0.10940402746200562, + "learning_rate": 0.00037421095504278084, + "loss": 2.6154, + "step": 19734 + }, + { + "epoch": 0.5852089078670344, + "grad_norm": 0.12287119776010513, + "learning_rate": 0.0003741654179388795, + "loss": 2.6381, + "step": 19735 + }, + { + "epoch": 0.5852385612193458, + "grad_norm": 0.10664806514978409, + "learning_rate": 0.000374119881949265, + "loss": 2.6486, + "step": 19736 + }, + { + "epoch": 0.5852682145716573, + "grad_norm": 0.11879506707191467, + "learning_rate": 0.0003740743470743405, + "loss": 2.6444, + "step": 19737 + }, + { + "epoch": 0.5852978679239688, + "grad_norm": 0.10183969885110855, + "learning_rate": 0.00037402881331450937, + "loss": 2.6345, + "step": 19738 + }, + { + "epoch": 0.5853275212762803, + "grad_norm": 0.11262033134698868, + "learning_rate": 0.0003739832806701747, + "loss": 2.6236, + "step": 19739 + }, + { + "epoch": 0.5853571746285917, + "grad_norm": 0.11600568890571594, + "learning_rate": 0.0003739377491417397, + "loss": 2.6572, + "step": 19740 + }, + { + "epoch": 0.5853868279809032, + "grad_norm": 0.11821074783802032, + "learning_rate": 0.00037389221872960736, + "loss": 2.6467, + "step": 19741 + }, + { + "epoch": 0.5854164813332147, + "grad_norm": 0.1063748300075531, + "learning_rate": 0.00037384668943418135, + "loss": 2.6545, + "step": 19742 + }, + { + "epoch": 0.5854461346855262, + "grad_norm": 0.11172918230295181, + "learning_rate": 0.0003738011612558645, + "loss": 2.6474, + "step": 19743 + }, + { + "epoch": 0.5854757880378377, + "grad_norm": 0.10606884956359863, + "learning_rate": 0.00037375563419506, + "loss": 2.6628, + "step": 19744 + }, + { + "epoch": 0.5855054413901492, + "grad_norm": 0.1021859273314476, + "learning_rate": 0.000373710108252171, + "loss": 2.6349, + "step": 19745 + }, + { + "epoch": 0.5855350947424607, + "grad_norm": 0.10420259833335876, + "learning_rate": 0.0003736645834276007, + "loss": 2.6199, + "step": 19746 + }, + { + "epoch": 0.5855647480947721, + "grad_norm": 0.09480225294828415, + "learning_rate": 0.00037361905972175223, + "loss": 2.6428, + "step": 19747 + }, + { + "epoch": 0.5855944014470836, + "grad_norm": 0.10625213384628296, + "learning_rate": 0.0003735735371350286, + "loss": 2.6217, + "step": 19748 + }, + { + "epoch": 0.585624054799395, + "grad_norm": 0.11567696183919907, + "learning_rate": 0.00037352801566783306, + "loss": 2.5623, + "step": 19749 + }, + { + "epoch": 0.5856537081517066, + "grad_norm": 0.10966209322214127, + "learning_rate": 0.00037348249532056875, + "loss": 2.6371, + "step": 19750 + }, + { + "epoch": 0.585683361504018, + "grad_norm": 0.11140056699514389, + "learning_rate": 0.0003734369760936386, + "loss": 2.6629, + "step": 19751 + }, + { + "epoch": 0.5857130148563295, + "grad_norm": 0.11712604761123657, + "learning_rate": 0.00037339145798744565, + "loss": 2.6478, + "step": 19752 + }, + { + "epoch": 0.585742668208641, + "grad_norm": 0.11748136579990387, + "learning_rate": 0.0003733459410023931, + "loss": 2.6249, + "step": 19753 + }, + { + "epoch": 0.5857723215609525, + "grad_norm": 0.15614472329616547, + "learning_rate": 0.0003733004251388839, + "loss": 2.6432, + "step": 19754 + }, + { + "epoch": 0.5858019749132639, + "grad_norm": 0.1895846724510193, + "learning_rate": 0.0003732549103973213, + "loss": 2.6926, + "step": 19755 + }, + { + "epoch": 0.5858316282655754, + "grad_norm": 0.15384191274642944, + "learning_rate": 0.00037320939677810814, + "loss": 2.6271, + "step": 19756 + }, + { + "epoch": 0.5858612816178869, + "grad_norm": 0.11800000816583633, + "learning_rate": 0.0003731638842816476, + "loss": 2.6317, + "step": 19757 + }, + { + "epoch": 0.5858909349701984, + "grad_norm": 0.14721982181072235, + "learning_rate": 0.0003731183729083427, + "loss": 2.677, + "step": 19758 + }, + { + "epoch": 0.5859205883225098, + "grad_norm": 0.13210739195346832, + "learning_rate": 0.0003730728626585963, + "loss": 2.626, + "step": 19759 + }, + { + "epoch": 0.5859502416748213, + "grad_norm": 0.1265750676393509, + "learning_rate": 0.0003730273535328115, + "loss": 2.6292, + "step": 19760 + }, + { + "epoch": 0.5859798950271328, + "grad_norm": 0.12890279293060303, + "learning_rate": 0.00037298184553139136, + "loss": 2.6633, + "step": 19761 + }, + { + "epoch": 0.5860095483794443, + "grad_norm": 0.10857980698347092, + "learning_rate": 0.00037293633865473873, + "loss": 2.6639, + "step": 19762 + }, + { + "epoch": 0.5860392017317557, + "grad_norm": 0.138608917593956, + "learning_rate": 0.00037289083290325663, + "loss": 2.6464, + "step": 19763 + }, + { + "epoch": 0.5860688550840673, + "grad_norm": 0.09914080053567886, + "learning_rate": 0.00037284532827734797, + "loss": 2.6518, + "step": 19764 + }, + { + "epoch": 0.5860985084363788, + "grad_norm": 0.1364111751317978, + "learning_rate": 0.0003727998247774158, + "loss": 2.6343, + "step": 19765 + }, + { + "epoch": 0.5861281617886902, + "grad_norm": 0.10140244662761688, + "learning_rate": 0.000372754322403863, + "loss": 2.6652, + "step": 19766 + }, + { + "epoch": 0.5861578151410017, + "grad_norm": 0.1152353435754776, + "learning_rate": 0.00037270882115709243, + "loss": 2.669, + "step": 19767 + }, + { + "epoch": 0.5861874684933132, + "grad_norm": 0.11384327709674835, + "learning_rate": 0.0003726633210375072, + "loss": 2.6658, + "step": 19768 + }, + { + "epoch": 0.5862171218456247, + "grad_norm": 0.10558245331048965, + "learning_rate": 0.0003726178220455101, + "loss": 2.6272, + "step": 19769 + }, + { + "epoch": 0.5862467751979361, + "grad_norm": 0.11375334113836288, + "learning_rate": 0.00037257232418150407, + "loss": 2.6371, + "step": 19770 + }, + { + "epoch": 0.5862764285502476, + "grad_norm": 0.11349064111709595, + "learning_rate": 0.00037252682744589205, + "loss": 2.6589, + "step": 19771 + }, + { + "epoch": 0.5863060819025591, + "grad_norm": 0.11369866132736206, + "learning_rate": 0.00037248133183907675, + "loss": 2.6756, + "step": 19772 + }, + { + "epoch": 0.5863357352548706, + "grad_norm": 0.12150665372610092, + "learning_rate": 0.0003724358373614612, + "loss": 2.6563, + "step": 19773 + }, + { + "epoch": 0.586365388607182, + "grad_norm": 0.11085677891969681, + "learning_rate": 0.00037239034401344816, + "loss": 2.6443, + "step": 19774 + }, + { + "epoch": 0.5863950419594935, + "grad_norm": 0.12033818662166595, + "learning_rate": 0.00037234485179544054, + "loss": 2.6549, + "step": 19775 + }, + { + "epoch": 0.586424695311805, + "grad_norm": 0.114754818379879, + "learning_rate": 0.0003722993607078412, + "loss": 2.6593, + "step": 19776 + }, + { + "epoch": 0.5864543486641165, + "grad_norm": 0.10660982877016068, + "learning_rate": 0.0003722538707510529, + "loss": 2.6132, + "step": 19777 + }, + { + "epoch": 0.5864840020164279, + "grad_norm": 0.13187099993228912, + "learning_rate": 0.00037220838192547856, + "loss": 2.6675, + "step": 19778 + }, + { + "epoch": 0.5865136553687395, + "grad_norm": 0.10820295661687851, + "learning_rate": 0.00037216289423152096, + "loss": 2.6183, + "step": 19779 + }, + { + "epoch": 0.5865433087210509, + "grad_norm": 0.1176973283290863, + "learning_rate": 0.0003721174076695826, + "loss": 2.6199, + "step": 19780 + }, + { + "epoch": 0.5865729620733624, + "grad_norm": 0.11316713690757751, + "learning_rate": 0.0003720719222400668, + "loss": 2.6611, + "step": 19781 + }, + { + "epoch": 0.5866026154256738, + "grad_norm": 0.11386791616678238, + "learning_rate": 0.00037202643794337603, + "loss": 2.6261, + "step": 19782 + }, + { + "epoch": 0.5866322687779854, + "grad_norm": 0.11313889175653458, + "learning_rate": 0.0003719809547799131, + "loss": 2.692, + "step": 19783 + }, + { + "epoch": 0.5866619221302968, + "grad_norm": 0.1022413894534111, + "learning_rate": 0.00037193547275008083, + "loss": 2.6555, + "step": 19784 + }, + { + "epoch": 0.5866915754826083, + "grad_norm": 0.11171875894069672, + "learning_rate": 0.0003718899918542819, + "loss": 2.6552, + "step": 19785 + }, + { + "epoch": 0.5867212288349198, + "grad_norm": 0.11862301081418991, + "learning_rate": 0.00037184451209291915, + "loss": 2.636, + "step": 19786 + }, + { + "epoch": 0.5867508821872313, + "grad_norm": 0.11677003651857376, + "learning_rate": 0.00037179903346639515, + "loss": 2.6513, + "step": 19787 + }, + { + "epoch": 0.5867805355395428, + "grad_norm": 0.12182146310806274, + "learning_rate": 0.00037175355597511285, + "loss": 2.6518, + "step": 19788 + }, + { + "epoch": 0.5868101888918542, + "grad_norm": 0.12240917235612869, + "learning_rate": 0.00037170807961947475, + "loss": 2.6667, + "step": 19789 + }, + { + "epoch": 0.5868398422441657, + "grad_norm": 0.10188811272382736, + "learning_rate": 0.0003716626043998836, + "loss": 2.6604, + "step": 19790 + }, + { + "epoch": 0.5868694955964772, + "grad_norm": 0.13635309040546417, + "learning_rate": 0.00037161713031674213, + "loss": 2.6398, + "step": 19791 + }, + { + "epoch": 0.5868991489487887, + "grad_norm": 0.11779692769050598, + "learning_rate": 0.00037157165737045295, + "loss": 2.6373, + "step": 19792 + }, + { + "epoch": 0.5869288023011001, + "grad_norm": 0.12719951570034027, + "learning_rate": 0.0003715261855614187, + "loss": 2.6582, + "step": 19793 + }, + { + "epoch": 0.5869584556534116, + "grad_norm": 0.10340513288974762, + "learning_rate": 0.00037148071489004233, + "loss": 2.6676, + "step": 19794 + }, + { + "epoch": 0.5869881090057231, + "grad_norm": 0.09728484600782394, + "learning_rate": 0.0003714352453567262, + "loss": 2.6224, + "step": 19795 + }, + { + "epoch": 0.5870177623580346, + "grad_norm": 0.10460575670003891, + "learning_rate": 0.00037138977696187306, + "loss": 2.6595, + "step": 19796 + }, + { + "epoch": 0.587047415710346, + "grad_norm": 0.11893312633037567, + "learning_rate": 0.0003713443097058855, + "loss": 2.6705, + "step": 19797 + }, + { + "epoch": 0.5870770690626576, + "grad_norm": 0.11994489282369614, + "learning_rate": 0.00037129884358916624, + "loss": 2.65, + "step": 19798 + }, + { + "epoch": 0.587106722414969, + "grad_norm": 0.10876119136810303, + "learning_rate": 0.00037125337861211783, + "loss": 2.6383, + "step": 19799 + }, + { + "epoch": 0.5871363757672805, + "grad_norm": 0.12290357053279877, + "learning_rate": 0.00037120791477514276, + "loss": 2.6702, + "step": 19800 + }, + { + "epoch": 0.5871660291195919, + "grad_norm": 0.12309218943119049, + "learning_rate": 0.00037116245207864375, + "loss": 2.6955, + "step": 19801 + }, + { + "epoch": 0.5871956824719035, + "grad_norm": 0.12096235156059265, + "learning_rate": 0.0003711169905230233, + "loss": 2.6408, + "step": 19802 + }, + { + "epoch": 0.5872253358242149, + "grad_norm": 0.10710179805755615, + "learning_rate": 0.00037107153010868405, + "loss": 2.6377, + "step": 19803 + }, + { + "epoch": 0.5872549891765264, + "grad_norm": 0.12731090188026428, + "learning_rate": 0.0003710260708360285, + "loss": 2.641, + "step": 19804 + }, + { + "epoch": 0.5872846425288378, + "grad_norm": 0.10538174957036972, + "learning_rate": 0.0003709806127054592, + "loss": 2.666, + "step": 19805 + }, + { + "epoch": 0.5873142958811494, + "grad_norm": 0.11972087621688843, + "learning_rate": 0.0003709351557173788, + "loss": 2.6616, + "step": 19806 + }, + { + "epoch": 0.5873439492334609, + "grad_norm": 0.11263566464185715, + "learning_rate": 0.00037088969987218967, + "loss": 2.6257, + "step": 19807 + }, + { + "epoch": 0.5873736025857723, + "grad_norm": 0.11021771281957626, + "learning_rate": 0.0003708442451702945, + "loss": 2.6348, + "step": 19808 + }, + { + "epoch": 0.5874032559380838, + "grad_norm": 0.098647840321064, + "learning_rate": 0.00037079879161209574, + "loss": 2.676, + "step": 19809 + }, + { + "epoch": 0.5874329092903953, + "grad_norm": 0.1166568398475647, + "learning_rate": 0.0003707533391979958, + "loss": 2.6649, + "step": 19810 + }, + { + "epoch": 0.5874625626427068, + "grad_norm": 0.12340063601732254, + "learning_rate": 0.0003707078879283972, + "loss": 2.6571, + "step": 19811 + }, + { + "epoch": 0.5874922159950182, + "grad_norm": 0.10988689213991165, + "learning_rate": 0.0003706624378037025, + "loss": 2.6362, + "step": 19812 + }, + { + "epoch": 0.5875218693473298, + "grad_norm": 0.10719846189022064, + "learning_rate": 0.00037061698882431403, + "loss": 2.6597, + "step": 19813 + }, + { + "epoch": 0.5875515226996412, + "grad_norm": 0.09989340603351593, + "learning_rate": 0.0003705715409906344, + "loss": 2.6627, + "step": 19814 + }, + { + "epoch": 0.5875811760519527, + "grad_norm": 0.1262127161026001, + "learning_rate": 0.00037052609430306594, + "loss": 2.6539, + "step": 19815 + }, + { + "epoch": 0.5876108294042641, + "grad_norm": 0.11960414052009583, + "learning_rate": 0.00037048064876201125, + "loss": 2.6648, + "step": 19816 + }, + { + "epoch": 0.5876404827565757, + "grad_norm": 0.12231635302305222, + "learning_rate": 0.0003704352043678726, + "loss": 2.6381, + "step": 19817 + }, + { + "epoch": 0.5876701361088871, + "grad_norm": 0.11067966371774673, + "learning_rate": 0.0003703897611210522, + "loss": 2.6245, + "step": 19818 + }, + { + "epoch": 0.5876997894611986, + "grad_norm": 0.11738893389701843, + "learning_rate": 0.00037034431902195305, + "loss": 2.6551, + "step": 19819 + }, + { + "epoch": 0.58772944281351, + "grad_norm": 0.11673415452241898, + "learning_rate": 0.00037029887807097706, + "loss": 2.6388, + "step": 19820 + }, + { + "epoch": 0.5877590961658216, + "grad_norm": 0.13753588497638702, + "learning_rate": 0.00037025343826852685, + "loss": 2.6673, + "step": 19821 + }, + { + "epoch": 0.587788749518133, + "grad_norm": 0.1312762051820755, + "learning_rate": 0.0003702079996150046, + "loss": 2.6591, + "step": 19822 + }, + { + "epoch": 0.5878184028704445, + "grad_norm": 0.13433495163917542, + "learning_rate": 0.0003701625621108129, + "loss": 2.6365, + "step": 19823 + }, + { + "epoch": 0.5878480562227559, + "grad_norm": 0.12297741323709488, + "learning_rate": 0.00037011712575635395, + "loss": 2.6494, + "step": 19824 + }, + { + "epoch": 0.5878777095750675, + "grad_norm": 0.11040998995304108, + "learning_rate": 0.00037007169055203014, + "loss": 2.6613, + "step": 19825 + }, + { + "epoch": 0.5879073629273789, + "grad_norm": 0.1251634806394577, + "learning_rate": 0.00037002625649824395, + "loss": 2.6604, + "step": 19826 + }, + { + "epoch": 0.5879370162796904, + "grad_norm": 0.1167081668972969, + "learning_rate": 0.0003699808235953974, + "loss": 2.662, + "step": 19827 + }, + { + "epoch": 0.587966669632002, + "grad_norm": 0.13252335786819458, + "learning_rate": 0.000369935391843893, + "loss": 2.6222, + "step": 19828 + }, + { + "epoch": 0.5879963229843134, + "grad_norm": 0.11262532323598862, + "learning_rate": 0.0003698899612441331, + "loss": 2.6807, + "step": 19829 + }, + { + "epoch": 0.5880259763366249, + "grad_norm": 0.12057526409626007, + "learning_rate": 0.0003698445317965199, + "loss": 2.6347, + "step": 19830 + }, + { + "epoch": 0.5880556296889363, + "grad_norm": 0.12053602188825607, + "learning_rate": 0.00036979910350145554, + "loss": 2.6516, + "step": 19831 + }, + { + "epoch": 0.5880852830412479, + "grad_norm": 0.1353808045387268, + "learning_rate": 0.0003697536763593426, + "loss": 2.6474, + "step": 19832 + }, + { + "epoch": 0.5881149363935593, + "grad_norm": 0.1265631765127182, + "learning_rate": 0.0003697082503705832, + "loss": 2.6192, + "step": 19833 + }, + { + "epoch": 0.5881445897458708, + "grad_norm": 0.12488503754138947, + "learning_rate": 0.0003696628255355796, + "loss": 2.6691, + "step": 19834 + }, + { + "epoch": 0.5881742430981822, + "grad_norm": 0.11683911830186844, + "learning_rate": 0.00036961740185473415, + "loss": 2.6376, + "step": 19835 + }, + { + "epoch": 0.5882038964504938, + "grad_norm": 0.12244845181703568, + "learning_rate": 0.00036957197932844886, + "loss": 2.6266, + "step": 19836 + }, + { + "epoch": 0.5882335498028052, + "grad_norm": 0.128672257065773, + "learning_rate": 0.0003695265579571263, + "loss": 2.6492, + "step": 19837 + }, + { + "epoch": 0.5882632031551167, + "grad_norm": 0.12617598474025726, + "learning_rate": 0.0003694811377411683, + "loss": 2.5956, + "step": 19838 + }, + { + "epoch": 0.5882928565074281, + "grad_norm": 0.11138724535703659, + "learning_rate": 0.00036943571868097724, + "loss": 2.6587, + "step": 19839 + }, + { + "epoch": 0.5883225098597397, + "grad_norm": 0.1057351678609848, + "learning_rate": 0.00036939030077695525, + "loss": 2.6318, + "step": 19840 + }, + { + "epoch": 0.5883521632120511, + "grad_norm": 0.12244247645139694, + "learning_rate": 0.0003693448840295046, + "loss": 2.6506, + "step": 19841 + }, + { + "epoch": 0.5883818165643626, + "grad_norm": 0.10249675810337067, + "learning_rate": 0.0003692994684390275, + "loss": 2.6389, + "step": 19842 + }, + { + "epoch": 0.588411469916674, + "grad_norm": 0.10811838507652283, + "learning_rate": 0.00036925405400592605, + "loss": 2.6009, + "step": 19843 + }, + { + "epoch": 0.5884411232689856, + "grad_norm": 0.10444983094930649, + "learning_rate": 0.00036920864073060214, + "loss": 2.6518, + "step": 19844 + }, + { + "epoch": 0.588470776621297, + "grad_norm": 0.09594040364027023, + "learning_rate": 0.0003691632286134583, + "loss": 2.6447, + "step": 19845 + }, + { + "epoch": 0.5885004299736085, + "grad_norm": 0.1165303885936737, + "learning_rate": 0.0003691178176548966, + "loss": 2.656, + "step": 19846 + }, + { + "epoch": 0.58853008332592, + "grad_norm": 0.1008167415857315, + "learning_rate": 0.00036907240785531914, + "loss": 2.6422, + "step": 19847 + }, + { + "epoch": 0.5885597366782315, + "grad_norm": 0.12787207961082458, + "learning_rate": 0.00036902699921512796, + "loss": 2.622, + "step": 19848 + }, + { + "epoch": 0.588589390030543, + "grad_norm": 0.1318168044090271, + "learning_rate": 0.0003689815917347251, + "loss": 2.6389, + "step": 19849 + }, + { + "epoch": 0.5886190433828544, + "grad_norm": 0.10506650060415268, + "learning_rate": 0.0003689361854145128, + "loss": 2.6253, + "step": 19850 + }, + { + "epoch": 0.588648696735166, + "grad_norm": 0.10716147720813751, + "learning_rate": 0.00036889078025489306, + "loss": 2.6355, + "step": 19851 + }, + { + "epoch": 0.5886783500874774, + "grad_norm": 0.12114819884300232, + "learning_rate": 0.0003688453762562679, + "loss": 2.6647, + "step": 19852 + }, + { + "epoch": 0.5887080034397889, + "grad_norm": 0.1388777494430542, + "learning_rate": 0.00036879997341903955, + "loss": 2.6329, + "step": 19853 + }, + { + "epoch": 0.5887376567921003, + "grad_norm": 0.11122032254934311, + "learning_rate": 0.00036875457174361, + "loss": 2.6512, + "step": 19854 + }, + { + "epoch": 0.5887673101444119, + "grad_norm": 0.10860385000705719, + "learning_rate": 0.0003687091712303811, + "loss": 2.6532, + "step": 19855 + }, + { + "epoch": 0.5887969634967233, + "grad_norm": 0.11786460876464844, + "learning_rate": 0.0003686637718797551, + "loss": 2.6383, + "step": 19856 + }, + { + "epoch": 0.5888266168490348, + "grad_norm": 0.10979652404785156, + "learning_rate": 0.0003686183736921338, + "loss": 2.6572, + "step": 19857 + }, + { + "epoch": 0.5888562702013462, + "grad_norm": 0.10346294194459915, + "learning_rate": 0.00036857297666791945, + "loss": 2.6369, + "step": 19858 + }, + { + "epoch": 0.5888859235536578, + "grad_norm": 0.10494980216026306, + "learning_rate": 0.00036852758080751396, + "loss": 2.6357, + "step": 19859 + }, + { + "epoch": 0.5889155769059692, + "grad_norm": 0.11004228889942169, + "learning_rate": 0.00036848218611131934, + "loss": 2.6178, + "step": 19860 + }, + { + "epoch": 0.5889452302582807, + "grad_norm": 0.11526668071746826, + "learning_rate": 0.0003684367925797375, + "loss": 2.643, + "step": 19861 + }, + { + "epoch": 0.5889748836105921, + "grad_norm": 0.10251104831695557, + "learning_rate": 0.00036839140021317047, + "loss": 2.6697, + "step": 19862 + }, + { + "epoch": 0.5890045369629037, + "grad_norm": 0.12109819054603577, + "learning_rate": 0.0003683460090120202, + "loss": 2.6524, + "step": 19863 + }, + { + "epoch": 0.5890341903152151, + "grad_norm": 0.12288197129964828, + "learning_rate": 0.00036830061897668865, + "loss": 2.6612, + "step": 19864 + }, + { + "epoch": 0.5890638436675266, + "grad_norm": 0.13732300698757172, + "learning_rate": 0.0003682552301075777, + "loss": 2.6662, + "step": 19865 + }, + { + "epoch": 0.589093497019838, + "grad_norm": 0.13904564082622528, + "learning_rate": 0.00036820984240508925, + "loss": 2.6339, + "step": 19866 + }, + { + "epoch": 0.5891231503721496, + "grad_norm": 0.14103834331035614, + "learning_rate": 0.00036816445586962523, + "loss": 2.6679, + "step": 19867 + }, + { + "epoch": 0.5891528037244611, + "grad_norm": 0.10693228989839554, + "learning_rate": 0.00036811907050158767, + "loss": 2.6668, + "step": 19868 + }, + { + "epoch": 0.5891824570767725, + "grad_norm": 0.13069136440753937, + "learning_rate": 0.0003680736863013783, + "loss": 2.6277, + "step": 19869 + }, + { + "epoch": 0.5892121104290841, + "grad_norm": 0.14329057931900024, + "learning_rate": 0.0003680283032693991, + "loss": 2.6647, + "step": 19870 + }, + { + "epoch": 0.5892417637813955, + "grad_norm": 0.13832154870033264, + "learning_rate": 0.00036798292140605187, + "loss": 2.632, + "step": 19871 + }, + { + "epoch": 0.589271417133707, + "grad_norm": 0.11580269783735275, + "learning_rate": 0.0003679375407117386, + "loss": 2.6664, + "step": 19872 + }, + { + "epoch": 0.5893010704860184, + "grad_norm": 0.13108476996421814, + "learning_rate": 0.0003678921611868611, + "loss": 2.6418, + "step": 19873 + }, + { + "epoch": 0.58933072383833, + "grad_norm": 0.12680156528949738, + "learning_rate": 0.00036784678283182114, + "loss": 2.6313, + "step": 19874 + }, + { + "epoch": 0.5893603771906414, + "grad_norm": 0.12098252028226852, + "learning_rate": 0.0003678014056470208, + "loss": 2.6737, + "step": 19875 + }, + { + "epoch": 0.5893900305429529, + "grad_norm": 0.12756021320819855, + "learning_rate": 0.00036775602963286155, + "loss": 2.6375, + "step": 19876 + }, + { + "epoch": 0.5894196838952643, + "grad_norm": 0.11579417437314987, + "learning_rate": 0.0003677106547897453, + "loss": 2.6493, + "step": 19877 + }, + { + "epoch": 0.5894493372475759, + "grad_norm": 0.09852075576782227, + "learning_rate": 0.00036766528111807395, + "loss": 2.6628, + "step": 19878 + }, + { + "epoch": 0.5894789905998873, + "grad_norm": 0.12854284048080444, + "learning_rate": 0.0003676199086182493, + "loss": 2.6616, + "step": 19879 + }, + { + "epoch": 0.5895086439521988, + "grad_norm": 0.12962552905082703, + "learning_rate": 0.000367574537290673, + "loss": 2.6467, + "step": 19880 + }, + { + "epoch": 0.5895382973045102, + "grad_norm": 0.11072271317243576, + "learning_rate": 0.0003675291671357471, + "loss": 2.6063, + "step": 19881 + }, + { + "epoch": 0.5895679506568218, + "grad_norm": 0.11472344398498535, + "learning_rate": 0.000367483798153873, + "loss": 2.648, + "step": 19882 + }, + { + "epoch": 0.5895976040091332, + "grad_norm": 0.11995834112167358, + "learning_rate": 0.0003674384303454524, + "loss": 2.6356, + "step": 19883 + }, + { + "epoch": 0.5896272573614447, + "grad_norm": 0.10932906717061996, + "learning_rate": 0.0003673930637108874, + "loss": 2.6508, + "step": 19884 + }, + { + "epoch": 0.5896569107137561, + "grad_norm": 0.10468301922082901, + "learning_rate": 0.00036734769825057977, + "loss": 2.6459, + "step": 19885 + }, + { + "epoch": 0.5896865640660677, + "grad_norm": 0.10739171504974365, + "learning_rate": 0.000367302333964931, + "loss": 2.6367, + "step": 19886 + }, + { + "epoch": 0.5897162174183791, + "grad_norm": 0.11070436239242554, + "learning_rate": 0.0003672569708543427, + "loss": 2.6274, + "step": 19887 + }, + { + "epoch": 0.5897458707706906, + "grad_norm": 0.10561221092939377, + "learning_rate": 0.0003672116089192168, + "loss": 2.6397, + "step": 19888 + }, + { + "epoch": 0.5897755241230022, + "grad_norm": 0.11511698365211487, + "learning_rate": 0.0003671662481599549, + "loss": 2.6571, + "step": 19889 + }, + { + "epoch": 0.5898051774753136, + "grad_norm": 0.11026385426521301, + "learning_rate": 0.0003671208885769586, + "loss": 2.6326, + "step": 19890 + }, + { + "epoch": 0.5898348308276251, + "grad_norm": 0.11745373904705048, + "learning_rate": 0.00036707553017062975, + "loss": 2.6247, + "step": 19891 + }, + { + "epoch": 0.5898644841799365, + "grad_norm": 0.10719543695449829, + "learning_rate": 0.00036703017294136984, + "loss": 2.6608, + "step": 19892 + }, + { + "epoch": 0.5898941375322481, + "grad_norm": 0.11591050773859024, + "learning_rate": 0.0003669848168895806, + "loss": 2.6504, + "step": 19893 + }, + { + "epoch": 0.5899237908845595, + "grad_norm": 0.13362936675548553, + "learning_rate": 0.0003669394620156636, + "loss": 2.6291, + "step": 19894 + }, + { + "epoch": 0.589953444236871, + "grad_norm": 0.11733543127775192, + "learning_rate": 0.0003668941083200206, + "loss": 2.6204, + "step": 19895 + }, + { + "epoch": 0.5899830975891824, + "grad_norm": 0.12807048857212067, + "learning_rate": 0.00036684875580305287, + "loss": 2.6277, + "step": 19896 + }, + { + "epoch": 0.590012750941494, + "grad_norm": 0.1134832501411438, + "learning_rate": 0.00036680340446516234, + "loss": 2.6658, + "step": 19897 + }, + { + "epoch": 0.5900424042938054, + "grad_norm": 0.11817163228988647, + "learning_rate": 0.0003667580543067507, + "loss": 2.6634, + "step": 19898 + }, + { + "epoch": 0.5900720576461169, + "grad_norm": 0.10997985303401947, + "learning_rate": 0.0003667127053282192, + "loss": 2.6348, + "step": 19899 + }, + { + "epoch": 0.5901017109984283, + "grad_norm": 0.12037846446037292, + "learning_rate": 0.00036666735752996965, + "loss": 2.6394, + "step": 19900 + }, + { + "epoch": 0.5901313643507399, + "grad_norm": 0.12125828117132187, + "learning_rate": 0.00036662201091240356, + "loss": 2.6336, + "step": 19901 + }, + { + "epoch": 0.5901610177030513, + "grad_norm": 0.11400855332612991, + "learning_rate": 0.0003665766654759225, + "loss": 2.6565, + "step": 19902 + }, + { + "epoch": 0.5901906710553628, + "grad_norm": 0.11185396462678909, + "learning_rate": 0.00036653132122092786, + "loss": 2.6648, + "step": 19903 + }, + { + "epoch": 0.5902203244076742, + "grad_norm": 0.1400671899318695, + "learning_rate": 0.0003664859781478213, + "loss": 2.6045, + "step": 19904 + }, + { + "epoch": 0.5902499777599858, + "grad_norm": 0.1574503630399704, + "learning_rate": 0.0003664406362570043, + "loss": 2.6348, + "step": 19905 + }, + { + "epoch": 0.5902796311122972, + "grad_norm": 0.12352246791124344, + "learning_rate": 0.00036639529554887844, + "loss": 2.6644, + "step": 19906 + }, + { + "epoch": 0.5903092844646087, + "grad_norm": 0.11712236702442169, + "learning_rate": 0.00036634995602384513, + "loss": 2.64, + "step": 19907 + }, + { + "epoch": 0.5903389378169202, + "grad_norm": 0.12094437330961227, + "learning_rate": 0.00036630461768230593, + "loss": 2.6312, + "step": 19908 + }, + { + "epoch": 0.5903685911692317, + "grad_norm": 0.11603984236717224, + "learning_rate": 0.00036625928052466217, + "loss": 2.6551, + "step": 19909 + }, + { + "epoch": 0.5903982445215432, + "grad_norm": 0.11674855649471283, + "learning_rate": 0.0003662139445513156, + "loss": 2.6268, + "step": 19910 + }, + { + "epoch": 0.5904278978738546, + "grad_norm": 0.11474812775850296, + "learning_rate": 0.00036616860976266744, + "loss": 2.6299, + "step": 19911 + }, + { + "epoch": 0.5904575512261662, + "grad_norm": 0.11870084702968597, + "learning_rate": 0.0003661232761591192, + "loss": 2.6481, + "step": 19912 + }, + { + "epoch": 0.5904872045784776, + "grad_norm": 0.11924741417169571, + "learning_rate": 0.0003660779437410725, + "loss": 2.6111, + "step": 19913 + }, + { + "epoch": 0.5905168579307891, + "grad_norm": 0.1328674554824829, + "learning_rate": 0.0003660326125089284, + "loss": 2.6496, + "step": 19914 + }, + { + "epoch": 0.5905465112831005, + "grad_norm": 0.12869350612163544, + "learning_rate": 0.0003659872824630886, + "loss": 2.6421, + "step": 19915 + }, + { + "epoch": 0.5905761646354121, + "grad_norm": 0.11012173444032669, + "learning_rate": 0.00036594195360395437, + "loss": 2.6782, + "step": 19916 + }, + { + "epoch": 0.5906058179877235, + "grad_norm": 0.12863565981388092, + "learning_rate": 0.00036589662593192716, + "loss": 2.6909, + "step": 19917 + }, + { + "epoch": 0.590635471340035, + "grad_norm": 0.12897427380084991, + "learning_rate": 0.0003658512994474084, + "loss": 2.6622, + "step": 19918 + }, + { + "epoch": 0.5906651246923464, + "grad_norm": 0.10237526148557663, + "learning_rate": 0.00036580597415079944, + "loss": 2.6389, + "step": 19919 + }, + { + "epoch": 0.590694778044658, + "grad_norm": 0.10978689789772034, + "learning_rate": 0.00036576065004250156, + "loss": 2.6972, + "step": 19920 + }, + { + "epoch": 0.5907244313969694, + "grad_norm": 0.11026226729154587, + "learning_rate": 0.0003657153271229161, + "loss": 2.6506, + "step": 19921 + }, + { + "epoch": 0.5907540847492809, + "grad_norm": 0.11541200429201126, + "learning_rate": 0.0003656700053924443, + "loss": 2.6486, + "step": 19922 + }, + { + "epoch": 0.5907837381015923, + "grad_norm": 0.11265489459037781, + "learning_rate": 0.000365624684851488, + "loss": 2.6102, + "step": 19923 + }, + { + "epoch": 0.5908133914539039, + "grad_norm": 0.10796317458152771, + "learning_rate": 0.00036557936550044804, + "loss": 2.6412, + "step": 19924 + }, + { + "epoch": 0.5908430448062153, + "grad_norm": 0.10438485443592072, + "learning_rate": 0.0003655340473397259, + "loss": 2.6337, + "step": 19925 + }, + { + "epoch": 0.5908726981585268, + "grad_norm": 0.1145855188369751, + "learning_rate": 0.00036548873036972284, + "loss": 2.6286, + "step": 19926 + }, + { + "epoch": 0.5909023515108383, + "grad_norm": 0.1595737189054489, + "learning_rate": 0.00036544341459084014, + "loss": 2.6442, + "step": 19927 + }, + { + "epoch": 0.5909320048631498, + "grad_norm": 0.1139177531003952, + "learning_rate": 0.0003653981000034791, + "loss": 2.6388, + "step": 19928 + }, + { + "epoch": 0.5909616582154612, + "grad_norm": 0.11444319039583206, + "learning_rate": 0.00036535278660804107, + "loss": 2.6524, + "step": 19929 + }, + { + "epoch": 0.5909913115677727, + "grad_norm": 0.12922579050064087, + "learning_rate": 0.00036530747440492717, + "loss": 2.6543, + "step": 19930 + }, + { + "epoch": 0.5910209649200843, + "grad_norm": 0.1234854906797409, + "learning_rate": 0.00036526216339453877, + "loss": 2.6344, + "step": 19931 + }, + { + "epoch": 0.5910506182723957, + "grad_norm": 0.1149936243891716, + "learning_rate": 0.00036521685357727697, + "loss": 2.6501, + "step": 19932 + }, + { + "epoch": 0.5910802716247072, + "grad_norm": 0.11606951057910919, + "learning_rate": 0.000365171544953543, + "loss": 2.6783, + "step": 19933 + }, + { + "epoch": 0.5911099249770186, + "grad_norm": 0.10870331525802612, + "learning_rate": 0.0003651262375237382, + "loss": 2.6402, + "step": 19934 + }, + { + "epoch": 0.5911395783293302, + "grad_norm": 0.12472892552614212, + "learning_rate": 0.0003650809312882636, + "loss": 2.6846, + "step": 19935 + }, + { + "epoch": 0.5911692316816416, + "grad_norm": 0.12494160234928131, + "learning_rate": 0.00036503562624752063, + "loss": 2.6799, + "step": 19936 + }, + { + "epoch": 0.5911988850339531, + "grad_norm": 0.10819660872220993, + "learning_rate": 0.0003649903224019104, + "loss": 2.6425, + "step": 19937 + }, + { + "epoch": 0.5912285383862645, + "grad_norm": 0.12055107206106186, + "learning_rate": 0.00036494501975183405, + "loss": 2.6016, + "step": 19938 + }, + { + "epoch": 0.5912581917385761, + "grad_norm": 0.10808638483285904, + "learning_rate": 0.00036489971829769266, + "loss": 2.6424, + "step": 19939 + }, + { + "epoch": 0.5912878450908875, + "grad_norm": 0.10039549320936203, + "learning_rate": 0.0003648544180398875, + "loss": 2.6401, + "step": 19940 + }, + { + "epoch": 0.591317498443199, + "grad_norm": 0.12437521666288376, + "learning_rate": 0.0003648091189788197, + "loss": 2.6446, + "step": 19941 + }, + { + "epoch": 0.5913471517955105, + "grad_norm": 0.11227942258119583, + "learning_rate": 0.00036476382111489026, + "loss": 2.6619, + "step": 19942 + }, + { + "epoch": 0.591376805147822, + "grad_norm": 0.11906490474939346, + "learning_rate": 0.00036471852444850046, + "loss": 2.6087, + "step": 19943 + }, + { + "epoch": 0.5914064585001334, + "grad_norm": 0.10262100398540497, + "learning_rate": 0.00036467322898005127, + "loss": 2.6199, + "step": 19944 + }, + { + "epoch": 0.5914361118524449, + "grad_norm": 0.12132174521684647, + "learning_rate": 0.00036462793470994396, + "loss": 2.6565, + "step": 19945 + }, + { + "epoch": 0.5914657652047564, + "grad_norm": 0.11453790962696075, + "learning_rate": 0.00036458264163857947, + "loss": 2.6628, + "step": 19946 + }, + { + "epoch": 0.5914954185570679, + "grad_norm": 0.12582457065582275, + "learning_rate": 0.00036453734976635906, + "loss": 2.6799, + "step": 19947 + }, + { + "epoch": 0.5915250719093793, + "grad_norm": 0.10279274731874466, + "learning_rate": 0.00036449205909368335, + "loss": 2.6304, + "step": 19948 + }, + { + "epoch": 0.5915547252616908, + "grad_norm": 0.10700390487909317, + "learning_rate": 0.0003644467696209539, + "loss": 2.6924, + "step": 19949 + }, + { + "epoch": 0.5915843786140023, + "grad_norm": 0.11819335073232651, + "learning_rate": 0.0003644014813485716, + "loss": 2.6596, + "step": 19950 + }, + { + "epoch": 0.5916140319663138, + "grad_norm": 0.10232517868280411, + "learning_rate": 0.00036435619427693756, + "loss": 2.6567, + "step": 19951 + }, + { + "epoch": 0.5916436853186253, + "grad_norm": 0.10583235323429108, + "learning_rate": 0.0003643109084064526, + "loss": 2.6399, + "step": 19952 + }, + { + "epoch": 0.5916733386709367, + "grad_norm": 0.10569974035024643, + "learning_rate": 0.0003642656237375178, + "loss": 2.6876, + "step": 19953 + }, + { + "epoch": 0.5917029920232483, + "grad_norm": 0.11177893728017807, + "learning_rate": 0.00036422034027053425, + "loss": 2.6567, + "step": 19954 + }, + { + "epoch": 0.5917326453755597, + "grad_norm": 0.09477093070745468, + "learning_rate": 0.00036417505800590287, + "loss": 2.642, + "step": 19955 + }, + { + "epoch": 0.5917622987278712, + "grad_norm": 0.10281423479318619, + "learning_rate": 0.00036412977694402467, + "loss": 2.6349, + "step": 19956 + }, + { + "epoch": 0.5917919520801826, + "grad_norm": 0.09863948076963425, + "learning_rate": 0.0003640844970853007, + "loss": 2.6163, + "step": 19957 + }, + { + "epoch": 0.5918216054324942, + "grad_norm": 0.09742170572280884, + "learning_rate": 0.00036403921843013176, + "loss": 2.6588, + "step": 19958 + }, + { + "epoch": 0.5918512587848056, + "grad_norm": 0.11402551829814911, + "learning_rate": 0.00036399394097891887, + "loss": 2.6774, + "step": 19959 + }, + { + "epoch": 0.5918809121371171, + "grad_norm": 0.12063710391521454, + "learning_rate": 0.000363948664732063, + "loss": 2.6472, + "step": 19960 + }, + { + "epoch": 0.5919105654894286, + "grad_norm": 0.11853349953889847, + "learning_rate": 0.00036390338968996487, + "loss": 2.6689, + "step": 19961 + }, + { + "epoch": 0.5919402188417401, + "grad_norm": 0.10774955153465271, + "learning_rate": 0.00036385811585302574, + "loss": 2.6614, + "step": 19962 + }, + { + "epoch": 0.5919698721940515, + "grad_norm": 0.10653889179229736, + "learning_rate": 0.0003638128432216464, + "loss": 2.6439, + "step": 19963 + }, + { + "epoch": 0.591999525546363, + "grad_norm": 0.11803983151912689, + "learning_rate": 0.00036376757179622764, + "loss": 2.6307, + "step": 19964 + }, + { + "epoch": 0.5920291788986745, + "grad_norm": 0.1226734071969986, + "learning_rate": 0.0003637223015771705, + "loss": 2.6355, + "step": 19965 + }, + { + "epoch": 0.592058832250986, + "grad_norm": 0.10497566312551498, + "learning_rate": 0.00036367703256487573, + "loss": 2.6211, + "step": 19966 + }, + { + "epoch": 0.5920884856032974, + "grad_norm": 0.1182965561747551, + "learning_rate": 0.00036363176475974425, + "loss": 2.6229, + "step": 19967 + }, + { + "epoch": 0.5921181389556089, + "grad_norm": 0.13135099411010742, + "learning_rate": 0.000363586498162177, + "loss": 2.6273, + "step": 19968 + }, + { + "epoch": 0.5921477923079204, + "grad_norm": 0.11520887911319733, + "learning_rate": 0.00036354123277257454, + "loss": 2.6318, + "step": 19969 + }, + { + "epoch": 0.5921774456602319, + "grad_norm": 0.10200352221727371, + "learning_rate": 0.000363495968591338, + "loss": 2.6666, + "step": 19970 + }, + { + "epoch": 0.5922070990125433, + "grad_norm": 0.13039064407348633, + "learning_rate": 0.00036345070561886805, + "loss": 2.6322, + "step": 19971 + }, + { + "epoch": 0.5922367523648548, + "grad_norm": 0.13357257843017578, + "learning_rate": 0.0003634054438555655, + "loss": 2.6358, + "step": 19972 + }, + { + "epoch": 0.5922664057171664, + "grad_norm": 0.12251082062721252, + "learning_rate": 0.0003633601833018313, + "loss": 2.6358, + "step": 19973 + }, + { + "epoch": 0.5922960590694778, + "grad_norm": 0.13571950793266296, + "learning_rate": 0.0003633149239580659, + "loss": 2.6008, + "step": 19974 + }, + { + "epoch": 0.5923257124217893, + "grad_norm": 0.11945731192827225, + "learning_rate": 0.00036326966582467046, + "loss": 2.6311, + "step": 19975 + }, + { + "epoch": 0.5923553657741008, + "grad_norm": 0.1256895512342453, + "learning_rate": 0.0003632244089020457, + "loss": 2.6197, + "step": 19976 + }, + { + "epoch": 0.5923850191264123, + "grad_norm": 0.12110331654548645, + "learning_rate": 0.00036317915319059214, + "loss": 2.636, + "step": 19977 + }, + { + "epoch": 0.5924146724787237, + "grad_norm": 0.11421898007392883, + "learning_rate": 0.0003631338986907108, + "loss": 2.625, + "step": 19978 + }, + { + "epoch": 0.5924443258310352, + "grad_norm": 0.11289799958467484, + "learning_rate": 0.0003630886454028022, + "loss": 2.6549, + "step": 19979 + }, + { + "epoch": 0.5924739791833467, + "grad_norm": 0.1298692375421524, + "learning_rate": 0.0003630433933272671, + "loss": 2.64, + "step": 19980 + }, + { + "epoch": 0.5925036325356582, + "grad_norm": 0.10458506643772125, + "learning_rate": 0.00036299814246450624, + "loss": 2.6613, + "step": 19981 + }, + { + "epoch": 0.5925332858879696, + "grad_norm": 0.10601761937141418, + "learning_rate": 0.00036295289281492045, + "loss": 2.6169, + "step": 19982 + }, + { + "epoch": 0.5925629392402811, + "grad_norm": 0.10300064086914062, + "learning_rate": 0.00036290764437891024, + "loss": 2.5979, + "step": 19983 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.1214689165353775, + "learning_rate": 0.00036286239715687643, + "loss": 2.6207, + "step": 19984 + }, + { + "epoch": 0.5926222459449041, + "grad_norm": 0.11305259168148041, + "learning_rate": 0.00036281715114921964, + "loss": 2.5916, + "step": 19985 + }, + { + "epoch": 0.5926518992972155, + "grad_norm": 0.10711967945098877, + "learning_rate": 0.00036277190635634025, + "loss": 2.6285, + "step": 19986 + }, + { + "epoch": 0.592681552649527, + "grad_norm": 0.11474089324474335, + "learning_rate": 0.0003627266627786395, + "loss": 2.6586, + "step": 19987 + }, + { + "epoch": 0.5927112060018385, + "grad_norm": 0.10323729366064072, + "learning_rate": 0.0003626814204165176, + "loss": 2.6366, + "step": 19988 + }, + { + "epoch": 0.59274085935415, + "grad_norm": 0.10876939445734024, + "learning_rate": 0.0003626361792703754, + "loss": 2.6791, + "step": 19989 + }, + { + "epoch": 0.5927705127064614, + "grad_norm": 0.1046709194779396, + "learning_rate": 0.00036259093934061336, + "loss": 2.6811, + "step": 19990 + }, + { + "epoch": 0.592800166058773, + "grad_norm": 0.11565231531858444, + "learning_rate": 0.0003625457006276321, + "loss": 2.6768, + "step": 19991 + }, + { + "epoch": 0.5928298194110844, + "grad_norm": 0.10619446635246277, + "learning_rate": 0.0003625004631318323, + "loss": 2.658, + "step": 19992 + }, + { + "epoch": 0.5928594727633959, + "grad_norm": 0.11191577464342117, + "learning_rate": 0.0003624552268536144, + "loss": 2.6214, + "step": 19993 + }, + { + "epoch": 0.5928891261157074, + "grad_norm": 0.10639014840126038, + "learning_rate": 0.0003624099917933792, + "loss": 2.6672, + "step": 19994 + }, + { + "epoch": 0.5929187794680189, + "grad_norm": 0.10705802589654922, + "learning_rate": 0.0003623647579515271, + "loss": 2.6618, + "step": 19995 + }, + { + "epoch": 0.5929484328203304, + "grad_norm": 0.1128174364566803, + "learning_rate": 0.0003623195253284587, + "loss": 2.6702, + "step": 19996 + }, + { + "epoch": 0.5929780861726418, + "grad_norm": 0.09451280534267426, + "learning_rate": 0.00036227429392457456, + "loss": 2.6727, + "step": 19997 + }, + { + "epoch": 0.5930077395249533, + "grad_norm": 0.10462687909603119, + "learning_rate": 0.0003622290637402751, + "loss": 2.6498, + "step": 19998 + }, + { + "epoch": 0.5930373928772648, + "grad_norm": 0.09625688940286636, + "learning_rate": 0.00036218383477596084, + "loss": 2.628, + "step": 19999 + }, + { + "epoch": 0.5930670462295763, + "grad_norm": 0.09720411151647568, + "learning_rate": 0.0003621386070320325, + "loss": 2.6524, + "step": 20000 + }, + { + "epoch": 0.5930966995818877, + "grad_norm": 0.09745873510837555, + "learning_rate": 0.00036209338050889053, + "loss": 2.637, + "step": 20001 + }, + { + "epoch": 0.5931263529341992, + "grad_norm": 0.1050543487071991, + "learning_rate": 0.00036204815520693526, + "loss": 2.5998, + "step": 20002 + }, + { + "epoch": 0.5931560062865107, + "grad_norm": 0.10245309770107269, + "learning_rate": 0.00036200293112656723, + "loss": 2.6489, + "step": 20003 + }, + { + "epoch": 0.5931856596388222, + "grad_norm": 0.11940386146306992, + "learning_rate": 0.000361957708268187, + "loss": 2.6322, + "step": 20004 + }, + { + "epoch": 0.5932153129911336, + "grad_norm": 0.11901090294122696, + "learning_rate": 0.00036191248663219487, + "loss": 2.6272, + "step": 20005 + }, + { + "epoch": 0.5932449663434451, + "grad_norm": 0.11166422069072723, + "learning_rate": 0.00036186726621899155, + "loss": 2.6992, + "step": 20006 + }, + { + "epoch": 0.5932746196957566, + "grad_norm": 0.09993324428796768, + "learning_rate": 0.00036182204702897726, + "loss": 2.6492, + "step": 20007 + }, + { + "epoch": 0.5933042730480681, + "grad_norm": 0.10564039647579193, + "learning_rate": 0.0003617768290625524, + "loss": 2.6761, + "step": 20008 + }, + { + "epoch": 0.5933339264003795, + "grad_norm": 0.12341099232435226, + "learning_rate": 0.0003617316123201174, + "loss": 2.6307, + "step": 20009 + }, + { + "epoch": 0.593363579752691, + "grad_norm": 0.11282217502593994, + "learning_rate": 0.0003616863968020727, + "loss": 2.6758, + "step": 20010 + }, + { + "epoch": 0.5933932331050025, + "grad_norm": 0.11633404344320297, + "learning_rate": 0.00036164118250881877, + "loss": 2.6432, + "step": 20011 + }, + { + "epoch": 0.593422886457314, + "grad_norm": 0.11263176053762436, + "learning_rate": 0.0003615959694407558, + "loss": 2.686, + "step": 20012 + }, + { + "epoch": 0.5934525398096254, + "grad_norm": 0.11830393970012665, + "learning_rate": 0.0003615507575982843, + "loss": 2.6283, + "step": 20013 + }, + { + "epoch": 0.593482193161937, + "grad_norm": 0.11475873738527298, + "learning_rate": 0.0003615055469818047, + "loss": 2.6323, + "step": 20014 + }, + { + "epoch": 0.5935118465142485, + "grad_norm": 0.11114269495010376, + "learning_rate": 0.0003614603375917172, + "loss": 2.6282, + "step": 20015 + }, + { + "epoch": 0.5935414998665599, + "grad_norm": 0.1250135600566864, + "learning_rate": 0.0003614151294284224, + "loss": 2.6422, + "step": 20016 + }, + { + "epoch": 0.5935711532188714, + "grad_norm": 0.11626599729061127, + "learning_rate": 0.00036136992249232016, + "loss": 2.6614, + "step": 20017 + }, + { + "epoch": 0.5936008065711829, + "grad_norm": 0.10171009600162506, + "learning_rate": 0.0003613247167838111, + "loss": 2.6318, + "step": 20018 + }, + { + "epoch": 0.5936304599234944, + "grad_norm": 0.1196870505809784, + "learning_rate": 0.0003612795123032955, + "loss": 2.6247, + "step": 20019 + }, + { + "epoch": 0.5936601132758058, + "grad_norm": 0.10933621227741241, + "learning_rate": 0.0003612343090511736, + "loss": 2.6721, + "step": 20020 + }, + { + "epoch": 0.5936897666281173, + "grad_norm": 0.10100746154785156, + "learning_rate": 0.0003611891070278458, + "loss": 2.6548, + "step": 20021 + }, + { + "epoch": 0.5937194199804288, + "grad_norm": 0.11168466508388519, + "learning_rate": 0.00036114390623371217, + "loss": 2.6258, + "step": 20022 + }, + { + "epoch": 0.5937490733327403, + "grad_norm": 0.09620263427495956, + "learning_rate": 0.0003610987066691733, + "loss": 2.6394, + "step": 20023 + }, + { + "epoch": 0.5937787266850517, + "grad_norm": 0.10571812093257904, + "learning_rate": 0.000361053508334629, + "loss": 2.7, + "step": 20024 + }, + { + "epoch": 0.5938083800373632, + "grad_norm": 0.11236993223428726, + "learning_rate": 0.0003610083112304796, + "loss": 2.6642, + "step": 20025 + }, + { + "epoch": 0.5938380333896747, + "grad_norm": 0.1144600510597229, + "learning_rate": 0.00036096311535712566, + "loss": 2.6572, + "step": 20026 + }, + { + "epoch": 0.5938676867419862, + "grad_norm": 0.13131186366081238, + "learning_rate": 0.0003609179207149673, + "loss": 2.6392, + "step": 20027 + }, + { + "epoch": 0.5938973400942976, + "grad_norm": 0.11746922135353088, + "learning_rate": 0.0003608727273044045, + "loss": 2.603, + "step": 20028 + }, + { + "epoch": 0.5939269934466092, + "grad_norm": 0.1238892525434494, + "learning_rate": 0.0003608275351258376, + "loss": 2.6037, + "step": 20029 + }, + { + "epoch": 0.5939566467989206, + "grad_norm": 0.12385748326778412, + "learning_rate": 0.0003607823441796668, + "loss": 2.6202, + "step": 20030 + }, + { + "epoch": 0.5939863001512321, + "grad_norm": 0.1346609741449356, + "learning_rate": 0.00036073715446629216, + "loss": 2.5864, + "step": 20031 + }, + { + "epoch": 0.5940159535035435, + "grad_norm": 0.1137581542134285, + "learning_rate": 0.00036069196598611397, + "loss": 2.6594, + "step": 20032 + }, + { + "epoch": 0.5940456068558551, + "grad_norm": 0.10019026696681976, + "learning_rate": 0.00036064677873953244, + "loss": 2.6368, + "step": 20033 + }, + { + "epoch": 0.5940752602081665, + "grad_norm": 0.12062481790781021, + "learning_rate": 0.0003606015927269475, + "loss": 2.6451, + "step": 20034 + }, + { + "epoch": 0.594104913560478, + "grad_norm": 0.129110187292099, + "learning_rate": 0.0003605564079487594, + "loss": 2.6601, + "step": 20035 + }, + { + "epoch": 0.5941345669127895, + "grad_norm": 0.12851044535636902, + "learning_rate": 0.0003605112244053682, + "loss": 2.6513, + "step": 20036 + }, + { + "epoch": 0.594164220265101, + "grad_norm": 0.11953771114349365, + "learning_rate": 0.00036046604209717404, + "loss": 2.6755, + "step": 20037 + }, + { + "epoch": 0.5941938736174125, + "grad_norm": 0.1129336804151535, + "learning_rate": 0.00036042086102457696, + "loss": 2.6562, + "step": 20038 + }, + { + "epoch": 0.5942235269697239, + "grad_norm": 0.12952113151550293, + "learning_rate": 0.00036037568118797725, + "loss": 2.6771, + "step": 20039 + }, + { + "epoch": 0.5942531803220354, + "grad_norm": 0.10946396738290787, + "learning_rate": 0.0003603305025877748, + "loss": 2.6273, + "step": 20040 + }, + { + "epoch": 0.5942828336743469, + "grad_norm": 0.11059218645095825, + "learning_rate": 0.00036028532522436974, + "loss": 2.6682, + "step": 20041 + }, + { + "epoch": 0.5943124870266584, + "grad_norm": 0.13073045015335083, + "learning_rate": 0.00036024014909816205, + "loss": 2.6536, + "step": 20042 + }, + { + "epoch": 0.5943421403789698, + "grad_norm": 0.12720850110054016, + "learning_rate": 0.00036019497420955194, + "loss": 2.6935, + "step": 20043 + }, + { + "epoch": 0.5943717937312814, + "grad_norm": 0.11410247534513474, + "learning_rate": 0.00036014980055893933, + "loss": 2.6619, + "step": 20044 + }, + { + "epoch": 0.5944014470835928, + "grad_norm": 0.1137646809220314, + "learning_rate": 0.0003601046281467242, + "loss": 2.635, + "step": 20045 + }, + { + "epoch": 0.5944311004359043, + "grad_norm": 0.1128196194767952, + "learning_rate": 0.00036005945697330656, + "loss": 2.6345, + "step": 20046 + }, + { + "epoch": 0.5944607537882157, + "grad_norm": 0.12088619917631149, + "learning_rate": 0.00036001428703908643, + "loss": 2.6618, + "step": 20047 + }, + { + "epoch": 0.5944904071405273, + "grad_norm": 0.12764427065849304, + "learning_rate": 0.00035996911834446386, + "loss": 2.6359, + "step": 20048 + }, + { + "epoch": 0.5945200604928387, + "grad_norm": 0.12894250452518463, + "learning_rate": 0.0003599239508898389, + "loss": 2.6568, + "step": 20049 + }, + { + "epoch": 0.5945497138451502, + "grad_norm": 0.1162213459610939, + "learning_rate": 0.0003598787846756113, + "loss": 2.6229, + "step": 20050 + }, + { + "epoch": 0.5945793671974616, + "grad_norm": 0.12262137234210968, + "learning_rate": 0.0003598336197021809, + "loss": 2.6575, + "step": 20051 + }, + { + "epoch": 0.5946090205497732, + "grad_norm": 0.0978492796421051, + "learning_rate": 0.00035978845596994804, + "loss": 2.6251, + "step": 20052 + }, + { + "epoch": 0.5946386739020846, + "grad_norm": 0.10896463692188263, + "learning_rate": 0.00035974329347931245, + "loss": 2.642, + "step": 20053 + }, + { + "epoch": 0.5946683272543961, + "grad_norm": 0.11864002794027328, + "learning_rate": 0.00035969813223067426, + "loss": 2.6516, + "step": 20054 + }, + { + "epoch": 0.5946979806067076, + "grad_norm": 0.1085331067442894, + "learning_rate": 0.000359652972224433, + "loss": 2.6321, + "step": 20055 + }, + { + "epoch": 0.5947276339590191, + "grad_norm": 0.12195614725351334, + "learning_rate": 0.0003596078134609888, + "loss": 2.6602, + "step": 20056 + }, + { + "epoch": 0.5947572873113306, + "grad_norm": 0.12531618773937225, + "learning_rate": 0.0003595626559407415, + "loss": 2.6384, + "step": 20057 + }, + { + "epoch": 0.594786940663642, + "grad_norm": 0.10529521107673645, + "learning_rate": 0.000359517499664091, + "loss": 2.6397, + "step": 20058 + }, + { + "epoch": 0.5948165940159535, + "grad_norm": 0.11854533106088638, + "learning_rate": 0.0003594723446314371, + "loss": 2.6808, + "step": 20059 + }, + { + "epoch": 0.594846247368265, + "grad_norm": 0.10409338027238846, + "learning_rate": 0.00035942719084317975, + "loss": 2.6833, + "step": 20060 + }, + { + "epoch": 0.5948759007205765, + "grad_norm": 0.11090900748968124, + "learning_rate": 0.0003593820382997189, + "loss": 2.6516, + "step": 20061 + }, + { + "epoch": 0.5949055540728879, + "grad_norm": 0.11578895896673203, + "learning_rate": 0.00035933688700145403, + "loss": 2.636, + "step": 20062 + }, + { + "epoch": 0.5949352074251995, + "grad_norm": 0.11735657602548599, + "learning_rate": 0.00035929173694878533, + "loss": 2.6845, + "step": 20063 + }, + { + "epoch": 0.5949648607775109, + "grad_norm": 0.10626644641160965, + "learning_rate": 0.0003592465881421121, + "loss": 2.5982, + "step": 20064 + }, + { + "epoch": 0.5949945141298224, + "grad_norm": 0.11924320459365845, + "learning_rate": 0.0003592014405818349, + "loss": 2.638, + "step": 20065 + }, + { + "epoch": 0.5950241674821338, + "grad_norm": 0.10108830779790878, + "learning_rate": 0.0003591562942683529, + "loss": 2.6363, + "step": 20066 + }, + { + "epoch": 0.5950538208344454, + "grad_norm": 0.11696497350931168, + "learning_rate": 0.0003591111492020662, + "loss": 2.6424, + "step": 20067 + }, + { + "epoch": 0.5950834741867568, + "grad_norm": 0.1338619589805603, + "learning_rate": 0.0003590660053833744, + "loss": 2.6421, + "step": 20068 + }, + { + "epoch": 0.5951131275390683, + "grad_norm": 0.11386910825967789, + "learning_rate": 0.0003590208628126773, + "loss": 2.6348, + "step": 20069 + }, + { + "epoch": 0.5951427808913797, + "grad_norm": 0.10488924384117126, + "learning_rate": 0.0003589757214903747, + "loss": 2.6442, + "step": 20070 + }, + { + "epoch": 0.5951724342436913, + "grad_norm": 0.1131250187754631, + "learning_rate": 0.00035893058141686635, + "loss": 2.6586, + "step": 20071 + }, + { + "epoch": 0.5952020875960027, + "grad_norm": 0.12779520452022552, + "learning_rate": 0.00035888544259255183, + "loss": 2.6425, + "step": 20072 + }, + { + "epoch": 0.5952317409483142, + "grad_norm": 0.12356902658939362, + "learning_rate": 0.00035884030501783095, + "loss": 2.6547, + "step": 20073 + }, + { + "epoch": 0.5952613943006256, + "grad_norm": 0.12298174202442169, + "learning_rate": 0.0003587951686931034, + "loss": 2.6427, + "step": 20074 + }, + { + "epoch": 0.5952910476529372, + "grad_norm": 0.11531465500593185, + "learning_rate": 0.0003587500336187689, + "loss": 2.6554, + "step": 20075 + }, + { + "epoch": 0.5953207010052487, + "grad_norm": 0.11360201239585876, + "learning_rate": 0.00035870489979522704, + "loss": 2.648, + "step": 20076 + }, + { + "epoch": 0.5953503543575601, + "grad_norm": 0.10248484462499619, + "learning_rate": 0.0003586597672228774, + "loss": 2.6365, + "step": 20077 + }, + { + "epoch": 0.5953800077098717, + "grad_norm": 0.10569591820240021, + "learning_rate": 0.00035861463590211996, + "loss": 2.6658, + "step": 20078 + }, + { + "epoch": 0.5954096610621831, + "grad_norm": 0.09819256514310837, + "learning_rate": 0.00035856950583335425, + "loss": 2.6223, + "step": 20079 + }, + { + "epoch": 0.5954393144144946, + "grad_norm": 0.11172676086425781, + "learning_rate": 0.0003585243770169797, + "loss": 2.6454, + "step": 20080 + }, + { + "epoch": 0.595468967766806, + "grad_norm": 0.1208936795592308, + "learning_rate": 0.0003584792494533962, + "loss": 2.6531, + "step": 20081 + }, + { + "epoch": 0.5954986211191176, + "grad_norm": 0.11028222739696503, + "learning_rate": 0.00035843412314300326, + "loss": 2.5991, + "step": 20082 + }, + { + "epoch": 0.595528274471429, + "grad_norm": 0.11418810486793518, + "learning_rate": 0.0003583889980862004, + "loss": 2.6062, + "step": 20083 + }, + { + "epoch": 0.5955579278237405, + "grad_norm": 0.10229940712451935, + "learning_rate": 0.00035834387428338723, + "loss": 2.6258, + "step": 20084 + }, + { + "epoch": 0.5955875811760519, + "grad_norm": 0.11178275942802429, + "learning_rate": 0.0003582987517349634, + "loss": 2.6432, + "step": 20085 + }, + { + "epoch": 0.5956172345283635, + "grad_norm": 0.0992198958992958, + "learning_rate": 0.00035825363044132843, + "loss": 2.6612, + "step": 20086 + }, + { + "epoch": 0.5956468878806749, + "grad_norm": 0.1141747459769249, + "learning_rate": 0.00035820851040288185, + "loss": 2.6831, + "step": 20087 + }, + { + "epoch": 0.5956765412329864, + "grad_norm": 0.11958608031272888, + "learning_rate": 0.0003581633916200234, + "loss": 2.6751, + "step": 20088 + }, + { + "epoch": 0.5957061945852978, + "grad_norm": 0.10846661776304245, + "learning_rate": 0.00035811827409315235, + "loss": 2.6507, + "step": 20089 + }, + { + "epoch": 0.5957358479376094, + "grad_norm": 0.11538446694612503, + "learning_rate": 0.0003580731578226681, + "loss": 2.6174, + "step": 20090 + }, + { + "epoch": 0.5957655012899208, + "grad_norm": 0.11461220681667328, + "learning_rate": 0.0003580280428089707, + "loss": 2.6511, + "step": 20091 + }, + { + "epoch": 0.5957951546422323, + "grad_norm": 0.10538345575332642, + "learning_rate": 0.0003579829290524594, + "loss": 2.6608, + "step": 20092 + }, + { + "epoch": 0.5958248079945437, + "grad_norm": 0.10434237867593765, + "learning_rate": 0.0003579378165535335, + "loss": 2.6338, + "step": 20093 + }, + { + "epoch": 0.5958544613468553, + "grad_norm": 0.12333545833826065, + "learning_rate": 0.0003578927053125927, + "loss": 2.6186, + "step": 20094 + }, + { + "epoch": 0.5958841146991667, + "grad_norm": 0.14207766950130463, + "learning_rate": 0.0003578475953300363, + "loss": 2.6296, + "step": 20095 + }, + { + "epoch": 0.5959137680514782, + "grad_norm": 0.15278521180152893, + "learning_rate": 0.0003578024866062639, + "loss": 2.6037, + "step": 20096 + }, + { + "epoch": 0.5959434214037898, + "grad_norm": 0.12351417541503906, + "learning_rate": 0.0003577573791416748, + "loss": 2.647, + "step": 20097 + }, + { + "epoch": 0.5959730747561012, + "grad_norm": 0.1394651085138321, + "learning_rate": 0.00035771227293666865, + "loss": 2.602, + "step": 20098 + }, + { + "epoch": 0.5960027281084127, + "grad_norm": 0.12035269290208817, + "learning_rate": 0.0003576671679916448, + "loss": 2.6399, + "step": 20099 + }, + { + "epoch": 0.5960323814607241, + "grad_norm": 0.1271902471780777, + "learning_rate": 0.0003576220643070025, + "loss": 2.6151, + "step": 20100 + }, + { + "epoch": 0.5960620348130357, + "grad_norm": 0.12353105843067169, + "learning_rate": 0.00035757696188314125, + "loss": 2.6496, + "step": 20101 + }, + { + "epoch": 0.5960916881653471, + "grad_norm": 0.1370764523744583, + "learning_rate": 0.0003575318607204605, + "loss": 2.7081, + "step": 20102 + }, + { + "epoch": 0.5961213415176586, + "grad_norm": 0.11446201056241989, + "learning_rate": 0.0003574867608193594, + "loss": 2.6429, + "step": 20103 + }, + { + "epoch": 0.59615099486997, + "grad_norm": 0.12271072715520859, + "learning_rate": 0.0003574416621802377, + "loss": 2.6044, + "step": 20104 + }, + { + "epoch": 0.5961806482222816, + "grad_norm": 0.10581595450639725, + "learning_rate": 0.0003573965648034944, + "loss": 2.6505, + "step": 20105 + }, + { + "epoch": 0.596210301574593, + "grad_norm": 0.12199416011571884, + "learning_rate": 0.00035735146868952914, + "loss": 2.6484, + "step": 20106 + }, + { + "epoch": 0.5962399549269045, + "grad_norm": 0.12250353395938873, + "learning_rate": 0.000357306373838741, + "loss": 2.6472, + "step": 20107 + }, + { + "epoch": 0.5962696082792159, + "grad_norm": 0.11525881290435791, + "learning_rate": 0.0003572612802515295, + "loss": 2.6628, + "step": 20108 + }, + { + "epoch": 0.5962992616315275, + "grad_norm": 0.11739955097436905, + "learning_rate": 0.0003572161879282939, + "loss": 2.6347, + "step": 20109 + }, + { + "epoch": 0.5963289149838389, + "grad_norm": 0.12082089483737946, + "learning_rate": 0.0003571710968694334, + "loss": 2.621, + "step": 20110 + }, + { + "epoch": 0.5963585683361504, + "grad_norm": 0.10881038755178452, + "learning_rate": 0.00035712600707534734, + "loss": 2.6208, + "step": 20111 + }, + { + "epoch": 0.5963882216884618, + "grad_norm": 0.13152889907360077, + "learning_rate": 0.000357080918546435, + "loss": 2.6759, + "step": 20112 + }, + { + "epoch": 0.5964178750407734, + "grad_norm": 0.12835991382598877, + "learning_rate": 0.0003570358312830957, + "loss": 2.6378, + "step": 20113 + }, + { + "epoch": 0.5964475283930848, + "grad_norm": 0.11242073029279709, + "learning_rate": 0.0003569907452857286, + "loss": 2.6419, + "step": 20114 + }, + { + "epoch": 0.5964771817453963, + "grad_norm": 0.11440714448690414, + "learning_rate": 0.000356945660554733, + "loss": 2.6493, + "step": 20115 + }, + { + "epoch": 0.5965068350977077, + "grad_norm": 0.12423305213451385, + "learning_rate": 0.00035690057709050803, + "loss": 2.6608, + "step": 20116 + }, + { + "epoch": 0.5965364884500193, + "grad_norm": 0.11968832463026047, + "learning_rate": 0.00035685549489345315, + "loss": 2.6631, + "step": 20117 + }, + { + "epoch": 0.5965661418023308, + "grad_norm": 0.12778864800930023, + "learning_rate": 0.0003568104139639675, + "loss": 2.664, + "step": 20118 + }, + { + "epoch": 0.5965957951546422, + "grad_norm": 0.10816995799541473, + "learning_rate": 0.0003567653343024501, + "loss": 2.6548, + "step": 20119 + }, + { + "epoch": 0.5966254485069538, + "grad_norm": 0.11990462988615036, + "learning_rate": 0.00035672025590930036, + "loss": 2.6671, + "step": 20120 + }, + { + "epoch": 0.5966551018592652, + "grad_norm": 0.10212961584329605, + "learning_rate": 0.0003566751787849173, + "loss": 2.6421, + "step": 20121 + }, + { + "epoch": 0.5966847552115767, + "grad_norm": 0.12660810351371765, + "learning_rate": 0.0003566301029297001, + "loss": 2.6099, + "step": 20122 + }, + { + "epoch": 0.5967144085638881, + "grad_norm": 0.12338972091674805, + "learning_rate": 0.00035658502834404795, + "loss": 2.6298, + "step": 20123 + }, + { + "epoch": 0.5967440619161997, + "grad_norm": 0.1075013279914856, + "learning_rate": 0.00035653995502836, + "loss": 2.6069, + "step": 20124 + }, + { + "epoch": 0.5967737152685111, + "grad_norm": 0.09872215986251831, + "learning_rate": 0.0003564948829830353, + "loss": 2.635, + "step": 20125 + }, + { + "epoch": 0.5968033686208226, + "grad_norm": 0.11433493345975876, + "learning_rate": 0.0003564498122084733, + "loss": 2.6302, + "step": 20126 + }, + { + "epoch": 0.596833021973134, + "grad_norm": 0.11613169312477112, + "learning_rate": 0.0003564047427050726, + "loss": 2.6631, + "step": 20127 + }, + { + "epoch": 0.5968626753254456, + "grad_norm": 0.12472917139530182, + "learning_rate": 0.00035635967447323263, + "loss": 2.6628, + "step": 20128 + }, + { + "epoch": 0.596892328677757, + "grad_norm": 0.11553093045949936, + "learning_rate": 0.0003563146075133522, + "loss": 2.5976, + "step": 20129 + }, + { + "epoch": 0.5969219820300685, + "grad_norm": 0.09076964855194092, + "learning_rate": 0.0003562695418258308, + "loss": 2.6523, + "step": 20130 + }, + { + "epoch": 0.5969516353823799, + "grad_norm": 0.11093386262655258, + "learning_rate": 0.00035622447741106726, + "loss": 2.6733, + "step": 20131 + }, + { + "epoch": 0.5969812887346915, + "grad_norm": 0.09984730929136276, + "learning_rate": 0.0003561794142694607, + "loss": 2.655, + "step": 20132 + }, + { + "epoch": 0.5970109420870029, + "grad_norm": 0.10706076771020889, + "learning_rate": 0.00035613435240141, + "loss": 2.6401, + "step": 20133 + }, + { + "epoch": 0.5970405954393144, + "grad_norm": 0.1170295774936676, + "learning_rate": 0.00035608929180731434, + "loss": 2.6479, + "step": 20134 + }, + { + "epoch": 0.5970702487916258, + "grad_norm": 0.10317010432481766, + "learning_rate": 0.0003560442324875727, + "loss": 2.6552, + "step": 20135 + }, + { + "epoch": 0.5970999021439374, + "grad_norm": 0.11983870714902878, + "learning_rate": 0.0003559991744425841, + "loss": 2.6613, + "step": 20136 + }, + { + "epoch": 0.5971295554962488, + "grad_norm": 0.11650882661342621, + "learning_rate": 0.00035595411767274765, + "loss": 2.6838, + "step": 20137 + }, + { + "epoch": 0.5971592088485603, + "grad_norm": 0.08901964128017426, + "learning_rate": 0.00035590906217846215, + "loss": 2.6491, + "step": 20138 + }, + { + "epoch": 0.5971888622008719, + "grad_norm": 0.11871310323476791, + "learning_rate": 0.00035586400796012654, + "loss": 2.6399, + "step": 20139 + }, + { + "epoch": 0.5972185155531833, + "grad_norm": 0.11607687920331955, + "learning_rate": 0.0003558189550181399, + "loss": 2.672, + "step": 20140 + }, + { + "epoch": 0.5972481689054948, + "grad_norm": 0.10771343857049942, + "learning_rate": 0.0003557739033529012, + "loss": 2.6494, + "step": 20141 + }, + { + "epoch": 0.5972778222578062, + "grad_norm": 0.103902667760849, + "learning_rate": 0.0003557288529648093, + "loss": 2.6161, + "step": 20142 + }, + { + "epoch": 0.5973074756101178, + "grad_norm": 0.10986824333667755, + "learning_rate": 0.00035568380385426313, + "loss": 2.6485, + "step": 20143 + }, + { + "epoch": 0.5973371289624292, + "grad_norm": 0.10439343005418777, + "learning_rate": 0.00035563875602166175, + "loss": 2.6505, + "step": 20144 + }, + { + "epoch": 0.5973667823147407, + "grad_norm": 0.10187103599309921, + "learning_rate": 0.0003555937094674039, + "loss": 2.6633, + "step": 20145 + }, + { + "epoch": 0.5973964356670521, + "grad_norm": 0.12202727049589157, + "learning_rate": 0.0003555486641918886, + "loss": 2.6413, + "step": 20146 + }, + { + "epoch": 0.5974260890193637, + "grad_norm": 0.11651249974966049, + "learning_rate": 0.00035550362019551475, + "loss": 2.6461, + "step": 20147 + }, + { + "epoch": 0.5974557423716751, + "grad_norm": 0.1045837476849556, + "learning_rate": 0.00035545857747868104, + "loss": 2.6845, + "step": 20148 + }, + { + "epoch": 0.5974853957239866, + "grad_norm": 0.11352572590112686, + "learning_rate": 0.0003554135360417864, + "loss": 2.692, + "step": 20149 + }, + { + "epoch": 0.597515049076298, + "grad_norm": 0.11232852190732956, + "learning_rate": 0.0003553684958852298, + "loss": 2.6493, + "step": 20150 + }, + { + "epoch": 0.5975447024286096, + "grad_norm": 0.11160074919462204, + "learning_rate": 0.0003553234570094099, + "loss": 2.6526, + "step": 20151 + }, + { + "epoch": 0.597574355780921, + "grad_norm": 0.10749264806509018, + "learning_rate": 0.0003552784194147257, + "loss": 2.6193, + "step": 20152 + }, + { + "epoch": 0.5976040091332325, + "grad_norm": 0.10579986125230789, + "learning_rate": 0.00035523338310157595, + "loss": 2.6626, + "step": 20153 + }, + { + "epoch": 0.597633662485544, + "grad_norm": 0.14579203724861145, + "learning_rate": 0.00035518834807035947, + "loss": 2.6166, + "step": 20154 + }, + { + "epoch": 0.5976633158378555, + "grad_norm": 0.14128045737743378, + "learning_rate": 0.00035514331432147476, + "loss": 2.64, + "step": 20155 + }, + { + "epoch": 0.5976929691901669, + "grad_norm": 0.12603220343589783, + "learning_rate": 0.000355098281855321, + "loss": 2.6432, + "step": 20156 + }, + { + "epoch": 0.5977226225424784, + "grad_norm": 0.11642803996801376, + "learning_rate": 0.00035505325067229686, + "loss": 2.6193, + "step": 20157 + }, + { + "epoch": 0.5977522758947899, + "grad_norm": 0.13698528707027435, + "learning_rate": 0.00035500822077280127, + "loss": 2.6341, + "step": 20158 + }, + { + "epoch": 0.5977819292471014, + "grad_norm": 0.12313944101333618, + "learning_rate": 0.00035496319215723253, + "loss": 2.6806, + "step": 20159 + }, + { + "epoch": 0.5978115825994129, + "grad_norm": 0.1177816390991211, + "learning_rate": 0.0003549181648259897, + "loss": 2.6554, + "step": 20160 + }, + { + "epoch": 0.5978412359517243, + "grad_norm": 0.11773883551359177, + "learning_rate": 0.00035487313877947144, + "loss": 2.6481, + "step": 20161 + }, + { + "epoch": 0.5978708893040359, + "grad_norm": 0.11315100640058517, + "learning_rate": 0.00035482811401807635, + "loss": 2.6547, + "step": 20162 + }, + { + "epoch": 0.5979005426563473, + "grad_norm": 0.11394641548395157, + "learning_rate": 0.0003547830905422033, + "loss": 2.6168, + "step": 20163 + }, + { + "epoch": 0.5979301960086588, + "grad_norm": 0.11282819509506226, + "learning_rate": 0.00035473806835225095, + "loss": 2.6317, + "step": 20164 + }, + { + "epoch": 0.5979598493609702, + "grad_norm": 0.11186060309410095, + "learning_rate": 0.00035469304744861795, + "loss": 2.6164, + "step": 20165 + }, + { + "epoch": 0.5979895027132818, + "grad_norm": 0.10496434569358826, + "learning_rate": 0.0003546480278317029, + "loss": 2.6372, + "step": 20166 + }, + { + "epoch": 0.5980191560655932, + "grad_norm": 0.11133724451065063, + "learning_rate": 0.0003546030095019045, + "loss": 2.6292, + "step": 20167 + }, + { + "epoch": 0.5980488094179047, + "grad_norm": 0.11375187337398529, + "learning_rate": 0.00035455799245962135, + "loss": 2.6459, + "step": 20168 + }, + { + "epoch": 0.5980784627702161, + "grad_norm": 0.11680854856967926, + "learning_rate": 0.0003545129767052522, + "loss": 2.6265, + "step": 20169 + }, + { + "epoch": 0.5981081161225277, + "grad_norm": 0.11136779189109802, + "learning_rate": 0.0003544679622391956, + "loss": 2.6171, + "step": 20170 + }, + { + "epoch": 0.5981377694748391, + "grad_norm": 0.11586285382509232, + "learning_rate": 0.0003544229490618502, + "loss": 2.6108, + "step": 20171 + }, + { + "epoch": 0.5981674228271506, + "grad_norm": 0.11926114559173584, + "learning_rate": 0.00035437793717361455, + "loss": 2.631, + "step": 20172 + }, + { + "epoch": 0.598197076179462, + "grad_norm": 0.12139241397380829, + "learning_rate": 0.0003543329265748873, + "loss": 2.6425, + "step": 20173 + }, + { + "epoch": 0.5982267295317736, + "grad_norm": 0.10261267423629761, + "learning_rate": 0.000354287917266067, + "loss": 2.6334, + "step": 20174 + }, + { + "epoch": 0.598256382884085, + "grad_norm": 0.10878906399011612, + "learning_rate": 0.0003542429092475522, + "loss": 2.6232, + "step": 20175 + }, + { + "epoch": 0.5982860362363965, + "grad_norm": 0.10070725530385971, + "learning_rate": 0.0003541979025197415, + "loss": 2.6014, + "step": 20176 + }, + { + "epoch": 0.598315689588708, + "grad_norm": 0.10119456052780151, + "learning_rate": 0.00035415289708303334, + "loss": 2.5964, + "step": 20177 + }, + { + "epoch": 0.5983453429410195, + "grad_norm": 0.094924196600914, + "learning_rate": 0.0003541078929378263, + "loss": 2.6182, + "step": 20178 + }, + { + "epoch": 0.5983749962933309, + "grad_norm": 0.10940004885196686, + "learning_rate": 0.0003540628900845189, + "loss": 2.6301, + "step": 20179 + }, + { + "epoch": 0.5984046496456424, + "grad_norm": 0.10346134006977081, + "learning_rate": 0.0003540178885235096, + "loss": 2.6343, + "step": 20180 + }, + { + "epoch": 0.598434302997954, + "grad_norm": 0.09714556485414505, + "learning_rate": 0.00035397288825519697, + "loss": 2.6414, + "step": 20181 + }, + { + "epoch": 0.5984639563502654, + "grad_norm": 0.11284106969833374, + "learning_rate": 0.00035392788927997954, + "loss": 2.6309, + "step": 20182 + }, + { + "epoch": 0.5984936097025769, + "grad_norm": 0.14469479024410248, + "learning_rate": 0.0003538828915982557, + "loss": 2.6567, + "step": 20183 + }, + { + "epoch": 0.5985232630548883, + "grad_norm": 0.14603851735591888, + "learning_rate": 0.000353837895210424, + "loss": 2.6598, + "step": 20184 + }, + { + "epoch": 0.5985529164071999, + "grad_norm": 0.11966545879840851, + "learning_rate": 0.0003537929001168828, + "loss": 2.6132, + "step": 20185 + }, + { + "epoch": 0.5985825697595113, + "grad_norm": 0.1068367138504982, + "learning_rate": 0.00035374790631803057, + "loss": 2.6326, + "step": 20186 + }, + { + "epoch": 0.5986122231118228, + "grad_norm": 0.13609443604946136, + "learning_rate": 0.00035370291381426575, + "loss": 2.649, + "step": 20187 + }, + { + "epoch": 0.5986418764641342, + "grad_norm": 0.13040092587471008, + "learning_rate": 0.0003536579226059867, + "loss": 2.6437, + "step": 20188 + }, + { + "epoch": 0.5986715298164458, + "grad_norm": 0.13248446583747864, + "learning_rate": 0.00035361293269359185, + "loss": 2.6162, + "step": 20189 + }, + { + "epoch": 0.5987011831687572, + "grad_norm": 0.1032554879784584, + "learning_rate": 0.0003535679440774796, + "loss": 2.6298, + "step": 20190 + }, + { + "epoch": 0.5987308365210687, + "grad_norm": 0.10951979458332062, + "learning_rate": 0.0003535229567580484, + "loss": 2.6332, + "step": 20191 + }, + { + "epoch": 0.5987604898733802, + "grad_norm": 0.11338850855827332, + "learning_rate": 0.0003534779707356966, + "loss": 2.65, + "step": 20192 + }, + { + "epoch": 0.5987901432256917, + "grad_norm": 0.1168617382645607, + "learning_rate": 0.0003534329860108222, + "loss": 2.6729, + "step": 20193 + }, + { + "epoch": 0.5988197965780031, + "grad_norm": 0.11615411192178726, + "learning_rate": 0.0003533880025838241, + "loss": 2.6584, + "step": 20194 + }, + { + "epoch": 0.5988494499303146, + "grad_norm": 0.12014642357826233, + "learning_rate": 0.0003533430204551005, + "loss": 2.6227, + "step": 20195 + }, + { + "epoch": 0.5988791032826261, + "grad_norm": 0.10219278186559677, + "learning_rate": 0.0003532980396250496, + "loss": 2.6415, + "step": 20196 + }, + { + "epoch": 0.5989087566349376, + "grad_norm": 0.12268221378326416, + "learning_rate": 0.0003532530600940697, + "loss": 2.6485, + "step": 20197 + }, + { + "epoch": 0.598938409987249, + "grad_norm": 0.1118617057800293, + "learning_rate": 0.00035320808186255903, + "loss": 2.6383, + "step": 20198 + }, + { + "epoch": 0.5989680633395605, + "grad_norm": 0.11169522255659103, + "learning_rate": 0.00035316310493091615, + "loss": 2.6691, + "step": 20199 + }, + { + "epoch": 0.598997716691872, + "grad_norm": 0.11914315819740295, + "learning_rate": 0.0003531181292995391, + "loss": 2.6083, + "step": 20200 + }, + { + "epoch": 0.5990273700441835, + "grad_norm": 0.1038808599114418, + "learning_rate": 0.0003530731549688262, + "loss": 2.615, + "step": 20201 + }, + { + "epoch": 0.599057023396495, + "grad_norm": 0.10207422077655792, + "learning_rate": 0.00035302818193917577, + "loss": 2.6207, + "step": 20202 + }, + { + "epoch": 0.5990866767488064, + "grad_norm": 0.1104961559176445, + "learning_rate": 0.0003529832102109861, + "loss": 2.6241, + "step": 20203 + }, + { + "epoch": 0.599116330101118, + "grad_norm": 0.13146370649337769, + "learning_rate": 0.0003529382397846553, + "loss": 2.6547, + "step": 20204 + }, + { + "epoch": 0.5991459834534294, + "grad_norm": 0.09944000840187073, + "learning_rate": 0.0003528932706605816, + "loss": 2.6601, + "step": 20205 + }, + { + "epoch": 0.5991756368057409, + "grad_norm": 0.11251430213451385, + "learning_rate": 0.00035284830283916315, + "loss": 2.6513, + "step": 20206 + }, + { + "epoch": 0.5992052901580524, + "grad_norm": 0.1194351315498352, + "learning_rate": 0.00035280333632079825, + "loss": 2.6374, + "step": 20207 + }, + { + "epoch": 0.5992349435103639, + "grad_norm": 0.09652499854564667, + "learning_rate": 0.00035275837110588517, + "loss": 2.6682, + "step": 20208 + }, + { + "epoch": 0.5992645968626753, + "grad_norm": 0.11054470390081406, + "learning_rate": 0.0003527134071948219, + "loss": 2.6061, + "step": 20209 + }, + { + "epoch": 0.5992942502149868, + "grad_norm": 0.13029490411281586, + "learning_rate": 0.00035266844458800676, + "loss": 2.6373, + "step": 20210 + }, + { + "epoch": 0.5993239035672983, + "grad_norm": 0.11338941007852554, + "learning_rate": 0.0003526234832858378, + "loss": 2.6565, + "step": 20211 + }, + { + "epoch": 0.5993535569196098, + "grad_norm": 0.10581137239933014, + "learning_rate": 0.0003525785232887132, + "loss": 2.6286, + "step": 20212 + }, + { + "epoch": 0.5993832102719212, + "grad_norm": 0.11528106033802032, + "learning_rate": 0.0003525335645970312, + "loss": 2.6426, + "step": 20213 + }, + { + "epoch": 0.5994128636242327, + "grad_norm": 0.11919525265693665, + "learning_rate": 0.00035248860721118967, + "loss": 2.6656, + "step": 20214 + }, + { + "epoch": 0.5994425169765442, + "grad_norm": 0.11158280819654465, + "learning_rate": 0.0003524436511315869, + "loss": 2.68, + "step": 20215 + }, + { + "epoch": 0.5994721703288557, + "grad_norm": 0.1320263296365738, + "learning_rate": 0.00035239869635862085, + "loss": 2.6592, + "step": 20216 + }, + { + "epoch": 0.5995018236811671, + "grad_norm": 0.11005997657775879, + "learning_rate": 0.0003523537428926897, + "loss": 2.6372, + "step": 20217 + }, + { + "epoch": 0.5995314770334786, + "grad_norm": 0.12014110386371613, + "learning_rate": 0.0003523087907341915, + "loss": 2.6247, + "step": 20218 + }, + { + "epoch": 0.5995611303857901, + "grad_norm": 0.11358173936605453, + "learning_rate": 0.0003522638398835243, + "loss": 2.6438, + "step": 20219 + }, + { + "epoch": 0.5995907837381016, + "grad_norm": 0.1333373785018921, + "learning_rate": 0.00035221889034108613, + "loss": 2.6348, + "step": 20220 + }, + { + "epoch": 0.599620437090413, + "grad_norm": 0.13182123005390167, + "learning_rate": 0.0003521739421072751, + "loss": 2.6712, + "step": 20221 + }, + { + "epoch": 0.5996500904427245, + "grad_norm": 0.1231020987033844, + "learning_rate": 0.0003521289951824892, + "loss": 2.6764, + "step": 20222 + }, + { + "epoch": 0.5996797437950361, + "grad_norm": 0.1136503666639328, + "learning_rate": 0.0003520840495671265, + "loss": 2.6231, + "step": 20223 + }, + { + "epoch": 0.5997093971473475, + "grad_norm": 0.13260558247566223, + "learning_rate": 0.0003520391052615849, + "loss": 2.7045, + "step": 20224 + }, + { + "epoch": 0.599739050499659, + "grad_norm": 0.12093854695558548, + "learning_rate": 0.00035199416226626233, + "loss": 2.6481, + "step": 20225 + }, + { + "epoch": 0.5997687038519705, + "grad_norm": 0.12364253401756287, + "learning_rate": 0.0003519492205815569, + "loss": 2.6263, + "step": 20226 + }, + { + "epoch": 0.599798357204282, + "grad_norm": 0.12468527257442474, + "learning_rate": 0.0003519042802078665, + "loss": 2.633, + "step": 20227 + }, + { + "epoch": 0.5998280105565934, + "grad_norm": 0.12615947425365448, + "learning_rate": 0.00035185934114558915, + "loss": 2.6343, + "step": 20228 + }, + { + "epoch": 0.5998576639089049, + "grad_norm": 0.10749757289886475, + "learning_rate": 0.0003518144033951228, + "loss": 2.6113, + "step": 20229 + }, + { + "epoch": 0.5998873172612164, + "grad_norm": 0.11090793460607529, + "learning_rate": 0.0003517694669568654, + "loss": 2.6414, + "step": 20230 + }, + { + "epoch": 0.5999169706135279, + "grad_norm": 0.10743249952793121, + "learning_rate": 0.00035172453183121474, + "loss": 2.6343, + "step": 20231 + }, + { + "epoch": 0.5999466239658393, + "grad_norm": 0.1104784831404686, + "learning_rate": 0.0003516795980185685, + "loss": 2.6503, + "step": 20232 + }, + { + "epoch": 0.5999762773181508, + "grad_norm": 0.1232391893863678, + "learning_rate": 0.0003516346655193252, + "loss": 2.6398, + "step": 20233 + }, + { + "epoch": 0.6000059306704623, + "grad_norm": 0.09633384644985199, + "learning_rate": 0.00035158973433388246, + "loss": 2.6454, + "step": 20234 + }, + { + "epoch": 0.6000355840227738, + "grad_norm": 0.1301525980234146, + "learning_rate": 0.000351544804462638, + "loss": 2.6493, + "step": 20235 + }, + { + "epoch": 0.6000652373750852, + "grad_norm": 0.1217566430568695, + "learning_rate": 0.00035149987590598974, + "loss": 2.6635, + "step": 20236 + }, + { + "epoch": 0.6000948907273967, + "grad_norm": 0.10579726845026016, + "learning_rate": 0.00035145494866433563, + "loss": 2.658, + "step": 20237 + }, + { + "epoch": 0.6001245440797082, + "grad_norm": 0.11195562034845352, + "learning_rate": 0.00035141002273807344, + "loss": 2.6477, + "step": 20238 + }, + { + "epoch": 0.6001541974320197, + "grad_norm": 0.10584068298339844, + "learning_rate": 0.00035136509812760096, + "loss": 2.6285, + "step": 20239 + }, + { + "epoch": 0.6001838507843311, + "grad_norm": 0.10780323296785355, + "learning_rate": 0.00035132017483331614, + "loss": 2.6294, + "step": 20240 + }, + { + "epoch": 0.6002135041366427, + "grad_norm": 0.10951188951730728, + "learning_rate": 0.00035127525285561667, + "loss": 2.6743, + "step": 20241 + }, + { + "epoch": 0.6002431574889541, + "grad_norm": 0.1206858828663826, + "learning_rate": 0.00035123033219490034, + "loss": 2.6404, + "step": 20242 + }, + { + "epoch": 0.6002728108412656, + "grad_norm": 0.1181786060333252, + "learning_rate": 0.0003511854128515649, + "loss": 2.6318, + "step": 20243 + }, + { + "epoch": 0.6003024641935771, + "grad_norm": 0.0978492721915245, + "learning_rate": 0.0003511404948260083, + "loss": 2.6423, + "step": 20244 + }, + { + "epoch": 0.6003321175458886, + "grad_norm": 0.10788915306329727, + "learning_rate": 0.0003510955781186279, + "loss": 2.6241, + "step": 20245 + }, + { + "epoch": 0.6003617708982001, + "grad_norm": 0.09374283999204636, + "learning_rate": 0.0003510506627298219, + "loss": 2.6587, + "step": 20246 + }, + { + "epoch": 0.6003914242505115, + "grad_norm": 0.10755818337202072, + "learning_rate": 0.00035100574865998784, + "loss": 2.673, + "step": 20247 + }, + { + "epoch": 0.600421077602823, + "grad_norm": 0.12695468962192535, + "learning_rate": 0.00035096083590952344, + "loss": 2.6489, + "step": 20248 + }, + { + "epoch": 0.6004507309551345, + "grad_norm": 0.1255955547094345, + "learning_rate": 0.0003509159244788265, + "loss": 2.6485, + "step": 20249 + }, + { + "epoch": 0.600480384307446, + "grad_norm": 0.12344653159379959, + "learning_rate": 0.0003508710143682945, + "loss": 2.6027, + "step": 20250 + }, + { + "epoch": 0.6005100376597574, + "grad_norm": 0.11932523548603058, + "learning_rate": 0.0003508261055783255, + "loss": 2.6194, + "step": 20251 + }, + { + "epoch": 0.6005396910120689, + "grad_norm": 0.12854835391044617, + "learning_rate": 0.00035078119810931675, + "loss": 2.651, + "step": 20252 + }, + { + "epoch": 0.6005693443643804, + "grad_norm": 0.1321382373571396, + "learning_rate": 0.00035073629196166614, + "loss": 2.6292, + "step": 20253 + }, + { + "epoch": 0.6005989977166919, + "grad_norm": 0.11084889620542526, + "learning_rate": 0.00035069138713577134, + "loss": 2.6773, + "step": 20254 + }, + { + "epoch": 0.6006286510690033, + "grad_norm": 0.11379025131464005, + "learning_rate": 0.0003506464836320298, + "loss": 2.6687, + "step": 20255 + }, + { + "epoch": 0.6006583044213148, + "grad_norm": 0.12793223559856415, + "learning_rate": 0.0003506015814508394, + "loss": 2.6543, + "step": 20256 + }, + { + "epoch": 0.6006879577736263, + "grad_norm": 0.10210070759057999, + "learning_rate": 0.0003505566805925976, + "loss": 2.6496, + "step": 20257 + }, + { + "epoch": 0.6007176111259378, + "grad_norm": 0.12472901493310928, + "learning_rate": 0.0003505117810577019, + "loss": 2.6211, + "step": 20258 + }, + { + "epoch": 0.6007472644782492, + "grad_norm": 0.1396958976984024, + "learning_rate": 0.00035046688284655017, + "loss": 2.6207, + "step": 20259 + }, + { + "epoch": 0.6007769178305608, + "grad_norm": 0.13434650003910065, + "learning_rate": 0.0003504219859595399, + "loss": 2.5994, + "step": 20260 + }, + { + "epoch": 0.6008065711828722, + "grad_norm": 0.13327983021736145, + "learning_rate": 0.00035037709039706865, + "loss": 2.6175, + "step": 20261 + }, + { + "epoch": 0.6008362245351837, + "grad_norm": 0.12053579837083817, + "learning_rate": 0.00035033219615953395, + "loss": 2.6431, + "step": 20262 + }, + { + "epoch": 0.6008658778874952, + "grad_norm": 0.12638185918331146, + "learning_rate": 0.0003502873032473333, + "loss": 2.6769, + "step": 20263 + }, + { + "epoch": 0.6008955312398067, + "grad_norm": 0.11958207935094833, + "learning_rate": 0.00035024241166086415, + "loss": 2.6382, + "step": 20264 + }, + { + "epoch": 0.6009251845921182, + "grad_norm": 0.11119620501995087, + "learning_rate": 0.00035019752140052427, + "loss": 2.6188, + "step": 20265 + }, + { + "epoch": 0.6009548379444296, + "grad_norm": 0.12668603658676147, + "learning_rate": 0.000350152632466711, + "loss": 2.6448, + "step": 20266 + }, + { + "epoch": 0.6009844912967411, + "grad_norm": 0.14700616896152496, + "learning_rate": 0.000350107744859822, + "loss": 2.6442, + "step": 20267 + }, + { + "epoch": 0.6010141446490526, + "grad_norm": 0.13723580539226532, + "learning_rate": 0.0003500628585802547, + "loss": 2.6799, + "step": 20268 + }, + { + "epoch": 0.6010437980013641, + "grad_norm": 0.10311140865087509, + "learning_rate": 0.00035001797362840635, + "loss": 2.6314, + "step": 20269 + }, + { + "epoch": 0.6010734513536755, + "grad_norm": 0.1293434053659439, + "learning_rate": 0.0003499730900046746, + "loss": 2.6253, + "step": 20270 + }, + { + "epoch": 0.601103104705987, + "grad_norm": 0.13085894286632538, + "learning_rate": 0.0003499282077094568, + "loss": 2.6516, + "step": 20271 + }, + { + "epoch": 0.6011327580582985, + "grad_norm": 0.10692653805017471, + "learning_rate": 0.0003498833267431507, + "loss": 2.6567, + "step": 20272 + }, + { + "epoch": 0.60116241141061, + "grad_norm": 0.11877631396055222, + "learning_rate": 0.0003498384471061534, + "loss": 2.6434, + "step": 20273 + }, + { + "epoch": 0.6011920647629214, + "grad_norm": 0.10904540121555328, + "learning_rate": 0.00034979356879886244, + "loss": 2.6482, + "step": 20274 + }, + { + "epoch": 0.601221718115233, + "grad_norm": 0.13140186667442322, + "learning_rate": 0.00034974869182167524, + "loss": 2.6506, + "step": 20275 + }, + { + "epoch": 0.6012513714675444, + "grad_norm": 0.10688275098800659, + "learning_rate": 0.00034970381617498907, + "loss": 2.6206, + "step": 20276 + }, + { + "epoch": 0.6012810248198559, + "grad_norm": 0.11998014897108078, + "learning_rate": 0.0003496589418592015, + "loss": 2.6575, + "step": 20277 + }, + { + "epoch": 0.6013106781721673, + "grad_norm": 0.11410323530435562, + "learning_rate": 0.0003496140688747098, + "loss": 2.6321, + "step": 20278 + }, + { + "epoch": 0.6013403315244789, + "grad_norm": 0.10322313010692596, + "learning_rate": 0.00034956919722191137, + "loss": 2.6706, + "step": 20279 + }, + { + "epoch": 0.6013699848767903, + "grad_norm": 0.11214187741279602, + "learning_rate": 0.0003495243269012035, + "loss": 2.6635, + "step": 20280 + }, + { + "epoch": 0.6013996382291018, + "grad_norm": 0.10447907447814941, + "learning_rate": 0.0003494794579129835, + "loss": 2.6144, + "step": 20281 + }, + { + "epoch": 0.6014292915814132, + "grad_norm": 0.11262974888086319, + "learning_rate": 0.0003494345902576487, + "loss": 2.662, + "step": 20282 + }, + { + "epoch": 0.6014589449337248, + "grad_norm": 0.10741529613733292, + "learning_rate": 0.00034938972393559655, + "loss": 2.6682, + "step": 20283 + }, + { + "epoch": 0.6014885982860363, + "grad_norm": 0.11227589845657349, + "learning_rate": 0.000349344858947224, + "loss": 2.6631, + "step": 20284 + }, + { + "epoch": 0.6015182516383477, + "grad_norm": 0.10575180500745773, + "learning_rate": 0.00034929999529292877, + "loss": 2.6097, + "step": 20285 + }, + { + "epoch": 0.6015479049906592, + "grad_norm": 0.09778274595737457, + "learning_rate": 0.000349255132973108, + "loss": 2.6243, + "step": 20286 + }, + { + "epoch": 0.6015775583429707, + "grad_norm": 0.10017228126525879, + "learning_rate": 0.0003492102719881588, + "loss": 2.6724, + "step": 20287 + }, + { + "epoch": 0.6016072116952822, + "grad_norm": 0.10297021269798279, + "learning_rate": 0.00034916541233847865, + "loss": 2.613, + "step": 20288 + }, + { + "epoch": 0.6016368650475936, + "grad_norm": 0.10265689343214035, + "learning_rate": 0.0003491205540244646, + "loss": 2.6381, + "step": 20289 + }, + { + "epoch": 0.6016665183999051, + "grad_norm": 0.0974598079919815, + "learning_rate": 0.000349075697046514, + "loss": 2.6174, + "step": 20290 + }, + { + "epoch": 0.6016961717522166, + "grad_norm": 0.11699005216360092, + "learning_rate": 0.00034903084140502395, + "loss": 2.6937, + "step": 20291 + }, + { + "epoch": 0.6017258251045281, + "grad_norm": 0.11168202012777328, + "learning_rate": 0.00034898598710039167, + "loss": 2.655, + "step": 20292 + }, + { + "epoch": 0.6017554784568395, + "grad_norm": 0.11954768747091293, + "learning_rate": 0.00034894113413301445, + "loss": 2.6138, + "step": 20293 + }, + { + "epoch": 0.601785131809151, + "grad_norm": 0.11769351363182068, + "learning_rate": 0.00034889628250328944, + "loss": 2.6271, + "step": 20294 + }, + { + "epoch": 0.6018147851614625, + "grad_norm": 0.11496512591838837, + "learning_rate": 0.0003488514322116136, + "loss": 2.6552, + "step": 20295 + }, + { + "epoch": 0.601844438513774, + "grad_norm": 0.10660528391599655, + "learning_rate": 0.0003488065832583846, + "loss": 2.6249, + "step": 20296 + }, + { + "epoch": 0.6018740918660854, + "grad_norm": 0.12677253782749176, + "learning_rate": 0.00034876173564399885, + "loss": 2.6902, + "step": 20297 + }, + { + "epoch": 0.601903745218397, + "grad_norm": 0.11755955964326859, + "learning_rate": 0.00034871688936885417, + "loss": 2.6367, + "step": 20298 + }, + { + "epoch": 0.6019333985707084, + "grad_norm": 0.12958061695098877, + "learning_rate": 0.00034867204443334737, + "loss": 2.6453, + "step": 20299 + }, + { + "epoch": 0.6019630519230199, + "grad_norm": 0.13698363304138184, + "learning_rate": 0.0003486272008378756, + "loss": 2.629, + "step": 20300 + }, + { + "epoch": 0.6019927052753313, + "grad_norm": 0.09875982999801636, + "learning_rate": 0.00034858235858283595, + "loss": 2.6191, + "step": 20301 + }, + { + "epoch": 0.6020223586276429, + "grad_norm": 0.10725508630275726, + "learning_rate": 0.0003485375176686254, + "loss": 2.6277, + "step": 20302 + }, + { + "epoch": 0.6020520119799543, + "grad_norm": 0.10320580005645752, + "learning_rate": 0.0003484926780956412, + "loss": 2.6236, + "step": 20303 + }, + { + "epoch": 0.6020816653322658, + "grad_norm": 0.11461130529642105, + "learning_rate": 0.0003484478398642804, + "loss": 2.6362, + "step": 20304 + }, + { + "epoch": 0.6021113186845773, + "grad_norm": 0.12356731295585632, + "learning_rate": 0.00034840300297493985, + "loss": 2.6668, + "step": 20305 + }, + { + "epoch": 0.6021409720368888, + "grad_norm": 0.12881527841091156, + "learning_rate": 0.000348358167428017, + "loss": 2.6619, + "step": 20306 + }, + { + "epoch": 0.6021706253892003, + "grad_norm": 0.11936607211828232, + "learning_rate": 0.00034831333322390837, + "loss": 2.6376, + "step": 20307 + }, + { + "epoch": 0.6022002787415117, + "grad_norm": 0.12822069227695465, + "learning_rate": 0.00034826850036301127, + "loss": 2.6522, + "step": 20308 + }, + { + "epoch": 0.6022299320938233, + "grad_norm": 0.12988349795341492, + "learning_rate": 0.0003482236688457226, + "loss": 2.65, + "step": 20309 + }, + { + "epoch": 0.6022595854461347, + "grad_norm": 0.12192517518997192, + "learning_rate": 0.0003481788386724393, + "loss": 2.645, + "step": 20310 + }, + { + "epoch": 0.6022892387984462, + "grad_norm": 0.10978345572948456, + "learning_rate": 0.0003481340098435586, + "loss": 2.649, + "step": 20311 + }, + { + "epoch": 0.6023188921507576, + "grad_norm": 0.11821141093969345, + "learning_rate": 0.0003480891823594773, + "loss": 2.6009, + "step": 20312 + }, + { + "epoch": 0.6023485455030692, + "grad_norm": 0.11406664550304413, + "learning_rate": 0.0003480443562205923, + "loss": 2.6439, + "step": 20313 + }, + { + "epoch": 0.6023781988553806, + "grad_norm": 0.13119205832481384, + "learning_rate": 0.0003479995314273007, + "loss": 2.6474, + "step": 20314 + }, + { + "epoch": 0.6024078522076921, + "grad_norm": 0.120874784886837, + "learning_rate": 0.00034795470797999926, + "loss": 2.6566, + "step": 20315 + }, + { + "epoch": 0.6024375055600035, + "grad_norm": 0.11404795944690704, + "learning_rate": 0.000347909885879085, + "loss": 2.6516, + "step": 20316 + }, + { + "epoch": 0.6024671589123151, + "grad_norm": 0.1147289127111435, + "learning_rate": 0.00034786506512495495, + "loss": 2.659, + "step": 20317 + }, + { + "epoch": 0.6024968122646265, + "grad_norm": 0.10897018015384674, + "learning_rate": 0.00034782024571800577, + "loss": 2.6299, + "step": 20318 + }, + { + "epoch": 0.602526465616938, + "grad_norm": 0.11301790922880173, + "learning_rate": 0.00034777542765863435, + "loss": 2.6363, + "step": 20319 + }, + { + "epoch": 0.6025561189692494, + "grad_norm": 0.11674350500106812, + "learning_rate": 0.00034773061094723766, + "loss": 2.6483, + "step": 20320 + }, + { + "epoch": 0.602585772321561, + "grad_norm": 0.10661520808935165, + "learning_rate": 0.0003476857955842126, + "loss": 2.6411, + "step": 20321 + }, + { + "epoch": 0.6026154256738724, + "grad_norm": 0.11165256798267365, + "learning_rate": 0.00034764098156995593, + "loss": 2.6097, + "step": 20322 + }, + { + "epoch": 0.6026450790261839, + "grad_norm": 0.10886726528406143, + "learning_rate": 0.00034759616890486447, + "loss": 2.6527, + "step": 20323 + }, + { + "epoch": 0.6026747323784953, + "grad_norm": 0.11384744942188263, + "learning_rate": 0.0003475513575893351, + "loss": 2.6557, + "step": 20324 + }, + { + "epoch": 0.6027043857308069, + "grad_norm": 0.12506070733070374, + "learning_rate": 0.0003475065476237647, + "loss": 2.6815, + "step": 20325 + }, + { + "epoch": 0.6027340390831184, + "grad_norm": 0.11583711206912994, + "learning_rate": 0.00034746173900855, + "loss": 2.6089, + "step": 20326 + }, + { + "epoch": 0.6027636924354298, + "grad_norm": 0.1065087765455246, + "learning_rate": 0.00034741693174408783, + "loss": 2.5929, + "step": 20327 + }, + { + "epoch": 0.6027933457877414, + "grad_norm": 0.11406097561120987, + "learning_rate": 0.00034737212583077487, + "loss": 2.6685, + "step": 20328 + }, + { + "epoch": 0.6028229991400528, + "grad_norm": 0.1124628335237503, + "learning_rate": 0.0003473273212690079, + "loss": 2.6638, + "step": 20329 + }, + { + "epoch": 0.6028526524923643, + "grad_norm": 0.1053740456700325, + "learning_rate": 0.00034728251805918374, + "loss": 2.6546, + "step": 20330 + }, + { + "epoch": 0.6028823058446757, + "grad_norm": 0.11754568666219711, + "learning_rate": 0.0003472377162016991, + "loss": 2.626, + "step": 20331 + }, + { + "epoch": 0.6029119591969873, + "grad_norm": 0.11723420768976212, + "learning_rate": 0.0003471929156969507, + "loss": 2.6499, + "step": 20332 + }, + { + "epoch": 0.6029416125492987, + "grad_norm": 0.1082165390253067, + "learning_rate": 0.0003471481165453352, + "loss": 2.6625, + "step": 20333 + }, + { + "epoch": 0.6029712659016102, + "grad_norm": 0.09596295654773712, + "learning_rate": 0.00034710331874724954, + "loss": 2.6524, + "step": 20334 + }, + { + "epoch": 0.6030009192539216, + "grad_norm": 0.10922885686159134, + "learning_rate": 0.00034705852230309007, + "loss": 2.617, + "step": 20335 + }, + { + "epoch": 0.6030305726062332, + "grad_norm": 0.10384920239448547, + "learning_rate": 0.0003470137272132535, + "loss": 2.6603, + "step": 20336 + }, + { + "epoch": 0.6030602259585446, + "grad_norm": 0.1171317994594574, + "learning_rate": 0.0003469689334781368, + "loss": 2.6245, + "step": 20337 + }, + { + "epoch": 0.6030898793108561, + "grad_norm": 0.11241248995065689, + "learning_rate": 0.00034692414109813657, + "loss": 2.6422, + "step": 20338 + }, + { + "epoch": 0.6031195326631675, + "grad_norm": 0.10284680873155594, + "learning_rate": 0.00034687935007364924, + "loss": 2.6851, + "step": 20339 + }, + { + "epoch": 0.6031491860154791, + "grad_norm": 0.12707401812076569, + "learning_rate": 0.00034683456040507154, + "loss": 2.6469, + "step": 20340 + }, + { + "epoch": 0.6031788393677905, + "grad_norm": 0.12759923934936523, + "learning_rate": 0.00034678977209280014, + "loss": 2.6347, + "step": 20341 + }, + { + "epoch": 0.603208492720102, + "grad_norm": 0.11992006748914719, + "learning_rate": 0.00034674498513723157, + "loss": 2.6377, + "step": 20342 + }, + { + "epoch": 0.6032381460724134, + "grad_norm": 0.10036174952983856, + "learning_rate": 0.0003467001995387625, + "loss": 2.6276, + "step": 20343 + }, + { + "epoch": 0.603267799424725, + "grad_norm": 0.1266993284225464, + "learning_rate": 0.00034665541529778956, + "loss": 2.6089, + "step": 20344 + }, + { + "epoch": 0.6032974527770364, + "grad_norm": 0.1325516551733017, + "learning_rate": 0.00034661063241470916, + "loss": 2.6447, + "step": 20345 + }, + { + "epoch": 0.6033271061293479, + "grad_norm": 0.12998367846012115, + "learning_rate": 0.0003465658508899179, + "loss": 2.6416, + "step": 20346 + }, + { + "epoch": 0.6033567594816595, + "grad_norm": 0.11155975610017776, + "learning_rate": 0.00034652107072381236, + "loss": 2.6825, + "step": 20347 + }, + { + "epoch": 0.6033864128339709, + "grad_norm": 0.11122214794158936, + "learning_rate": 0.0003464762919167891, + "loss": 2.645, + "step": 20348 + }, + { + "epoch": 0.6034160661862824, + "grad_norm": 0.11838696151971817, + "learning_rate": 0.00034643151446924446, + "loss": 2.6305, + "step": 20349 + }, + { + "epoch": 0.6034457195385938, + "grad_norm": 0.10994050651788712, + "learning_rate": 0.0003463867383815753, + "loss": 2.6234, + "step": 20350 + }, + { + "epoch": 0.6034753728909054, + "grad_norm": 0.10058985650539398, + "learning_rate": 0.0003463419636541779, + "loss": 2.6255, + "step": 20351 + }, + { + "epoch": 0.6035050262432168, + "grad_norm": 0.1006472185254097, + "learning_rate": 0.00034629719028744885, + "loss": 2.6262, + "step": 20352 + }, + { + "epoch": 0.6035346795955283, + "grad_norm": 0.10936986654996872, + "learning_rate": 0.0003462524182817845, + "loss": 2.6733, + "step": 20353 + }, + { + "epoch": 0.6035643329478397, + "grad_norm": 0.10237261652946472, + "learning_rate": 0.0003462076476375814, + "loss": 2.6505, + "step": 20354 + }, + { + "epoch": 0.6035939863001513, + "grad_norm": 0.10730160027742386, + "learning_rate": 0.0003461628783552361, + "loss": 2.6353, + "step": 20355 + }, + { + "epoch": 0.6036236396524627, + "grad_norm": 0.11998501420021057, + "learning_rate": 0.0003461181104351447, + "loss": 2.6289, + "step": 20356 + }, + { + "epoch": 0.6036532930047742, + "grad_norm": 0.12297102808952332, + "learning_rate": 0.000346073343877704, + "loss": 2.6545, + "step": 20357 + }, + { + "epoch": 0.6036829463570856, + "grad_norm": 0.11848361045122147, + "learning_rate": 0.00034602857868331016, + "loss": 2.638, + "step": 20358 + }, + { + "epoch": 0.6037125997093972, + "grad_norm": 0.13089092075824738, + "learning_rate": 0.00034598381485235975, + "loss": 2.661, + "step": 20359 + }, + { + "epoch": 0.6037422530617086, + "grad_norm": 0.11710615456104279, + "learning_rate": 0.0003459390523852491, + "loss": 2.6331, + "step": 20360 + }, + { + "epoch": 0.6037719064140201, + "grad_norm": 0.11266770213842392, + "learning_rate": 0.00034589429128237443, + "loss": 2.6481, + "step": 20361 + }, + { + "epoch": 0.6038015597663315, + "grad_norm": 0.10685528069734573, + "learning_rate": 0.00034584953154413235, + "loss": 2.6432, + "step": 20362 + }, + { + "epoch": 0.6038312131186431, + "grad_norm": 0.1456855982542038, + "learning_rate": 0.00034580477317091917, + "loss": 2.6368, + "step": 20363 + }, + { + "epoch": 0.6038608664709545, + "grad_norm": 0.11068232357501984, + "learning_rate": 0.00034576001616313124, + "loss": 2.6504, + "step": 20364 + }, + { + "epoch": 0.603890519823266, + "grad_norm": 0.11902911216020584, + "learning_rate": 0.0003457152605211649, + "loss": 2.6248, + "step": 20365 + }, + { + "epoch": 0.6039201731755774, + "grad_norm": 0.10864734649658203, + "learning_rate": 0.0003456705062454163, + "loss": 2.6794, + "step": 20366 + }, + { + "epoch": 0.603949826527889, + "grad_norm": 0.10896829515695572, + "learning_rate": 0.0003456257533362818, + "loss": 2.6576, + "step": 20367 + }, + { + "epoch": 0.6039794798802005, + "grad_norm": 0.11061537265777588, + "learning_rate": 0.00034558100179415784, + "loss": 2.6322, + "step": 20368 + }, + { + "epoch": 0.6040091332325119, + "grad_norm": 0.10929840058088303, + "learning_rate": 0.0003455362516194406, + "loss": 2.6336, + "step": 20369 + }, + { + "epoch": 0.6040387865848235, + "grad_norm": 0.11882348358631134, + "learning_rate": 0.00034549150281252633, + "loss": 2.6543, + "step": 20370 + }, + { + "epoch": 0.6040684399371349, + "grad_norm": 0.11549876630306244, + "learning_rate": 0.0003454467553738113, + "loss": 2.6221, + "step": 20371 + }, + { + "epoch": 0.6040980932894464, + "grad_norm": 0.11027637869119644, + "learning_rate": 0.00034540200930369196, + "loss": 2.6559, + "step": 20372 + }, + { + "epoch": 0.6041277466417578, + "grad_norm": 0.13310988247394562, + "learning_rate": 0.0003453572646025642, + "loss": 2.6306, + "step": 20373 + }, + { + "epoch": 0.6041573999940694, + "grad_norm": 0.1105487197637558, + "learning_rate": 0.00034531252127082427, + "loss": 2.6311, + "step": 20374 + }, + { + "epoch": 0.6041870533463808, + "grad_norm": 0.10203899443149567, + "learning_rate": 0.00034526777930886864, + "loss": 2.6085, + "step": 20375 + }, + { + "epoch": 0.6042167066986923, + "grad_norm": 0.10955163836479187, + "learning_rate": 0.0003452230387170935, + "loss": 2.6428, + "step": 20376 + }, + { + "epoch": 0.6042463600510037, + "grad_norm": 0.1057506576180458, + "learning_rate": 0.00034517829949589485, + "loss": 2.6086, + "step": 20377 + }, + { + "epoch": 0.6042760134033153, + "grad_norm": 0.10812313109636307, + "learning_rate": 0.000345133561645669, + "loss": 2.6263, + "step": 20378 + }, + { + "epoch": 0.6043056667556267, + "grad_norm": 0.11595381796360016, + "learning_rate": 0.00034508882516681195, + "loss": 2.6279, + "step": 20379 + }, + { + "epoch": 0.6043353201079382, + "grad_norm": 0.105488620698452, + "learning_rate": 0.0003450440900597199, + "loss": 2.6649, + "step": 20380 + }, + { + "epoch": 0.6043649734602496, + "grad_norm": 0.12223092466592789, + "learning_rate": 0.0003449993563247892, + "loss": 2.6705, + "step": 20381 + }, + { + "epoch": 0.6043946268125612, + "grad_norm": 0.12544560432434082, + "learning_rate": 0.0003449546239624157, + "loss": 2.6001, + "step": 20382 + }, + { + "epoch": 0.6044242801648726, + "grad_norm": 0.1254255473613739, + "learning_rate": 0.0003449098929729957, + "loss": 2.6283, + "step": 20383 + }, + { + "epoch": 0.6044539335171841, + "grad_norm": 0.11560848355293274, + "learning_rate": 0.00034486516335692507, + "loss": 2.6444, + "step": 20384 + }, + { + "epoch": 0.6044835868694955, + "grad_norm": 0.12265177816152573, + "learning_rate": 0.00034482043511460013, + "loss": 2.6257, + "step": 20385 + }, + { + "epoch": 0.6045132402218071, + "grad_norm": 0.12660084664821625, + "learning_rate": 0.00034477570824641683, + "loss": 2.649, + "step": 20386 + }, + { + "epoch": 0.6045428935741185, + "grad_norm": 0.11364570260047913, + "learning_rate": 0.00034473098275277127, + "loss": 2.6507, + "step": 20387 + }, + { + "epoch": 0.60457254692643, + "grad_norm": 0.11047310382127762, + "learning_rate": 0.0003446862586340595, + "loss": 2.6379, + "step": 20388 + }, + { + "epoch": 0.6046022002787416, + "grad_norm": 0.12119269371032715, + "learning_rate": 0.0003446415358906776, + "loss": 2.6445, + "step": 20389 + }, + { + "epoch": 0.604631853631053, + "grad_norm": 0.11335710436105728, + "learning_rate": 0.0003445968145230216, + "loss": 2.6173, + "step": 20390 + }, + { + "epoch": 0.6046615069833645, + "grad_norm": 0.10973626375198364, + "learning_rate": 0.00034455209453148743, + "loss": 2.6448, + "step": 20391 + }, + { + "epoch": 0.6046911603356759, + "grad_norm": 0.11101756989955902, + "learning_rate": 0.0003445073759164712, + "loss": 2.6354, + "step": 20392 + }, + { + "epoch": 0.6047208136879875, + "grad_norm": 0.10380341112613678, + "learning_rate": 0.0003444626586783689, + "loss": 2.6376, + "step": 20393 + }, + { + "epoch": 0.6047504670402989, + "grad_norm": 0.10790623724460602, + "learning_rate": 0.00034441794281757636, + "loss": 2.6643, + "step": 20394 + }, + { + "epoch": 0.6047801203926104, + "grad_norm": 0.10580545663833618, + "learning_rate": 0.0003443732283344897, + "loss": 2.5926, + "step": 20395 + }, + { + "epoch": 0.6048097737449218, + "grad_norm": 0.11556481570005417, + "learning_rate": 0.00034432851522950476, + "loss": 2.6274, + "step": 20396 + }, + { + "epoch": 0.6048394270972334, + "grad_norm": 0.09892165660858154, + "learning_rate": 0.00034428380350301756, + "loss": 2.6404, + "step": 20397 + }, + { + "epoch": 0.6048690804495448, + "grad_norm": 0.1137261763215065, + "learning_rate": 0.000344239093155424, + "loss": 2.6139, + "step": 20398 + }, + { + "epoch": 0.6048987338018563, + "grad_norm": 0.11982957273721695, + "learning_rate": 0.00034419438418712014, + "loss": 2.6174, + "step": 20399 + }, + { + "epoch": 0.6049283871541677, + "grad_norm": 0.11608520150184631, + "learning_rate": 0.0003441496765985014, + "loss": 2.649, + "step": 20400 + }, + { + "epoch": 0.6049580405064793, + "grad_norm": 0.10374783724546432, + "learning_rate": 0.0003441049703899642, + "loss": 2.649, + "step": 20401 + }, + { + "epoch": 0.6049876938587907, + "grad_norm": 0.09341233968734741, + "learning_rate": 0.00034406026556190426, + "loss": 2.6098, + "step": 20402 + }, + { + "epoch": 0.6050173472111022, + "grad_norm": 0.11126933246850967, + "learning_rate": 0.00034401556211471765, + "loss": 2.6186, + "step": 20403 + }, + { + "epoch": 0.6050470005634137, + "grad_norm": 0.11182931810617447, + "learning_rate": 0.00034397086004879973, + "loss": 2.6494, + "step": 20404 + }, + { + "epoch": 0.6050766539157252, + "grad_norm": 0.10939933359622955, + "learning_rate": 0.0003439261593645467, + "loss": 2.6309, + "step": 20405 + }, + { + "epoch": 0.6051063072680366, + "grad_norm": 0.11492688208818436, + "learning_rate": 0.00034388146006235446, + "loss": 2.6562, + "step": 20406 + }, + { + "epoch": 0.6051359606203481, + "grad_norm": 0.11855010688304901, + "learning_rate": 0.00034383676214261847, + "loss": 2.6366, + "step": 20407 + }, + { + "epoch": 0.6051656139726596, + "grad_norm": 0.12673376500606537, + "learning_rate": 0.0003437920656057349, + "loss": 2.6371, + "step": 20408 + }, + { + "epoch": 0.6051952673249711, + "grad_norm": 0.11593881249427795, + "learning_rate": 0.0003437473704520994, + "loss": 2.6368, + "step": 20409 + }, + { + "epoch": 0.6052249206772826, + "grad_norm": 0.11636864393949509, + "learning_rate": 0.00034370267668210774, + "loss": 2.5909, + "step": 20410 + }, + { + "epoch": 0.605254574029594, + "grad_norm": 0.09999412298202515, + "learning_rate": 0.0003436579842961557, + "loss": 2.6625, + "step": 20411 + }, + { + "epoch": 0.6052842273819056, + "grad_norm": 0.11697347462177277, + "learning_rate": 0.00034361329329463906, + "loss": 2.6463, + "step": 20412 + }, + { + "epoch": 0.605313880734217, + "grad_norm": 0.10850725322961807, + "learning_rate": 0.00034356860367795326, + "loss": 2.6599, + "step": 20413 + }, + { + "epoch": 0.6053435340865285, + "grad_norm": 0.10270946472883224, + "learning_rate": 0.0003435239154464947, + "loss": 2.6488, + "step": 20414 + }, + { + "epoch": 0.6053731874388399, + "grad_norm": 0.1171230673789978, + "learning_rate": 0.00034347922860065863, + "loss": 2.6312, + "step": 20415 + }, + { + "epoch": 0.6054028407911515, + "grad_norm": 0.11930692195892334, + "learning_rate": 0.0003434345431408408, + "loss": 2.6363, + "step": 20416 + }, + { + "epoch": 0.6054324941434629, + "grad_norm": 0.1358983963727951, + "learning_rate": 0.000343389859067437, + "loss": 2.628, + "step": 20417 + }, + { + "epoch": 0.6054621474957744, + "grad_norm": 0.105249784886837, + "learning_rate": 0.0003433451763808429, + "loss": 2.612, + "step": 20418 + }, + { + "epoch": 0.6054918008480858, + "grad_norm": 0.10591508448123932, + "learning_rate": 0.00034330049508145413, + "loss": 2.6522, + "step": 20419 + }, + { + "epoch": 0.6055214542003974, + "grad_norm": 0.11984376609325409, + "learning_rate": 0.00034325581516966646, + "loss": 2.6387, + "step": 20420 + }, + { + "epoch": 0.6055511075527088, + "grad_norm": 0.10405966639518738, + "learning_rate": 0.0003432111366458754, + "loss": 2.6596, + "step": 20421 + }, + { + "epoch": 0.6055807609050203, + "grad_norm": 0.10481808334589005, + "learning_rate": 0.0003431664595104766, + "loss": 2.6357, + "step": 20422 + }, + { + "epoch": 0.6056104142573318, + "grad_norm": 0.10374592989683151, + "learning_rate": 0.0003431217837638657, + "loss": 2.6591, + "step": 20423 + }, + { + "epoch": 0.6056400676096433, + "grad_norm": 0.11054235696792603, + "learning_rate": 0.00034307710940643834, + "loss": 2.639, + "step": 20424 + }, + { + "epoch": 0.6056697209619547, + "grad_norm": 0.09234003722667694, + "learning_rate": 0.0003430324364385902, + "loss": 2.6085, + "step": 20425 + }, + { + "epoch": 0.6056993743142662, + "grad_norm": 0.10713043063879013, + "learning_rate": 0.0003429877648607166, + "loss": 2.631, + "step": 20426 + }, + { + "epoch": 0.6057290276665777, + "grad_norm": 0.12165336310863495, + "learning_rate": 0.0003429430946732134, + "loss": 2.6511, + "step": 20427 + }, + { + "epoch": 0.6057586810188892, + "grad_norm": 0.11461704224348068, + "learning_rate": 0.000342898425876476, + "loss": 2.6262, + "step": 20428 + }, + { + "epoch": 0.6057883343712006, + "grad_norm": 0.11707595735788345, + "learning_rate": 0.0003428537584709001, + "loss": 2.6518, + "step": 20429 + }, + { + "epoch": 0.6058179877235121, + "grad_norm": 0.10781227052211761, + "learning_rate": 0.0003428090924568811, + "loss": 2.6226, + "step": 20430 + }, + { + "epoch": 0.6058476410758237, + "grad_norm": 0.1115158349275589, + "learning_rate": 0.00034276442783481463, + "loss": 2.64, + "step": 20431 + }, + { + "epoch": 0.6058772944281351, + "grad_norm": 0.11324741691350937, + "learning_rate": 0.00034271976460509615, + "loss": 2.6315, + "step": 20432 + }, + { + "epoch": 0.6059069477804466, + "grad_norm": 0.11020393669605255, + "learning_rate": 0.000342675102768121, + "loss": 2.6476, + "step": 20433 + }, + { + "epoch": 0.605936601132758, + "grad_norm": 0.11682269722223282, + "learning_rate": 0.00034263044232428496, + "loss": 2.6504, + "step": 20434 + }, + { + "epoch": 0.6059662544850696, + "grad_norm": 0.11327515542507172, + "learning_rate": 0.0003425857832739833, + "loss": 2.6203, + "step": 20435 + }, + { + "epoch": 0.605995907837381, + "grad_norm": 0.11406546831130981, + "learning_rate": 0.0003425411256176115, + "loss": 2.6233, + "step": 20436 + }, + { + "epoch": 0.6060255611896925, + "grad_norm": 0.12006402760744095, + "learning_rate": 0.00034249646935556523, + "loss": 2.6452, + "step": 20437 + }, + { + "epoch": 0.606055214542004, + "grad_norm": 0.12193930149078369, + "learning_rate": 0.0003424518144882397, + "loss": 2.6448, + "step": 20438 + }, + { + "epoch": 0.6060848678943155, + "grad_norm": 0.09965700656175613, + "learning_rate": 0.0003424071610160301, + "loss": 2.665, + "step": 20439 + }, + { + "epoch": 0.6061145212466269, + "grad_norm": 0.12503471970558167, + "learning_rate": 0.0003423625089393324, + "loss": 2.6532, + "step": 20440 + }, + { + "epoch": 0.6061441745989384, + "grad_norm": 0.1142861396074295, + "learning_rate": 0.0003423178582585418, + "loss": 2.6216, + "step": 20441 + }, + { + "epoch": 0.6061738279512499, + "grad_norm": 0.1200508251786232, + "learning_rate": 0.00034227320897405354, + "loss": 2.6515, + "step": 20442 + }, + { + "epoch": 0.6062034813035614, + "grad_norm": 0.11857690662145615, + "learning_rate": 0.0003422285610862631, + "loss": 2.6575, + "step": 20443 + }, + { + "epoch": 0.6062331346558728, + "grad_norm": 0.10701267421245575, + "learning_rate": 0.00034218391459556587, + "loss": 2.6529, + "step": 20444 + }, + { + "epoch": 0.6062627880081843, + "grad_norm": 0.12230135500431061, + "learning_rate": 0.00034213926950235716, + "loss": 2.6534, + "step": 20445 + }, + { + "epoch": 0.6062924413604958, + "grad_norm": 0.11893764138221741, + "learning_rate": 0.0003420946258070322, + "loss": 2.6277, + "step": 20446 + }, + { + "epoch": 0.6063220947128073, + "grad_norm": 0.1211618110537529, + "learning_rate": 0.00034204998350998654, + "loss": 2.6726, + "step": 20447 + }, + { + "epoch": 0.6063517480651187, + "grad_norm": 0.10455277562141418, + "learning_rate": 0.0003420053426116155, + "loss": 2.6223, + "step": 20448 + }, + { + "epoch": 0.6063814014174302, + "grad_norm": 0.11456979066133499, + "learning_rate": 0.00034196070311231407, + "loss": 2.6377, + "step": 20449 + }, + { + "epoch": 0.6064110547697417, + "grad_norm": 0.10332958400249481, + "learning_rate": 0.0003419160650124779, + "loss": 2.6016, + "step": 20450 + }, + { + "epoch": 0.6064407081220532, + "grad_norm": 0.11167577654123306, + "learning_rate": 0.00034187142831250194, + "loss": 2.6334, + "step": 20451 + }, + { + "epoch": 0.6064703614743647, + "grad_norm": 0.11056109517812729, + "learning_rate": 0.0003418267930127816, + "loss": 2.6327, + "step": 20452 + }, + { + "epoch": 0.6065000148266761, + "grad_norm": 0.10873233526945114, + "learning_rate": 0.0003417821591137122, + "loss": 2.6525, + "step": 20453 + }, + { + "epoch": 0.6065296681789877, + "grad_norm": 0.11323218792676926, + "learning_rate": 0.0003417375266156891, + "loss": 2.6543, + "step": 20454 + }, + { + "epoch": 0.6065593215312991, + "grad_norm": 0.1307327300310135, + "learning_rate": 0.0003416928955191072, + "loss": 2.6352, + "step": 20455 + }, + { + "epoch": 0.6065889748836106, + "grad_norm": 0.13120277225971222, + "learning_rate": 0.000341648265824362, + "loss": 2.6499, + "step": 20456 + }, + { + "epoch": 0.606618628235922, + "grad_norm": 0.0959494560956955, + "learning_rate": 0.00034160363753184855, + "loss": 2.6284, + "step": 20457 + }, + { + "epoch": 0.6066482815882336, + "grad_norm": 0.12240410596132278, + "learning_rate": 0.00034155901064196215, + "loss": 2.6445, + "step": 20458 + }, + { + "epoch": 0.606677934940545, + "grad_norm": 0.11645375937223434, + "learning_rate": 0.0003415143851550978, + "loss": 2.6176, + "step": 20459 + }, + { + "epoch": 0.6067075882928565, + "grad_norm": 0.10514959692955017, + "learning_rate": 0.0003414697610716508, + "loss": 2.6292, + "step": 20460 + }, + { + "epoch": 0.606737241645168, + "grad_norm": 0.1105179637670517, + "learning_rate": 0.0003414251383920163, + "loss": 2.6366, + "step": 20461 + }, + { + "epoch": 0.6067668949974795, + "grad_norm": 0.11543643474578857, + "learning_rate": 0.00034138051711658936, + "loss": 2.6272, + "step": 20462 + }, + { + "epoch": 0.6067965483497909, + "grad_norm": 0.10418369621038437, + "learning_rate": 0.0003413358972457652, + "loss": 2.604, + "step": 20463 + }, + { + "epoch": 0.6068262017021024, + "grad_norm": 0.11044061928987503, + "learning_rate": 0.00034129127877993894, + "loss": 2.6695, + "step": 20464 + }, + { + "epoch": 0.6068558550544139, + "grad_norm": 0.10755127668380737, + "learning_rate": 0.0003412466617195055, + "loss": 2.6021, + "step": 20465 + }, + { + "epoch": 0.6068855084067254, + "grad_norm": 0.10158123821020126, + "learning_rate": 0.0003412020460648602, + "loss": 2.6328, + "step": 20466 + }, + { + "epoch": 0.6069151617590368, + "grad_norm": 0.11046972125768661, + "learning_rate": 0.00034115743181639804, + "loss": 2.6267, + "step": 20467 + }, + { + "epoch": 0.6069448151113483, + "grad_norm": 0.11208804696798325, + "learning_rate": 0.00034111281897451415, + "loss": 2.6187, + "step": 20468 + }, + { + "epoch": 0.6069744684636598, + "grad_norm": 0.11764735728502274, + "learning_rate": 0.00034106820753960354, + "loss": 2.6145, + "step": 20469 + }, + { + "epoch": 0.6070041218159713, + "grad_norm": 0.11632310599088669, + "learning_rate": 0.0003410235975120612, + "loss": 2.6476, + "step": 20470 + }, + { + "epoch": 0.6070337751682828, + "grad_norm": 0.10995849967002869, + "learning_rate": 0.00034097898889228216, + "loss": 2.6323, + "step": 20471 + }, + { + "epoch": 0.6070634285205943, + "grad_norm": 0.10931096225976944, + "learning_rate": 0.0003409343816806615, + "loss": 2.6492, + "step": 20472 + }, + { + "epoch": 0.6070930818729058, + "grad_norm": 0.11988181620836258, + "learning_rate": 0.0003408897758775942, + "loss": 2.6589, + "step": 20473 + }, + { + "epoch": 0.6071227352252172, + "grad_norm": 0.11757739633321762, + "learning_rate": 0.00034084517148347515, + "loss": 2.6372, + "step": 20474 + }, + { + "epoch": 0.6071523885775287, + "grad_norm": 0.1119283139705658, + "learning_rate": 0.0003408005684986996, + "loss": 2.6524, + "step": 20475 + }, + { + "epoch": 0.6071820419298402, + "grad_norm": 0.11368189007043839, + "learning_rate": 0.00034075596692366227, + "loss": 2.6331, + "step": 20476 + }, + { + "epoch": 0.6072116952821517, + "grad_norm": 0.10023186355829239, + "learning_rate": 0.0003407113667587581, + "loss": 2.6512, + "step": 20477 + }, + { + "epoch": 0.6072413486344631, + "grad_norm": 0.10923995822668076, + "learning_rate": 0.00034066676800438204, + "loss": 2.6491, + "step": 20478 + }, + { + "epoch": 0.6072710019867746, + "grad_norm": 0.11705108731985092, + "learning_rate": 0.00034062217066092924, + "loss": 2.5982, + "step": 20479 + }, + { + "epoch": 0.6073006553390861, + "grad_norm": 0.10872580111026764, + "learning_rate": 0.00034057757472879445, + "loss": 2.6749, + "step": 20480 + }, + { + "epoch": 0.6073303086913976, + "grad_norm": 0.11018414795398712, + "learning_rate": 0.00034053298020837264, + "loss": 2.6807, + "step": 20481 + }, + { + "epoch": 0.607359962043709, + "grad_norm": 0.11140335351228714, + "learning_rate": 0.00034048838710005867, + "loss": 2.6556, + "step": 20482 + }, + { + "epoch": 0.6073896153960205, + "grad_norm": 0.10726230591535568, + "learning_rate": 0.00034044379540424736, + "loss": 2.6185, + "step": 20483 + }, + { + "epoch": 0.607419268748332, + "grad_norm": 0.10912928730249405, + "learning_rate": 0.0003403992051213337, + "loss": 2.6142, + "step": 20484 + }, + { + "epoch": 0.6074489221006435, + "grad_norm": 0.11353874206542969, + "learning_rate": 0.0003403546162517124, + "loss": 2.6228, + "step": 20485 + }, + { + "epoch": 0.6074785754529549, + "grad_norm": 0.10629026591777802, + "learning_rate": 0.00034031002879577845, + "loss": 2.6418, + "step": 20486 + }, + { + "epoch": 0.6075082288052664, + "grad_norm": 0.11494464427232742, + "learning_rate": 0.0003402654427539266, + "loss": 2.6005, + "step": 20487 + }, + { + "epoch": 0.6075378821575779, + "grad_norm": 0.09876866638660431, + "learning_rate": 0.00034022085812655155, + "loss": 2.6527, + "step": 20488 + }, + { + "epoch": 0.6075675355098894, + "grad_norm": 0.11545614153146744, + "learning_rate": 0.00034017627491404837, + "loss": 2.6449, + "step": 20489 + }, + { + "epoch": 0.6075971888622008, + "grad_norm": 0.11006685346364975, + "learning_rate": 0.00034013169311681164, + "loss": 2.6181, + "step": 20490 + }, + { + "epoch": 0.6076268422145124, + "grad_norm": 0.09761400520801544, + "learning_rate": 0.00034008711273523606, + "loss": 2.6269, + "step": 20491 + }, + { + "epoch": 0.6076564955668239, + "grad_norm": 0.10438723862171173, + "learning_rate": 0.0003400425337697167, + "loss": 2.6334, + "step": 20492 + }, + { + "epoch": 0.6076861489191353, + "grad_norm": 0.1125410944223404, + "learning_rate": 0.0003399979562206482, + "loss": 2.636, + "step": 20493 + }, + { + "epoch": 0.6077158022714468, + "grad_norm": 0.09913789480924606, + "learning_rate": 0.0003399533800884252, + "loss": 2.6139, + "step": 20494 + }, + { + "epoch": 0.6077454556237583, + "grad_norm": 0.10186325758695602, + "learning_rate": 0.0003399088053734425, + "loss": 2.6412, + "step": 20495 + }, + { + "epoch": 0.6077751089760698, + "grad_norm": 0.11324357241392136, + "learning_rate": 0.00033986423207609494, + "loss": 2.6599, + "step": 20496 + }, + { + "epoch": 0.6078047623283812, + "grad_norm": 0.10428837686777115, + "learning_rate": 0.00033981966019677696, + "loss": 2.6476, + "step": 20497 + }, + { + "epoch": 0.6078344156806927, + "grad_norm": 0.1267957240343094, + "learning_rate": 0.0003397750897358834, + "loss": 2.6647, + "step": 20498 + }, + { + "epoch": 0.6078640690330042, + "grad_norm": 0.126112699508667, + "learning_rate": 0.00033973052069380887, + "loss": 2.6487, + "step": 20499 + }, + { + "epoch": 0.6078937223853157, + "grad_norm": 0.13673879206180573, + "learning_rate": 0.00033968595307094817, + "loss": 2.6207, + "step": 20500 + }, + { + "epoch": 0.6079233757376271, + "grad_norm": 0.1240946501493454, + "learning_rate": 0.0003396413868676959, + "loss": 2.6572, + "step": 20501 + }, + { + "epoch": 0.6079530290899386, + "grad_norm": 0.11203368008136749, + "learning_rate": 0.00033959682208444653, + "loss": 2.6405, + "step": 20502 + }, + { + "epoch": 0.6079826824422501, + "grad_norm": 0.12783141434192657, + "learning_rate": 0.000339552258721595, + "loss": 2.6398, + "step": 20503 + }, + { + "epoch": 0.6080123357945616, + "grad_norm": 0.1431606262922287, + "learning_rate": 0.00033950769677953545, + "loss": 2.6568, + "step": 20504 + }, + { + "epoch": 0.608041989146873, + "grad_norm": 0.1306733340024948, + "learning_rate": 0.000339463136258663, + "loss": 2.6449, + "step": 20505 + }, + { + "epoch": 0.6080716424991846, + "grad_norm": 0.11491525918245316, + "learning_rate": 0.0003394185771593721, + "loss": 2.6841, + "step": 20506 + }, + { + "epoch": 0.608101295851496, + "grad_norm": 0.1300993114709854, + "learning_rate": 0.00033937401948205733, + "loss": 2.6471, + "step": 20507 + }, + { + "epoch": 0.6081309492038075, + "grad_norm": 0.13339850306510925, + "learning_rate": 0.00033932946322711306, + "loss": 2.6281, + "step": 20508 + }, + { + "epoch": 0.6081606025561189, + "grad_norm": 0.12852595746517181, + "learning_rate": 0.00033928490839493395, + "loss": 2.6584, + "step": 20509 + }, + { + "epoch": 0.6081902559084305, + "grad_norm": 0.11838363111019135, + "learning_rate": 0.00033924035498591467, + "loss": 2.6388, + "step": 20510 + }, + { + "epoch": 0.6082199092607419, + "grad_norm": 0.1306668519973755, + "learning_rate": 0.0003391958030004495, + "loss": 2.6409, + "step": 20511 + }, + { + "epoch": 0.6082495626130534, + "grad_norm": 0.12482314556837082, + "learning_rate": 0.0003391512524389332, + "loss": 2.6485, + "step": 20512 + }, + { + "epoch": 0.6082792159653649, + "grad_norm": 0.11893883347511292, + "learning_rate": 0.0003391067033017602, + "loss": 2.6154, + "step": 20513 + }, + { + "epoch": 0.6083088693176764, + "grad_norm": 0.12231558561325073, + "learning_rate": 0.0003390621555893249, + "loss": 2.6435, + "step": 20514 + }, + { + "epoch": 0.6083385226699879, + "grad_norm": 0.14541307091712952, + "learning_rate": 0.00033901760930202187, + "loss": 2.6518, + "step": 20515 + }, + { + "epoch": 0.6083681760222993, + "grad_norm": 0.11220542341470718, + "learning_rate": 0.00033897306444024547, + "loss": 2.631, + "step": 20516 + }, + { + "epoch": 0.6083978293746108, + "grad_norm": 0.11339114606380463, + "learning_rate": 0.00033892852100439013, + "loss": 2.636, + "step": 20517 + }, + { + "epoch": 0.6084274827269223, + "grad_norm": 0.12001939117908478, + "learning_rate": 0.0003388839789948505, + "loss": 2.6471, + "step": 20518 + }, + { + "epoch": 0.6084571360792338, + "grad_norm": 0.10123323649168015, + "learning_rate": 0.0003388394384120209, + "loss": 2.6407, + "step": 20519 + }, + { + "epoch": 0.6084867894315452, + "grad_norm": 0.12269610911607742, + "learning_rate": 0.00033879489925629574, + "loss": 2.6373, + "step": 20520 + }, + { + "epoch": 0.6085164427838567, + "grad_norm": 0.11600463837385178, + "learning_rate": 0.00033875036152806944, + "loss": 2.6515, + "step": 20521 + }, + { + "epoch": 0.6085460961361682, + "grad_norm": 0.10905938595533371, + "learning_rate": 0.0003387058252277363, + "loss": 2.6011, + "step": 20522 + }, + { + "epoch": 0.6085757494884797, + "grad_norm": 0.10803326964378357, + "learning_rate": 0.0003386612903556908, + "loss": 2.6139, + "step": 20523 + }, + { + "epoch": 0.6086054028407911, + "grad_norm": 0.11888241767883301, + "learning_rate": 0.0003386167569123274, + "loss": 2.6349, + "step": 20524 + }, + { + "epoch": 0.6086350561931027, + "grad_norm": 0.09968675673007965, + "learning_rate": 0.00033857222489804017, + "loss": 2.6338, + "step": 20525 + }, + { + "epoch": 0.6086647095454141, + "grad_norm": 0.10674573481082916, + "learning_rate": 0.0003385276943132236, + "loss": 2.6829, + "step": 20526 + }, + { + "epoch": 0.6086943628977256, + "grad_norm": 0.10705333948135376, + "learning_rate": 0.00033848316515827203, + "loss": 2.6562, + "step": 20527 + }, + { + "epoch": 0.608724016250037, + "grad_norm": 0.11057012528181076, + "learning_rate": 0.00033843863743357976, + "loss": 2.6719, + "step": 20528 + }, + { + "epoch": 0.6087536696023486, + "grad_norm": 0.10633455961942673, + "learning_rate": 0.0003383941111395411, + "loss": 2.6639, + "step": 20529 + }, + { + "epoch": 0.60878332295466, + "grad_norm": 0.105035200715065, + "learning_rate": 0.0003383495862765502, + "loss": 2.6158, + "step": 20530 + }, + { + "epoch": 0.6088129763069715, + "grad_norm": 0.11111108958721161, + "learning_rate": 0.0003383050628450016, + "loss": 2.6154, + "step": 20531 + }, + { + "epoch": 0.6088426296592829, + "grad_norm": 0.1081593930721283, + "learning_rate": 0.00033826054084528935, + "loss": 2.6033, + "step": 20532 + }, + { + "epoch": 0.6088722830115945, + "grad_norm": 0.1264706403017044, + "learning_rate": 0.0003382160202778078, + "loss": 2.6236, + "step": 20533 + }, + { + "epoch": 0.608901936363906, + "grad_norm": 0.15181440114974976, + "learning_rate": 0.0003381715011429513, + "loss": 2.6292, + "step": 20534 + }, + { + "epoch": 0.6089315897162174, + "grad_norm": 0.1592717170715332, + "learning_rate": 0.0003381269834411138, + "loss": 2.6218, + "step": 20535 + }, + { + "epoch": 0.608961243068529, + "grad_norm": 0.10814967006444931, + "learning_rate": 0.00033808246717268966, + "loss": 2.6184, + "step": 20536 + }, + { + "epoch": 0.6089908964208404, + "grad_norm": 0.12726905941963196, + "learning_rate": 0.0003380379523380731, + "loss": 2.5894, + "step": 20537 + }, + { + "epoch": 0.6090205497731519, + "grad_norm": 0.1426015943288803, + "learning_rate": 0.0003379934389376582, + "loss": 2.6285, + "step": 20538 + }, + { + "epoch": 0.6090502031254633, + "grad_norm": 0.11310586333274841, + "learning_rate": 0.00033794892697183926, + "loss": 2.6423, + "step": 20539 + }, + { + "epoch": 0.6090798564777749, + "grad_norm": 0.13151727616786957, + "learning_rate": 0.00033790441644101035, + "loss": 2.65, + "step": 20540 + }, + { + "epoch": 0.6091095098300863, + "grad_norm": 0.13662338256835938, + "learning_rate": 0.0003378599073455658, + "loss": 2.6436, + "step": 20541 + }, + { + "epoch": 0.6091391631823978, + "grad_norm": 0.12085436284542084, + "learning_rate": 0.0003378153996858993, + "loss": 2.6381, + "step": 20542 + }, + { + "epoch": 0.6091688165347092, + "grad_norm": 0.1283618062734604, + "learning_rate": 0.0003377708934624054, + "loss": 2.6776, + "step": 20543 + }, + { + "epoch": 0.6091984698870208, + "grad_norm": 0.12594468891620636, + "learning_rate": 0.00033772638867547814, + "loss": 2.6547, + "step": 20544 + }, + { + "epoch": 0.6092281232393322, + "grad_norm": 0.11629980057477951, + "learning_rate": 0.0003376818853255117, + "loss": 2.6512, + "step": 20545 + }, + { + "epoch": 0.6092577765916437, + "grad_norm": 0.1303303986787796, + "learning_rate": 0.00033763738341289985, + "loss": 2.586, + "step": 20546 + }, + { + "epoch": 0.6092874299439551, + "grad_norm": 0.12635545432567596, + "learning_rate": 0.00033759288293803687, + "loss": 2.6213, + "step": 20547 + }, + { + "epoch": 0.6093170832962667, + "grad_norm": 0.12676557898521423, + "learning_rate": 0.00033754838390131684, + "loss": 2.645, + "step": 20548 + }, + { + "epoch": 0.6093467366485781, + "grad_norm": 0.12546713650226593, + "learning_rate": 0.00033750388630313365, + "loss": 2.6368, + "step": 20549 + }, + { + "epoch": 0.6093763900008896, + "grad_norm": 0.1213664785027504, + "learning_rate": 0.00033745939014388147, + "loss": 2.671, + "step": 20550 + }, + { + "epoch": 0.609406043353201, + "grad_norm": 0.11083411425352097, + "learning_rate": 0.0003374148954239544, + "loss": 2.615, + "step": 20551 + }, + { + "epoch": 0.6094356967055126, + "grad_norm": 0.1270342767238617, + "learning_rate": 0.00033737040214374634, + "loss": 2.6169, + "step": 20552 + }, + { + "epoch": 0.609465350057824, + "grad_norm": 0.12679021060466766, + "learning_rate": 0.00033732591030365117, + "loss": 2.6282, + "step": 20553 + }, + { + "epoch": 0.6094950034101355, + "grad_norm": 0.12304452061653137, + "learning_rate": 0.000337281419904063, + "loss": 2.6593, + "step": 20554 + }, + { + "epoch": 0.609524656762447, + "grad_norm": 0.11419977247714996, + "learning_rate": 0.00033723693094537573, + "loss": 2.5929, + "step": 20555 + }, + { + "epoch": 0.6095543101147585, + "grad_norm": 0.10529978573322296, + "learning_rate": 0.0003371924434279834, + "loss": 2.6228, + "step": 20556 + }, + { + "epoch": 0.60958396346707, + "grad_norm": 0.10191374272108078, + "learning_rate": 0.00033714795735228, + "loss": 2.6061, + "step": 20557 + }, + { + "epoch": 0.6096136168193814, + "grad_norm": 0.10755550861358643, + "learning_rate": 0.00033710347271865936, + "loss": 2.6457, + "step": 20558 + }, + { + "epoch": 0.609643270171693, + "grad_norm": 0.1084049642086029, + "learning_rate": 0.0003370589895275155, + "loss": 2.6129, + "step": 20559 + }, + { + "epoch": 0.6096729235240044, + "grad_norm": 0.10321484506130219, + "learning_rate": 0.00033701450777924217, + "loss": 2.6891, + "step": 20560 + }, + { + "epoch": 0.6097025768763159, + "grad_norm": 0.11015359312295914, + "learning_rate": 0.0003369700274742333, + "loss": 2.6404, + "step": 20561 + }, + { + "epoch": 0.6097322302286273, + "grad_norm": 0.11799045652151108, + "learning_rate": 0.00033692554861288306, + "loss": 2.6557, + "step": 20562 + }, + { + "epoch": 0.6097618835809389, + "grad_norm": 0.11136683821678162, + "learning_rate": 0.0003368810711955849, + "loss": 2.6198, + "step": 20563 + }, + { + "epoch": 0.6097915369332503, + "grad_norm": 0.10105511546134949, + "learning_rate": 0.00033683659522273284, + "loss": 2.6466, + "step": 20564 + }, + { + "epoch": 0.6098211902855618, + "grad_norm": 0.11245152354240417, + "learning_rate": 0.00033679212069472075, + "loss": 2.6263, + "step": 20565 + }, + { + "epoch": 0.6098508436378732, + "grad_norm": 0.09953474998474121, + "learning_rate": 0.0003367476476119424, + "loss": 2.6326, + "step": 20566 + }, + { + "epoch": 0.6098804969901848, + "grad_norm": 0.09396333247423172, + "learning_rate": 0.00033670317597479175, + "loss": 2.5996, + "step": 20567 + }, + { + "epoch": 0.6099101503424962, + "grad_norm": 0.10168413817882538, + "learning_rate": 0.0003366587057836624, + "loss": 2.6468, + "step": 20568 + }, + { + "epoch": 0.6099398036948077, + "grad_norm": 0.10269578546285629, + "learning_rate": 0.0003366142370389483, + "loss": 2.6348, + "step": 20569 + }, + { + "epoch": 0.6099694570471191, + "grad_norm": 0.11789838969707489, + "learning_rate": 0.00033656976974104314, + "loss": 2.6245, + "step": 20570 + }, + { + "epoch": 0.6099991103994307, + "grad_norm": 0.12858350574970245, + "learning_rate": 0.0003365253038903408, + "loss": 2.6198, + "step": 20571 + }, + { + "epoch": 0.6100287637517421, + "grad_norm": 0.13878369331359863, + "learning_rate": 0.00033648083948723505, + "loss": 2.622, + "step": 20572 + }, + { + "epoch": 0.6100584171040536, + "grad_norm": 0.11297226697206497, + "learning_rate": 0.0003364363765321194, + "loss": 2.6548, + "step": 20573 + }, + { + "epoch": 0.610088070456365, + "grad_norm": 0.10376956313848495, + "learning_rate": 0.00033639191502538767, + "loss": 2.6577, + "step": 20574 + }, + { + "epoch": 0.6101177238086766, + "grad_norm": 0.11994825303554535, + "learning_rate": 0.0003363474549674337, + "loss": 2.6216, + "step": 20575 + }, + { + "epoch": 0.6101473771609881, + "grad_norm": 0.1341419667005539, + "learning_rate": 0.0003363029963586511, + "loss": 2.6361, + "step": 20576 + }, + { + "epoch": 0.6101770305132995, + "grad_norm": 0.12294262647628784, + "learning_rate": 0.0003362585391994335, + "loss": 2.5998, + "step": 20577 + }, + { + "epoch": 0.610206683865611, + "grad_norm": 0.10388141870498657, + "learning_rate": 0.00033621408349017467, + "loss": 2.6248, + "step": 20578 + }, + { + "epoch": 0.6102363372179225, + "grad_norm": 0.12961910665035248, + "learning_rate": 0.00033616962923126837, + "loss": 2.6561, + "step": 20579 + }, + { + "epoch": 0.610265990570234, + "grad_norm": 0.12017864733934402, + "learning_rate": 0.000336125176423108, + "loss": 2.6387, + "step": 20580 + }, + { + "epoch": 0.6102956439225454, + "grad_norm": 0.09745600819587708, + "learning_rate": 0.00033608072506608713, + "loss": 2.5866, + "step": 20581 + }, + { + "epoch": 0.610325297274857, + "grad_norm": 0.12127390503883362, + "learning_rate": 0.00033603627516059977, + "loss": 2.6767, + "step": 20582 + }, + { + "epoch": 0.6103549506271684, + "grad_norm": 0.11576389521360397, + "learning_rate": 0.0003359918267070394, + "loss": 2.6683, + "step": 20583 + }, + { + "epoch": 0.6103846039794799, + "grad_norm": 0.11505350470542908, + "learning_rate": 0.00033594737970579953, + "loss": 2.657, + "step": 20584 + }, + { + "epoch": 0.6104142573317913, + "grad_norm": 0.10488945990800858, + "learning_rate": 0.00033590293415727374, + "loss": 2.6272, + "step": 20585 + }, + { + "epoch": 0.6104439106841029, + "grad_norm": 0.10963989794254303, + "learning_rate": 0.00033585849006185565, + "loss": 2.6374, + "step": 20586 + }, + { + "epoch": 0.6104735640364143, + "grad_norm": 0.11591120064258575, + "learning_rate": 0.00033581404741993883, + "loss": 2.6558, + "step": 20587 + }, + { + "epoch": 0.6105032173887258, + "grad_norm": 0.1101439967751503, + "learning_rate": 0.0003357696062319167, + "loss": 2.6578, + "step": 20588 + }, + { + "epoch": 0.6105328707410372, + "grad_norm": 0.11759205162525177, + "learning_rate": 0.00033572516649818307, + "loss": 2.6581, + "step": 20589 + }, + { + "epoch": 0.6105625240933488, + "grad_norm": 0.11061690747737885, + "learning_rate": 0.0003356807282191312, + "loss": 2.5951, + "step": 20590 + }, + { + "epoch": 0.6105921774456602, + "grad_norm": 0.11310068517923355, + "learning_rate": 0.00033563629139515464, + "loss": 2.6061, + "step": 20591 + }, + { + "epoch": 0.6106218307979717, + "grad_norm": 0.10927505046129227, + "learning_rate": 0.00033559185602664697, + "loss": 2.6378, + "step": 20592 + }, + { + "epoch": 0.6106514841502831, + "grad_norm": 0.1150236502289772, + "learning_rate": 0.0003355474221140016, + "loss": 2.6322, + "step": 20593 + }, + { + "epoch": 0.6106811375025947, + "grad_norm": 0.1245458796620369, + "learning_rate": 0.00033550298965761194, + "loss": 2.6182, + "step": 20594 + }, + { + "epoch": 0.6107107908549061, + "grad_norm": 0.12325138598680496, + "learning_rate": 0.00033545855865787166, + "loss": 2.6374, + "step": 20595 + }, + { + "epoch": 0.6107404442072176, + "grad_norm": 0.10948367416858673, + "learning_rate": 0.0003354141291151741, + "loss": 2.6688, + "step": 20596 + }, + { + "epoch": 0.6107700975595292, + "grad_norm": 0.10935927927494049, + "learning_rate": 0.00033536970102991263, + "loss": 2.6338, + "step": 20597 + }, + { + "epoch": 0.6107997509118406, + "grad_norm": 0.12467191368341446, + "learning_rate": 0.00033532527440248075, + "loss": 2.6249, + "step": 20598 + }, + { + "epoch": 0.6108294042641521, + "grad_norm": 0.12532559037208557, + "learning_rate": 0.00033528084923327184, + "loss": 2.6466, + "step": 20599 + }, + { + "epoch": 0.6108590576164635, + "grad_norm": 0.10520553588867188, + "learning_rate": 0.0003352364255226794, + "loss": 2.6458, + "step": 20600 + }, + { + "epoch": 0.6108887109687751, + "grad_norm": 0.126685231924057, + "learning_rate": 0.00033519200327109655, + "loss": 2.6305, + "step": 20601 + }, + { + "epoch": 0.6109183643210865, + "grad_norm": 0.12069761008024216, + "learning_rate": 0.0003351475824789168, + "loss": 2.6276, + "step": 20602 + }, + { + "epoch": 0.610948017673398, + "grad_norm": 0.10188644379377365, + "learning_rate": 0.00033510316314653356, + "loss": 2.6387, + "step": 20603 + }, + { + "epoch": 0.6109776710257094, + "grad_norm": 0.096359983086586, + "learning_rate": 0.0003350587452743401, + "loss": 2.6331, + "step": 20604 + }, + { + "epoch": 0.611007324378021, + "grad_norm": 0.10366588830947876, + "learning_rate": 0.00033501432886272965, + "loss": 2.6605, + "step": 20605 + }, + { + "epoch": 0.6110369777303324, + "grad_norm": 0.10415655374526978, + "learning_rate": 0.00033496991391209575, + "loss": 2.6604, + "step": 20606 + }, + { + "epoch": 0.6110666310826439, + "grad_norm": 0.11780981719493866, + "learning_rate": 0.00033492550042283145, + "loss": 2.6601, + "step": 20607 + }, + { + "epoch": 0.6110962844349553, + "grad_norm": 0.10855839401483536, + "learning_rate": 0.00033488108839533026, + "loss": 2.6643, + "step": 20608 + }, + { + "epoch": 0.6111259377872669, + "grad_norm": 0.12129455804824829, + "learning_rate": 0.0003348366778299854, + "loss": 2.6575, + "step": 20609 + }, + { + "epoch": 0.6111555911395783, + "grad_norm": 0.12246143817901611, + "learning_rate": 0.0003347922687271903, + "loss": 2.6114, + "step": 20610 + }, + { + "epoch": 0.6111852444918898, + "grad_norm": 0.09225647151470184, + "learning_rate": 0.0003347478610873378, + "loss": 2.5962, + "step": 20611 + }, + { + "epoch": 0.6112148978442012, + "grad_norm": 0.12498798221349716, + "learning_rate": 0.00033470345491082136, + "loss": 2.6316, + "step": 20612 + }, + { + "epoch": 0.6112445511965128, + "grad_norm": 0.11124490946531296, + "learning_rate": 0.0003346590501980342, + "loss": 2.6327, + "step": 20613 + }, + { + "epoch": 0.6112742045488242, + "grad_norm": 0.12574069201946259, + "learning_rate": 0.0003346146469493696, + "loss": 2.6139, + "step": 20614 + }, + { + "epoch": 0.6113038579011357, + "grad_norm": 0.12894804775714874, + "learning_rate": 0.0003345702451652206, + "loss": 2.6566, + "step": 20615 + }, + { + "epoch": 0.6113335112534471, + "grad_norm": 0.12439608573913574, + "learning_rate": 0.00033452584484598057, + "loss": 2.6198, + "step": 20616 + }, + { + "epoch": 0.6113631646057587, + "grad_norm": 0.15404652059078217, + "learning_rate": 0.00033448144599204264, + "loss": 2.6379, + "step": 20617 + }, + { + "epoch": 0.6113928179580702, + "grad_norm": 0.13954603672027588, + "learning_rate": 0.0003344370486037998, + "loss": 2.6332, + "step": 20618 + }, + { + "epoch": 0.6114224713103816, + "grad_norm": 0.11070293933153152, + "learning_rate": 0.0003343926526816453, + "loss": 2.6393, + "step": 20619 + }, + { + "epoch": 0.6114521246626932, + "grad_norm": 0.15077681839466095, + "learning_rate": 0.00033434825822597217, + "loss": 2.6281, + "step": 20620 + }, + { + "epoch": 0.6114817780150046, + "grad_norm": 0.14487974345684052, + "learning_rate": 0.00033430386523717383, + "loss": 2.669, + "step": 20621 + }, + { + "epoch": 0.6115114313673161, + "grad_norm": 0.1407385766506195, + "learning_rate": 0.0003342594737156432, + "loss": 2.6263, + "step": 20622 + }, + { + "epoch": 0.6115410847196275, + "grad_norm": 0.14099204540252686, + "learning_rate": 0.0003342150836617733, + "loss": 2.6631, + "step": 20623 + }, + { + "epoch": 0.6115707380719391, + "grad_norm": 0.13499872386455536, + "learning_rate": 0.00033417069507595736, + "loss": 2.6516, + "step": 20624 + }, + { + "epoch": 0.6116003914242505, + "grad_norm": 0.10943233221769333, + "learning_rate": 0.00033412630795858834, + "loss": 2.6278, + "step": 20625 + }, + { + "epoch": 0.611630044776562, + "grad_norm": 0.11607808619737625, + "learning_rate": 0.0003340819223100594, + "loss": 2.6367, + "step": 20626 + }, + { + "epoch": 0.6116596981288734, + "grad_norm": 0.11539965867996216, + "learning_rate": 0.00033403753813076356, + "loss": 2.6655, + "step": 20627 + }, + { + "epoch": 0.611689351481185, + "grad_norm": 0.11492778360843658, + "learning_rate": 0.00033399315542109373, + "loss": 2.6174, + "step": 20628 + }, + { + "epoch": 0.6117190048334964, + "grad_norm": 0.12016333639621735, + "learning_rate": 0.000333948774181443, + "loss": 2.6339, + "step": 20629 + }, + { + "epoch": 0.6117486581858079, + "grad_norm": 0.10836254060268402, + "learning_rate": 0.00033390439441220433, + "loss": 2.6404, + "step": 20630 + }, + { + "epoch": 0.6117783115381193, + "grad_norm": 0.1198287233710289, + "learning_rate": 0.0003338600161137708, + "loss": 2.6418, + "step": 20631 + }, + { + "epoch": 0.6118079648904309, + "grad_norm": 0.1262788027524948, + "learning_rate": 0.00033381563928653533, + "loss": 2.6604, + "step": 20632 + }, + { + "epoch": 0.6118376182427423, + "grad_norm": 0.1291319876909256, + "learning_rate": 0.0003337712639308908, + "loss": 2.6514, + "step": 20633 + }, + { + "epoch": 0.6118672715950538, + "grad_norm": 0.11692267656326294, + "learning_rate": 0.0003337268900472303, + "loss": 2.6494, + "step": 20634 + }, + { + "epoch": 0.6118969249473653, + "grad_norm": 0.11900477111339569, + "learning_rate": 0.0003336825176359468, + "loss": 2.6476, + "step": 20635 + }, + { + "epoch": 0.6119265782996768, + "grad_norm": 0.10669872909784317, + "learning_rate": 0.0003336381466974332, + "loss": 2.6443, + "step": 20636 + }, + { + "epoch": 0.6119562316519882, + "grad_norm": 0.10981453210115433, + "learning_rate": 0.00033359377723208227, + "loss": 2.6403, + "step": 20637 + }, + { + "epoch": 0.6119858850042997, + "grad_norm": 0.10906155407428741, + "learning_rate": 0.0003335494092402872, + "loss": 2.6162, + "step": 20638 + }, + { + "epoch": 0.6120155383566113, + "grad_norm": 0.10486062616109848, + "learning_rate": 0.00033350504272244055, + "loss": 2.6259, + "step": 20639 + }, + { + "epoch": 0.6120451917089227, + "grad_norm": 0.10741270333528519, + "learning_rate": 0.00033346067767893526, + "loss": 2.6325, + "step": 20640 + }, + { + "epoch": 0.6120748450612342, + "grad_norm": 0.12035929411649704, + "learning_rate": 0.0003334163141101644, + "loss": 2.643, + "step": 20641 + }, + { + "epoch": 0.6121044984135456, + "grad_norm": 0.10047297179698944, + "learning_rate": 0.00033337195201652047, + "loss": 2.6281, + "step": 20642 + }, + { + "epoch": 0.6121341517658572, + "grad_norm": 0.10349589586257935, + "learning_rate": 0.00033332759139839663, + "loss": 2.6499, + "step": 20643 + }, + { + "epoch": 0.6121638051181686, + "grad_norm": 0.11137337982654572, + "learning_rate": 0.0003332832322561856, + "loss": 2.6471, + "step": 20644 + }, + { + "epoch": 0.6121934584704801, + "grad_norm": 0.09511694312095642, + "learning_rate": 0.0003332388745902802, + "loss": 2.642, + "step": 20645 + }, + { + "epoch": 0.6122231118227915, + "grad_norm": 0.11323977261781693, + "learning_rate": 0.00033319451840107297, + "loss": 2.6143, + "step": 20646 + }, + { + "epoch": 0.6122527651751031, + "grad_norm": 0.10727754980325699, + "learning_rate": 0.000333150163688957, + "loss": 2.6674, + "step": 20647 + }, + { + "epoch": 0.6122824185274145, + "grad_norm": 0.11672265082597733, + "learning_rate": 0.00033310581045432517, + "loss": 2.6467, + "step": 20648 + }, + { + "epoch": 0.612312071879726, + "grad_norm": 0.11652252823114395, + "learning_rate": 0.0003330614586975699, + "loss": 2.6408, + "step": 20649 + }, + { + "epoch": 0.6123417252320374, + "grad_norm": 0.12216302752494812, + "learning_rate": 0.00033301710841908405, + "loss": 2.6474, + "step": 20650 + }, + { + "epoch": 0.612371378584349, + "grad_norm": 0.11487564444541931, + "learning_rate": 0.0003329727596192603, + "loss": 2.6509, + "step": 20651 + }, + { + "epoch": 0.6124010319366604, + "grad_norm": 0.12702788412570953, + "learning_rate": 0.0003329284122984916, + "loss": 2.6194, + "step": 20652 + }, + { + "epoch": 0.6124306852889719, + "grad_norm": 0.12531812489032745, + "learning_rate": 0.0003328840664571704, + "loss": 2.6399, + "step": 20653 + }, + { + "epoch": 0.6124603386412834, + "grad_norm": 0.12366128712892532, + "learning_rate": 0.0003328397220956895, + "loss": 2.6416, + "step": 20654 + }, + { + "epoch": 0.6124899919935949, + "grad_norm": 0.11047951877117157, + "learning_rate": 0.0003327953792144416, + "loss": 2.6086, + "step": 20655 + }, + { + "epoch": 0.6125196453459063, + "grad_norm": 0.10425551980733871, + "learning_rate": 0.0003327510378138192, + "loss": 2.6035, + "step": 20656 + }, + { + "epoch": 0.6125492986982178, + "grad_norm": 0.10821207612752914, + "learning_rate": 0.0003327066978942151, + "loss": 2.63, + "step": 20657 + }, + { + "epoch": 0.6125789520505293, + "grad_norm": 0.11006387323141098, + "learning_rate": 0.000332662359456022, + "loss": 2.6079, + "step": 20658 + }, + { + "epoch": 0.6126086054028408, + "grad_norm": 0.11194406449794769, + "learning_rate": 0.00033261802249963217, + "loss": 2.6474, + "step": 20659 + }, + { + "epoch": 0.6126382587551523, + "grad_norm": 0.12289546430110931, + "learning_rate": 0.00033257368702543867, + "loss": 2.66, + "step": 20660 + }, + { + "epoch": 0.6126679121074637, + "grad_norm": 0.11580200493335724, + "learning_rate": 0.0003325293530338339, + "loss": 2.6582, + "step": 20661 + }, + { + "epoch": 0.6126975654597753, + "grad_norm": 0.11690989136695862, + "learning_rate": 0.0003324850205252105, + "loss": 2.6377, + "step": 20662 + }, + { + "epoch": 0.6127272188120867, + "grad_norm": 0.1259392350912094, + "learning_rate": 0.0003324406894999609, + "loss": 2.6387, + "step": 20663 + }, + { + "epoch": 0.6127568721643982, + "grad_norm": 0.11117120087146759, + "learning_rate": 0.00033239635995847784, + "loss": 2.6284, + "step": 20664 + }, + { + "epoch": 0.6127865255167096, + "grad_norm": 0.12344469875097275, + "learning_rate": 0.0003323520319011538, + "loss": 2.6616, + "step": 20665 + }, + { + "epoch": 0.6128161788690212, + "grad_norm": 0.1175430417060852, + "learning_rate": 0.00033230770532838126, + "loss": 2.6681, + "step": 20666 + }, + { + "epoch": 0.6128458322213326, + "grad_norm": 0.1259947270154953, + "learning_rate": 0.0003322633802405528, + "loss": 2.636, + "step": 20667 + }, + { + "epoch": 0.6128754855736441, + "grad_norm": 0.11453323066234589, + "learning_rate": 0.00033221905663806086, + "loss": 2.6528, + "step": 20668 + }, + { + "epoch": 0.6129051389259556, + "grad_norm": 0.12843020260334015, + "learning_rate": 0.0003321747345212979, + "loss": 2.6583, + "step": 20669 + }, + { + "epoch": 0.6129347922782671, + "grad_norm": 0.13484424352645874, + "learning_rate": 0.0003321304138906566, + "loss": 2.6153, + "step": 20670 + }, + { + "epoch": 0.6129644456305785, + "grad_norm": 0.11231331527233124, + "learning_rate": 0.0003320860947465292, + "loss": 2.6463, + "step": 20671 + }, + { + "epoch": 0.61299409898289, + "grad_norm": 0.11834380030632019, + "learning_rate": 0.0003320417770893082, + "loss": 2.6923, + "step": 20672 + }, + { + "epoch": 0.6130237523352015, + "grad_norm": 0.13932639360427856, + "learning_rate": 0.0003319974609193862, + "loss": 2.6043, + "step": 20673 + }, + { + "epoch": 0.613053405687513, + "grad_norm": 0.11915510892868042, + "learning_rate": 0.0003319531462371555, + "loss": 2.6295, + "step": 20674 + }, + { + "epoch": 0.6130830590398244, + "grad_norm": 0.10982999950647354, + "learning_rate": 0.00033190883304300855, + "loss": 2.6187, + "step": 20675 + }, + { + "epoch": 0.6131127123921359, + "grad_norm": 0.12590236961841583, + "learning_rate": 0.0003318645213373378, + "loss": 2.6707, + "step": 20676 + }, + { + "epoch": 0.6131423657444474, + "grad_norm": 0.11661428213119507, + "learning_rate": 0.00033182021112053553, + "loss": 2.6262, + "step": 20677 + }, + { + "epoch": 0.6131720190967589, + "grad_norm": 0.131738543510437, + "learning_rate": 0.0003317759023929942, + "loss": 2.6677, + "step": 20678 + }, + { + "epoch": 0.6132016724490703, + "grad_norm": 0.11310528218746185, + "learning_rate": 0.00033173159515510596, + "loss": 2.6102, + "step": 20679 + }, + { + "epoch": 0.6132313258013818, + "grad_norm": 0.10284557938575745, + "learning_rate": 0.00033168728940726345, + "loss": 2.6499, + "step": 20680 + }, + { + "epoch": 0.6132609791536934, + "grad_norm": 0.12355668097734451, + "learning_rate": 0.0003316429851498589, + "loss": 2.6285, + "step": 20681 + }, + { + "epoch": 0.6132906325060048, + "grad_norm": 0.10257375240325928, + "learning_rate": 0.0003315986823832845, + "loss": 2.5898, + "step": 20682 + }, + { + "epoch": 0.6133202858583163, + "grad_norm": 0.10901562869548798, + "learning_rate": 0.0003315543811079329, + "loss": 2.618, + "step": 20683 + }, + { + "epoch": 0.6133499392106277, + "grad_norm": 0.1173044741153717, + "learning_rate": 0.000331510081324196, + "loss": 2.6376, + "step": 20684 + }, + { + "epoch": 0.6133795925629393, + "grad_norm": 0.10288331657648087, + "learning_rate": 0.00033146578303246604, + "loss": 2.6526, + "step": 20685 + }, + { + "epoch": 0.6134092459152507, + "grad_norm": 0.10877660661935806, + "learning_rate": 0.0003314214862331358, + "loss": 2.66, + "step": 20686 + }, + { + "epoch": 0.6134388992675622, + "grad_norm": 0.10202287137508392, + "learning_rate": 0.00033137719092659723, + "loss": 2.6099, + "step": 20687 + }, + { + "epoch": 0.6134685526198737, + "grad_norm": 0.09947815537452698, + "learning_rate": 0.0003313328971132425, + "loss": 2.627, + "step": 20688 + }, + { + "epoch": 0.6134982059721852, + "grad_norm": 0.11091803014278412, + "learning_rate": 0.0003312886047934639, + "loss": 2.6234, + "step": 20689 + }, + { + "epoch": 0.6135278593244966, + "grad_norm": 0.10736631602048874, + "learning_rate": 0.00033124431396765376, + "loss": 2.6673, + "step": 20690 + }, + { + "epoch": 0.6135575126768081, + "grad_norm": 0.11229949444532394, + "learning_rate": 0.0003312000246362041, + "loss": 2.6471, + "step": 20691 + }, + { + "epoch": 0.6135871660291196, + "grad_norm": 0.11540169268846512, + "learning_rate": 0.00033115573679950717, + "loss": 2.6477, + "step": 20692 + }, + { + "epoch": 0.6136168193814311, + "grad_norm": 0.11310827732086182, + "learning_rate": 0.0003311114504579553, + "loss": 2.6059, + "step": 20693 + }, + { + "epoch": 0.6136464727337425, + "grad_norm": 0.11473535746335983, + "learning_rate": 0.0003310671656119405, + "loss": 2.6615, + "step": 20694 + }, + { + "epoch": 0.613676126086054, + "grad_norm": 0.12497247755527496, + "learning_rate": 0.0003310228822618548, + "loss": 2.6373, + "step": 20695 + }, + { + "epoch": 0.6137057794383655, + "grad_norm": 0.10311509668827057, + "learning_rate": 0.00033097860040809065, + "loss": 2.659, + "step": 20696 + }, + { + "epoch": 0.613735432790677, + "grad_norm": 0.11300238221883774, + "learning_rate": 0.0003309343200510399, + "loss": 2.6275, + "step": 20697 + }, + { + "epoch": 0.6137650861429884, + "grad_norm": 0.11773726344108582, + "learning_rate": 0.00033089004119109467, + "loss": 2.6684, + "step": 20698 + }, + { + "epoch": 0.6137947394953, + "grad_norm": 0.11728999018669128, + "learning_rate": 0.00033084576382864727, + "loss": 2.6408, + "step": 20699 + }, + { + "epoch": 0.6138243928476115, + "grad_norm": 0.10753632336854935, + "learning_rate": 0.0003308014879640897, + "loss": 2.6187, + "step": 20700 + }, + { + "epoch": 0.6138540461999229, + "grad_norm": 0.09485457092523575, + "learning_rate": 0.00033075721359781396, + "loss": 2.6174, + "step": 20701 + }, + { + "epoch": 0.6138836995522344, + "grad_norm": 0.11992570757865906, + "learning_rate": 0.0003307129407302122, + "loss": 2.6227, + "step": 20702 + }, + { + "epoch": 0.6139133529045459, + "grad_norm": 0.10688678920269012, + "learning_rate": 0.0003306686693616764, + "loss": 2.6674, + "step": 20703 + }, + { + "epoch": 0.6139430062568574, + "grad_norm": 0.11111629009246826, + "learning_rate": 0.00033062439949259857, + "loss": 2.6459, + "step": 20704 + }, + { + "epoch": 0.6139726596091688, + "grad_norm": 0.11101084202528, + "learning_rate": 0.0003305801311233707, + "loss": 2.6129, + "step": 20705 + }, + { + "epoch": 0.6140023129614803, + "grad_norm": 0.10662811249494553, + "learning_rate": 0.000330535864254385, + "loss": 2.647, + "step": 20706 + }, + { + "epoch": 0.6140319663137918, + "grad_norm": 0.11325684934854507, + "learning_rate": 0.00033049159888603314, + "loss": 2.6446, + "step": 20707 + }, + { + "epoch": 0.6140616196661033, + "grad_norm": 0.12047215551137924, + "learning_rate": 0.0003304473350187073, + "loss": 2.6379, + "step": 20708 + }, + { + "epoch": 0.6140912730184147, + "grad_norm": 0.13850440084934235, + "learning_rate": 0.00033040307265279947, + "loss": 2.6059, + "step": 20709 + }, + { + "epoch": 0.6141209263707262, + "grad_norm": 0.141055628657341, + "learning_rate": 0.00033035881178870163, + "loss": 2.6357, + "step": 20710 + }, + { + "epoch": 0.6141505797230377, + "grad_norm": 0.10819605737924576, + "learning_rate": 0.0003303145524268053, + "loss": 2.6051, + "step": 20711 + }, + { + "epoch": 0.6141802330753492, + "grad_norm": 0.12187080830335617, + "learning_rate": 0.0003302702945675029, + "loss": 2.6516, + "step": 20712 + }, + { + "epoch": 0.6142098864276606, + "grad_norm": 0.11932724714279175, + "learning_rate": 0.00033022603821118626, + "loss": 2.6494, + "step": 20713 + }, + { + "epoch": 0.6142395397799721, + "grad_norm": 0.13333240151405334, + "learning_rate": 0.0003301817833582472, + "loss": 2.6488, + "step": 20714 + }, + { + "epoch": 0.6142691931322836, + "grad_norm": 0.10939841717481613, + "learning_rate": 0.00033013753000907763, + "loss": 2.6435, + "step": 20715 + }, + { + "epoch": 0.6142988464845951, + "grad_norm": 0.11156657338142395, + "learning_rate": 0.0003300932781640693, + "loss": 2.6417, + "step": 20716 + }, + { + "epoch": 0.6143284998369065, + "grad_norm": 0.11309535056352615, + "learning_rate": 0.00033004902782361414, + "loss": 2.6578, + "step": 20717 + }, + { + "epoch": 0.614358153189218, + "grad_norm": 0.12493089586496353, + "learning_rate": 0.00033000477898810406, + "loss": 2.6309, + "step": 20718 + }, + { + "epoch": 0.6143878065415295, + "grad_norm": 0.1027098223567009, + "learning_rate": 0.0003299605316579308, + "loss": 2.653, + "step": 20719 + }, + { + "epoch": 0.614417459893841, + "grad_norm": 0.10983584076166153, + "learning_rate": 0.00032991628583348625, + "loss": 2.644, + "step": 20720 + }, + { + "epoch": 0.6144471132461525, + "grad_norm": 0.10211317241191864, + "learning_rate": 0.0003298720415151623, + "loss": 2.6497, + "step": 20721 + }, + { + "epoch": 0.614476766598464, + "grad_norm": 0.10823702812194824, + "learning_rate": 0.00032982779870335046, + "loss": 2.6618, + "step": 20722 + }, + { + "epoch": 0.6145064199507755, + "grad_norm": 0.12186039984226227, + "learning_rate": 0.0003297835573984425, + "loss": 2.6192, + "step": 20723 + }, + { + "epoch": 0.6145360733030869, + "grad_norm": 0.09927443414926529, + "learning_rate": 0.0003297393176008306, + "loss": 2.659, + "step": 20724 + }, + { + "epoch": 0.6145657266553984, + "grad_norm": 0.13980473577976227, + "learning_rate": 0.00032969507931090633, + "loss": 2.6648, + "step": 20725 + }, + { + "epoch": 0.6145953800077099, + "grad_norm": 0.11370017379522324, + "learning_rate": 0.00032965084252906124, + "loss": 2.6145, + "step": 20726 + }, + { + "epoch": 0.6146250333600214, + "grad_norm": 0.11852936446666718, + "learning_rate": 0.0003296066072556873, + "loss": 2.6456, + "step": 20727 + }, + { + "epoch": 0.6146546867123328, + "grad_norm": 0.11341600865125656, + "learning_rate": 0.0003295623734911759, + "loss": 2.6464, + "step": 20728 + }, + { + "epoch": 0.6146843400646443, + "grad_norm": 0.09894723445177078, + "learning_rate": 0.00032951814123591906, + "loss": 2.6688, + "step": 20729 + }, + { + "epoch": 0.6147139934169558, + "grad_norm": 0.11302255094051361, + "learning_rate": 0.00032947391049030836, + "loss": 2.6374, + "step": 20730 + }, + { + "epoch": 0.6147436467692673, + "grad_norm": 0.10429531335830688, + "learning_rate": 0.00032942968125473546, + "loss": 2.6357, + "step": 20731 + }, + { + "epoch": 0.6147733001215787, + "grad_norm": 0.11834028363227844, + "learning_rate": 0.00032938545352959205, + "loss": 2.6311, + "step": 20732 + }, + { + "epoch": 0.6148029534738902, + "grad_norm": 0.12842434644699097, + "learning_rate": 0.0003293412273152696, + "loss": 2.616, + "step": 20733 + }, + { + "epoch": 0.6148326068262017, + "grad_norm": 0.10157699882984161, + "learning_rate": 0.00032929700261215993, + "loss": 2.6174, + "step": 20734 + }, + { + "epoch": 0.6148622601785132, + "grad_norm": 0.12405218929052353, + "learning_rate": 0.00032925277942065463, + "loss": 2.6189, + "step": 20735 + }, + { + "epoch": 0.6148919135308246, + "grad_norm": 0.10997001826763153, + "learning_rate": 0.0003292085577411451, + "loss": 2.615, + "step": 20736 + }, + { + "epoch": 0.6149215668831362, + "grad_norm": 0.11099886149168015, + "learning_rate": 0.0003291643375740233, + "loss": 2.6216, + "step": 20737 + }, + { + "epoch": 0.6149512202354476, + "grad_norm": 0.12791790068149567, + "learning_rate": 0.00032912011891968055, + "loss": 2.618, + "step": 20738 + }, + { + "epoch": 0.6149808735877591, + "grad_norm": 0.12207519263029099, + "learning_rate": 0.00032907590177850855, + "loss": 2.6258, + "step": 20739 + }, + { + "epoch": 0.6150105269400705, + "grad_norm": 0.10545239597558975, + "learning_rate": 0.00032903168615089866, + "loss": 2.6366, + "step": 20740 + }, + { + "epoch": 0.615040180292382, + "grad_norm": 0.11538476496934891, + "learning_rate": 0.0003289874720372427, + "loss": 2.6266, + "step": 20741 + }, + { + "epoch": 0.6150698336446936, + "grad_norm": 0.12335773557424545, + "learning_rate": 0.00032894325943793196, + "loss": 2.6465, + "step": 20742 + }, + { + "epoch": 0.615099486997005, + "grad_norm": 0.1278361976146698, + "learning_rate": 0.000328899048353358, + "loss": 2.6643, + "step": 20743 + }, + { + "epoch": 0.6151291403493165, + "grad_norm": 0.1165129765868187, + "learning_rate": 0.0003288548387839123, + "loss": 2.6081, + "step": 20744 + }, + { + "epoch": 0.615158793701628, + "grad_norm": 0.10772102326154709, + "learning_rate": 0.0003288106307299864, + "loss": 2.607, + "step": 20745 + }, + { + "epoch": 0.6151884470539395, + "grad_norm": 0.11484463512897491, + "learning_rate": 0.0003287664241919718, + "loss": 2.6195, + "step": 20746 + }, + { + "epoch": 0.6152181004062509, + "grad_norm": 0.11089376360177994, + "learning_rate": 0.00032872221917025984, + "loss": 2.6564, + "step": 20747 + }, + { + "epoch": 0.6152477537585624, + "grad_norm": 0.10683014988899231, + "learning_rate": 0.00032867801566524216, + "loss": 2.6519, + "step": 20748 + }, + { + "epoch": 0.6152774071108739, + "grad_norm": 0.11504357308149338, + "learning_rate": 0.0003286338136773097, + "loss": 2.6322, + "step": 20749 + }, + { + "epoch": 0.6153070604631854, + "grad_norm": 0.11245816200971603, + "learning_rate": 0.00032858961320685455, + "loss": 2.6284, + "step": 20750 + }, + { + "epoch": 0.6153367138154968, + "grad_norm": 0.10428116470575333, + "learning_rate": 0.0003285454142542677, + "loss": 2.6477, + "step": 20751 + }, + { + "epoch": 0.6153663671678083, + "grad_norm": 0.1310860961675644, + "learning_rate": 0.0003285012168199408, + "loss": 2.628, + "step": 20752 + }, + { + "epoch": 0.6153960205201198, + "grad_norm": 0.1661541610956192, + "learning_rate": 0.00032845702090426485, + "loss": 2.6543, + "step": 20753 + }, + { + "epoch": 0.6154256738724313, + "grad_norm": 0.130134716629982, + "learning_rate": 0.0003284128265076315, + "loss": 2.6708, + "step": 20754 + }, + { + "epoch": 0.6154553272247427, + "grad_norm": 0.1313878297805786, + "learning_rate": 0.00032836863363043214, + "loss": 2.6487, + "step": 20755 + }, + { + "epoch": 0.6154849805770543, + "grad_norm": 0.12672454118728638, + "learning_rate": 0.0003283244422730578, + "loss": 2.6461, + "step": 20756 + }, + { + "epoch": 0.6155146339293657, + "grad_norm": 0.1113191768527031, + "learning_rate": 0.0003282802524359001, + "loss": 2.6326, + "step": 20757 + }, + { + "epoch": 0.6155442872816772, + "grad_norm": 0.15550778806209564, + "learning_rate": 0.0003282360641193502, + "loss": 2.6387, + "step": 20758 + }, + { + "epoch": 0.6155739406339886, + "grad_norm": 0.13150155544281006, + "learning_rate": 0.0003281918773237995, + "loss": 2.6636, + "step": 20759 + }, + { + "epoch": 0.6156035939863002, + "grad_norm": 0.12606577575206757, + "learning_rate": 0.00032814769204963926, + "loss": 2.6441, + "step": 20760 + }, + { + "epoch": 0.6156332473386116, + "grad_norm": 0.13508236408233643, + "learning_rate": 0.00032810350829726056, + "loss": 2.6383, + "step": 20761 + }, + { + "epoch": 0.6156629006909231, + "grad_norm": 0.13108009099960327, + "learning_rate": 0.0003280593260670547, + "loss": 2.6438, + "step": 20762 + }, + { + "epoch": 0.6156925540432346, + "grad_norm": 0.12949766218662262, + "learning_rate": 0.0003280151453594132, + "loss": 2.6301, + "step": 20763 + }, + { + "epoch": 0.6157222073955461, + "grad_norm": 0.11894889175891876, + "learning_rate": 0.0003279709661747271, + "loss": 2.6194, + "step": 20764 + }, + { + "epoch": 0.6157518607478576, + "grad_norm": 0.12867291271686554, + "learning_rate": 0.0003279267885133876, + "loss": 2.6691, + "step": 20765 + }, + { + "epoch": 0.615781514100169, + "grad_norm": 0.13110321760177612, + "learning_rate": 0.00032788261237578587, + "loss": 2.6759, + "step": 20766 + }, + { + "epoch": 0.6158111674524805, + "grad_norm": 0.11315558105707169, + "learning_rate": 0.00032783843776231327, + "loss": 2.6335, + "step": 20767 + }, + { + "epoch": 0.615840820804792, + "grad_norm": 0.1277410089969635, + "learning_rate": 0.0003277942646733607, + "loss": 2.6403, + "step": 20768 + }, + { + "epoch": 0.6158704741571035, + "grad_norm": 0.13498349487781525, + "learning_rate": 0.0003277500931093197, + "loss": 2.6045, + "step": 20769 + }, + { + "epoch": 0.6159001275094149, + "grad_norm": 0.12808731198310852, + "learning_rate": 0.00032770592307058104, + "loss": 2.6432, + "step": 20770 + }, + { + "epoch": 0.6159297808617265, + "grad_norm": 0.11764650791883469, + "learning_rate": 0.00032766175455753606, + "loss": 2.6635, + "step": 20771 + }, + { + "epoch": 0.6159594342140379, + "grad_norm": 0.1340622454881668, + "learning_rate": 0.0003276175875705758, + "loss": 2.6224, + "step": 20772 + }, + { + "epoch": 0.6159890875663494, + "grad_norm": 0.12297642230987549, + "learning_rate": 0.00032757342211009135, + "loss": 2.6253, + "step": 20773 + }, + { + "epoch": 0.6160187409186608, + "grad_norm": 0.10591540485620499, + "learning_rate": 0.0003275292581764738, + "loss": 2.6075, + "step": 20774 + }, + { + "epoch": 0.6160483942709724, + "grad_norm": 0.10534653067588806, + "learning_rate": 0.0003274850957701143, + "loss": 2.6179, + "step": 20775 + }, + { + "epoch": 0.6160780476232838, + "grad_norm": 0.11098814010620117, + "learning_rate": 0.0003274409348914039, + "loss": 2.6255, + "step": 20776 + }, + { + "epoch": 0.6161077009755953, + "grad_norm": 0.11154629290103912, + "learning_rate": 0.0003273967755407337, + "loss": 2.6117, + "step": 20777 + }, + { + "epoch": 0.6161373543279067, + "grad_norm": 0.0987575575709343, + "learning_rate": 0.0003273526177184947, + "loss": 2.6245, + "step": 20778 + }, + { + "epoch": 0.6161670076802183, + "grad_norm": 0.10996447503566742, + "learning_rate": 0.0003273084614250778, + "loss": 2.6255, + "step": 20779 + }, + { + "epoch": 0.6161966610325297, + "grad_norm": 0.1161201074719429, + "learning_rate": 0.0003272643066608743, + "loss": 2.6579, + "step": 20780 + }, + { + "epoch": 0.6162263143848412, + "grad_norm": 0.10156355798244476, + "learning_rate": 0.00032722015342627486, + "loss": 2.6353, + "step": 20781 + }, + { + "epoch": 0.6162559677371526, + "grad_norm": 0.10830792784690857, + "learning_rate": 0.00032717600172167064, + "loss": 2.6664, + "step": 20782 + }, + { + "epoch": 0.6162856210894642, + "grad_norm": 0.11178693920373917, + "learning_rate": 0.0003271318515474526, + "loss": 2.6339, + "step": 20783 + }, + { + "epoch": 0.6163152744417757, + "grad_norm": 0.11054843664169312, + "learning_rate": 0.0003270877029040117, + "loss": 2.6449, + "step": 20784 + }, + { + "epoch": 0.6163449277940871, + "grad_norm": 0.10128573328256607, + "learning_rate": 0.0003270435557917388, + "loss": 2.6369, + "step": 20785 + }, + { + "epoch": 0.6163745811463986, + "grad_norm": 0.11257755011320114, + "learning_rate": 0.0003269994102110251, + "loss": 2.6454, + "step": 20786 + }, + { + "epoch": 0.6164042344987101, + "grad_norm": 0.09706149995326996, + "learning_rate": 0.00032695526616226115, + "loss": 2.6201, + "step": 20787 + }, + { + "epoch": 0.6164338878510216, + "grad_norm": 0.11133934557437897, + "learning_rate": 0.00032691112364583786, + "loss": 2.6449, + "step": 20788 + }, + { + "epoch": 0.616463541203333, + "grad_norm": 0.11039027571678162, + "learning_rate": 0.0003268669826621464, + "loss": 2.6298, + "step": 20789 + }, + { + "epoch": 0.6164931945556446, + "grad_norm": 0.11670731008052826, + "learning_rate": 0.00032682284321157776, + "loss": 2.6564, + "step": 20790 + }, + { + "epoch": 0.616522847907956, + "grad_norm": 0.10596849769353867, + "learning_rate": 0.00032677870529452234, + "loss": 2.6446, + "step": 20791 + }, + { + "epoch": 0.6165525012602675, + "grad_norm": 0.10848764330148697, + "learning_rate": 0.0003267345689113713, + "loss": 2.6415, + "step": 20792 + }, + { + "epoch": 0.6165821546125789, + "grad_norm": 0.10671214759349823, + "learning_rate": 0.0003266904340625154, + "loss": 2.6513, + "step": 20793 + }, + { + "epoch": 0.6166118079648905, + "grad_norm": 0.10945043712854385, + "learning_rate": 0.00032664630074834543, + "loss": 2.6011, + "step": 20794 + }, + { + "epoch": 0.6166414613172019, + "grad_norm": 0.1152147427201271, + "learning_rate": 0.0003266021689692522, + "loss": 2.634, + "step": 20795 + }, + { + "epoch": 0.6166711146695134, + "grad_norm": 0.11434614658355713, + "learning_rate": 0.0003265580387256265, + "loss": 2.6234, + "step": 20796 + }, + { + "epoch": 0.6167007680218248, + "grad_norm": 0.10784940421581268, + "learning_rate": 0.00032651391001785936, + "loss": 2.6767, + "step": 20797 + }, + { + "epoch": 0.6167304213741364, + "grad_norm": 0.11355850100517273, + "learning_rate": 0.0003264697828463411, + "loss": 2.6531, + "step": 20798 + }, + { + "epoch": 0.6167600747264478, + "grad_norm": 0.12072676420211792, + "learning_rate": 0.0003264256572114628, + "loss": 2.6622, + "step": 20799 + }, + { + "epoch": 0.6167897280787593, + "grad_norm": 0.12289215624332428, + "learning_rate": 0.00032638153311361504, + "loss": 2.628, + "step": 20800 + }, + { + "epoch": 0.6168193814310707, + "grad_norm": 0.11377588659524918, + "learning_rate": 0.00032633741055318845, + "loss": 2.6403, + "step": 20801 + }, + { + "epoch": 0.6168490347833823, + "grad_norm": 0.10830322653055191, + "learning_rate": 0.00032629328953057406, + "loss": 2.628, + "step": 20802 + }, + { + "epoch": 0.6168786881356937, + "grad_norm": 0.10221505910158157, + "learning_rate": 0.0003262491700461624, + "loss": 2.6465, + "step": 20803 + }, + { + "epoch": 0.6169083414880052, + "grad_norm": 0.12934096157550812, + "learning_rate": 0.0003262050521003442, + "loss": 2.6613, + "step": 20804 + }, + { + "epoch": 0.6169379948403168, + "grad_norm": 0.12904109060764313, + "learning_rate": 0.00032616093569351, + "loss": 2.6463, + "step": 20805 + }, + { + "epoch": 0.6169676481926282, + "grad_norm": 0.111246258020401, + "learning_rate": 0.0003261168208260507, + "loss": 2.6393, + "step": 20806 + }, + { + "epoch": 0.6169973015449397, + "grad_norm": 0.115772545337677, + "learning_rate": 0.00032607270749835683, + "loss": 2.6011, + "step": 20807 + }, + { + "epoch": 0.6170269548972511, + "grad_norm": 0.12944146990776062, + "learning_rate": 0.00032602859571081885, + "loss": 2.6053, + "step": 20808 + }, + { + "epoch": 0.6170566082495627, + "grad_norm": 0.11301252990961075, + "learning_rate": 0.00032598448546382753, + "loss": 2.6452, + "step": 20809 + }, + { + "epoch": 0.6170862616018741, + "grad_norm": 0.13550004363059998, + "learning_rate": 0.00032594037675777346, + "loss": 2.625, + "step": 20810 + }, + { + "epoch": 0.6171159149541856, + "grad_norm": 0.10892129689455032, + "learning_rate": 0.00032589626959304727, + "loss": 2.6116, + "step": 20811 + }, + { + "epoch": 0.617145568306497, + "grad_norm": 0.1222035214304924, + "learning_rate": 0.0003258521639700395, + "loss": 2.6045, + "step": 20812 + }, + { + "epoch": 0.6171752216588086, + "grad_norm": 0.136841282248497, + "learning_rate": 0.00032580805988914065, + "loss": 2.6156, + "step": 20813 + }, + { + "epoch": 0.61720487501112, + "grad_norm": 0.14054033160209656, + "learning_rate": 0.0003257639573507413, + "loss": 2.6475, + "step": 20814 + }, + { + "epoch": 0.6172345283634315, + "grad_norm": 0.13806185126304626, + "learning_rate": 0.00032571985635523204, + "loss": 2.6584, + "step": 20815 + }, + { + "epoch": 0.6172641817157429, + "grad_norm": 0.14542131125926971, + "learning_rate": 0.00032567575690300337, + "loss": 2.5937, + "step": 20816 + }, + { + "epoch": 0.6172938350680545, + "grad_norm": 0.12264104932546616, + "learning_rate": 0.0003256316589944458, + "loss": 2.5844, + "step": 20817 + }, + { + "epoch": 0.6173234884203659, + "grad_norm": 0.14772242307662964, + "learning_rate": 0.00032558756262994997, + "loss": 2.6051, + "step": 20818 + }, + { + "epoch": 0.6173531417726774, + "grad_norm": 0.11997762322425842, + "learning_rate": 0.00032554346780990607, + "loss": 2.6573, + "step": 20819 + }, + { + "epoch": 0.6173827951249888, + "grad_norm": 0.11745576560497284, + "learning_rate": 0.0003254993745347047, + "loss": 2.6307, + "step": 20820 + }, + { + "epoch": 0.6174124484773004, + "grad_norm": 0.1259080320596695, + "learning_rate": 0.0003254552828047364, + "loss": 2.6697, + "step": 20821 + }, + { + "epoch": 0.6174421018296118, + "grad_norm": 0.11148478835821152, + "learning_rate": 0.0003254111926203915, + "loss": 2.6145, + "step": 20822 + }, + { + "epoch": 0.6174717551819233, + "grad_norm": 0.1041918396949768, + "learning_rate": 0.0003253671039820604, + "loss": 2.6132, + "step": 20823 + }, + { + "epoch": 0.6175014085342347, + "grad_norm": 0.11052165925502777, + "learning_rate": 0.0003253230168901337, + "loss": 2.6336, + "step": 20824 + }, + { + "epoch": 0.6175310618865463, + "grad_norm": 0.11134955286979675, + "learning_rate": 0.0003252789313450016, + "loss": 2.6369, + "step": 20825 + }, + { + "epoch": 0.6175607152388578, + "grad_norm": 0.11132687330245972, + "learning_rate": 0.0003252348473470545, + "loss": 2.6171, + "step": 20826 + }, + { + "epoch": 0.6175903685911692, + "grad_norm": 0.10751403123140335, + "learning_rate": 0.00032519076489668266, + "loss": 2.6583, + "step": 20827 + }, + { + "epoch": 0.6176200219434808, + "grad_norm": 0.10939173400402069, + "learning_rate": 0.00032514668399427686, + "loss": 2.6142, + "step": 20828 + }, + { + "epoch": 0.6176496752957922, + "grad_norm": 0.11337452381849289, + "learning_rate": 0.0003251026046402271, + "loss": 2.6359, + "step": 20829 + }, + { + "epoch": 0.6176793286481037, + "grad_norm": 0.11087416857481003, + "learning_rate": 0.00032505852683492386, + "loss": 2.6112, + "step": 20830 + }, + { + "epoch": 0.6177089820004151, + "grad_norm": 0.1129227727651596, + "learning_rate": 0.00032501445057875734, + "loss": 2.6048, + "step": 20831 + }, + { + "epoch": 0.6177386353527267, + "grad_norm": 0.10742000490427017, + "learning_rate": 0.00032497037587211794, + "loss": 2.6393, + "step": 20832 + }, + { + "epoch": 0.6177682887050381, + "grad_norm": 0.13020025193691254, + "learning_rate": 0.0003249263027153958, + "loss": 2.6684, + "step": 20833 + }, + { + "epoch": 0.6177979420573496, + "grad_norm": 0.11778157204389572, + "learning_rate": 0.0003248822311089814, + "loss": 2.6105, + "step": 20834 + }, + { + "epoch": 0.617827595409661, + "grad_norm": 0.09812808781862259, + "learning_rate": 0.000324838161053265, + "loss": 2.629, + "step": 20835 + }, + { + "epoch": 0.6178572487619726, + "grad_norm": 0.12996354699134827, + "learning_rate": 0.00032479409254863655, + "loss": 2.6583, + "step": 20836 + }, + { + "epoch": 0.617886902114284, + "grad_norm": 0.13631901144981384, + "learning_rate": 0.0003247500255954866, + "loss": 2.642, + "step": 20837 + }, + { + "epoch": 0.6179165554665955, + "grad_norm": 0.12365827709436417, + "learning_rate": 0.00032470596019420524, + "loss": 2.6262, + "step": 20838 + }, + { + "epoch": 0.6179462088189069, + "grad_norm": 0.12102431803941727, + "learning_rate": 0.0003246618963451826, + "loss": 2.6202, + "step": 20839 + }, + { + "epoch": 0.6179758621712185, + "grad_norm": 0.1334386169910431, + "learning_rate": 0.00032461783404880894, + "loss": 2.62, + "step": 20840 + }, + { + "epoch": 0.6180055155235299, + "grad_norm": 0.11022397130727768, + "learning_rate": 0.00032457377330547454, + "loss": 2.6368, + "step": 20841 + }, + { + "epoch": 0.6180351688758414, + "grad_norm": 0.1181471049785614, + "learning_rate": 0.0003245297141155695, + "loss": 2.6623, + "step": 20842 + }, + { + "epoch": 0.6180648222281528, + "grad_norm": 0.12909311056137085, + "learning_rate": 0.000324485656479484, + "loss": 2.6316, + "step": 20843 + }, + { + "epoch": 0.6180944755804644, + "grad_norm": 0.10712800920009613, + "learning_rate": 0.00032444160039760806, + "loss": 2.6332, + "step": 20844 + }, + { + "epoch": 0.6181241289327758, + "grad_norm": 0.1431119740009308, + "learning_rate": 0.00032439754587033204, + "loss": 2.6385, + "step": 20845 + }, + { + "epoch": 0.6181537822850873, + "grad_norm": 0.12158521264791489, + "learning_rate": 0.0003243534928980458, + "loss": 2.6133, + "step": 20846 + }, + { + "epoch": 0.6181834356373989, + "grad_norm": 0.11007620394229889, + "learning_rate": 0.0003243094414811395, + "loss": 2.5862, + "step": 20847 + }, + { + "epoch": 0.6182130889897103, + "grad_norm": 0.10674756020307541, + "learning_rate": 0.00032426539162000326, + "loss": 2.6267, + "step": 20848 + }, + { + "epoch": 0.6182427423420218, + "grad_norm": 0.12699352204799652, + "learning_rate": 0.0003242213433150271, + "loss": 2.615, + "step": 20849 + }, + { + "epoch": 0.6182723956943332, + "grad_norm": 0.11450223624706268, + "learning_rate": 0.00032417729656660123, + "loss": 2.6461, + "step": 20850 + }, + { + "epoch": 0.6183020490466448, + "grad_norm": 0.11559164524078369, + "learning_rate": 0.00032413325137511543, + "loss": 2.619, + "step": 20851 + }, + { + "epoch": 0.6183317023989562, + "grad_norm": 0.11436299979686737, + "learning_rate": 0.0003240892077409601, + "loss": 2.633, + "step": 20852 + }, + { + "epoch": 0.6183613557512677, + "grad_norm": 0.11469656974077225, + "learning_rate": 0.0003240451656645247, + "loss": 2.6313, + "step": 20853 + }, + { + "epoch": 0.6183910091035791, + "grad_norm": 0.11545895040035248, + "learning_rate": 0.00032400112514619975, + "loss": 2.6463, + "step": 20854 + }, + { + "epoch": 0.6184206624558907, + "grad_norm": 0.09799302369356155, + "learning_rate": 0.000323957086186375, + "loss": 2.6641, + "step": 20855 + }, + { + "epoch": 0.6184503158082021, + "grad_norm": 0.11314231902360916, + "learning_rate": 0.0003239130487854406, + "loss": 2.6317, + "step": 20856 + }, + { + "epoch": 0.6184799691605136, + "grad_norm": 0.10545554012060165, + "learning_rate": 0.00032386901294378625, + "loss": 2.6297, + "step": 20857 + }, + { + "epoch": 0.618509622512825, + "grad_norm": 0.10679154098033905, + "learning_rate": 0.00032382497866180205, + "loss": 2.5974, + "step": 20858 + }, + { + "epoch": 0.6185392758651366, + "grad_norm": 0.10391275584697723, + "learning_rate": 0.00032378094593987793, + "loss": 2.6531, + "step": 20859 + }, + { + "epoch": 0.618568929217448, + "grad_norm": 0.11773812770843506, + "learning_rate": 0.0003237369147784037, + "loss": 2.6181, + "step": 20860 + }, + { + "epoch": 0.6185985825697595, + "grad_norm": 0.1028226688504219, + "learning_rate": 0.00032369288517776945, + "loss": 2.6335, + "step": 20861 + }, + { + "epoch": 0.618628235922071, + "grad_norm": 0.11447261273860931, + "learning_rate": 0.000323648857138365, + "loss": 2.6296, + "step": 20862 + }, + { + "epoch": 0.6186578892743825, + "grad_norm": 0.09758199006319046, + "learning_rate": 0.00032360483066058016, + "loss": 2.6337, + "step": 20863 + }, + { + "epoch": 0.6186875426266939, + "grad_norm": 0.11372588574886322, + "learning_rate": 0.0003235608057448047, + "loss": 2.6685, + "step": 20864 + }, + { + "epoch": 0.6187171959790054, + "grad_norm": 0.11499785631895065, + "learning_rate": 0.00032351678239142875, + "loss": 2.6457, + "step": 20865 + }, + { + "epoch": 0.6187468493313169, + "grad_norm": 0.11420978605747223, + "learning_rate": 0.00032347276060084175, + "loss": 2.643, + "step": 20866 + }, + { + "epoch": 0.6187765026836284, + "grad_norm": 0.11915170401334763, + "learning_rate": 0.00032342874037343395, + "loss": 2.6471, + "step": 20867 + }, + { + "epoch": 0.6188061560359399, + "grad_norm": 0.10373729467391968, + "learning_rate": 0.0003233847217095949, + "loss": 2.6024, + "step": 20868 + }, + { + "epoch": 0.6188358093882513, + "grad_norm": 0.11278592795133591, + "learning_rate": 0.0003233407046097144, + "loss": 2.6283, + "step": 20869 + }, + { + "epoch": 0.6188654627405629, + "grad_norm": 0.0984707623720169, + "learning_rate": 0.0003232966890741824, + "loss": 2.6301, + "step": 20870 + }, + { + "epoch": 0.6188951160928743, + "grad_norm": 0.11611822247505188, + "learning_rate": 0.0003232526751033885, + "loss": 2.6615, + "step": 20871 + }, + { + "epoch": 0.6189247694451858, + "grad_norm": 0.0979638397693634, + "learning_rate": 0.00032320866269772256, + "loss": 2.6932, + "step": 20872 + }, + { + "epoch": 0.6189544227974972, + "grad_norm": 0.10698915272951126, + "learning_rate": 0.0003231646518575743, + "loss": 2.6505, + "step": 20873 + }, + { + "epoch": 0.6189840761498088, + "grad_norm": 0.09669168293476105, + "learning_rate": 0.0003231206425833333, + "loss": 2.5833, + "step": 20874 + }, + { + "epoch": 0.6190137295021202, + "grad_norm": 0.10663309693336487, + "learning_rate": 0.00032307663487538934, + "loss": 2.6368, + "step": 20875 + }, + { + "epoch": 0.6190433828544317, + "grad_norm": 0.1180511862039566, + "learning_rate": 0.0003230326287341322, + "loss": 2.6387, + "step": 20876 + }, + { + "epoch": 0.6190730362067431, + "grad_norm": 0.10076964646577835, + "learning_rate": 0.00032298862415995144, + "loss": 2.6505, + "step": 20877 + }, + { + "epoch": 0.6191026895590547, + "grad_norm": 0.10824981331825256, + "learning_rate": 0.00032294462115323684, + "loss": 2.6575, + "step": 20878 + }, + { + "epoch": 0.6191323429113661, + "grad_norm": 0.1197732537984848, + "learning_rate": 0.00032290061971437795, + "loss": 2.6438, + "step": 20879 + }, + { + "epoch": 0.6191619962636776, + "grad_norm": 0.11832979321479797, + "learning_rate": 0.0003228566198437646, + "loss": 2.6346, + "step": 20880 + }, + { + "epoch": 0.619191649615989, + "grad_norm": 0.10361883044242859, + "learning_rate": 0.00032281262154178615, + "loss": 2.6488, + "step": 20881 + }, + { + "epoch": 0.6192213029683006, + "grad_norm": 0.11666225641965866, + "learning_rate": 0.00032276862480883246, + "loss": 2.6147, + "step": 20882 + }, + { + "epoch": 0.619250956320612, + "grad_norm": 0.11823893338441849, + "learning_rate": 0.0003227246296452931, + "loss": 2.6406, + "step": 20883 + }, + { + "epoch": 0.6192806096729235, + "grad_norm": 0.12390301376581192, + "learning_rate": 0.0003226806360515574, + "loss": 2.6202, + "step": 20884 + }, + { + "epoch": 0.619310263025235, + "grad_norm": 0.10666754841804504, + "learning_rate": 0.00032263664402801517, + "loss": 2.6427, + "step": 20885 + }, + { + "epoch": 0.6193399163775465, + "grad_norm": 0.10652060061693192, + "learning_rate": 0.0003225926535750559, + "loss": 2.6208, + "step": 20886 + }, + { + "epoch": 0.6193695697298579, + "grad_norm": 0.10772541165351868, + "learning_rate": 0.00032254866469306917, + "loss": 2.6051, + "step": 20887 + }, + { + "epoch": 0.6193992230821694, + "grad_norm": 0.10077480226755142, + "learning_rate": 0.0003225046773824444, + "loss": 2.6365, + "step": 20888 + }, + { + "epoch": 0.619428876434481, + "grad_norm": 0.12539857625961304, + "learning_rate": 0.00032246069164357125, + "loss": 2.5862, + "step": 20889 + }, + { + "epoch": 0.6194585297867924, + "grad_norm": 0.11909990012645721, + "learning_rate": 0.00032241670747683917, + "loss": 2.6365, + "step": 20890 + }, + { + "epoch": 0.6194881831391039, + "grad_norm": 0.11954722553491592, + "learning_rate": 0.00032237272488263755, + "loss": 2.652, + "step": 20891 + }, + { + "epoch": 0.6195178364914153, + "grad_norm": 0.11428221315145493, + "learning_rate": 0.0003223287438613558, + "loss": 2.609, + "step": 20892 + }, + { + "epoch": 0.6195474898437269, + "grad_norm": 0.10794169455766678, + "learning_rate": 0.0003222847644133836, + "loss": 2.6164, + "step": 20893 + }, + { + "epoch": 0.6195771431960383, + "grad_norm": 0.12853649258613586, + "learning_rate": 0.00032224078653911054, + "loss": 2.6287, + "step": 20894 + }, + { + "epoch": 0.6196067965483498, + "grad_norm": 0.12013480067253113, + "learning_rate": 0.00032219681023892567, + "loss": 2.6511, + "step": 20895 + }, + { + "epoch": 0.6196364499006612, + "grad_norm": 0.1099371388554573, + "learning_rate": 0.0003221528355132186, + "loss": 2.6271, + "step": 20896 + }, + { + "epoch": 0.6196661032529728, + "grad_norm": 0.11711028963327408, + "learning_rate": 0.0003221088623623787, + "loss": 2.6272, + "step": 20897 + }, + { + "epoch": 0.6196957566052842, + "grad_norm": 0.10031626373529434, + "learning_rate": 0.0003220648907867953, + "loss": 2.6148, + "step": 20898 + }, + { + "epoch": 0.6197254099575957, + "grad_norm": 0.11658412218093872, + "learning_rate": 0.00032202092078685785, + "loss": 2.6257, + "step": 20899 + }, + { + "epoch": 0.6197550633099072, + "grad_norm": 0.10088774561882019, + "learning_rate": 0.0003219769523629558, + "loss": 2.6105, + "step": 20900 + }, + { + "epoch": 0.6197847166622187, + "grad_norm": 0.11229779571294785, + "learning_rate": 0.00032193298551547834, + "loss": 2.6395, + "step": 20901 + }, + { + "epoch": 0.6198143700145301, + "grad_norm": 0.10769766569137573, + "learning_rate": 0.00032188902024481484, + "loss": 2.6592, + "step": 20902 + }, + { + "epoch": 0.6198440233668416, + "grad_norm": 0.098167784512043, + "learning_rate": 0.00032184505655135455, + "loss": 2.6324, + "step": 20903 + }, + { + "epoch": 0.619873676719153, + "grad_norm": 0.10724135488271713, + "learning_rate": 0.000321801094435487, + "loss": 2.5986, + "step": 20904 + }, + { + "epoch": 0.6199033300714646, + "grad_norm": 0.10402119904756546, + "learning_rate": 0.00032175713389760117, + "loss": 2.6356, + "step": 20905 + }, + { + "epoch": 0.619932983423776, + "grad_norm": 0.1086416244506836, + "learning_rate": 0.0003217131749380866, + "loss": 2.6559, + "step": 20906 + }, + { + "epoch": 0.6199626367760875, + "grad_norm": 0.11581888049840927, + "learning_rate": 0.00032166921755733246, + "loss": 2.6201, + "step": 20907 + }, + { + "epoch": 0.6199922901283991, + "grad_norm": 0.10483339428901672, + "learning_rate": 0.0003216252617557281, + "loss": 2.6295, + "step": 20908 + }, + { + "epoch": 0.6200219434807105, + "grad_norm": 0.11129432916641235, + "learning_rate": 0.0003215813075336625, + "loss": 2.6594, + "step": 20909 + }, + { + "epoch": 0.620051596833022, + "grad_norm": 0.09901795536279678, + "learning_rate": 0.00032153735489152523, + "loss": 2.6381, + "step": 20910 + }, + { + "epoch": 0.6200812501853334, + "grad_norm": 0.10478437691926956, + "learning_rate": 0.00032149340382970536, + "loss": 2.6145, + "step": 20911 + }, + { + "epoch": 0.620110903537645, + "grad_norm": 0.11746252328157425, + "learning_rate": 0.0003214494543485919, + "loss": 2.627, + "step": 20912 + }, + { + "epoch": 0.6201405568899564, + "grad_norm": 0.126661017537117, + "learning_rate": 0.0003214055064485742, + "loss": 2.6254, + "step": 20913 + }, + { + "epoch": 0.6201702102422679, + "grad_norm": 0.1416163146495819, + "learning_rate": 0.00032136156013004135, + "loss": 2.6763, + "step": 20914 + }, + { + "epoch": 0.6201998635945793, + "grad_norm": 0.12136154621839523, + "learning_rate": 0.00032131761539338263, + "loss": 2.6277, + "step": 20915 + }, + { + "epoch": 0.6202295169468909, + "grad_norm": 0.11264915019273758, + "learning_rate": 0.0003212736722389871, + "loss": 2.6079, + "step": 20916 + }, + { + "epoch": 0.6202591702992023, + "grad_norm": 0.12034939974546432, + "learning_rate": 0.0003212297306672437, + "loss": 2.6081, + "step": 20917 + }, + { + "epoch": 0.6202888236515138, + "grad_norm": 0.14782744646072388, + "learning_rate": 0.00032118579067854183, + "loss": 2.6341, + "step": 20918 + }, + { + "epoch": 0.6203184770038253, + "grad_norm": 0.12730664014816284, + "learning_rate": 0.00032114185227327045, + "loss": 2.6285, + "step": 20919 + }, + { + "epoch": 0.6203481303561368, + "grad_norm": 0.10783283412456512, + "learning_rate": 0.0003210979154518188, + "loss": 2.6581, + "step": 20920 + }, + { + "epoch": 0.6203777837084482, + "grad_norm": 0.11965364962816238, + "learning_rate": 0.00032105398021457576, + "loss": 2.6469, + "step": 20921 + }, + { + "epoch": 0.6204074370607597, + "grad_norm": 0.11189016699790955, + "learning_rate": 0.00032101004656193046, + "loss": 2.6218, + "step": 20922 + }, + { + "epoch": 0.6204370904130712, + "grad_norm": 0.09695324301719666, + "learning_rate": 0.0003209661144942718, + "loss": 2.6444, + "step": 20923 + }, + { + "epoch": 0.6204667437653827, + "grad_norm": 0.12224820256233215, + "learning_rate": 0.00032092218401198904, + "loss": 2.6292, + "step": 20924 + }, + { + "epoch": 0.6204963971176941, + "grad_norm": 0.10920495539903641, + "learning_rate": 0.0003208782551154711, + "loss": 2.6028, + "step": 20925 + }, + { + "epoch": 0.6205260504700056, + "grad_norm": 0.11662470549345016, + "learning_rate": 0.00032083432780510683, + "loss": 2.6281, + "step": 20926 + }, + { + "epoch": 0.6205557038223171, + "grad_norm": 0.10217953473329544, + "learning_rate": 0.0003207904020812854, + "loss": 2.6192, + "step": 20927 + }, + { + "epoch": 0.6205853571746286, + "grad_norm": 0.12114495038986206, + "learning_rate": 0.0003207464779443958, + "loss": 2.6247, + "step": 20928 + }, + { + "epoch": 0.6206150105269401, + "grad_norm": 0.12180367112159729, + "learning_rate": 0.00032070255539482683, + "loss": 2.6371, + "step": 20929 + }, + { + "epoch": 0.6206446638792515, + "grad_norm": 0.11722706258296967, + "learning_rate": 0.0003206586344329674, + "loss": 2.6441, + "step": 20930 + }, + { + "epoch": 0.6206743172315631, + "grad_norm": 0.10745847970247269, + "learning_rate": 0.0003206147150592066, + "loss": 2.6227, + "step": 20931 + }, + { + "epoch": 0.6207039705838745, + "grad_norm": 0.10515718162059784, + "learning_rate": 0.00032057079727393345, + "loss": 2.5932, + "step": 20932 + }, + { + "epoch": 0.620733623936186, + "grad_norm": 0.1109580397605896, + "learning_rate": 0.0003205268810775366, + "loss": 2.632, + "step": 20933 + }, + { + "epoch": 0.6207632772884975, + "grad_norm": 0.09835297614336014, + "learning_rate": 0.000320482966470405, + "loss": 2.6578, + "step": 20934 + }, + { + "epoch": 0.620792930640809, + "grad_norm": 0.12035878002643585, + "learning_rate": 0.0003204390534529276, + "loss": 2.6484, + "step": 20935 + }, + { + "epoch": 0.6208225839931204, + "grad_norm": 0.11742962151765823, + "learning_rate": 0.00032039514202549316, + "loss": 2.6381, + "step": 20936 + }, + { + "epoch": 0.6208522373454319, + "grad_norm": 0.09223505109548569, + "learning_rate": 0.0003203512321884905, + "loss": 2.6446, + "step": 20937 + }, + { + "epoch": 0.6208818906977434, + "grad_norm": 0.11354994028806686, + "learning_rate": 0.0003203073239423087, + "loss": 2.6634, + "step": 20938 + }, + { + "epoch": 0.6209115440500549, + "grad_norm": 0.12535427510738373, + "learning_rate": 0.00032026341728733624, + "loss": 2.6166, + "step": 20939 + }, + { + "epoch": 0.6209411974023663, + "grad_norm": 0.10564722865819931, + "learning_rate": 0.0003202195122239622, + "loss": 2.6573, + "step": 20940 + }, + { + "epoch": 0.6209708507546778, + "grad_norm": 0.0981503501534462, + "learning_rate": 0.00032017560875257506, + "loss": 2.6414, + "step": 20941 + }, + { + "epoch": 0.6210005041069893, + "grad_norm": 0.10904113948345184, + "learning_rate": 0.00032013170687356383, + "loss": 2.6458, + "step": 20942 + }, + { + "epoch": 0.6210301574593008, + "grad_norm": 0.10571597516536713, + "learning_rate": 0.00032008780658731716, + "loss": 2.5975, + "step": 20943 + }, + { + "epoch": 0.6210598108116122, + "grad_norm": 0.10061835497617722, + "learning_rate": 0.0003200439078942239, + "loss": 2.6261, + "step": 20944 + }, + { + "epoch": 0.6210894641639237, + "grad_norm": 0.10429568588733673, + "learning_rate": 0.0003200000107946728, + "loss": 2.612, + "step": 20945 + }, + { + "epoch": 0.6211191175162352, + "grad_norm": 0.10281025618314743, + "learning_rate": 0.0003199561152890524, + "loss": 2.6548, + "step": 20946 + }, + { + "epoch": 0.6211487708685467, + "grad_norm": 0.10299179702997208, + "learning_rate": 0.0003199122213777516, + "loss": 2.6207, + "step": 20947 + }, + { + "epoch": 0.6211784242208581, + "grad_norm": 0.11245273798704147, + "learning_rate": 0.00031986832906115886, + "loss": 2.6106, + "step": 20948 + }, + { + "epoch": 0.6212080775731696, + "grad_norm": 0.10357317328453064, + "learning_rate": 0.00031982443833966314, + "loss": 2.6588, + "step": 20949 + }, + { + "epoch": 0.6212377309254812, + "grad_norm": 0.1174035593867302, + "learning_rate": 0.0003197805492136529, + "loss": 2.6235, + "step": 20950 + }, + { + "epoch": 0.6212673842777926, + "grad_norm": 0.11969736963510513, + "learning_rate": 0.0003197366616835168, + "loss": 2.6163, + "step": 20951 + }, + { + "epoch": 0.6212970376301041, + "grad_norm": 0.09680696576833725, + "learning_rate": 0.0003196927757496435, + "loss": 2.6301, + "step": 20952 + }, + { + "epoch": 0.6213266909824156, + "grad_norm": 0.11008625477552414, + "learning_rate": 0.0003196488914124216, + "loss": 2.6107, + "step": 20953 + }, + { + "epoch": 0.6213563443347271, + "grad_norm": 0.09533771872520447, + "learning_rate": 0.0003196050086722397, + "loss": 2.6138, + "step": 20954 + }, + { + "epoch": 0.6213859976870385, + "grad_norm": 0.11290746182203293, + "learning_rate": 0.00031956112752948664, + "loss": 2.6106, + "step": 20955 + }, + { + "epoch": 0.62141565103935, + "grad_norm": 0.09950768202543259, + "learning_rate": 0.00031951724798455033, + "loss": 2.6569, + "step": 20956 + }, + { + "epoch": 0.6214453043916615, + "grad_norm": 0.12385091185569763, + "learning_rate": 0.00031947337003782007, + "loss": 2.6486, + "step": 20957 + }, + { + "epoch": 0.621474957743973, + "grad_norm": 0.12159328162670135, + "learning_rate": 0.000319429493689684, + "loss": 2.6295, + "step": 20958 + }, + { + "epoch": 0.6215046110962844, + "grad_norm": 0.12047483026981354, + "learning_rate": 0.0003193856189405309, + "loss": 2.6434, + "step": 20959 + }, + { + "epoch": 0.6215342644485959, + "grad_norm": 0.10792935639619827, + "learning_rate": 0.0003193417457907491, + "loss": 2.654, + "step": 20960 + }, + { + "epoch": 0.6215639178009074, + "grad_norm": 0.10888496041297913, + "learning_rate": 0.0003192978742407271, + "loss": 2.6274, + "step": 20961 + }, + { + "epoch": 0.6215935711532189, + "grad_norm": 0.10315300524234772, + "learning_rate": 0.0003192540042908534, + "loss": 2.6528, + "step": 20962 + }, + { + "epoch": 0.6216232245055303, + "grad_norm": 0.1030624508857727, + "learning_rate": 0.0003192101359415166, + "loss": 2.6285, + "step": 20963 + }, + { + "epoch": 0.6216528778578418, + "grad_norm": 0.10639873892068863, + "learning_rate": 0.0003191662691931051, + "loss": 2.6259, + "step": 20964 + }, + { + "epoch": 0.6216825312101533, + "grad_norm": 0.10734270513057709, + "learning_rate": 0.00031912240404600724, + "loss": 2.6503, + "step": 20965 + }, + { + "epoch": 0.6217121845624648, + "grad_norm": 0.11371998488903046, + "learning_rate": 0.0003190785405006117, + "loss": 2.5991, + "step": 20966 + }, + { + "epoch": 0.6217418379147762, + "grad_norm": 0.10933182388544083, + "learning_rate": 0.00031903467855730664, + "loss": 2.6185, + "step": 20967 + }, + { + "epoch": 0.6217714912670878, + "grad_norm": 0.12280973047018051, + "learning_rate": 0.00031899081821648047, + "loss": 2.6309, + "step": 20968 + }, + { + "epoch": 0.6218011446193992, + "grad_norm": 0.1083536371588707, + "learning_rate": 0.00031894695947852156, + "loss": 2.6433, + "step": 20969 + }, + { + "epoch": 0.6218307979717107, + "grad_norm": 0.11210627853870392, + "learning_rate": 0.0003189031023438187, + "loss": 2.6348, + "step": 20970 + }, + { + "epoch": 0.6218604513240222, + "grad_norm": 0.11850232630968094, + "learning_rate": 0.0003188592468127598, + "loss": 2.6466, + "step": 20971 + }, + { + "epoch": 0.6218901046763337, + "grad_norm": 0.12242510914802551, + "learning_rate": 0.00031881539288573335, + "loss": 2.6264, + "step": 20972 + }, + { + "epoch": 0.6219197580286452, + "grad_norm": 0.10872266441583633, + "learning_rate": 0.0003187715405631278, + "loss": 2.6723, + "step": 20973 + }, + { + "epoch": 0.6219494113809566, + "grad_norm": 0.1280432492494583, + "learning_rate": 0.0003187276898453313, + "loss": 2.6837, + "step": 20974 + }, + { + "epoch": 0.6219790647332681, + "grad_norm": 0.13021208345890045, + "learning_rate": 0.00031868384073273224, + "loss": 2.6417, + "step": 20975 + }, + { + "epoch": 0.6220087180855796, + "grad_norm": 0.12103666365146637, + "learning_rate": 0.00031863999322571897, + "loss": 2.6045, + "step": 20976 + }, + { + "epoch": 0.6220383714378911, + "grad_norm": 0.12408773601055145, + "learning_rate": 0.00031859614732467957, + "loss": 2.6478, + "step": 20977 + }, + { + "epoch": 0.6220680247902025, + "grad_norm": 0.12379555404186249, + "learning_rate": 0.0003185523030300025, + "loss": 2.6327, + "step": 20978 + }, + { + "epoch": 0.622097678142514, + "grad_norm": 0.11011034995317459, + "learning_rate": 0.0003185084603420759, + "loss": 2.6561, + "step": 20979 + }, + { + "epoch": 0.6221273314948255, + "grad_norm": 0.11017968505620956, + "learning_rate": 0.000318464619261288, + "loss": 2.6159, + "step": 20980 + }, + { + "epoch": 0.622156984847137, + "grad_norm": 0.1278398036956787, + "learning_rate": 0.0003184207797880271, + "loss": 2.6296, + "step": 20981 + }, + { + "epoch": 0.6221866381994484, + "grad_norm": 0.1253087967634201, + "learning_rate": 0.00031837694192268117, + "loss": 2.6579, + "step": 20982 + }, + { + "epoch": 0.62221629155176, + "grad_norm": 0.10079918801784515, + "learning_rate": 0.0003183331056656388, + "loss": 2.6299, + "step": 20983 + }, + { + "epoch": 0.6222459449040714, + "grad_norm": 0.13752029836177826, + "learning_rate": 0.00031828927101728796, + "loss": 2.6496, + "step": 20984 + }, + { + "epoch": 0.6222755982563829, + "grad_norm": 0.12221498042345047, + "learning_rate": 0.00031824543797801674, + "loss": 2.6073, + "step": 20985 + }, + { + "epoch": 0.6223052516086943, + "grad_norm": 0.10576559603214264, + "learning_rate": 0.0003182016065482134, + "loss": 2.6062, + "step": 20986 + }, + { + "epoch": 0.6223349049610059, + "grad_norm": 0.1347026824951172, + "learning_rate": 0.0003181577767282662, + "loss": 2.638, + "step": 20987 + }, + { + "epoch": 0.6223645583133173, + "grad_norm": 0.11502570658922195, + "learning_rate": 0.0003181139485185629, + "loss": 2.6128, + "step": 20988 + }, + { + "epoch": 0.6223942116656288, + "grad_norm": 0.11715319752693176, + "learning_rate": 0.00031807012191949186, + "loss": 2.6653, + "step": 20989 + }, + { + "epoch": 0.6224238650179402, + "grad_norm": 0.11430484801530838, + "learning_rate": 0.00031802629693144114, + "loss": 2.6409, + "step": 20990 + }, + { + "epoch": 0.6224535183702518, + "grad_norm": 0.12033027410507202, + "learning_rate": 0.00031798247355479875, + "loss": 2.6545, + "step": 20991 + }, + { + "epoch": 0.6224831717225633, + "grad_norm": 0.11484316736459732, + "learning_rate": 0.0003179386517899528, + "loss": 2.6601, + "step": 20992 + }, + { + "epoch": 0.6225128250748747, + "grad_norm": 0.11186017841100693, + "learning_rate": 0.0003178948316372915, + "loss": 2.6438, + "step": 20993 + }, + { + "epoch": 0.6225424784271862, + "grad_norm": 0.11785659193992615, + "learning_rate": 0.00031785101309720254, + "loss": 2.6343, + "step": 20994 + }, + { + "epoch": 0.6225721317794977, + "grad_norm": 0.11717693507671356, + "learning_rate": 0.000317807196170074, + "loss": 2.6449, + "step": 20995 + }, + { + "epoch": 0.6226017851318092, + "grad_norm": 0.11665138602256775, + "learning_rate": 0.00031776338085629413, + "loss": 2.628, + "step": 20996 + }, + { + "epoch": 0.6226314384841206, + "grad_norm": 0.10827341675758362, + "learning_rate": 0.00031771956715625094, + "loss": 2.6409, + "step": 20997 + }, + { + "epoch": 0.6226610918364321, + "grad_norm": 0.12615610659122467, + "learning_rate": 0.00031767575507033217, + "loss": 2.6217, + "step": 20998 + }, + { + "epoch": 0.6226907451887436, + "grad_norm": 0.10436141490936279, + "learning_rate": 0.00031763194459892596, + "loss": 2.6258, + "step": 20999 + }, + { + "epoch": 0.6227203985410551, + "grad_norm": 0.12785205245018005, + "learning_rate": 0.00031758813574242007, + "loss": 2.66, + "step": 21000 + }, + { + "epoch": 0.6227500518933665, + "grad_norm": 0.12672902643680573, + "learning_rate": 0.00031754432850120265, + "loss": 2.6125, + "step": 21001 + }, + { + "epoch": 0.622779705245678, + "grad_norm": 0.1280031055212021, + "learning_rate": 0.00031750052287566146, + "loss": 2.6207, + "step": 21002 + }, + { + "epoch": 0.6228093585979895, + "grad_norm": 0.13823312520980835, + "learning_rate": 0.00031745671886618444, + "loss": 2.6111, + "step": 21003 + }, + { + "epoch": 0.622839011950301, + "grad_norm": 0.1351819783449173, + "learning_rate": 0.0003174129164731596, + "loss": 2.6523, + "step": 21004 + }, + { + "epoch": 0.6228686653026124, + "grad_norm": 0.11180330812931061, + "learning_rate": 0.0003173691156969747, + "loss": 2.6049, + "step": 21005 + }, + { + "epoch": 0.622898318654924, + "grad_norm": 0.11057370901107788, + "learning_rate": 0.0003173253165380176, + "loss": 2.6466, + "step": 21006 + }, + { + "epoch": 0.6229279720072354, + "grad_norm": 0.13600841164588928, + "learning_rate": 0.0003172815189966762, + "loss": 2.6519, + "step": 21007 + }, + { + "epoch": 0.6229576253595469, + "grad_norm": 0.12321488559246063, + "learning_rate": 0.00031723772307333813, + "loss": 2.616, + "step": 21008 + }, + { + "epoch": 0.6229872787118583, + "grad_norm": 0.10609354823827744, + "learning_rate": 0.0003171939287683916, + "loss": 2.622, + "step": 21009 + }, + { + "epoch": 0.6230169320641699, + "grad_norm": 0.12034282088279724, + "learning_rate": 0.0003171501360822241, + "loss": 2.6088, + "step": 21010 + }, + { + "epoch": 0.6230465854164813, + "grad_norm": 0.11847560852766037, + "learning_rate": 0.00031710634501522363, + "loss": 2.618, + "step": 21011 + }, + { + "epoch": 0.6230762387687928, + "grad_norm": 0.12341517955064774, + "learning_rate": 0.0003170625555677778, + "loss": 2.6211, + "step": 21012 + }, + { + "epoch": 0.6231058921211043, + "grad_norm": 0.1137450635433197, + "learning_rate": 0.0003170187677402745, + "loss": 2.6373, + "step": 21013 + }, + { + "epoch": 0.6231355454734158, + "grad_norm": 0.12005278468132019, + "learning_rate": 0.00031697498153310146, + "loss": 2.6136, + "step": 21014 + }, + { + "epoch": 0.6231651988257273, + "grad_norm": 0.10947871208190918, + "learning_rate": 0.0003169311969466463, + "loss": 2.6259, + "step": 21015 + }, + { + "epoch": 0.6231948521780387, + "grad_norm": 0.10205212980508804, + "learning_rate": 0.00031688741398129685, + "loss": 2.6369, + "step": 21016 + }, + { + "epoch": 0.6232245055303502, + "grad_norm": 0.09961450845003128, + "learning_rate": 0.00031684363263744067, + "loss": 2.6296, + "step": 21017 + }, + { + "epoch": 0.6232541588826617, + "grad_norm": 0.10927985608577728, + "learning_rate": 0.00031679985291546567, + "loss": 2.6455, + "step": 21018 + }, + { + "epoch": 0.6232838122349732, + "grad_norm": 0.11341237276792526, + "learning_rate": 0.00031675607481575934, + "loss": 2.6486, + "step": 21019 + }, + { + "epoch": 0.6233134655872846, + "grad_norm": 0.11147923767566681, + "learning_rate": 0.0003167122983387095, + "loss": 2.6331, + "step": 21020 + }, + { + "epoch": 0.6233431189395962, + "grad_norm": 0.1026613637804985, + "learning_rate": 0.00031666852348470354, + "loss": 2.6706, + "step": 21021 + }, + { + "epoch": 0.6233727722919076, + "grad_norm": 0.1312500685453415, + "learning_rate": 0.00031662475025412943, + "loss": 2.6349, + "step": 21022 + }, + { + "epoch": 0.6234024256442191, + "grad_norm": 0.10481099784374237, + "learning_rate": 0.0003165809786473747, + "loss": 2.6459, + "step": 21023 + }, + { + "epoch": 0.6234320789965305, + "grad_norm": 0.11346852034330368, + "learning_rate": 0.0003165372086648267, + "loss": 2.6094, + "step": 21024 + }, + { + "epoch": 0.6234617323488421, + "grad_norm": 0.1368270069360733, + "learning_rate": 0.0003164934403068734, + "loss": 2.639, + "step": 21025 + }, + { + "epoch": 0.6234913857011535, + "grad_norm": 0.11780919879674911, + "learning_rate": 0.0003164496735739021, + "loss": 2.6161, + "step": 21026 + }, + { + "epoch": 0.623521039053465, + "grad_norm": 0.11883359402418137, + "learning_rate": 0.00031640590846630047, + "loss": 2.6232, + "step": 21027 + }, + { + "epoch": 0.6235506924057764, + "grad_norm": 0.13017325103282928, + "learning_rate": 0.000316362144984456, + "loss": 2.635, + "step": 21028 + }, + { + "epoch": 0.623580345758088, + "grad_norm": 0.12866052985191345, + "learning_rate": 0.00031631838312875626, + "loss": 2.6268, + "step": 21029 + }, + { + "epoch": 0.6236099991103994, + "grad_norm": 0.1283605694770813, + "learning_rate": 0.00031627462289958886, + "loss": 2.6333, + "step": 21030 + }, + { + "epoch": 0.6236396524627109, + "grad_norm": 0.139495387673378, + "learning_rate": 0.00031623086429734113, + "loss": 2.6388, + "step": 21031 + }, + { + "epoch": 0.6236693058150223, + "grad_norm": 0.11699289828538895, + "learning_rate": 0.0003161871073224007, + "loss": 2.6309, + "step": 21032 + }, + { + "epoch": 0.6236989591673339, + "grad_norm": 0.1307706981897354, + "learning_rate": 0.00031614335197515487, + "loss": 2.6225, + "step": 21033 + }, + { + "epoch": 0.6237286125196454, + "grad_norm": 0.12104522436857224, + "learning_rate": 0.0003160995982559911, + "loss": 2.6067, + "step": 21034 + }, + { + "epoch": 0.6237582658719568, + "grad_norm": 0.12085483968257904, + "learning_rate": 0.00031605584616529726, + "loss": 2.6261, + "step": 21035 + }, + { + "epoch": 0.6237879192242684, + "grad_norm": 0.11508067697286606, + "learning_rate": 0.0003160120957034603, + "loss": 2.6382, + "step": 21036 + }, + { + "epoch": 0.6238175725765798, + "grad_norm": 0.11420220881700516, + "learning_rate": 0.0003159683468708678, + "loss": 2.6225, + "step": 21037 + }, + { + "epoch": 0.6238472259288913, + "grad_norm": 0.11220155656337738, + "learning_rate": 0.00031592459966790727, + "loss": 2.6209, + "step": 21038 + }, + { + "epoch": 0.6238768792812027, + "grad_norm": 0.11738713830709457, + "learning_rate": 0.0003158808540949659, + "loss": 2.6193, + "step": 21039 + }, + { + "epoch": 0.6239065326335143, + "grad_norm": 0.10963144153356552, + "learning_rate": 0.0003158371101524313, + "loss": 2.6233, + "step": 21040 + }, + { + "epoch": 0.6239361859858257, + "grad_norm": 0.11327895522117615, + "learning_rate": 0.0003157933678406906, + "loss": 2.6462, + "step": 21041 + }, + { + "epoch": 0.6239658393381372, + "grad_norm": 0.10772188007831573, + "learning_rate": 0.00031574962716013143, + "loss": 2.6159, + "step": 21042 + }, + { + "epoch": 0.6239954926904486, + "grad_norm": 0.10976256430149078, + "learning_rate": 0.0003157058881111408, + "loss": 2.6332, + "step": 21043 + }, + { + "epoch": 0.6240251460427602, + "grad_norm": 0.10493506491184235, + "learning_rate": 0.0003156621506941061, + "loss": 2.6223, + "step": 21044 + }, + { + "epoch": 0.6240547993950716, + "grad_norm": 0.10648465156555176, + "learning_rate": 0.0003156184149094148, + "loss": 2.6302, + "step": 21045 + }, + { + "epoch": 0.6240844527473831, + "grad_norm": 0.10703233629465103, + "learning_rate": 0.000315574680757454, + "loss": 2.6319, + "step": 21046 + }, + { + "epoch": 0.6241141060996945, + "grad_norm": 0.10205525159835815, + "learning_rate": 0.00031553094823861096, + "loss": 2.6286, + "step": 21047 + }, + { + "epoch": 0.6241437594520061, + "grad_norm": 0.09574995189905167, + "learning_rate": 0.00031548721735327317, + "loss": 2.633, + "step": 21048 + }, + { + "epoch": 0.6241734128043175, + "grad_norm": 0.09901615977287292, + "learning_rate": 0.0003154434881018278, + "loss": 2.6342, + "step": 21049 + }, + { + "epoch": 0.624203066156629, + "grad_norm": 0.09109007567167282, + "learning_rate": 0.0003153997604846619, + "loss": 2.6117, + "step": 21050 + }, + { + "epoch": 0.6242327195089404, + "grad_norm": 0.09780381619930267, + "learning_rate": 0.0003153560345021629, + "loss": 2.6375, + "step": 21051 + }, + { + "epoch": 0.624262372861252, + "grad_norm": 0.10768657177686691, + "learning_rate": 0.00031531231015471793, + "loss": 2.5884, + "step": 21052 + }, + { + "epoch": 0.6242920262135634, + "grad_norm": 0.10642726719379425, + "learning_rate": 0.00031526858744271416, + "loss": 2.6121, + "step": 21053 + }, + { + "epoch": 0.6243216795658749, + "grad_norm": 0.09399189800024033, + "learning_rate": 0.0003152248663665387, + "loss": 2.6131, + "step": 21054 + }, + { + "epoch": 0.6243513329181865, + "grad_norm": 0.10139089822769165, + "learning_rate": 0.00031518114692657875, + "loss": 2.6571, + "step": 21055 + }, + { + "epoch": 0.6243809862704979, + "grad_norm": 0.11539500951766968, + "learning_rate": 0.00031513742912322144, + "loss": 2.6502, + "step": 21056 + }, + { + "epoch": 0.6244106396228094, + "grad_norm": 0.12199725955724716, + "learning_rate": 0.00031509371295685394, + "loss": 2.6478, + "step": 21057 + }, + { + "epoch": 0.6244402929751208, + "grad_norm": 0.1313750445842743, + "learning_rate": 0.0003150499984278634, + "loss": 2.6332, + "step": 21058 + }, + { + "epoch": 0.6244699463274324, + "grad_norm": 0.10242941230535507, + "learning_rate": 0.00031500628553663703, + "loss": 2.6429, + "step": 21059 + }, + { + "epoch": 0.6244995996797438, + "grad_norm": 0.12467624247074127, + "learning_rate": 0.00031496257428356136, + "loss": 2.6072, + "step": 21060 + }, + { + "epoch": 0.6245292530320553, + "grad_norm": 0.11351308971643448, + "learning_rate": 0.000314918864669024, + "loss": 2.6349, + "step": 21061 + }, + { + "epoch": 0.6245589063843667, + "grad_norm": 0.12462832033634186, + "learning_rate": 0.00031487515669341193, + "loss": 2.6231, + "step": 21062 + }, + { + "epoch": 0.6245885597366783, + "grad_norm": 0.12480482459068298, + "learning_rate": 0.00031483145035711223, + "loss": 2.6631, + "step": 21063 + }, + { + "epoch": 0.6246182130889897, + "grad_norm": 0.11057393997907639, + "learning_rate": 0.0003147877456605117, + "loss": 2.6636, + "step": 21064 + }, + { + "epoch": 0.6246478664413012, + "grad_norm": 0.11244776099920273, + "learning_rate": 0.0003147440426039975, + "loss": 2.6362, + "step": 21065 + }, + { + "epoch": 0.6246775197936126, + "grad_norm": 0.09974449127912521, + "learning_rate": 0.00031470034118795664, + "loss": 2.6427, + "step": 21066 + }, + { + "epoch": 0.6247071731459242, + "grad_norm": 0.10470641404390335, + "learning_rate": 0.000314656641412776, + "loss": 2.6412, + "step": 21067 + }, + { + "epoch": 0.6247368264982356, + "grad_norm": 0.11157554388046265, + "learning_rate": 0.0003146129432788426, + "loss": 2.6031, + "step": 21068 + }, + { + "epoch": 0.6247664798505471, + "grad_norm": 0.1084650307893753, + "learning_rate": 0.00031456924678654357, + "loss": 2.6609, + "step": 21069 + }, + { + "epoch": 0.6247961332028585, + "grad_norm": 0.10742403566837311, + "learning_rate": 0.00031452555193626553, + "loss": 2.6314, + "step": 21070 + }, + { + "epoch": 0.6248257865551701, + "grad_norm": 0.09463849663734436, + "learning_rate": 0.00031448185872839566, + "loss": 2.6291, + "step": 21071 + }, + { + "epoch": 0.6248554399074815, + "grad_norm": 0.1195271834731102, + "learning_rate": 0.0003144381671633206, + "loss": 2.6234, + "step": 21072 + }, + { + "epoch": 0.624885093259793, + "grad_norm": 0.11723050475120544, + "learning_rate": 0.0003143944772414275, + "loss": 2.6471, + "step": 21073 + }, + { + "epoch": 0.6249147466121044, + "grad_norm": 0.11032339930534363, + "learning_rate": 0.0003143507889631032, + "loss": 2.6299, + "step": 21074 + }, + { + "epoch": 0.624944399964416, + "grad_norm": 0.10364770144224167, + "learning_rate": 0.0003143071023287345, + "loss": 2.6691, + "step": 21075 + }, + { + "epoch": 0.6249740533167275, + "grad_norm": 0.11455920338630676, + "learning_rate": 0.00031426341733870836, + "loss": 2.6228, + "step": 21076 + }, + { + "epoch": 0.6250037066690389, + "grad_norm": 0.11451619118452072, + "learning_rate": 0.00031421973399341143, + "loss": 2.6632, + "step": 21077 + }, + { + "epoch": 0.6250333600213505, + "grad_norm": 0.1009335070848465, + "learning_rate": 0.00031417605229323067, + "loss": 2.6238, + "step": 21078 + }, + { + "epoch": 0.6250630133736619, + "grad_norm": 0.11186210066080093, + "learning_rate": 0.0003141323722385529, + "loss": 2.6433, + "step": 21079 + }, + { + "epoch": 0.6250926667259734, + "grad_norm": 0.10331113636493683, + "learning_rate": 0.00031408869382976504, + "loss": 2.6282, + "step": 21080 + }, + { + "epoch": 0.6251223200782848, + "grad_norm": 0.11021695286035538, + "learning_rate": 0.0003140450170672535, + "loss": 2.632, + "step": 21081 + }, + { + "epoch": 0.6251519734305964, + "grad_norm": 0.12147833406925201, + "learning_rate": 0.0003140013419514053, + "loss": 2.6207, + "step": 21082 + }, + { + "epoch": 0.6251816267829078, + "grad_norm": 0.10444552451372147, + "learning_rate": 0.0003139576684826071, + "loss": 2.6272, + "step": 21083 + }, + { + "epoch": 0.6252112801352193, + "grad_norm": 0.1082010567188263, + "learning_rate": 0.0003139139966612457, + "loss": 2.6133, + "step": 21084 + }, + { + "epoch": 0.6252409334875307, + "grad_norm": 0.12275167554616928, + "learning_rate": 0.00031387032648770786, + "loss": 2.6315, + "step": 21085 + }, + { + "epoch": 0.6252705868398423, + "grad_norm": 0.10702871531248093, + "learning_rate": 0.00031382665796238, + "loss": 2.6134, + "step": 21086 + }, + { + "epoch": 0.6253002401921537, + "grad_norm": 0.09973654896020889, + "learning_rate": 0.00031378299108564923, + "loss": 2.6382, + "step": 21087 + }, + { + "epoch": 0.6253298935444652, + "grad_norm": 0.1153068095445633, + "learning_rate": 0.000313739325857902, + "loss": 2.6689, + "step": 21088 + }, + { + "epoch": 0.6253595468967766, + "grad_norm": 0.10773742198944092, + "learning_rate": 0.00031369566227952507, + "loss": 2.6485, + "step": 21089 + }, + { + "epoch": 0.6253892002490882, + "grad_norm": 0.10720290243625641, + "learning_rate": 0.0003136520003509051, + "loss": 2.6179, + "step": 21090 + }, + { + "epoch": 0.6254188536013996, + "grad_norm": 0.10452435910701752, + "learning_rate": 0.00031360834007242855, + "loss": 2.6412, + "step": 21091 + }, + { + "epoch": 0.6254485069537111, + "grad_norm": 0.11862603574991226, + "learning_rate": 0.0003135646814444821, + "loss": 2.6116, + "step": 21092 + }, + { + "epoch": 0.6254781603060225, + "grad_norm": 0.09631577134132385, + "learning_rate": 0.0003135210244674525, + "loss": 2.6032, + "step": 21093 + }, + { + "epoch": 0.6255078136583341, + "grad_norm": 0.11573555320501328, + "learning_rate": 0.0003134773691417262, + "loss": 2.6659, + "step": 21094 + }, + { + "epoch": 0.6255374670106455, + "grad_norm": 0.11300353705883026, + "learning_rate": 0.0003134337154676897, + "loss": 2.6385, + "step": 21095 + }, + { + "epoch": 0.625567120362957, + "grad_norm": 0.10138695687055588, + "learning_rate": 0.0003133900634457298, + "loss": 2.6406, + "step": 21096 + }, + { + "epoch": 0.6255967737152686, + "grad_norm": 0.10198713093996048, + "learning_rate": 0.00031334641307623304, + "loss": 2.6328, + "step": 21097 + }, + { + "epoch": 0.62562642706758, + "grad_norm": 0.11456536501646042, + "learning_rate": 0.00031330276435958545, + "loss": 2.6649, + "step": 21098 + }, + { + "epoch": 0.6256560804198915, + "grad_norm": 0.09527949243783951, + "learning_rate": 0.0003132591172961741, + "loss": 2.6457, + "step": 21099 + }, + { + "epoch": 0.6256857337722029, + "grad_norm": 0.09917362034320831, + "learning_rate": 0.0003132154718863854, + "loss": 2.6316, + "step": 21100 + }, + { + "epoch": 0.6257153871245145, + "grad_norm": 0.11306553333997726, + "learning_rate": 0.00031317182813060587, + "loss": 2.6239, + "step": 21101 + }, + { + "epoch": 0.6257450404768259, + "grad_norm": 0.10182969272136688, + "learning_rate": 0.0003131281860292217, + "loss": 2.6401, + "step": 21102 + }, + { + "epoch": 0.6257746938291374, + "grad_norm": 0.12193849682807922, + "learning_rate": 0.00031308454558261954, + "loss": 2.614, + "step": 21103 + }, + { + "epoch": 0.6258043471814488, + "grad_norm": 0.12565845251083374, + "learning_rate": 0.0003130409067911858, + "loss": 2.6512, + "step": 21104 + }, + { + "epoch": 0.6258340005337604, + "grad_norm": 0.1211739256978035, + "learning_rate": 0.00031299726965530695, + "loss": 2.6434, + "step": 21105 + }, + { + "epoch": 0.6258636538860718, + "grad_norm": 0.11326494812965393, + "learning_rate": 0.0003129536341753694, + "loss": 2.6384, + "step": 21106 + }, + { + "epoch": 0.6258933072383833, + "grad_norm": 0.10658901929855347, + "learning_rate": 0.00031291000035175954, + "loss": 2.6468, + "step": 21107 + }, + { + "epoch": 0.6259229605906947, + "grad_norm": 0.09804339706897736, + "learning_rate": 0.0003128663681848637, + "loss": 2.635, + "step": 21108 + }, + { + "epoch": 0.6259526139430063, + "grad_norm": 0.1119675487279892, + "learning_rate": 0.0003128227376750683, + "loss": 2.6283, + "step": 21109 + }, + { + "epoch": 0.6259822672953177, + "grad_norm": 0.13260406255722046, + "learning_rate": 0.0003127791088227597, + "loss": 2.6357, + "step": 21110 + }, + { + "epoch": 0.6260119206476292, + "grad_norm": 0.11277195066213608, + "learning_rate": 0.0003127354816283241, + "loss": 2.6414, + "step": 21111 + }, + { + "epoch": 0.6260415739999406, + "grad_norm": 0.09936138242483139, + "learning_rate": 0.000312691856092148, + "loss": 2.6564, + "step": 21112 + }, + { + "epoch": 0.6260712273522522, + "grad_norm": 0.1287941187620163, + "learning_rate": 0.00031264823221461777, + "loss": 2.6452, + "step": 21113 + }, + { + "epoch": 0.6261008807045636, + "grad_norm": 0.11332973837852478, + "learning_rate": 0.0003126046099961195, + "loss": 2.6106, + "step": 21114 + }, + { + "epoch": 0.6261305340568751, + "grad_norm": 0.10622506588697433, + "learning_rate": 0.00031256098943703965, + "loss": 2.6095, + "step": 21115 + }, + { + "epoch": 0.6261601874091867, + "grad_norm": 0.11421438306570053, + "learning_rate": 0.00031251737053776443, + "loss": 2.6164, + "step": 21116 + }, + { + "epoch": 0.6261898407614981, + "grad_norm": 0.11865890771150589, + "learning_rate": 0.00031247375329868, + "loss": 2.6242, + "step": 21117 + }, + { + "epoch": 0.6262194941138096, + "grad_norm": 0.11531040072441101, + "learning_rate": 0.0003124301377201728, + "loss": 2.6497, + "step": 21118 + }, + { + "epoch": 0.626249147466121, + "grad_norm": 0.11859767138957977, + "learning_rate": 0.00031238652380262877, + "loss": 2.6252, + "step": 21119 + }, + { + "epoch": 0.6262788008184326, + "grad_norm": 0.11957290023565292, + "learning_rate": 0.0003123429115464344, + "loss": 2.6096, + "step": 21120 + }, + { + "epoch": 0.626308454170744, + "grad_norm": 0.12663400173187256, + "learning_rate": 0.0003122993009519757, + "loss": 2.6318, + "step": 21121 + }, + { + "epoch": 0.6263381075230555, + "grad_norm": 0.11433059722185135, + "learning_rate": 0.00031225569201963886, + "loss": 2.6163, + "step": 21122 + }, + { + "epoch": 0.6263677608753669, + "grad_norm": 0.12371112406253815, + "learning_rate": 0.0003122120847498101, + "loss": 2.6468, + "step": 21123 + }, + { + "epoch": 0.6263974142276785, + "grad_norm": 0.10921228677034378, + "learning_rate": 0.0003121684791428755, + "loss": 2.6264, + "step": 21124 + }, + { + "epoch": 0.6264270675799899, + "grad_norm": 0.10767363011837006, + "learning_rate": 0.0003121248751992214, + "loss": 2.6384, + "step": 21125 + }, + { + "epoch": 0.6264567209323014, + "grad_norm": 0.1032361090183258, + "learning_rate": 0.00031208127291923373, + "loss": 2.6242, + "step": 21126 + }, + { + "epoch": 0.6264863742846128, + "grad_norm": 0.10922233760356903, + "learning_rate": 0.0003120376723032986, + "loss": 2.6319, + "step": 21127 + }, + { + "epoch": 0.6265160276369244, + "grad_norm": 0.11508823186159134, + "learning_rate": 0.0003119940733518023, + "loss": 2.6589, + "step": 21128 + }, + { + "epoch": 0.6265456809892358, + "grad_norm": 0.09936714172363281, + "learning_rate": 0.0003119504760651307, + "loss": 2.6451, + "step": 21129 + }, + { + "epoch": 0.6265753343415473, + "grad_norm": 0.10141021013259888, + "learning_rate": 0.0003119068804436699, + "loss": 2.6126, + "step": 21130 + }, + { + "epoch": 0.6266049876938588, + "grad_norm": 0.11295554041862488, + "learning_rate": 0.0003118632864878058, + "loss": 2.6548, + "step": 21131 + }, + { + "epoch": 0.6266346410461703, + "grad_norm": 0.1035095751285553, + "learning_rate": 0.00031181969419792475, + "loss": 2.6574, + "step": 21132 + }, + { + "epoch": 0.6266642943984817, + "grad_norm": 0.11879580467939377, + "learning_rate": 0.00031177610357441255, + "loss": 2.6306, + "step": 21133 + }, + { + "epoch": 0.6266939477507932, + "grad_norm": 0.10102012753486633, + "learning_rate": 0.0003117325146176553, + "loss": 2.6626, + "step": 21134 + }, + { + "epoch": 0.6267236011031047, + "grad_norm": 0.1206340566277504, + "learning_rate": 0.000311688927328039, + "loss": 2.627, + "step": 21135 + }, + { + "epoch": 0.6267532544554162, + "grad_norm": 0.09655296057462692, + "learning_rate": 0.00031164534170594955, + "loss": 2.6128, + "step": 21136 + }, + { + "epoch": 0.6267829078077277, + "grad_norm": 0.11741408705711365, + "learning_rate": 0.0003116017577517727, + "loss": 2.6107, + "step": 21137 + }, + { + "epoch": 0.6268125611600391, + "grad_norm": 0.10348419100046158, + "learning_rate": 0.00031155817546589477, + "loss": 2.6163, + "step": 21138 + }, + { + "epoch": 0.6268422145123507, + "grad_norm": 0.10418926179409027, + "learning_rate": 0.0003115145948487017, + "loss": 2.6303, + "step": 21139 + }, + { + "epoch": 0.6268718678646621, + "grad_norm": 0.10837670415639877, + "learning_rate": 0.0003114710159005791, + "loss": 2.627, + "step": 21140 + }, + { + "epoch": 0.6269015212169736, + "grad_norm": 0.10184543579816818, + "learning_rate": 0.000311427438621913, + "loss": 2.6463, + "step": 21141 + }, + { + "epoch": 0.626931174569285, + "grad_norm": 0.10566616803407669, + "learning_rate": 0.00031138386301308936, + "loss": 2.6254, + "step": 21142 + }, + { + "epoch": 0.6269608279215966, + "grad_norm": 0.09722405672073364, + "learning_rate": 0.00031134028907449395, + "loss": 2.6094, + "step": 21143 + }, + { + "epoch": 0.626990481273908, + "grad_norm": 0.10648713260889053, + "learning_rate": 0.0003112967168065127, + "loss": 2.6528, + "step": 21144 + }, + { + "epoch": 0.6270201346262195, + "grad_norm": 0.10912492126226425, + "learning_rate": 0.0003112531462095315, + "loss": 2.644, + "step": 21145 + }, + { + "epoch": 0.627049787978531, + "grad_norm": 0.11317239701747894, + "learning_rate": 0.00031120957728393594, + "loss": 2.6317, + "step": 21146 + }, + { + "epoch": 0.6270794413308425, + "grad_norm": 0.10837910324335098, + "learning_rate": 0.00031116601003011203, + "loss": 2.6529, + "step": 21147 + }, + { + "epoch": 0.6271090946831539, + "grad_norm": 0.10198786109685898, + "learning_rate": 0.0003111224444484455, + "loss": 2.6422, + "step": 21148 + }, + { + "epoch": 0.6271387480354654, + "grad_norm": 0.1214202269911766, + "learning_rate": 0.0003110788805393221, + "loss": 2.69, + "step": 21149 + }, + { + "epoch": 0.6271684013877769, + "grad_norm": 0.11038054525852203, + "learning_rate": 0.0003110353183031276, + "loss": 2.6297, + "step": 21150 + }, + { + "epoch": 0.6271980547400884, + "grad_norm": 0.11532575637102127, + "learning_rate": 0.0003109917577402479, + "loss": 2.6447, + "step": 21151 + }, + { + "epoch": 0.6272277080923998, + "grad_norm": 0.11398272216320038, + "learning_rate": 0.0003109481988510686, + "loss": 2.6196, + "step": 21152 + }, + { + "epoch": 0.6272573614447113, + "grad_norm": 0.10618604719638824, + "learning_rate": 0.00031090464163597545, + "loss": 2.6249, + "step": 21153 + }, + { + "epoch": 0.6272870147970228, + "grad_norm": 0.10213527828454971, + "learning_rate": 0.0003108610860953541, + "loss": 2.6302, + "step": 21154 + }, + { + "epoch": 0.6273166681493343, + "grad_norm": 0.11741109192371368, + "learning_rate": 0.00031081753222959044, + "loss": 2.6197, + "step": 21155 + }, + { + "epoch": 0.6273463215016457, + "grad_norm": 0.09850768744945526, + "learning_rate": 0.00031077398003907, + "loss": 2.6094, + "step": 21156 + }, + { + "epoch": 0.6273759748539572, + "grad_norm": 0.09783341735601425, + "learning_rate": 0.0003107304295241784, + "loss": 2.6296, + "step": 21157 + }, + { + "epoch": 0.6274056282062688, + "grad_norm": 0.1035677045583725, + "learning_rate": 0.0003106868806853013, + "loss": 2.6221, + "step": 21158 + }, + { + "epoch": 0.6274352815585802, + "grad_norm": 0.1043754518032074, + "learning_rate": 0.00031064333352282436, + "loss": 2.6268, + "step": 21159 + }, + { + "epoch": 0.6274649349108917, + "grad_norm": 0.10005412250757217, + "learning_rate": 0.00031059978803713316, + "loss": 2.6502, + "step": 21160 + }, + { + "epoch": 0.6274945882632031, + "grad_norm": 0.10210637003183365, + "learning_rate": 0.00031055624422861343, + "loss": 2.6046, + "step": 21161 + }, + { + "epoch": 0.6275242416155147, + "grad_norm": 0.10877320915460587, + "learning_rate": 0.0003105127020976507, + "loss": 2.6414, + "step": 21162 + }, + { + "epoch": 0.6275538949678261, + "grad_norm": 0.11882177740335464, + "learning_rate": 0.00031046916164463036, + "loss": 2.6025, + "step": 21163 + }, + { + "epoch": 0.6275835483201376, + "grad_norm": 0.10832477360963821, + "learning_rate": 0.0003104256228699382, + "loss": 2.5984, + "step": 21164 + }, + { + "epoch": 0.627613201672449, + "grad_norm": 0.1094026118516922, + "learning_rate": 0.00031038208577395976, + "loss": 2.6568, + "step": 21165 + }, + { + "epoch": 0.6276428550247606, + "grad_norm": 0.13419649004936218, + "learning_rate": 0.00031033855035708056, + "loss": 2.6299, + "step": 21166 + }, + { + "epoch": 0.627672508377072, + "grad_norm": 0.1506813019514084, + "learning_rate": 0.00031029501661968597, + "loss": 2.6356, + "step": 21167 + }, + { + "epoch": 0.6277021617293835, + "grad_norm": 0.13414828479290009, + "learning_rate": 0.00031025148456216144, + "loss": 2.6669, + "step": 21168 + }, + { + "epoch": 0.627731815081695, + "grad_norm": 0.10990294814109802, + "learning_rate": 0.0003102079541848928, + "loss": 2.6528, + "step": 21169 + }, + { + "epoch": 0.6277614684340065, + "grad_norm": 0.11808919161558151, + "learning_rate": 0.00031016442548826515, + "loss": 2.6298, + "step": 21170 + }, + { + "epoch": 0.6277911217863179, + "grad_norm": 0.1212191954255104, + "learning_rate": 0.0003101208984726641, + "loss": 2.6383, + "step": 21171 + }, + { + "epoch": 0.6278207751386294, + "grad_norm": 0.10330117493867874, + "learning_rate": 0.00031007737313847516, + "loss": 2.5871, + "step": 21172 + }, + { + "epoch": 0.6278504284909409, + "grad_norm": 0.11790324747562408, + "learning_rate": 0.0003100338494860838, + "loss": 2.6386, + "step": 21173 + }, + { + "epoch": 0.6278800818432524, + "grad_norm": 0.11038929224014282, + "learning_rate": 0.00030999032751587516, + "loss": 2.6091, + "step": 21174 + }, + { + "epoch": 0.6279097351955638, + "grad_norm": 0.10677368938922882, + "learning_rate": 0.00030994680722823474, + "loss": 2.6387, + "step": 21175 + }, + { + "epoch": 0.6279393885478753, + "grad_norm": 0.10387221723794937, + "learning_rate": 0.00030990328862354785, + "loss": 2.654, + "step": 21176 + }, + { + "epoch": 0.6279690419001868, + "grad_norm": 0.11317255347967148, + "learning_rate": 0.0003098597717022002, + "loss": 2.6326, + "step": 21177 + }, + { + "epoch": 0.6279986952524983, + "grad_norm": 0.1119980439543724, + "learning_rate": 0.0003098162564645768, + "loss": 2.6304, + "step": 21178 + }, + { + "epoch": 0.6280283486048098, + "grad_norm": 0.13938657939434052, + "learning_rate": 0.00030977274291106317, + "loss": 2.6355, + "step": 21179 + }, + { + "epoch": 0.6280580019571212, + "grad_norm": 0.12585222721099854, + "learning_rate": 0.00030972923104204456, + "loss": 2.628, + "step": 21180 + }, + { + "epoch": 0.6280876553094328, + "grad_norm": 0.10805767774581909, + "learning_rate": 0.0003096857208579062, + "loss": 2.6372, + "step": 21181 + }, + { + "epoch": 0.6281173086617442, + "grad_norm": 0.11184418201446533, + "learning_rate": 0.00030964221235903346, + "loss": 2.6264, + "step": 21182 + }, + { + "epoch": 0.6281469620140557, + "grad_norm": 0.1224990040063858, + "learning_rate": 0.0003095987055458116, + "loss": 2.6497, + "step": 21183 + }, + { + "epoch": 0.6281766153663672, + "grad_norm": 0.11188183724880219, + "learning_rate": 0.00030955520041862607, + "loss": 2.607, + "step": 21184 + }, + { + "epoch": 0.6282062687186787, + "grad_norm": 0.10589560121297836, + "learning_rate": 0.00030951169697786176, + "loss": 2.6217, + "step": 21185 + }, + { + "epoch": 0.6282359220709901, + "grad_norm": 0.10630301386117935, + "learning_rate": 0.000309468195223904, + "loss": 2.633, + "step": 21186 + }, + { + "epoch": 0.6282655754233016, + "grad_norm": 0.10871944576501846, + "learning_rate": 0.00030942469515713816, + "loss": 2.6419, + "step": 21187 + }, + { + "epoch": 0.6282952287756131, + "grad_norm": 0.11809048801660538, + "learning_rate": 0.00030938119677794936, + "loss": 2.6202, + "step": 21188 + }, + { + "epoch": 0.6283248821279246, + "grad_norm": 0.1092938557267189, + "learning_rate": 0.00030933770008672267, + "loss": 2.6077, + "step": 21189 + }, + { + "epoch": 0.628354535480236, + "grad_norm": 0.10375066101551056, + "learning_rate": 0.0003092942050838434, + "loss": 2.6308, + "step": 21190 + }, + { + "epoch": 0.6283841888325475, + "grad_norm": 0.11259086430072784, + "learning_rate": 0.0003092507117696968, + "loss": 2.6478, + "step": 21191 + }, + { + "epoch": 0.628413842184859, + "grad_norm": 0.11259124428033829, + "learning_rate": 0.00030920722014466783, + "loss": 2.6485, + "step": 21192 + }, + { + "epoch": 0.6284434955371705, + "grad_norm": 0.11004938185214996, + "learning_rate": 0.0003091637302091417, + "loss": 2.6284, + "step": 21193 + }, + { + "epoch": 0.6284731488894819, + "grad_norm": 0.11851426959037781, + "learning_rate": 0.0003091202419635035, + "loss": 2.6077, + "step": 21194 + }, + { + "epoch": 0.6285028022417934, + "grad_norm": 0.12957768142223358, + "learning_rate": 0.0003090767554081384, + "loss": 2.609, + "step": 21195 + }, + { + "epoch": 0.6285324555941049, + "grad_norm": 0.11206761002540588, + "learning_rate": 0.0003090332705434312, + "loss": 2.6503, + "step": 21196 + }, + { + "epoch": 0.6285621089464164, + "grad_norm": 0.11283320933580399, + "learning_rate": 0.00030898978736976733, + "loss": 2.6198, + "step": 21197 + }, + { + "epoch": 0.6285917622987278, + "grad_norm": 0.11727556586265564, + "learning_rate": 0.0003089463058875316, + "loss": 2.6231, + "step": 21198 + }, + { + "epoch": 0.6286214156510394, + "grad_norm": 0.13288836181163788, + "learning_rate": 0.0003089028260971092, + "loss": 2.6398, + "step": 21199 + }, + { + "epoch": 0.6286510690033509, + "grad_norm": 0.12038449198007584, + "learning_rate": 0.00030885934799888495, + "loss": 2.6295, + "step": 21200 + }, + { + "epoch": 0.6286807223556623, + "grad_norm": 0.1047220230102539, + "learning_rate": 0.0003088158715932442, + "loss": 2.6205, + "step": 21201 + }, + { + "epoch": 0.6287103757079738, + "grad_norm": 0.12407144159078598, + "learning_rate": 0.00030877239688057134, + "loss": 2.6305, + "step": 21202 + }, + { + "epoch": 0.6287400290602853, + "grad_norm": 0.13338270783424377, + "learning_rate": 0.00030872892386125196, + "loss": 2.6336, + "step": 21203 + }, + { + "epoch": 0.6287696824125968, + "grad_norm": 0.10945610702037811, + "learning_rate": 0.00030868545253567094, + "loss": 2.6303, + "step": 21204 + }, + { + "epoch": 0.6287993357649082, + "grad_norm": 0.11554217338562012, + "learning_rate": 0.00030864198290421284, + "loss": 2.6086, + "step": 21205 + }, + { + "epoch": 0.6288289891172197, + "grad_norm": 0.13644467294216156, + "learning_rate": 0.000308598514967263, + "loss": 2.6546, + "step": 21206 + }, + { + "epoch": 0.6288586424695312, + "grad_norm": 0.1135009378194809, + "learning_rate": 0.00030855504872520607, + "loss": 2.6277, + "step": 21207 + }, + { + "epoch": 0.6288882958218427, + "grad_norm": 0.11255814880132675, + "learning_rate": 0.00030851158417842707, + "loss": 2.6343, + "step": 21208 + }, + { + "epoch": 0.6289179491741541, + "grad_norm": 0.1138257384300232, + "learning_rate": 0.00030846812132731083, + "loss": 2.633, + "step": 21209 + }, + { + "epoch": 0.6289476025264656, + "grad_norm": 0.1225389763712883, + "learning_rate": 0.00030842466017224224, + "loss": 2.628, + "step": 21210 + }, + { + "epoch": 0.6289772558787771, + "grad_norm": 0.11075279116630554, + "learning_rate": 0.0003083812007136063, + "loss": 2.6374, + "step": 21211 + }, + { + "epoch": 0.6290069092310886, + "grad_norm": 0.12297599762678146, + "learning_rate": 0.0003083377429517876, + "loss": 2.6364, + "step": 21212 + }, + { + "epoch": 0.6290365625834, + "grad_norm": 0.13033023476600647, + "learning_rate": 0.0003082942868871711, + "loss": 2.6593, + "step": 21213 + }, + { + "epoch": 0.6290662159357115, + "grad_norm": 0.10278081893920898, + "learning_rate": 0.0003082508325201416, + "loss": 2.5941, + "step": 21214 + }, + { + "epoch": 0.629095869288023, + "grad_norm": 0.11115662008523941, + "learning_rate": 0.0003082073798510838, + "loss": 2.6255, + "step": 21215 + }, + { + "epoch": 0.6291255226403345, + "grad_norm": 0.14453300833702087, + "learning_rate": 0.0003081639288803827, + "loss": 2.619, + "step": 21216 + }, + { + "epoch": 0.6291551759926459, + "grad_norm": 0.10769607126712799, + "learning_rate": 0.0003081204796084228, + "loss": 2.6237, + "step": 21217 + }, + { + "epoch": 0.6291848293449575, + "grad_norm": 0.12680213153362274, + "learning_rate": 0.0003080770320355891, + "loss": 2.618, + "step": 21218 + }, + { + "epoch": 0.6292144826972689, + "grad_norm": 0.1274290680885315, + "learning_rate": 0.0003080335861622663, + "loss": 2.6286, + "step": 21219 + }, + { + "epoch": 0.6292441360495804, + "grad_norm": 0.11004975438117981, + "learning_rate": 0.0003079901419888389, + "loss": 2.5756, + "step": 21220 + }, + { + "epoch": 0.6292737894018919, + "grad_norm": 0.1083269789814949, + "learning_rate": 0.0003079466995156918, + "loss": 2.6336, + "step": 21221 + }, + { + "epoch": 0.6293034427542034, + "grad_norm": 0.10525459051132202, + "learning_rate": 0.0003079032587432098, + "loss": 2.6254, + "step": 21222 + }, + { + "epoch": 0.6293330961065149, + "grad_norm": 0.11824662238359451, + "learning_rate": 0.00030785981967177724, + "loss": 2.6311, + "step": 21223 + }, + { + "epoch": 0.6293627494588263, + "grad_norm": 0.10978936403989792, + "learning_rate": 0.000307816382301779, + "loss": 2.6367, + "step": 21224 + }, + { + "epoch": 0.6293924028111378, + "grad_norm": 0.10216108709573746, + "learning_rate": 0.0003077729466335997, + "loss": 2.6123, + "step": 21225 + }, + { + "epoch": 0.6294220561634493, + "grad_norm": 0.10672031342983246, + "learning_rate": 0.0003077295126676238, + "loss": 2.623, + "step": 21226 + }, + { + "epoch": 0.6294517095157608, + "grad_norm": 0.10849152505397797, + "learning_rate": 0.0003076860804042362, + "loss": 2.6405, + "step": 21227 + }, + { + "epoch": 0.6294813628680722, + "grad_norm": 0.12335705012083054, + "learning_rate": 0.0003076426498438213, + "loss": 2.6269, + "step": 21228 + }, + { + "epoch": 0.6295110162203837, + "grad_norm": 0.10569241642951965, + "learning_rate": 0.0003075992209867638, + "loss": 2.6366, + "step": 21229 + }, + { + "epoch": 0.6295406695726952, + "grad_norm": 0.11393750458955765, + "learning_rate": 0.00030755579383344824, + "loss": 2.6523, + "step": 21230 + }, + { + "epoch": 0.6295703229250067, + "grad_norm": 0.12574543058872223, + "learning_rate": 0.0003075123683842591, + "loss": 2.6382, + "step": 21231 + }, + { + "epoch": 0.6295999762773181, + "grad_norm": 0.11962950229644775, + "learning_rate": 0.0003074689446395812, + "loss": 2.6297, + "step": 21232 + }, + { + "epoch": 0.6296296296296297, + "grad_norm": 0.1142755076289177, + "learning_rate": 0.0003074255225997986, + "loss": 2.6416, + "step": 21233 + }, + { + "epoch": 0.6296592829819411, + "grad_norm": 0.10543303936719894, + "learning_rate": 0.00030738210226529614, + "loss": 2.6384, + "step": 21234 + }, + { + "epoch": 0.6296889363342526, + "grad_norm": 0.13823233544826508, + "learning_rate": 0.00030733868363645825, + "loss": 2.6319, + "step": 21235 + }, + { + "epoch": 0.629718589686564, + "grad_norm": 0.11596380174160004, + "learning_rate": 0.0003072952667136693, + "loss": 2.6201, + "step": 21236 + }, + { + "epoch": 0.6297482430388756, + "grad_norm": 0.11550919711589813, + "learning_rate": 0.0003072518514973139, + "loss": 2.6616, + "step": 21237 + }, + { + "epoch": 0.629777896391187, + "grad_norm": 0.11824019253253937, + "learning_rate": 0.0003072084379877764, + "loss": 2.6524, + "step": 21238 + }, + { + "epoch": 0.6298075497434985, + "grad_norm": 0.12546822428703308, + "learning_rate": 0.0003071650261854414, + "loss": 2.6469, + "step": 21239 + }, + { + "epoch": 0.6298372030958099, + "grad_norm": 0.0920218974351883, + "learning_rate": 0.0003071216160906931, + "loss": 2.6351, + "step": 21240 + }, + { + "epoch": 0.6298668564481215, + "grad_norm": 0.1071673110127449, + "learning_rate": 0.0003070782077039158, + "loss": 2.6565, + "step": 21241 + }, + { + "epoch": 0.629896509800433, + "grad_norm": 0.0990329161286354, + "learning_rate": 0.00030703480102549434, + "loss": 2.6263, + "step": 21242 + }, + { + "epoch": 0.6299261631527444, + "grad_norm": 0.10082588344812393, + "learning_rate": 0.0003069913960558128, + "loss": 2.57, + "step": 21243 + }, + { + "epoch": 0.6299558165050559, + "grad_norm": 0.1027805507183075, + "learning_rate": 0.0003069479927952555, + "loss": 2.6121, + "step": 21244 + }, + { + "epoch": 0.6299854698573674, + "grad_norm": 0.11318304389715195, + "learning_rate": 0.000306904591244207, + "loss": 2.6123, + "step": 21245 + }, + { + "epoch": 0.6300151232096789, + "grad_norm": 0.09955506026744843, + "learning_rate": 0.00030686119140305136, + "loss": 2.6215, + "step": 21246 + }, + { + "epoch": 0.6300447765619903, + "grad_norm": 0.10156363248825073, + "learning_rate": 0.0003068177932721731, + "loss": 2.5946, + "step": 21247 + }, + { + "epoch": 0.6300744299143018, + "grad_norm": 0.11070676892995834, + "learning_rate": 0.00030677439685195635, + "loss": 2.6336, + "step": 21248 + }, + { + "epoch": 0.6301040832666133, + "grad_norm": 0.10592617839574814, + "learning_rate": 0.00030673100214278565, + "loss": 2.5915, + "step": 21249 + }, + { + "epoch": 0.6301337366189248, + "grad_norm": 0.1057695522904396, + "learning_rate": 0.000306687609145045, + "loss": 2.6329, + "step": 21250 + }, + { + "epoch": 0.6301633899712362, + "grad_norm": 0.09574329107999802, + "learning_rate": 0.0003066442178591187, + "loss": 2.631, + "step": 21251 + }, + { + "epoch": 0.6301930433235478, + "grad_norm": 0.10564836114645004, + "learning_rate": 0.0003066008282853911, + "loss": 2.6373, + "step": 21252 + }, + { + "epoch": 0.6302226966758592, + "grad_norm": 0.10972537100315094, + "learning_rate": 0.00030655744042424633, + "loss": 2.5974, + "step": 21253 + }, + { + "epoch": 0.6302523500281707, + "grad_norm": 0.10553780943155289, + "learning_rate": 0.00030651405427606853, + "loss": 2.6238, + "step": 21254 + }, + { + "epoch": 0.6302820033804821, + "grad_norm": 0.11574021726846695, + "learning_rate": 0.00030647066984124214, + "loss": 2.657, + "step": 21255 + }, + { + "epoch": 0.6303116567327937, + "grad_norm": 0.10536313056945801, + "learning_rate": 0.0003064272871201511, + "loss": 2.617, + "step": 21256 + }, + { + "epoch": 0.6303413100851051, + "grad_norm": 0.09566619247198105, + "learning_rate": 0.0003063839061131797, + "loss": 2.6125, + "step": 21257 + }, + { + "epoch": 0.6303709634374166, + "grad_norm": 0.12537236511707306, + "learning_rate": 0.00030634052682071205, + "loss": 2.629, + "step": 21258 + }, + { + "epoch": 0.630400616789728, + "grad_norm": 0.10248327255249023, + "learning_rate": 0.0003062971492431323, + "loss": 2.6423, + "step": 21259 + }, + { + "epoch": 0.6304302701420396, + "grad_norm": 0.12179569900035858, + "learning_rate": 0.0003062537733808246, + "loss": 2.6434, + "step": 21260 + }, + { + "epoch": 0.630459923494351, + "grad_norm": 0.11120223253965378, + "learning_rate": 0.0003062103992341729, + "loss": 2.63, + "step": 21261 + }, + { + "epoch": 0.6304895768466625, + "grad_norm": 0.11228777468204498, + "learning_rate": 0.0003061670268035614, + "loss": 2.664, + "step": 21262 + }, + { + "epoch": 0.630519230198974, + "grad_norm": 0.12210078537464142, + "learning_rate": 0.0003061236560893741, + "loss": 2.6394, + "step": 21263 + }, + { + "epoch": 0.6305488835512855, + "grad_norm": 0.11007581651210785, + "learning_rate": 0.0003060802870919952, + "loss": 2.5891, + "step": 21264 + }, + { + "epoch": 0.630578536903597, + "grad_norm": 0.11924880743026733, + "learning_rate": 0.00030603691981180857, + "loss": 2.6294, + "step": 21265 + }, + { + "epoch": 0.6306081902559084, + "grad_norm": 0.10196955502033234, + "learning_rate": 0.0003059935542491984, + "loss": 2.6231, + "step": 21266 + }, + { + "epoch": 0.63063784360822, + "grad_norm": 0.11319732666015625, + "learning_rate": 0.00030595019040454834, + "loss": 2.6277, + "step": 21267 + }, + { + "epoch": 0.6306674969605314, + "grad_norm": 0.11694502830505371, + "learning_rate": 0.0003059068282782429, + "loss": 2.6475, + "step": 21268 + }, + { + "epoch": 0.6306971503128429, + "grad_norm": 0.11978409439325333, + "learning_rate": 0.00030586346787066574, + "loss": 2.6238, + "step": 21269 + }, + { + "epoch": 0.6307268036651543, + "grad_norm": 0.10292518883943558, + "learning_rate": 0.000305820109182201, + "loss": 2.6461, + "step": 21270 + }, + { + "epoch": 0.6307564570174659, + "grad_norm": 0.10767079889774323, + "learning_rate": 0.0003057767522132324, + "loss": 2.6537, + "step": 21271 + }, + { + "epoch": 0.6307861103697773, + "grad_norm": 0.10896550118923187, + "learning_rate": 0.00030573339696414405, + "loss": 2.6533, + "step": 21272 + }, + { + "epoch": 0.6308157637220888, + "grad_norm": 0.10528018325567245, + "learning_rate": 0.0003056900434353198, + "loss": 2.62, + "step": 21273 + }, + { + "epoch": 0.6308454170744002, + "grad_norm": 0.11232980340719223, + "learning_rate": 0.00030564669162714354, + "loss": 2.6274, + "step": 21274 + }, + { + "epoch": 0.6308750704267118, + "grad_norm": 0.12209717184305191, + "learning_rate": 0.00030560334153999924, + "loss": 2.6499, + "step": 21275 + }, + { + "epoch": 0.6309047237790232, + "grad_norm": 0.10291210561990738, + "learning_rate": 0.0003055599931742707, + "loss": 2.6546, + "step": 21276 + }, + { + "epoch": 0.6309343771313347, + "grad_norm": 0.12260226160287857, + "learning_rate": 0.00030551664653034183, + "loss": 2.6152, + "step": 21277 + }, + { + "epoch": 0.6309640304836461, + "grad_norm": 0.10491548478603363, + "learning_rate": 0.00030547330160859645, + "loss": 2.6131, + "step": 21278 + }, + { + "epoch": 0.6309936838359577, + "grad_norm": 0.1137620285153389, + "learning_rate": 0.00030542995840941815, + "loss": 2.6217, + "step": 21279 + }, + { + "epoch": 0.6310233371882691, + "grad_norm": 0.11732755601406097, + "learning_rate": 0.00030538661693319116, + "loss": 2.6189, + "step": 21280 + }, + { + "epoch": 0.6310529905405806, + "grad_norm": 0.12426172941923141, + "learning_rate": 0.0003053432771802991, + "loss": 2.6598, + "step": 21281 + }, + { + "epoch": 0.631082643892892, + "grad_norm": 0.11908858269453049, + "learning_rate": 0.00030529993915112566, + "loss": 2.6102, + "step": 21282 + }, + { + "epoch": 0.6311122972452036, + "grad_norm": 0.10472346842288971, + "learning_rate": 0.0003052566028460547, + "loss": 2.6046, + "step": 21283 + }, + { + "epoch": 0.6311419505975151, + "grad_norm": 0.12057464569807053, + "learning_rate": 0.00030521326826547, + "loss": 2.6469, + "step": 21284 + }, + { + "epoch": 0.6311716039498265, + "grad_norm": 0.1414462774991989, + "learning_rate": 0.0003051699354097551, + "loss": 2.621, + "step": 21285 + }, + { + "epoch": 0.631201257302138, + "grad_norm": 0.10018017888069153, + "learning_rate": 0.00030512660427929405, + "loss": 2.6197, + "step": 21286 + }, + { + "epoch": 0.6312309106544495, + "grad_norm": 0.11194871366024017, + "learning_rate": 0.00030508327487447035, + "loss": 2.6338, + "step": 21287 + }, + { + "epoch": 0.631260564006761, + "grad_norm": 0.12935832142829895, + "learning_rate": 0.0003050399471956676, + "loss": 2.6175, + "step": 21288 + }, + { + "epoch": 0.6312902173590724, + "grad_norm": 0.09190104156732559, + "learning_rate": 0.00030499662124326964, + "loss": 2.6005, + "step": 21289 + }, + { + "epoch": 0.631319870711384, + "grad_norm": 0.11716359108686447, + "learning_rate": 0.00030495329701766004, + "loss": 2.628, + "step": 21290 + }, + { + "epoch": 0.6313495240636954, + "grad_norm": 0.10432090610265732, + "learning_rate": 0.00030490997451922255, + "loss": 2.6104, + "step": 21291 + }, + { + "epoch": 0.6313791774160069, + "grad_norm": 0.10823102295398712, + "learning_rate": 0.00030486665374834056, + "loss": 2.6101, + "step": 21292 + }, + { + "epoch": 0.6314088307683183, + "grad_norm": 0.11430280655622482, + "learning_rate": 0.0003048233347053979, + "loss": 2.6451, + "step": 21293 + }, + { + "epoch": 0.6314384841206299, + "grad_norm": 0.09665936231613159, + "learning_rate": 0.0003047800173907782, + "loss": 2.5951, + "step": 21294 + }, + { + "epoch": 0.6314681374729413, + "grad_norm": 0.11129180341959, + "learning_rate": 0.0003047367018048649, + "loss": 2.6081, + "step": 21295 + }, + { + "epoch": 0.6314977908252528, + "grad_norm": 0.10933874547481537, + "learning_rate": 0.0003046933879480416, + "loss": 2.6182, + "step": 21296 + }, + { + "epoch": 0.6315274441775642, + "grad_norm": 0.09709633886814117, + "learning_rate": 0.0003046500758206919, + "loss": 2.6191, + "step": 21297 + }, + { + "epoch": 0.6315570975298758, + "grad_norm": 0.1192125529050827, + "learning_rate": 0.00030460676542319945, + "loss": 2.6065, + "step": 21298 + }, + { + "epoch": 0.6315867508821872, + "grad_norm": 0.1073712632060051, + "learning_rate": 0.00030456345675594756, + "loss": 2.6078, + "step": 21299 + }, + { + "epoch": 0.6316164042344987, + "grad_norm": 0.11356473714113235, + "learning_rate": 0.0003045201498193197, + "loss": 2.6538, + "step": 21300 + }, + { + "epoch": 0.6316460575868101, + "grad_norm": 0.11847084760665894, + "learning_rate": 0.0003044768446136995, + "loss": 2.6351, + "step": 21301 + }, + { + "epoch": 0.6316757109391217, + "grad_norm": 0.12328494340181351, + "learning_rate": 0.0003044335411394704, + "loss": 2.6424, + "step": 21302 + }, + { + "epoch": 0.6317053642914331, + "grad_norm": 0.11161867529153824, + "learning_rate": 0.0003043902393970159, + "loss": 2.6138, + "step": 21303 + }, + { + "epoch": 0.6317350176437446, + "grad_norm": 0.1109674945473671, + "learning_rate": 0.0003043469393867195, + "loss": 2.6175, + "step": 21304 + }, + { + "epoch": 0.6317646709960562, + "grad_norm": 0.11726494878530502, + "learning_rate": 0.00030430364110896417, + "loss": 2.587, + "step": 21305 + }, + { + "epoch": 0.6317943243483676, + "grad_norm": 0.12230613082647324, + "learning_rate": 0.000304260344564134, + "loss": 2.6364, + "step": 21306 + }, + { + "epoch": 0.6318239777006791, + "grad_norm": 0.13246165215969086, + "learning_rate": 0.000304217049752612, + "loss": 2.6548, + "step": 21307 + }, + { + "epoch": 0.6318536310529905, + "grad_norm": 0.12199077755212784, + "learning_rate": 0.00030417375667478173, + "loss": 2.6244, + "step": 21308 + }, + { + "epoch": 0.6318832844053021, + "grad_norm": 0.12042819708585739, + "learning_rate": 0.0003041304653310264, + "loss": 2.6582, + "step": 21309 + }, + { + "epoch": 0.6319129377576135, + "grad_norm": 0.13178154826164246, + "learning_rate": 0.0003040871757217294, + "loss": 2.6443, + "step": 21310 + }, + { + "epoch": 0.631942591109925, + "grad_norm": 0.11063782125711441, + "learning_rate": 0.00030404388784727404, + "loss": 2.6263, + "step": 21311 + }, + { + "epoch": 0.6319722444622364, + "grad_norm": 0.11957399547100067, + "learning_rate": 0.0003040006017080437, + "loss": 2.6063, + "step": 21312 + }, + { + "epoch": 0.632001897814548, + "grad_norm": 0.11709196865558624, + "learning_rate": 0.0003039573173044217, + "loss": 2.6225, + "step": 21313 + }, + { + "epoch": 0.6320315511668594, + "grad_norm": 0.11384814977645874, + "learning_rate": 0.00030391403463679134, + "loss": 2.6359, + "step": 21314 + }, + { + "epoch": 0.6320612045191709, + "grad_norm": 0.13537515699863434, + "learning_rate": 0.00030387075370553595, + "loss": 2.6185, + "step": 21315 + }, + { + "epoch": 0.6320908578714823, + "grad_norm": 0.11978671699762344, + "learning_rate": 0.00030382747451103854, + "loss": 2.6156, + "step": 21316 + }, + { + "epoch": 0.6321205112237939, + "grad_norm": 0.12227743864059448, + "learning_rate": 0.00030378419705368254, + "loss": 2.6304, + "step": 21317 + }, + { + "epoch": 0.6321501645761053, + "grad_norm": 0.09968529641628265, + "learning_rate": 0.0003037409213338511, + "loss": 2.6061, + "step": 21318 + }, + { + "epoch": 0.6321798179284168, + "grad_norm": 0.1269494891166687, + "learning_rate": 0.00030369764735192756, + "loss": 2.6249, + "step": 21319 + }, + { + "epoch": 0.6322094712807282, + "grad_norm": 0.10392209887504578, + "learning_rate": 0.00030365437510829506, + "loss": 2.6462, + "step": 21320 + }, + { + "epoch": 0.6322391246330398, + "grad_norm": 0.10659649968147278, + "learning_rate": 0.00030361110460333675, + "loss": 2.6326, + "step": 21321 + }, + { + "epoch": 0.6322687779853512, + "grad_norm": 0.13184021413326263, + "learning_rate": 0.00030356783583743586, + "loss": 2.6241, + "step": 21322 + }, + { + "epoch": 0.6322984313376627, + "grad_norm": 0.11188255995512009, + "learning_rate": 0.00030352456881097543, + "loss": 2.6284, + "step": 21323 + }, + { + "epoch": 0.6323280846899743, + "grad_norm": 0.09941627085208893, + "learning_rate": 0.00030348130352433867, + "loss": 2.5985, + "step": 21324 + }, + { + "epoch": 0.6323577380422857, + "grad_norm": 0.10878846794366837, + "learning_rate": 0.00030343803997790885, + "loss": 2.6653, + "step": 21325 + }, + { + "epoch": 0.6323873913945972, + "grad_norm": 0.1019478365778923, + "learning_rate": 0.00030339477817206885, + "loss": 2.6005, + "step": 21326 + }, + { + "epoch": 0.6324170447469086, + "grad_norm": 0.11358146369457245, + "learning_rate": 0.0003033515181072017, + "loss": 2.6172, + "step": 21327 + }, + { + "epoch": 0.6324466980992202, + "grad_norm": 0.14609526097774506, + "learning_rate": 0.00030330825978369083, + "loss": 2.6653, + "step": 21328 + }, + { + "epoch": 0.6324763514515316, + "grad_norm": 0.14542169868946075, + "learning_rate": 0.0003032650032019189, + "loss": 2.6045, + "step": 21329 + }, + { + "epoch": 0.6325060048038431, + "grad_norm": 0.12097252160310745, + "learning_rate": 0.00030322174836226924, + "loss": 2.6243, + "step": 21330 + }, + { + "epoch": 0.6325356581561545, + "grad_norm": 0.11993114650249481, + "learning_rate": 0.00030317849526512457, + "loss": 2.6329, + "step": 21331 + }, + { + "epoch": 0.6325653115084661, + "grad_norm": 0.1444864124059677, + "learning_rate": 0.00030313524391086834, + "loss": 2.5982, + "step": 21332 + }, + { + "epoch": 0.6325949648607775, + "grad_norm": 0.12468554824590683, + "learning_rate": 0.0003030919942998832, + "loss": 2.6502, + "step": 21333 + }, + { + "epoch": 0.632624618213089, + "grad_norm": 0.13170646131038666, + "learning_rate": 0.0003030487464325523, + "loss": 2.6548, + "step": 21334 + }, + { + "epoch": 0.6326542715654004, + "grad_norm": 0.1388893723487854, + "learning_rate": 0.00030300550030925856, + "loss": 2.6109, + "step": 21335 + }, + { + "epoch": 0.632683924917712, + "grad_norm": 0.11506593972444534, + "learning_rate": 0.000302962255930385, + "loss": 2.6249, + "step": 21336 + }, + { + "epoch": 0.6327135782700234, + "grad_norm": 0.11928263306617737, + "learning_rate": 0.0003029190132963144, + "loss": 2.6478, + "step": 21337 + }, + { + "epoch": 0.6327432316223349, + "grad_norm": 0.12780937552452087, + "learning_rate": 0.0003028757724074298, + "loss": 2.6581, + "step": 21338 + }, + { + "epoch": 0.6327728849746463, + "grad_norm": 0.12605954706668854, + "learning_rate": 0.000302832533264114, + "loss": 2.6319, + "step": 21339 + }, + { + "epoch": 0.6328025383269579, + "grad_norm": 0.10364982485771179, + "learning_rate": 0.00030278929586675, + "loss": 2.654, + "step": 21340 + }, + { + "epoch": 0.6328321916792693, + "grad_norm": 0.13600750267505646, + "learning_rate": 0.00030274606021572065, + "loss": 2.6212, + "step": 21341 + }, + { + "epoch": 0.6328618450315808, + "grad_norm": 0.11979295313358307, + "learning_rate": 0.0003027028263114089, + "loss": 2.6279, + "step": 21342 + }, + { + "epoch": 0.6328914983838922, + "grad_norm": 0.12144124507904053, + "learning_rate": 0.00030265959415419735, + "loss": 2.6408, + "step": 21343 + }, + { + "epoch": 0.6329211517362038, + "grad_norm": 0.13406290113925934, + "learning_rate": 0.00030261636374446875, + "loss": 2.6275, + "step": 21344 + }, + { + "epoch": 0.6329508050885153, + "grad_norm": 0.13003018498420715, + "learning_rate": 0.00030257313508260644, + "loss": 2.616, + "step": 21345 + }, + { + "epoch": 0.6329804584408267, + "grad_norm": 0.11556896567344666, + "learning_rate": 0.000302529908168993, + "loss": 2.599, + "step": 21346 + }, + { + "epoch": 0.6330101117931383, + "grad_norm": 0.12947997450828552, + "learning_rate": 0.00030248668300401086, + "loss": 2.6462, + "step": 21347 + }, + { + "epoch": 0.6330397651454497, + "grad_norm": 0.11673565208911896, + "learning_rate": 0.00030244345958804314, + "loss": 2.6331, + "step": 21348 + }, + { + "epoch": 0.6330694184977612, + "grad_norm": 0.10865096002817154, + "learning_rate": 0.00030240023792147254, + "loss": 2.6631, + "step": 21349 + }, + { + "epoch": 0.6330990718500726, + "grad_norm": 0.12523968517780304, + "learning_rate": 0.0003023570180046816, + "loss": 2.6072, + "step": 21350 + }, + { + "epoch": 0.6331287252023842, + "grad_norm": 0.09993692487478256, + "learning_rate": 0.00030231379983805334, + "loss": 2.6082, + "step": 21351 + }, + { + "epoch": 0.6331583785546956, + "grad_norm": 0.10544416308403015, + "learning_rate": 0.0003022705834219703, + "loss": 2.6507, + "step": 21352 + }, + { + "epoch": 0.6331880319070071, + "grad_norm": 0.10578937083482742, + "learning_rate": 0.00030222736875681525, + "loss": 2.6284, + "step": 21353 + }, + { + "epoch": 0.6332176852593185, + "grad_norm": 0.11432304978370667, + "learning_rate": 0.00030218415584297063, + "loss": 2.5758, + "step": 21354 + }, + { + "epoch": 0.6332473386116301, + "grad_norm": 0.11538252234458923, + "learning_rate": 0.0003021409446808194, + "loss": 2.6543, + "step": 21355 + }, + { + "epoch": 0.6332769919639415, + "grad_norm": 0.1241447851061821, + "learning_rate": 0.000302097735270744, + "loss": 2.6602, + "step": 21356 + }, + { + "epoch": 0.633306645316253, + "grad_norm": 0.10229016095399857, + "learning_rate": 0.000302054527613127, + "loss": 2.6214, + "step": 21357 + }, + { + "epoch": 0.6333362986685644, + "grad_norm": 0.11313041299581528, + "learning_rate": 0.0003020113217083513, + "loss": 2.6008, + "step": 21358 + }, + { + "epoch": 0.633365952020876, + "grad_norm": 0.10442081838846207, + "learning_rate": 0.00030196811755679926, + "loss": 2.6229, + "step": 21359 + }, + { + "epoch": 0.6333956053731874, + "grad_norm": 0.11387839913368225, + "learning_rate": 0.00030192491515885356, + "loss": 2.6538, + "step": 21360 + }, + { + "epoch": 0.6334252587254989, + "grad_norm": 0.10633043199777603, + "learning_rate": 0.0003018817145148968, + "loss": 2.6285, + "step": 21361 + }, + { + "epoch": 0.6334549120778104, + "grad_norm": 0.11507032811641693, + "learning_rate": 0.00030183851562531135, + "loss": 2.6106, + "step": 21362 + }, + { + "epoch": 0.6334845654301219, + "grad_norm": 0.1005316823720932, + "learning_rate": 0.00030179531849048, + "loss": 2.6465, + "step": 21363 + }, + { + "epoch": 0.6335142187824333, + "grad_norm": 0.11278020590543747, + "learning_rate": 0.00030175212311078504, + "loss": 2.6563, + "step": 21364 + }, + { + "epoch": 0.6335438721347448, + "grad_norm": 0.1090734452009201, + "learning_rate": 0.0003017089294866091, + "loss": 2.5973, + "step": 21365 + }, + { + "epoch": 0.6335735254870564, + "grad_norm": 0.11448927968740463, + "learning_rate": 0.00030166573761833453, + "loss": 2.6351, + "step": 21366 + }, + { + "epoch": 0.6336031788393678, + "grad_norm": 0.11001143604516983, + "learning_rate": 0.000301622547506344, + "loss": 2.638, + "step": 21367 + }, + { + "epoch": 0.6336328321916793, + "grad_norm": 0.1253342628479004, + "learning_rate": 0.00030157935915101975, + "loss": 2.6412, + "step": 21368 + }, + { + "epoch": 0.6336624855439907, + "grad_norm": 0.1031159982085228, + "learning_rate": 0.0003015361725527444, + "loss": 2.6009, + "step": 21369 + }, + { + "epoch": 0.6336921388963023, + "grad_norm": 0.10886553674936295, + "learning_rate": 0.0003014929877119002, + "loss": 2.6298, + "step": 21370 + }, + { + "epoch": 0.6337217922486137, + "grad_norm": 0.12027119100093842, + "learning_rate": 0.00030144980462886974, + "loss": 2.6486, + "step": 21371 + }, + { + "epoch": 0.6337514456009252, + "grad_norm": 0.10446707904338837, + "learning_rate": 0.0003014066233040354, + "loss": 2.6227, + "step": 21372 + }, + { + "epoch": 0.6337810989532366, + "grad_norm": 0.12173687666654587, + "learning_rate": 0.00030136344373777945, + "loss": 2.6356, + "step": 21373 + }, + { + "epoch": 0.6338107523055482, + "grad_norm": 0.10307420045137405, + "learning_rate": 0.00030132026593048444, + "loss": 2.6533, + "step": 21374 + }, + { + "epoch": 0.6338404056578596, + "grad_norm": 0.10943562537431717, + "learning_rate": 0.00030127708988253243, + "loss": 2.6348, + "step": 21375 + }, + { + "epoch": 0.6338700590101711, + "grad_norm": 0.12281947582960129, + "learning_rate": 0.0003012339155943059, + "loss": 2.6026, + "step": 21376 + }, + { + "epoch": 0.6338997123624825, + "grad_norm": 0.1028374433517456, + "learning_rate": 0.0003011907430661872, + "loss": 2.6235, + "step": 21377 + }, + { + "epoch": 0.6339293657147941, + "grad_norm": 0.10405243188142776, + "learning_rate": 0.0003011475722985586, + "loss": 2.6613, + "step": 21378 + }, + { + "epoch": 0.6339590190671055, + "grad_norm": 0.09499291330575943, + "learning_rate": 0.00030110440329180236, + "loss": 2.6144, + "step": 21379 + }, + { + "epoch": 0.633988672419417, + "grad_norm": 0.11050461232662201, + "learning_rate": 0.00030106123604630087, + "loss": 2.6064, + "step": 21380 + }, + { + "epoch": 0.6340183257717285, + "grad_norm": 0.10079121589660645, + "learning_rate": 0.0003010180705624362, + "loss": 2.6438, + "step": 21381 + }, + { + "epoch": 0.63404797912404, + "grad_norm": 0.12825176119804382, + "learning_rate": 0.0003009749068405907, + "loss": 2.6225, + "step": 21382 + }, + { + "epoch": 0.6340776324763514, + "grad_norm": 0.1344660371541977, + "learning_rate": 0.0003009317448811463, + "loss": 2.6442, + "step": 21383 + }, + { + "epoch": 0.6341072858286629, + "grad_norm": 0.12220368534326553, + "learning_rate": 0.00030088858468448575, + "loss": 2.614, + "step": 21384 + }, + { + "epoch": 0.6341369391809744, + "grad_norm": 0.11475064605474472, + "learning_rate": 0.00030084542625099094, + "loss": 2.6322, + "step": 21385 + }, + { + "epoch": 0.6341665925332859, + "grad_norm": 0.11132776737213135, + "learning_rate": 0.000300802269581044, + "loss": 2.6544, + "step": 21386 + }, + { + "epoch": 0.6341962458855974, + "grad_norm": 0.12197969108819962, + "learning_rate": 0.0003007591146750272, + "loss": 2.6397, + "step": 21387 + }, + { + "epoch": 0.6342258992379088, + "grad_norm": 0.11352347582578659, + "learning_rate": 0.0003007159615333226, + "loss": 2.6582, + "step": 21388 + }, + { + "epoch": 0.6342555525902204, + "grad_norm": 0.1126101166009903, + "learning_rate": 0.0003006728101563124, + "loss": 2.6371, + "step": 21389 + }, + { + "epoch": 0.6342852059425318, + "grad_norm": 0.1033986359834671, + "learning_rate": 0.0003006296605443787, + "loss": 2.6111, + "step": 21390 + }, + { + "epoch": 0.6343148592948433, + "grad_norm": 0.1086931973695755, + "learning_rate": 0.0003005865126979036, + "loss": 2.6521, + "step": 21391 + }, + { + "epoch": 0.6343445126471547, + "grad_norm": 0.10177138447761536, + "learning_rate": 0.0003005433666172691, + "loss": 2.6524, + "step": 21392 + }, + { + "epoch": 0.6343741659994663, + "grad_norm": 0.11580880731344223, + "learning_rate": 0.0003005002223028573, + "loss": 2.6112, + "step": 21393 + }, + { + "epoch": 0.6344038193517777, + "grad_norm": 0.10711319744586945, + "learning_rate": 0.00030045707975505034, + "loss": 2.6032, + "step": 21394 + }, + { + "epoch": 0.6344334727040892, + "grad_norm": 0.10851242393255234, + "learning_rate": 0.00030041393897423015, + "loss": 2.6852, + "step": 21395 + }, + { + "epoch": 0.6344631260564007, + "grad_norm": 0.1110820472240448, + "learning_rate": 0.00030037079996077866, + "loss": 2.6439, + "step": 21396 + }, + { + "epoch": 0.6344927794087122, + "grad_norm": 0.12967513501644135, + "learning_rate": 0.00030032766271507815, + "loss": 2.6389, + "step": 21397 + }, + { + "epoch": 0.6345224327610236, + "grad_norm": 0.12230239808559418, + "learning_rate": 0.00030028452723751043, + "loss": 2.6494, + "step": 21398 + }, + { + "epoch": 0.6345520861133351, + "grad_norm": 0.09660890698432922, + "learning_rate": 0.0003002413935284575, + "loss": 2.6188, + "step": 21399 + }, + { + "epoch": 0.6345817394656466, + "grad_norm": 0.11671130359172821, + "learning_rate": 0.0003001982615883013, + "loss": 2.6515, + "step": 21400 + }, + { + "epoch": 0.6346113928179581, + "grad_norm": 0.1065838411450386, + "learning_rate": 0.000300155131417424, + "loss": 2.6197, + "step": 21401 + }, + { + "epoch": 0.6346410461702695, + "grad_norm": 0.11924248933792114, + "learning_rate": 0.00030011200301620713, + "loss": 2.6034, + "step": 21402 + }, + { + "epoch": 0.634670699522581, + "grad_norm": 0.1186409443616867, + "learning_rate": 0.00030006887638503276, + "loss": 2.5951, + "step": 21403 + }, + { + "epoch": 0.6347003528748925, + "grad_norm": 0.10744799673557281, + "learning_rate": 0.00030002575152428284, + "loss": 2.6483, + "step": 21404 + }, + { + "epoch": 0.634730006227204, + "grad_norm": 0.13870245218276978, + "learning_rate": 0.0002999826284343392, + "loss": 2.6579, + "step": 21405 + }, + { + "epoch": 0.6347596595795154, + "grad_norm": 0.13315477967262268, + "learning_rate": 0.0002999395071155837, + "loss": 2.6133, + "step": 21406 + }, + { + "epoch": 0.6347893129318269, + "grad_norm": 0.1151905357837677, + "learning_rate": 0.0002998963875683983, + "loss": 2.6104, + "step": 21407 + }, + { + "epoch": 0.6348189662841385, + "grad_norm": 0.11052407324314117, + "learning_rate": 0.00029985326979316474, + "loss": 2.6291, + "step": 21408 + }, + { + "epoch": 0.6348486196364499, + "grad_norm": 0.11835820227861404, + "learning_rate": 0.00029981015379026456, + "loss": 2.6057, + "step": 21409 + }, + { + "epoch": 0.6348782729887614, + "grad_norm": 0.13265833258628845, + "learning_rate": 0.00029976703956008, + "loss": 2.6176, + "step": 21410 + }, + { + "epoch": 0.6349079263410728, + "grad_norm": 0.10952972620725632, + "learning_rate": 0.00029972392710299274, + "loss": 2.6118, + "step": 21411 + }, + { + "epoch": 0.6349375796933844, + "grad_norm": 0.10493919998407364, + "learning_rate": 0.00029968081641938455, + "loss": 2.6246, + "step": 21412 + }, + { + "epoch": 0.6349672330456958, + "grad_norm": 0.11705964803695679, + "learning_rate": 0.00029963770750963704, + "loss": 2.6238, + "step": 21413 + }, + { + "epoch": 0.6349968863980073, + "grad_norm": 0.1109558641910553, + "learning_rate": 0.000299594600374132, + "loss": 2.6441, + "step": 21414 + }, + { + "epoch": 0.6350265397503188, + "grad_norm": 0.11299600452184677, + "learning_rate": 0.00029955149501325115, + "loss": 2.6344, + "step": 21415 + }, + { + "epoch": 0.6350561931026303, + "grad_norm": 0.115199014544487, + "learning_rate": 0.00029950839142737617, + "loss": 2.6237, + "step": 21416 + }, + { + "epoch": 0.6350858464549417, + "grad_norm": 0.11961167305707932, + "learning_rate": 0.0002994652896168889, + "loss": 2.6083, + "step": 21417 + }, + { + "epoch": 0.6351154998072532, + "grad_norm": 0.11751297861337662, + "learning_rate": 0.000299422189582171, + "loss": 2.6546, + "step": 21418 + }, + { + "epoch": 0.6351451531595647, + "grad_norm": 0.1123989075422287, + "learning_rate": 0.00029937909132360385, + "loss": 2.642, + "step": 21419 + }, + { + "epoch": 0.6351748065118762, + "grad_norm": 0.12222924083471298, + "learning_rate": 0.0002993359948415694, + "loss": 2.6453, + "step": 21420 + }, + { + "epoch": 0.6352044598641876, + "grad_norm": 0.111336350440979, + "learning_rate": 0.00029929290013644904, + "loss": 2.6119, + "step": 21421 + }, + { + "epoch": 0.6352341132164991, + "grad_norm": 0.11737661808729172, + "learning_rate": 0.0002992498072086245, + "loss": 2.6412, + "step": 21422 + }, + { + "epoch": 0.6352637665688106, + "grad_norm": 0.10940252244472504, + "learning_rate": 0.0002992067160584775, + "loss": 2.6205, + "step": 21423 + }, + { + "epoch": 0.6352934199211221, + "grad_norm": 0.12169855833053589, + "learning_rate": 0.00029916362668638944, + "loss": 2.6468, + "step": 21424 + }, + { + "epoch": 0.6353230732734335, + "grad_norm": 0.11047079414129257, + "learning_rate": 0.0002991205390927419, + "loss": 2.6364, + "step": 21425 + }, + { + "epoch": 0.635352726625745, + "grad_norm": 0.1169329509139061, + "learning_rate": 0.00029907745327791647, + "loss": 2.6089, + "step": 21426 + }, + { + "epoch": 0.6353823799780565, + "grad_norm": 0.12672226130962372, + "learning_rate": 0.0002990343692422948, + "loss": 2.6506, + "step": 21427 + }, + { + "epoch": 0.635412033330368, + "grad_norm": 0.11933930218219757, + "learning_rate": 0.0002989912869862581, + "loss": 2.6371, + "step": 21428 + }, + { + "epoch": 0.6354416866826795, + "grad_norm": 0.11942343413829803, + "learning_rate": 0.0002989482065101883, + "loss": 2.6226, + "step": 21429 + }, + { + "epoch": 0.635471340034991, + "grad_norm": 0.1107952818274498, + "learning_rate": 0.0002989051278144665, + "loss": 2.6532, + "step": 21430 + }, + { + "epoch": 0.6355009933873025, + "grad_norm": 0.10366839170455933, + "learning_rate": 0.00029886205089947425, + "loss": 2.6271, + "step": 21431 + }, + { + "epoch": 0.6355306467396139, + "grad_norm": 0.1068354994058609, + "learning_rate": 0.0002988189757655931, + "loss": 2.6451, + "step": 21432 + }, + { + "epoch": 0.6355603000919254, + "grad_norm": 0.09978936612606049, + "learning_rate": 0.00029877590241320453, + "loss": 2.6268, + "step": 21433 + }, + { + "epoch": 0.6355899534442369, + "grad_norm": 0.10797614604234695, + "learning_rate": 0.0002987328308426898, + "loss": 2.6309, + "step": 21434 + }, + { + "epoch": 0.6356196067965484, + "grad_norm": 0.12203769385814667, + "learning_rate": 0.00029868976105443035, + "loss": 2.6362, + "step": 21435 + }, + { + "epoch": 0.6356492601488598, + "grad_norm": 0.10866428911685944, + "learning_rate": 0.00029864669304880765, + "loss": 2.642, + "step": 21436 + }, + { + "epoch": 0.6356789135011713, + "grad_norm": 0.1205417811870575, + "learning_rate": 0.0002986036268262031, + "loss": 2.6506, + "step": 21437 + }, + { + "epoch": 0.6357085668534828, + "grad_norm": 0.11649047583341599, + "learning_rate": 0.00029856056238699804, + "loss": 2.6458, + "step": 21438 + }, + { + "epoch": 0.6357382202057943, + "grad_norm": 0.11194106191396713, + "learning_rate": 0.0002985174997315738, + "loss": 2.5985, + "step": 21439 + }, + { + "epoch": 0.6357678735581057, + "grad_norm": 0.11728476732969284, + "learning_rate": 0.00029847443886031166, + "loss": 2.5832, + "step": 21440 + }, + { + "epoch": 0.6357975269104172, + "grad_norm": 0.11984290927648544, + "learning_rate": 0.00029843137977359293, + "loss": 2.6425, + "step": 21441 + }, + { + "epoch": 0.6358271802627287, + "grad_norm": 0.11953859776258469, + "learning_rate": 0.00029838832247179885, + "loss": 2.6101, + "step": 21442 + }, + { + "epoch": 0.6358568336150402, + "grad_norm": 0.11584343761205673, + "learning_rate": 0.0002983452669553108, + "loss": 2.5986, + "step": 21443 + }, + { + "epoch": 0.6358864869673516, + "grad_norm": 0.11010874807834625, + "learning_rate": 0.0002983022132245101, + "loss": 2.6313, + "step": 21444 + }, + { + "epoch": 0.6359161403196631, + "grad_norm": 0.12029418349266052, + "learning_rate": 0.00029825916127977794, + "loss": 2.5981, + "step": 21445 + }, + { + "epoch": 0.6359457936719746, + "grad_norm": 0.12681154906749725, + "learning_rate": 0.00029821611112149554, + "loss": 2.6233, + "step": 21446 + }, + { + "epoch": 0.6359754470242861, + "grad_norm": 0.11303555220365524, + "learning_rate": 0.0002981730627500441, + "loss": 2.6405, + "step": 21447 + }, + { + "epoch": 0.6360051003765975, + "grad_norm": 0.11866267025470734, + "learning_rate": 0.0002981300161658046, + "loss": 2.6192, + "step": 21448 + }, + { + "epoch": 0.636034753728909, + "grad_norm": 0.10755357146263123, + "learning_rate": 0.00029808697136915864, + "loss": 2.6303, + "step": 21449 + }, + { + "epoch": 0.6360644070812206, + "grad_norm": 0.1244053915143013, + "learning_rate": 0.0002980439283604873, + "loss": 2.6302, + "step": 21450 + }, + { + "epoch": 0.636094060433532, + "grad_norm": 0.1143156960606575, + "learning_rate": 0.0002980008871401715, + "loss": 2.6438, + "step": 21451 + }, + { + "epoch": 0.6361237137858435, + "grad_norm": 0.12469346821308136, + "learning_rate": 0.00029795784770859256, + "loss": 2.6518, + "step": 21452 + }, + { + "epoch": 0.636153367138155, + "grad_norm": 0.1173376813530922, + "learning_rate": 0.0002979148100661316, + "loss": 2.6397, + "step": 21453 + }, + { + "epoch": 0.6361830204904665, + "grad_norm": 0.1223602443933487, + "learning_rate": 0.0002978717742131697, + "loss": 2.6186, + "step": 21454 + }, + { + "epoch": 0.6362126738427779, + "grad_norm": 0.11217749118804932, + "learning_rate": 0.00029782874015008785, + "loss": 2.6273, + "step": 21455 + }, + { + "epoch": 0.6362423271950894, + "grad_norm": 0.13742174208164215, + "learning_rate": 0.00029778570787726734, + "loss": 2.6499, + "step": 21456 + }, + { + "epoch": 0.6362719805474009, + "grad_norm": 0.12014574557542801, + "learning_rate": 0.00029774267739508895, + "loss": 2.6263, + "step": 21457 + }, + { + "epoch": 0.6363016338997124, + "grad_norm": 0.10624987632036209, + "learning_rate": 0.00029769964870393395, + "loss": 2.6237, + "step": 21458 + }, + { + "epoch": 0.6363312872520238, + "grad_norm": 0.12021645158529282, + "learning_rate": 0.00029765662180418327, + "loss": 2.6455, + "step": 21459 + }, + { + "epoch": 0.6363609406043353, + "grad_norm": 0.10939820855855942, + "learning_rate": 0.0002976135966962178, + "loss": 2.6411, + "step": 21460 + }, + { + "epoch": 0.6363905939566468, + "grad_norm": 0.11923220008611679, + "learning_rate": 0.0002975705733804188, + "loss": 2.6298, + "step": 21461 + }, + { + "epoch": 0.6364202473089583, + "grad_norm": 0.11316706985235214, + "learning_rate": 0.00029752755185716713, + "loss": 2.6086, + "step": 21462 + }, + { + "epoch": 0.6364499006612697, + "grad_norm": 0.1214277371764183, + "learning_rate": 0.00029748453212684366, + "loss": 2.6696, + "step": 21463 + }, + { + "epoch": 0.6364795540135813, + "grad_norm": 0.11651728302240372, + "learning_rate": 0.00029744151418982956, + "loss": 2.6184, + "step": 21464 + }, + { + "epoch": 0.6365092073658927, + "grad_norm": 0.12147206813097, + "learning_rate": 0.00029739849804650555, + "loss": 2.6461, + "step": 21465 + }, + { + "epoch": 0.6365388607182042, + "grad_norm": 0.12084506452083588, + "learning_rate": 0.00029735548369725257, + "loss": 2.6329, + "step": 21466 + }, + { + "epoch": 0.6365685140705156, + "grad_norm": 0.11565188318490982, + "learning_rate": 0.00029731247114245176, + "loss": 2.6035, + "step": 21467 + }, + { + "epoch": 0.6365981674228272, + "grad_norm": 0.13118238747119904, + "learning_rate": 0.0002972694603824836, + "loss": 2.6175, + "step": 21468 + }, + { + "epoch": 0.6366278207751386, + "grad_norm": 0.11192654073238373, + "learning_rate": 0.0002972264514177292, + "loss": 2.6125, + "step": 21469 + }, + { + "epoch": 0.6366574741274501, + "grad_norm": 0.12283960729837418, + "learning_rate": 0.0002971834442485694, + "loss": 2.6563, + "step": 21470 + }, + { + "epoch": 0.6366871274797616, + "grad_norm": 0.11371836811304092, + "learning_rate": 0.000297140438875385, + "loss": 2.635, + "step": 21471 + }, + { + "epoch": 0.6367167808320731, + "grad_norm": 0.11747333407402039, + "learning_rate": 0.0002970974352985569, + "loss": 2.618, + "step": 21472 + }, + { + "epoch": 0.6367464341843846, + "grad_norm": 0.1003967896103859, + "learning_rate": 0.0002970544335184657, + "loss": 2.61, + "step": 21473 + }, + { + "epoch": 0.636776087536696, + "grad_norm": 0.11717004328966141, + "learning_rate": 0.00029701143353549236, + "loss": 2.6192, + "step": 21474 + }, + { + "epoch": 0.6368057408890075, + "grad_norm": 0.10336250066757202, + "learning_rate": 0.0002969684353500177, + "loss": 2.5938, + "step": 21475 + }, + { + "epoch": 0.636835394241319, + "grad_norm": 0.12673303484916687, + "learning_rate": 0.00029692543896242237, + "loss": 2.6383, + "step": 21476 + }, + { + "epoch": 0.6368650475936305, + "grad_norm": 0.10491533577442169, + "learning_rate": 0.0002968824443730872, + "loss": 2.6338, + "step": 21477 + }, + { + "epoch": 0.6368947009459419, + "grad_norm": 0.10326693952083588, + "learning_rate": 0.00029683945158239276, + "loss": 2.6309, + "step": 21478 + }, + { + "epoch": 0.6369243542982534, + "grad_norm": 0.1170579046010971, + "learning_rate": 0.00029679646059071986, + "loss": 2.6505, + "step": 21479 + }, + { + "epoch": 0.6369540076505649, + "grad_norm": 0.13153332471847534, + "learning_rate": 0.0002967534713984492, + "loss": 2.6082, + "step": 21480 + }, + { + "epoch": 0.6369836610028764, + "grad_norm": 0.11748093366622925, + "learning_rate": 0.00029671048400596145, + "loss": 2.6243, + "step": 21481 + }, + { + "epoch": 0.6370133143551878, + "grad_norm": 0.11327680200338364, + "learning_rate": 0.0002966674984136372, + "loss": 2.6287, + "step": 21482 + }, + { + "epoch": 0.6370429677074994, + "grad_norm": 0.14816781878471375, + "learning_rate": 0.0002966245146218572, + "loss": 2.6297, + "step": 21483 + }, + { + "epoch": 0.6370726210598108, + "grad_norm": 0.10586755722761154, + "learning_rate": 0.0002965815326310021, + "loss": 2.633, + "step": 21484 + }, + { + "epoch": 0.6371022744121223, + "grad_norm": 0.11051735281944275, + "learning_rate": 0.0002965385524414524, + "loss": 2.621, + "step": 21485 + }, + { + "epoch": 0.6371319277644337, + "grad_norm": 0.10509686172008514, + "learning_rate": 0.0002964955740535885, + "loss": 2.6498, + "step": 21486 + }, + { + "epoch": 0.6371615811167453, + "grad_norm": 0.11857884377241135, + "learning_rate": 0.0002964525974677914, + "loss": 2.6391, + "step": 21487 + }, + { + "epoch": 0.6371912344690567, + "grad_norm": 0.10564466565847397, + "learning_rate": 0.00029640962268444163, + "loss": 2.6076, + "step": 21488 + }, + { + "epoch": 0.6372208878213682, + "grad_norm": 0.11345037817955017, + "learning_rate": 0.00029636664970391946, + "loss": 2.6663, + "step": 21489 + }, + { + "epoch": 0.6372505411736796, + "grad_norm": 0.11840321868658066, + "learning_rate": 0.00029632367852660565, + "loss": 2.6615, + "step": 21490 + }, + { + "epoch": 0.6372801945259912, + "grad_norm": 0.10743486136198044, + "learning_rate": 0.0002962807091528805, + "loss": 2.6412, + "step": 21491 + }, + { + "epoch": 0.6373098478783027, + "grad_norm": 0.11653397977352142, + "learning_rate": 0.00029623774158312475, + "loss": 2.6288, + "step": 21492 + }, + { + "epoch": 0.6373395012306141, + "grad_norm": 0.11791063845157623, + "learning_rate": 0.0002961947758177187, + "loss": 2.6526, + "step": 21493 + }, + { + "epoch": 0.6373691545829256, + "grad_norm": 0.11753548681735992, + "learning_rate": 0.000296151811857043, + "loss": 2.6513, + "step": 21494 + }, + { + "epoch": 0.6373988079352371, + "grad_norm": 0.10867327451705933, + "learning_rate": 0.00029610884970147797, + "loss": 2.6316, + "step": 21495 + }, + { + "epoch": 0.6374284612875486, + "grad_norm": 0.11604168266057968, + "learning_rate": 0.00029606588935140397, + "loss": 2.614, + "step": 21496 + }, + { + "epoch": 0.63745811463986, + "grad_norm": 0.11057351529598236, + "learning_rate": 0.00029602293080720156, + "loss": 2.6075, + "step": 21497 + }, + { + "epoch": 0.6374877679921716, + "grad_norm": 0.10105057060718536, + "learning_rate": 0.00029597997406925113, + "loss": 2.6054, + "step": 21498 + }, + { + "epoch": 0.637517421344483, + "grad_norm": 0.11327141523361206, + "learning_rate": 0.0002959370191379329, + "loss": 2.64, + "step": 21499 + }, + { + "epoch": 0.6375470746967945, + "grad_norm": 0.1015344187617302, + "learning_rate": 0.00029589406601362756, + "loss": 2.6325, + "step": 21500 + }, + { + "epoch": 0.6375767280491059, + "grad_norm": 0.11329557001590729, + "learning_rate": 0.0002958511146967153, + "loss": 2.6253, + "step": 21501 + }, + { + "epoch": 0.6376063814014175, + "grad_norm": 0.1035439595580101, + "learning_rate": 0.0002958081651875764, + "loss": 2.6419, + "step": 21502 + }, + { + "epoch": 0.6376360347537289, + "grad_norm": 0.10183558613061905, + "learning_rate": 0.0002957652174865913, + "loss": 2.6413, + "step": 21503 + }, + { + "epoch": 0.6376656881060404, + "grad_norm": 0.10219832509756088, + "learning_rate": 0.00029572227159414024, + "loss": 2.5989, + "step": 21504 + }, + { + "epoch": 0.6376953414583518, + "grad_norm": 0.09595753252506256, + "learning_rate": 0.0002956793275106036, + "loss": 2.6355, + "step": 21505 + }, + { + "epoch": 0.6377249948106634, + "grad_norm": 0.11323873698711395, + "learning_rate": 0.0002956363852363615, + "loss": 2.6338, + "step": 21506 + }, + { + "epoch": 0.6377546481629748, + "grad_norm": 0.11079082638025284, + "learning_rate": 0.0002955934447717943, + "loss": 2.6508, + "step": 21507 + }, + { + "epoch": 0.6377843015152863, + "grad_norm": 0.09941721707582474, + "learning_rate": 0.00029555050611728225, + "loss": 2.6178, + "step": 21508 + }, + { + "epoch": 0.6378139548675977, + "grad_norm": 0.1114099770784378, + "learning_rate": 0.00029550756927320554, + "loss": 2.617, + "step": 21509 + }, + { + "epoch": 0.6378436082199093, + "grad_norm": 0.10673771053552628, + "learning_rate": 0.0002954646342399444, + "loss": 2.6147, + "step": 21510 + }, + { + "epoch": 0.6378732615722207, + "grad_norm": 0.11803868412971497, + "learning_rate": 0.00029542170101787914, + "loss": 2.6337, + "step": 21511 + }, + { + "epoch": 0.6379029149245322, + "grad_norm": 0.09806633740663528, + "learning_rate": 0.00029537876960738954, + "loss": 2.6202, + "step": 21512 + }, + { + "epoch": 0.6379325682768437, + "grad_norm": 0.10685022920370102, + "learning_rate": 0.0002953358400088563, + "loss": 2.6106, + "step": 21513 + }, + { + "epoch": 0.6379622216291552, + "grad_norm": 0.11177876591682434, + "learning_rate": 0.0002952929122226592, + "loss": 2.6196, + "step": 21514 + }, + { + "epoch": 0.6379918749814667, + "grad_norm": 0.11590774357318878, + "learning_rate": 0.00029524998624917866, + "loss": 2.6222, + "step": 21515 + }, + { + "epoch": 0.6380215283337781, + "grad_norm": 0.12492892891168594, + "learning_rate": 0.0002952070620887946, + "loss": 2.6601, + "step": 21516 + }, + { + "epoch": 0.6380511816860897, + "grad_norm": 0.13411694765090942, + "learning_rate": 0.00029516413974188706, + "loss": 2.6265, + "step": 21517 + }, + { + "epoch": 0.6380808350384011, + "grad_norm": 0.11431200057268143, + "learning_rate": 0.00029512121920883627, + "loss": 2.6095, + "step": 21518 + }, + { + "epoch": 0.6381104883907126, + "grad_norm": 0.1319098025560379, + "learning_rate": 0.0002950783004900223, + "loss": 2.6659, + "step": 21519 + }, + { + "epoch": 0.638140141743024, + "grad_norm": 0.11431166529655457, + "learning_rate": 0.000295035383585825, + "loss": 2.6328, + "step": 21520 + }, + { + "epoch": 0.6381697950953356, + "grad_norm": 0.1161835789680481, + "learning_rate": 0.00029499246849662474, + "loss": 2.6245, + "step": 21521 + }, + { + "epoch": 0.638199448447647, + "grad_norm": 0.11357295513153076, + "learning_rate": 0.0002949495552228014, + "loss": 2.6284, + "step": 21522 + }, + { + "epoch": 0.6382291017999585, + "grad_norm": 0.11677927523851395, + "learning_rate": 0.0002949066437647349, + "loss": 2.6411, + "step": 21523 + }, + { + "epoch": 0.6382587551522699, + "grad_norm": 0.1121065616607666, + "learning_rate": 0.0002948637341228051, + "loss": 2.6506, + "step": 21524 + }, + { + "epoch": 0.6382884085045815, + "grad_norm": 0.09406103193759918, + "learning_rate": 0.00029482082629739205, + "loss": 2.627, + "step": 21525 + }, + { + "epoch": 0.6383180618568929, + "grad_norm": 0.11467457562685013, + "learning_rate": 0.0002947779202888761, + "loss": 2.6176, + "step": 21526 + }, + { + "epoch": 0.6383477152092044, + "grad_norm": 0.09885910153388977, + "learning_rate": 0.0002947350160976368, + "loss": 2.6191, + "step": 21527 + }, + { + "epoch": 0.6383773685615158, + "grad_norm": 0.11718007177114487, + "learning_rate": 0.0002946921137240542, + "loss": 2.6219, + "step": 21528 + }, + { + "epoch": 0.6384070219138274, + "grad_norm": 0.11323191225528717, + "learning_rate": 0.00029464921316850815, + "loss": 2.6432, + "step": 21529 + }, + { + "epoch": 0.6384366752661388, + "grad_norm": 0.11643805354833603, + "learning_rate": 0.00029460631443137855, + "loss": 2.6057, + "step": 21530 + }, + { + "epoch": 0.6384663286184503, + "grad_norm": 0.12451039999723434, + "learning_rate": 0.00029456341751304526, + "loss": 2.6618, + "step": 21531 + }, + { + "epoch": 0.6384959819707619, + "grad_norm": 0.11273685097694397, + "learning_rate": 0.0002945205224138883, + "loss": 2.6437, + "step": 21532 + }, + { + "epoch": 0.6385256353230733, + "grad_norm": 0.11049409210681915, + "learning_rate": 0.0002944776291342873, + "loss": 2.6322, + "step": 21533 + }, + { + "epoch": 0.6385552886753848, + "grad_norm": 0.12157101929187775, + "learning_rate": 0.0002944347376746221, + "loss": 2.6411, + "step": 21534 + }, + { + "epoch": 0.6385849420276962, + "grad_norm": 0.12276315689086914, + "learning_rate": 0.0002943918480352726, + "loss": 2.631, + "step": 21535 + }, + { + "epoch": 0.6386145953800078, + "grad_norm": 0.11535321176052094, + "learning_rate": 0.00029434896021661867, + "loss": 2.6372, + "step": 21536 + }, + { + "epoch": 0.6386442487323192, + "grad_norm": 0.12062744051218033, + "learning_rate": 0.00029430607421903997, + "loss": 2.6459, + "step": 21537 + }, + { + "epoch": 0.6386739020846307, + "grad_norm": 0.10802178084850311, + "learning_rate": 0.00029426319004291615, + "loss": 2.606, + "step": 21538 + }, + { + "epoch": 0.6387035554369421, + "grad_norm": 0.11885247379541397, + "learning_rate": 0.00029422030768862717, + "loss": 2.6145, + "step": 21539 + }, + { + "epoch": 0.6387332087892537, + "grad_norm": 0.12726353108882904, + "learning_rate": 0.00029417742715655276, + "loss": 2.6562, + "step": 21540 + }, + { + "epoch": 0.6387628621415651, + "grad_norm": 0.105763740837574, + "learning_rate": 0.0002941345484470726, + "loss": 2.642, + "step": 21541 + }, + { + "epoch": 0.6387925154938766, + "grad_norm": 0.11254321038722992, + "learning_rate": 0.00029409167156056623, + "loss": 2.6724, + "step": 21542 + }, + { + "epoch": 0.638822168846188, + "grad_norm": 0.12201635539531708, + "learning_rate": 0.0002940487964974136, + "loss": 2.6204, + "step": 21543 + }, + { + "epoch": 0.6388518221984996, + "grad_norm": 0.10930614173412323, + "learning_rate": 0.00029400592325799416, + "loss": 2.5959, + "step": 21544 + }, + { + "epoch": 0.638881475550811, + "grad_norm": 0.1232961118221283, + "learning_rate": 0.0002939630518426876, + "loss": 2.6422, + "step": 21545 + }, + { + "epoch": 0.6389111289031225, + "grad_norm": 0.11139379441738129, + "learning_rate": 0.0002939201822518735, + "loss": 2.6407, + "step": 21546 + }, + { + "epoch": 0.6389407822554339, + "grad_norm": 0.1235276386141777, + "learning_rate": 0.00029387731448593166, + "loss": 2.6318, + "step": 21547 + }, + { + "epoch": 0.6389704356077455, + "grad_norm": 0.09768082946538925, + "learning_rate": 0.00029383444854524156, + "loss": 2.5971, + "step": 21548 + }, + { + "epoch": 0.6390000889600569, + "grad_norm": 0.10221194475889206, + "learning_rate": 0.0002937915844301829, + "loss": 2.6106, + "step": 21549 + }, + { + "epoch": 0.6390297423123684, + "grad_norm": 0.11599717289209366, + "learning_rate": 0.000293748722141135, + "loss": 2.6724, + "step": 21550 + }, + { + "epoch": 0.6390593956646798, + "grad_norm": 0.10789836943149567, + "learning_rate": 0.00029370586167847744, + "loss": 2.6089, + "step": 21551 + }, + { + "epoch": 0.6390890490169914, + "grad_norm": 0.10394017398357391, + "learning_rate": 0.0002936630030425901, + "loss": 2.6528, + "step": 21552 + }, + { + "epoch": 0.6391187023693029, + "grad_norm": 0.10521775484085083, + "learning_rate": 0.00029362014623385236, + "loss": 2.649, + "step": 21553 + }, + { + "epoch": 0.6391483557216143, + "grad_norm": 0.09437426179647446, + "learning_rate": 0.00029357729125264353, + "loss": 2.6338, + "step": 21554 + }, + { + "epoch": 0.6391780090739259, + "grad_norm": 0.11086152493953705, + "learning_rate": 0.00029353443809934323, + "loss": 2.6357, + "step": 21555 + }, + { + "epoch": 0.6392076624262373, + "grad_norm": 0.0922669917345047, + "learning_rate": 0.00029349158677433095, + "loss": 2.6148, + "step": 21556 + }, + { + "epoch": 0.6392373157785488, + "grad_norm": 0.10041981935501099, + "learning_rate": 0.00029344873727798603, + "loss": 2.6357, + "step": 21557 + }, + { + "epoch": 0.6392669691308602, + "grad_norm": 0.09718377143144608, + "learning_rate": 0.00029340588961068807, + "loss": 2.6428, + "step": 21558 + }, + { + "epoch": 0.6392966224831718, + "grad_norm": 0.10893376916646957, + "learning_rate": 0.0002933630437728164, + "loss": 2.6232, + "step": 21559 + }, + { + "epoch": 0.6393262758354832, + "grad_norm": 0.09543191641569138, + "learning_rate": 0.00029332019976475055, + "loss": 2.6015, + "step": 21560 + }, + { + "epoch": 0.6393559291877947, + "grad_norm": 0.10888099670410156, + "learning_rate": 0.0002932773575868697, + "loss": 2.6335, + "step": 21561 + }, + { + "epoch": 0.6393855825401061, + "grad_norm": 0.1096888929605484, + "learning_rate": 0.0002932345172395534, + "loss": 2.6102, + "step": 21562 + }, + { + "epoch": 0.6394152358924177, + "grad_norm": 0.09271618723869324, + "learning_rate": 0.00029319167872318084, + "loss": 2.6223, + "step": 21563 + }, + { + "epoch": 0.6394448892447291, + "grad_norm": 0.11020161211490631, + "learning_rate": 0.0002931488420381314, + "loss": 2.6038, + "step": 21564 + }, + { + "epoch": 0.6394745425970406, + "grad_norm": 0.10869757831096649, + "learning_rate": 0.00029310600718478457, + "loss": 2.6355, + "step": 21565 + }, + { + "epoch": 0.639504195949352, + "grad_norm": 0.10020102560520172, + "learning_rate": 0.0002930631741635196, + "loss": 2.6114, + "step": 21566 + }, + { + "epoch": 0.6395338493016636, + "grad_norm": 0.10110291093587875, + "learning_rate": 0.0002930203429747157, + "loss": 2.6304, + "step": 21567 + }, + { + "epoch": 0.639563502653975, + "grad_norm": 0.08924811333417892, + "learning_rate": 0.00029297751361875217, + "loss": 2.6183, + "step": 21568 + }, + { + "epoch": 0.6395931560062865, + "grad_norm": 0.11136224120855331, + "learning_rate": 0.0002929346860960083, + "loss": 2.6064, + "step": 21569 + }, + { + "epoch": 0.6396228093585979, + "grad_norm": 0.11273397505283356, + "learning_rate": 0.0002928918604068634, + "loss": 2.6384, + "step": 21570 + }, + { + "epoch": 0.6396524627109095, + "grad_norm": 0.1061040535569191, + "learning_rate": 0.00029284903655169665, + "loss": 2.622, + "step": 21571 + }, + { + "epoch": 0.6396821160632209, + "grad_norm": 0.10148187726736069, + "learning_rate": 0.0002928062145308872, + "loss": 2.6318, + "step": 21572 + }, + { + "epoch": 0.6397117694155324, + "grad_norm": 0.10919253528118134, + "learning_rate": 0.0002927633943448142, + "loss": 2.6128, + "step": 21573 + }, + { + "epoch": 0.639741422767844, + "grad_norm": 0.11288660019636154, + "learning_rate": 0.00029272057599385694, + "loss": 2.5848, + "step": 21574 + }, + { + "epoch": 0.6397710761201554, + "grad_norm": 0.12211386114358902, + "learning_rate": 0.0002926777594783946, + "loss": 2.6278, + "step": 21575 + }, + { + "epoch": 0.6398007294724669, + "grad_norm": 0.10647637397050858, + "learning_rate": 0.00029263494479880625, + "loss": 2.6226, + "step": 21576 + }, + { + "epoch": 0.6398303828247783, + "grad_norm": 0.1398073434829712, + "learning_rate": 0.00029259213195547096, + "loss": 2.6099, + "step": 21577 + }, + { + "epoch": 0.6398600361770899, + "grad_norm": 0.13736677169799805, + "learning_rate": 0.00029254932094876807, + "loss": 2.6095, + "step": 21578 + }, + { + "epoch": 0.6398896895294013, + "grad_norm": 0.13558275997638702, + "learning_rate": 0.0002925065117790765, + "loss": 2.6107, + "step": 21579 + }, + { + "epoch": 0.6399193428817128, + "grad_norm": 0.12077953666448593, + "learning_rate": 0.00029246370444677546, + "loss": 2.6153, + "step": 21580 + }, + { + "epoch": 0.6399489962340242, + "grad_norm": 0.12318658083677292, + "learning_rate": 0.000292420898952244, + "loss": 2.6392, + "step": 21581 + }, + { + "epoch": 0.6399786495863358, + "grad_norm": 0.11578129976987839, + "learning_rate": 0.00029237809529586103, + "loss": 2.636, + "step": 21582 + }, + { + "epoch": 0.6400083029386472, + "grad_norm": 0.11482573300600052, + "learning_rate": 0.0002923352934780056, + "loss": 2.6352, + "step": 21583 + }, + { + "epoch": 0.6400379562909587, + "grad_norm": 0.11384999752044678, + "learning_rate": 0.0002922924934990568, + "loss": 2.6397, + "step": 21584 + }, + { + "epoch": 0.6400676096432701, + "grad_norm": 0.11577451229095459, + "learning_rate": 0.0002922496953593937, + "loss": 2.6445, + "step": 21585 + }, + { + "epoch": 0.6400972629955817, + "grad_norm": 0.10177838057279587, + "learning_rate": 0.0002922068990593951, + "loss": 2.6387, + "step": 21586 + }, + { + "epoch": 0.6401269163478931, + "grad_norm": 0.10710009187459946, + "learning_rate": 0.0002921641045994403, + "loss": 2.6325, + "step": 21587 + }, + { + "epoch": 0.6401565697002046, + "grad_norm": 0.10687372088432312, + "learning_rate": 0.0002921213119799079, + "loss": 2.6323, + "step": 21588 + }, + { + "epoch": 0.640186223052516, + "grad_norm": 0.09972970932722092, + "learning_rate": 0.0002920785212011769, + "loss": 2.6342, + "step": 21589 + }, + { + "epoch": 0.6402158764048276, + "grad_norm": 0.11271067708730698, + "learning_rate": 0.0002920357322636261, + "loss": 2.5971, + "step": 21590 + }, + { + "epoch": 0.640245529757139, + "grad_norm": 0.10253337770700455, + "learning_rate": 0.0002919929451676349, + "loss": 2.6406, + "step": 21591 + }, + { + "epoch": 0.6402751831094505, + "grad_norm": 0.10373403131961823, + "learning_rate": 0.00029195015991358175, + "loss": 2.6421, + "step": 21592 + }, + { + "epoch": 0.640304836461762, + "grad_norm": 0.10652682185173035, + "learning_rate": 0.00029190737650184565, + "loss": 2.6575, + "step": 21593 + }, + { + "epoch": 0.6403344898140735, + "grad_norm": 0.10999488830566406, + "learning_rate": 0.00029186459493280546, + "loss": 2.6431, + "step": 21594 + }, + { + "epoch": 0.640364143166385, + "grad_norm": 0.11380398273468018, + "learning_rate": 0.0002918218152068402, + "loss": 2.6221, + "step": 21595 + }, + { + "epoch": 0.6403937965186964, + "grad_norm": 0.11665306985378265, + "learning_rate": 0.0002917790373243283, + "loss": 2.6142, + "step": 21596 + }, + { + "epoch": 0.640423449871008, + "grad_norm": 0.1101033166050911, + "learning_rate": 0.0002917362612856488, + "loss": 2.6387, + "step": 21597 + }, + { + "epoch": 0.6404531032233194, + "grad_norm": 0.10607527196407318, + "learning_rate": 0.0002916934870911805, + "loss": 2.6257, + "step": 21598 + }, + { + "epoch": 0.6404827565756309, + "grad_norm": 0.10096629709005356, + "learning_rate": 0.000291650714741302, + "loss": 2.5947, + "step": 21599 + }, + { + "epoch": 0.6405124099279423, + "grad_norm": 0.1124323159456253, + "learning_rate": 0.00029160794423639224, + "loss": 2.6091, + "step": 21600 + }, + { + "epoch": 0.6405420632802539, + "grad_norm": 0.09986075013875961, + "learning_rate": 0.0002915651755768299, + "loss": 2.6324, + "step": 21601 + }, + { + "epoch": 0.6405717166325653, + "grad_norm": 0.09884002059698105, + "learning_rate": 0.00029152240876299373, + "loss": 2.5806, + "step": 21602 + }, + { + "epoch": 0.6406013699848768, + "grad_norm": 0.11694755405187607, + "learning_rate": 0.00029147964379526225, + "loss": 2.6352, + "step": 21603 + }, + { + "epoch": 0.6406310233371882, + "grad_norm": 0.10721497237682343, + "learning_rate": 0.00029143688067401446, + "loss": 2.6421, + "step": 21604 + }, + { + "epoch": 0.6406606766894998, + "grad_norm": 0.11418092250823975, + "learning_rate": 0.0002913941193996291, + "loss": 2.601, + "step": 21605 + }, + { + "epoch": 0.6406903300418112, + "grad_norm": 0.10288304090499878, + "learning_rate": 0.0002913513599724844, + "loss": 2.6656, + "step": 21606 + }, + { + "epoch": 0.6407199833941227, + "grad_norm": 0.10954512655735016, + "learning_rate": 0.0002913086023929593, + "loss": 2.6533, + "step": 21607 + }, + { + "epoch": 0.6407496367464341, + "grad_norm": 0.12966430187225342, + "learning_rate": 0.0002912658466614323, + "loss": 2.616, + "step": 21608 + }, + { + "epoch": 0.6407792900987457, + "grad_norm": 0.10405098646879196, + "learning_rate": 0.0002912230927782821, + "loss": 2.5866, + "step": 21609 + }, + { + "epoch": 0.6408089434510571, + "grad_norm": 0.10791611671447754, + "learning_rate": 0.0002911803407438871, + "loss": 2.6196, + "step": 21610 + }, + { + "epoch": 0.6408385968033686, + "grad_norm": 0.11131469905376434, + "learning_rate": 0.00029113759055862616, + "loss": 2.6551, + "step": 21611 + }, + { + "epoch": 0.64086825015568, + "grad_norm": 0.09758046269416809, + "learning_rate": 0.00029109484222287764, + "loss": 2.5847, + "step": 21612 + }, + { + "epoch": 0.6408979035079916, + "grad_norm": 0.10555706173181534, + "learning_rate": 0.0002910520957370202, + "loss": 2.6493, + "step": 21613 + }, + { + "epoch": 0.640927556860303, + "grad_norm": 0.10345178842544556, + "learning_rate": 0.00029100935110143233, + "loss": 2.6209, + "step": 21614 + }, + { + "epoch": 0.6409572102126145, + "grad_norm": 0.10370075702667236, + "learning_rate": 0.0002909666083164925, + "loss": 2.6053, + "step": 21615 + }, + { + "epoch": 0.6409868635649261, + "grad_norm": 0.11343692988157272, + "learning_rate": 0.0002909238673825792, + "loss": 2.6522, + "step": 21616 + }, + { + "epoch": 0.6410165169172375, + "grad_norm": 0.09959875047206879, + "learning_rate": 0.000290881128300071, + "loss": 2.6117, + "step": 21617 + }, + { + "epoch": 0.641046170269549, + "grad_norm": 0.10969781875610352, + "learning_rate": 0.0002908383910693463, + "loss": 2.642, + "step": 21618 + }, + { + "epoch": 0.6410758236218604, + "grad_norm": 0.11826518923044205, + "learning_rate": 0.00029079565569078346, + "loss": 2.6277, + "step": 21619 + }, + { + "epoch": 0.641105476974172, + "grad_norm": 0.1139800027012825, + "learning_rate": 0.00029075292216476113, + "loss": 2.6203, + "step": 21620 + }, + { + "epoch": 0.6411351303264834, + "grad_norm": 0.0915461927652359, + "learning_rate": 0.00029071019049165746, + "loss": 2.606, + "step": 21621 + }, + { + "epoch": 0.6411647836787949, + "grad_norm": 0.10166957229375839, + "learning_rate": 0.0002906674606718511, + "loss": 2.6029, + "step": 21622 + }, + { + "epoch": 0.6411944370311063, + "grad_norm": 0.11838633567094803, + "learning_rate": 0.0002906247327057202, + "loss": 2.6161, + "step": 21623 + }, + { + "epoch": 0.6412240903834179, + "grad_norm": 0.1078619509935379, + "learning_rate": 0.00029058200659364333, + "loss": 2.6643, + "step": 21624 + }, + { + "epoch": 0.6412537437357293, + "grad_norm": 0.11106562614440918, + "learning_rate": 0.0002905392823359987, + "loss": 2.6093, + "step": 21625 + }, + { + "epoch": 0.6412833970880408, + "grad_norm": 0.13927951455116272, + "learning_rate": 0.0002904965599331646, + "loss": 2.6432, + "step": 21626 + }, + { + "epoch": 0.6413130504403523, + "grad_norm": 0.12094416469335556, + "learning_rate": 0.0002904538393855195, + "loss": 2.6451, + "step": 21627 + }, + { + "epoch": 0.6413427037926638, + "grad_norm": 0.12211748957633972, + "learning_rate": 0.00029041112069344165, + "loss": 2.6231, + "step": 21628 + }, + { + "epoch": 0.6413723571449752, + "grad_norm": 0.10613319277763367, + "learning_rate": 0.00029036840385730924, + "loss": 2.6167, + "step": 21629 + }, + { + "epoch": 0.6414020104972867, + "grad_norm": 0.13352368772029877, + "learning_rate": 0.00029032568887750064, + "loss": 2.6137, + "step": 21630 + }, + { + "epoch": 0.6414316638495982, + "grad_norm": 0.12579859793186188, + "learning_rate": 0.000290282975754394, + "loss": 2.6386, + "step": 21631 + }, + { + "epoch": 0.6414613172019097, + "grad_norm": 0.12342558056116104, + "learning_rate": 0.00029024026448836766, + "loss": 2.6306, + "step": 21632 + }, + { + "epoch": 0.6414909705542211, + "grad_norm": 0.1203732043504715, + "learning_rate": 0.00029019755507979974, + "loss": 2.6029, + "step": 21633 + }, + { + "epoch": 0.6415206239065326, + "grad_norm": 0.1118827760219574, + "learning_rate": 0.0002901548475290685, + "loss": 2.6194, + "step": 21634 + }, + { + "epoch": 0.6415502772588441, + "grad_norm": 0.11818943172693253, + "learning_rate": 0.0002901121418365521, + "loss": 2.5966, + "step": 21635 + }, + { + "epoch": 0.6415799306111556, + "grad_norm": 0.13340029120445251, + "learning_rate": 0.00029006943800262875, + "loss": 2.627, + "step": 21636 + }, + { + "epoch": 0.6416095839634671, + "grad_norm": 0.12345022708177567, + "learning_rate": 0.00029002673602767656, + "loss": 2.6464, + "step": 21637 + }, + { + "epoch": 0.6416392373157785, + "grad_norm": 0.1279849410057068, + "learning_rate": 0.00028998403591207363, + "loss": 2.6201, + "step": 21638 + }, + { + "epoch": 0.6416688906680901, + "grad_norm": 0.11957289278507233, + "learning_rate": 0.0002899413376561983, + "loss": 2.6144, + "step": 21639 + }, + { + "epoch": 0.6416985440204015, + "grad_norm": 0.11286614090204239, + "learning_rate": 0.0002898986412604283, + "loss": 2.6181, + "step": 21640 + }, + { + "epoch": 0.641728197372713, + "grad_norm": 0.12472104281187057, + "learning_rate": 0.0002898559467251419, + "loss": 2.6527, + "step": 21641 + }, + { + "epoch": 0.6417578507250244, + "grad_norm": 0.10325825214385986, + "learning_rate": 0.00028981325405071697, + "loss": 2.6206, + "step": 21642 + }, + { + "epoch": 0.641787504077336, + "grad_norm": 0.13210956752300262, + "learning_rate": 0.00028977056323753193, + "loss": 2.6332, + "step": 21643 + }, + { + "epoch": 0.6418171574296474, + "grad_norm": 0.1141628697514534, + "learning_rate": 0.00028972787428596463, + "loss": 2.6537, + "step": 21644 + }, + { + "epoch": 0.6418468107819589, + "grad_norm": 0.12545253336429596, + "learning_rate": 0.00028968518719639313, + "loss": 2.6362, + "step": 21645 + }, + { + "epoch": 0.6418764641342704, + "grad_norm": 0.1031174436211586, + "learning_rate": 0.0002896425019691954, + "loss": 2.6052, + "step": 21646 + }, + { + "epoch": 0.6419061174865819, + "grad_norm": 0.10312909632921219, + "learning_rate": 0.0002895998186047494, + "loss": 2.6286, + "step": 21647 + }, + { + "epoch": 0.6419357708388933, + "grad_norm": 0.12155114114284515, + "learning_rate": 0.0002895571371034331, + "loss": 2.6487, + "step": 21648 + }, + { + "epoch": 0.6419654241912048, + "grad_norm": 0.10823433101177216, + "learning_rate": 0.0002895144574656245, + "loss": 2.6199, + "step": 21649 + }, + { + "epoch": 0.6419950775435163, + "grad_norm": 0.12283679097890854, + "learning_rate": 0.0002894717796917017, + "loss": 2.5978, + "step": 21650 + }, + { + "epoch": 0.6420247308958278, + "grad_norm": 0.11398850381374359, + "learning_rate": 0.0002894291037820422, + "loss": 2.618, + "step": 21651 + }, + { + "epoch": 0.6420543842481392, + "grad_norm": 0.11330198496580124, + "learning_rate": 0.0002893864297370242, + "loss": 2.6103, + "step": 21652 + }, + { + "epoch": 0.6420840376004507, + "grad_norm": 0.11919688433408737, + "learning_rate": 0.0002893437575570255, + "loss": 2.6162, + "step": 21653 + }, + { + "epoch": 0.6421136909527622, + "grad_norm": 0.10741044580936432, + "learning_rate": 0.0002893010872424238, + "loss": 2.6176, + "step": 21654 + }, + { + "epoch": 0.6421433443050737, + "grad_norm": 0.12545254826545715, + "learning_rate": 0.00028925841879359727, + "loss": 2.612, + "step": 21655 + }, + { + "epoch": 0.6421729976573851, + "grad_norm": 0.12601539492607117, + "learning_rate": 0.0002892157522109236, + "loss": 2.635, + "step": 21656 + }, + { + "epoch": 0.6422026510096966, + "grad_norm": 0.13688594102859497, + "learning_rate": 0.00028917308749478067, + "loss": 2.6617, + "step": 21657 + }, + { + "epoch": 0.6422323043620082, + "grad_norm": 0.11854597181081772, + "learning_rate": 0.0002891304246455463, + "loss": 2.6497, + "step": 21658 + }, + { + "epoch": 0.6422619577143196, + "grad_norm": 0.09831434488296509, + "learning_rate": 0.0002890877636635981, + "loss": 2.6322, + "step": 21659 + }, + { + "epoch": 0.6422916110666311, + "grad_norm": 0.11177043616771698, + "learning_rate": 0.00028904510454931423, + "loss": 2.6579, + "step": 21660 + }, + { + "epoch": 0.6423212644189426, + "grad_norm": 0.09860965609550476, + "learning_rate": 0.0002890024473030719, + "loss": 2.614, + "step": 21661 + }, + { + "epoch": 0.6423509177712541, + "grad_norm": 0.09886176884174347, + "learning_rate": 0.00028895979192524926, + "loss": 2.6535, + "step": 21662 + }, + { + "epoch": 0.6423805711235655, + "grad_norm": 0.10052353888750076, + "learning_rate": 0.00028891713841622377, + "loss": 2.6059, + "step": 21663 + }, + { + "epoch": 0.642410224475877, + "grad_norm": 0.10295400768518448, + "learning_rate": 0.0002888744867763732, + "loss": 2.616, + "step": 21664 + }, + { + "epoch": 0.6424398778281885, + "grad_norm": 0.10495513677597046, + "learning_rate": 0.0002888318370060754, + "loss": 2.624, + "step": 21665 + }, + { + "epoch": 0.6424695311805, + "grad_norm": 0.10577443987131119, + "learning_rate": 0.0002887891891057079, + "loss": 2.6595, + "step": 21666 + }, + { + "epoch": 0.6424991845328114, + "grad_norm": 0.1216028481721878, + "learning_rate": 0.0002887465430756482, + "loss": 2.6187, + "step": 21667 + }, + { + "epoch": 0.6425288378851229, + "grad_norm": 0.11270096153020859, + "learning_rate": 0.0002887038989162742, + "loss": 2.5984, + "step": 21668 + }, + { + "epoch": 0.6425584912374344, + "grad_norm": 0.11782851815223694, + "learning_rate": 0.0002886612566279636, + "loss": 2.6229, + "step": 21669 + }, + { + "epoch": 0.6425881445897459, + "grad_norm": 0.10833241045475006, + "learning_rate": 0.0002886186162110938, + "loss": 2.6276, + "step": 21670 + }, + { + "epoch": 0.6426177979420573, + "grad_norm": 0.12061094492673874, + "learning_rate": 0.0002885759776660426, + "loss": 2.6482, + "step": 21671 + }, + { + "epoch": 0.6426474512943688, + "grad_norm": 0.0955268070101738, + "learning_rate": 0.0002885333409931873, + "loss": 2.6124, + "step": 21672 + }, + { + "epoch": 0.6426771046466803, + "grad_norm": 0.11695720255374908, + "learning_rate": 0.00028849070619290545, + "loss": 2.6286, + "step": 21673 + }, + { + "epoch": 0.6427067579989918, + "grad_norm": 0.10114980489015579, + "learning_rate": 0.0002884480732655748, + "loss": 2.6443, + "step": 21674 + }, + { + "epoch": 0.6427364113513032, + "grad_norm": 0.12646038830280304, + "learning_rate": 0.00028840544221157274, + "loss": 2.6118, + "step": 21675 + }, + { + "epoch": 0.6427660647036147, + "grad_norm": 0.11607594043016434, + "learning_rate": 0.00028836281303127676, + "loss": 2.6012, + "step": 21676 + }, + { + "epoch": 0.6427957180559262, + "grad_norm": 0.1128387600183487, + "learning_rate": 0.00028832018572506446, + "loss": 2.6566, + "step": 21677 + }, + { + "epoch": 0.6428253714082377, + "grad_norm": 0.11726129800081253, + "learning_rate": 0.0002882775602933132, + "loss": 2.6476, + "step": 21678 + }, + { + "epoch": 0.6428550247605492, + "grad_norm": 0.15051014721393585, + "learning_rate": 0.0002882349367364005, + "loss": 2.6386, + "step": 21679 + }, + { + "epoch": 0.6428846781128607, + "grad_norm": 0.12176375836133957, + "learning_rate": 0.00028819231505470357, + "loss": 2.6392, + "step": 21680 + }, + { + "epoch": 0.6429143314651722, + "grad_norm": 0.11124120652675629, + "learning_rate": 0.00028814969524860047, + "loss": 2.618, + "step": 21681 + }, + { + "epoch": 0.6429439848174836, + "grad_norm": 0.10229216516017914, + "learning_rate": 0.000288107077318468, + "loss": 2.6107, + "step": 21682 + }, + { + "epoch": 0.6429736381697951, + "grad_norm": 0.1154698058962822, + "learning_rate": 0.00028806446126468366, + "loss": 2.6681, + "step": 21683 + }, + { + "epoch": 0.6430032915221066, + "grad_norm": 0.10402411222457886, + "learning_rate": 0.00028802184708762505, + "loss": 2.6126, + "step": 21684 + }, + { + "epoch": 0.6430329448744181, + "grad_norm": 0.11147867143154144, + "learning_rate": 0.0002879792347876692, + "loss": 2.6484, + "step": 21685 + }, + { + "epoch": 0.6430625982267295, + "grad_norm": 0.10445239394903183, + "learning_rate": 0.0002879366243651937, + "loss": 2.613, + "step": 21686 + }, + { + "epoch": 0.643092251579041, + "grad_norm": 0.11084368079900742, + "learning_rate": 0.0002878940158205757, + "loss": 2.6228, + "step": 21687 + }, + { + "epoch": 0.6431219049313525, + "grad_norm": 0.09351669251918793, + "learning_rate": 0.0002878514091541927, + "loss": 2.6153, + "step": 21688 + }, + { + "epoch": 0.643151558283664, + "grad_norm": 0.11633813381195068, + "learning_rate": 0.00028780880436642185, + "loss": 2.6315, + "step": 21689 + }, + { + "epoch": 0.6431812116359754, + "grad_norm": 0.10643773525953293, + "learning_rate": 0.00028776620145764056, + "loss": 2.6272, + "step": 21690 + }, + { + "epoch": 0.643210864988287, + "grad_norm": 0.09792517870664597, + "learning_rate": 0.0002877236004282259, + "loss": 2.6246, + "step": 21691 + }, + { + "epoch": 0.6432405183405984, + "grad_norm": 0.09661649167537689, + "learning_rate": 0.00028768100127855514, + "loss": 2.6253, + "step": 21692 + }, + { + "epoch": 0.6432701716929099, + "grad_norm": 0.08980818837881088, + "learning_rate": 0.0002876384040090056, + "loss": 2.6364, + "step": 21693 + }, + { + "epoch": 0.6432998250452213, + "grad_norm": 0.09797698259353638, + "learning_rate": 0.0002875958086199545, + "loss": 2.627, + "step": 21694 + }, + { + "epoch": 0.6433294783975329, + "grad_norm": 0.09818895161151886, + "learning_rate": 0.0002875532151117789, + "loss": 2.6269, + "step": 21695 + }, + { + "epoch": 0.6433591317498443, + "grad_norm": 0.10322540998458862, + "learning_rate": 0.000287510623484856, + "loss": 2.633, + "step": 21696 + }, + { + "epoch": 0.6433887851021558, + "grad_norm": 0.09551876038312912, + "learning_rate": 0.0002874680337395631, + "loss": 2.6441, + "step": 21697 + }, + { + "epoch": 0.6434184384544672, + "grad_norm": 0.09905403852462769, + "learning_rate": 0.0002874254458762772, + "loss": 2.6226, + "step": 21698 + }, + { + "epoch": 0.6434480918067788, + "grad_norm": 0.0967528223991394, + "learning_rate": 0.0002873828598953755, + "loss": 2.6047, + "step": 21699 + }, + { + "epoch": 0.6434777451590903, + "grad_norm": 0.10391148179769516, + "learning_rate": 0.0002873402757972351, + "loss": 2.6234, + "step": 21700 + }, + { + "epoch": 0.6435073985114017, + "grad_norm": 0.1018296331167221, + "learning_rate": 0.000287297693582233, + "loss": 2.6123, + "step": 21701 + }, + { + "epoch": 0.6435370518637132, + "grad_norm": 0.10540783405303955, + "learning_rate": 0.00028725511325074633, + "loss": 2.6032, + "step": 21702 + }, + { + "epoch": 0.6435667052160247, + "grad_norm": 0.10739781707525253, + "learning_rate": 0.00028721253480315214, + "loss": 2.6232, + "step": 21703 + }, + { + "epoch": 0.6435963585683362, + "grad_norm": 0.09878551959991455, + "learning_rate": 0.0002871699582398275, + "loss": 2.6015, + "step": 21704 + }, + { + "epoch": 0.6436260119206476, + "grad_norm": 0.1055942177772522, + "learning_rate": 0.00028712738356114967, + "loss": 2.6126, + "step": 21705 + }, + { + "epoch": 0.6436556652729591, + "grad_norm": 0.10656784474849701, + "learning_rate": 0.0002870848107674949, + "loss": 2.6203, + "step": 21706 + }, + { + "epoch": 0.6436853186252706, + "grad_norm": 0.10647747665643692, + "learning_rate": 0.0002870422398592409, + "loss": 2.6756, + "step": 21707 + }, + { + "epoch": 0.6437149719775821, + "grad_norm": 0.11371499300003052, + "learning_rate": 0.00028699967083676447, + "loss": 2.6326, + "step": 21708 + }, + { + "epoch": 0.6437446253298935, + "grad_norm": 0.12628217041492462, + "learning_rate": 0.0002869571037004425, + "loss": 2.6232, + "step": 21709 + }, + { + "epoch": 0.643774278682205, + "grad_norm": 0.10617125034332275, + "learning_rate": 0.0002869145384506519, + "loss": 2.6576, + "step": 21710 + }, + { + "epoch": 0.6438039320345165, + "grad_norm": 0.11565770208835602, + "learning_rate": 0.00028687197508776965, + "loss": 2.6429, + "step": 21711 + }, + { + "epoch": 0.643833585386828, + "grad_norm": 0.11799998581409454, + "learning_rate": 0.00028682941361217267, + "loss": 2.657, + "step": 21712 + }, + { + "epoch": 0.6438632387391394, + "grad_norm": 0.14619652926921844, + "learning_rate": 0.0002867868540242378, + "loss": 2.6516, + "step": 21713 + }, + { + "epoch": 0.643892892091451, + "grad_norm": 0.1340242326259613, + "learning_rate": 0.000286744296324342, + "loss": 2.6578, + "step": 21714 + }, + { + "epoch": 0.6439225454437624, + "grad_norm": 0.12123499810695648, + "learning_rate": 0.0002867017405128622, + "loss": 2.6231, + "step": 21715 + }, + { + "epoch": 0.6439521987960739, + "grad_norm": 0.11060654371976852, + "learning_rate": 0.0002866591865901749, + "loss": 2.6486, + "step": 21716 + }, + { + "epoch": 0.6439818521483853, + "grad_norm": 0.11853693425655365, + "learning_rate": 0.0002866166345566571, + "loss": 2.6522, + "step": 21717 + }, + { + "epoch": 0.6440115055006969, + "grad_norm": 0.10397320985794067, + "learning_rate": 0.00028657408441268566, + "loss": 2.6428, + "step": 21718 + }, + { + "epoch": 0.6440411588530083, + "grad_norm": 0.1274043619632721, + "learning_rate": 0.00028653153615863715, + "loss": 2.6195, + "step": 21719 + }, + { + "epoch": 0.6440708122053198, + "grad_norm": 0.10996291041374207, + "learning_rate": 0.0002864889897948887, + "loss": 2.6373, + "step": 21720 + }, + { + "epoch": 0.6441004655576313, + "grad_norm": 0.10206180810928345, + "learning_rate": 0.0002864464453218169, + "loss": 2.5926, + "step": 21721 + }, + { + "epoch": 0.6441301189099428, + "grad_norm": 0.10517461597919464, + "learning_rate": 0.00028640390273979857, + "loss": 2.6385, + "step": 21722 + }, + { + "epoch": 0.6441597722622543, + "grad_norm": 0.10876494646072388, + "learning_rate": 0.0002863613620492102, + "loss": 2.6128, + "step": 21723 + }, + { + "epoch": 0.6441894256145657, + "grad_norm": 0.10323277860879898, + "learning_rate": 0.0002863188232504287, + "loss": 2.6268, + "step": 21724 + }, + { + "epoch": 0.6442190789668772, + "grad_norm": 0.11434199661016464, + "learning_rate": 0.0002862762863438307, + "loss": 2.6326, + "step": 21725 + }, + { + "epoch": 0.6442487323191887, + "grad_norm": 0.10741066932678223, + "learning_rate": 0.00028623375132979313, + "loss": 2.6181, + "step": 21726 + }, + { + "epoch": 0.6442783856715002, + "grad_norm": 0.11534087359905243, + "learning_rate": 0.0002861912182086921, + "loss": 2.6532, + "step": 21727 + }, + { + "epoch": 0.6443080390238116, + "grad_norm": 0.11366825550794601, + "learning_rate": 0.0002861486869809046, + "loss": 2.6069, + "step": 21728 + }, + { + "epoch": 0.6443376923761232, + "grad_norm": 0.10004974156618118, + "learning_rate": 0.00028610615764680707, + "loss": 2.6489, + "step": 21729 + }, + { + "epoch": 0.6443673457284346, + "grad_norm": 0.1134122982621193, + "learning_rate": 0.0002860636302067763, + "loss": 2.5948, + "step": 21730 + }, + { + "epoch": 0.6443969990807461, + "grad_norm": 0.10161204636096954, + "learning_rate": 0.0002860211046611888, + "loss": 2.6214, + "step": 21731 + }, + { + "epoch": 0.6444266524330575, + "grad_norm": 0.10559100657701492, + "learning_rate": 0.000285978581010421, + "loss": 2.6312, + "step": 21732 + }, + { + "epoch": 0.644456305785369, + "grad_norm": 0.11341841518878937, + "learning_rate": 0.00028593605925484976, + "loss": 2.646, + "step": 21733 + }, + { + "epoch": 0.6444859591376805, + "grad_norm": 0.10904448479413986, + "learning_rate": 0.0002858935393948514, + "loss": 2.6354, + "step": 21734 + }, + { + "epoch": 0.644515612489992, + "grad_norm": 0.10522547364234924, + "learning_rate": 0.00028585102143080255, + "loss": 2.6309, + "step": 21735 + }, + { + "epoch": 0.6445452658423034, + "grad_norm": 0.09850950539112091, + "learning_rate": 0.0002858085053630799, + "loss": 2.6337, + "step": 21736 + }, + { + "epoch": 0.644574919194615, + "grad_norm": 0.11836546659469604, + "learning_rate": 0.00028576599119205947, + "loss": 2.6153, + "step": 21737 + }, + { + "epoch": 0.6446045725469264, + "grad_norm": 0.10097327828407288, + "learning_rate": 0.000285723478918118, + "loss": 2.6105, + "step": 21738 + }, + { + "epoch": 0.6446342258992379, + "grad_norm": 0.11405139416456223, + "learning_rate": 0.00028568096854163184, + "loss": 2.637, + "step": 21739 + }, + { + "epoch": 0.6446638792515494, + "grad_norm": 0.09808817505836487, + "learning_rate": 0.0002856384600629775, + "loss": 2.644, + "step": 21740 + }, + { + "epoch": 0.6446935326038609, + "grad_norm": 0.11293499171733856, + "learning_rate": 0.0002855959534825314, + "loss": 2.6263, + "step": 21741 + }, + { + "epoch": 0.6447231859561724, + "grad_norm": 0.10269691050052643, + "learning_rate": 0.00028555344880067, + "loss": 2.6933, + "step": 21742 + }, + { + "epoch": 0.6447528393084838, + "grad_norm": 0.11326435953378677, + "learning_rate": 0.0002855109460177695, + "loss": 2.6186, + "step": 21743 + }, + { + "epoch": 0.6447824926607953, + "grad_norm": 0.10341937839984894, + "learning_rate": 0.00028546844513420644, + "loss": 2.5685, + "step": 21744 + }, + { + "epoch": 0.6448121460131068, + "grad_norm": 0.11959049850702286, + "learning_rate": 0.000285425946150357, + "loss": 2.615, + "step": 21745 + }, + { + "epoch": 0.6448417993654183, + "grad_norm": 0.12103047221899033, + "learning_rate": 0.00028538344906659777, + "loss": 2.6303, + "step": 21746 + }, + { + "epoch": 0.6448714527177297, + "grad_norm": 0.10489081591367722, + "learning_rate": 0.00028534095388330506, + "loss": 2.6277, + "step": 21747 + }, + { + "epoch": 0.6449011060700413, + "grad_norm": 0.11369411647319794, + "learning_rate": 0.00028529846060085495, + "loss": 2.6297, + "step": 21748 + }, + { + "epoch": 0.6449307594223527, + "grad_norm": 0.10662080347537994, + "learning_rate": 0.0002852559692196238, + "loss": 2.6096, + "step": 21749 + }, + { + "epoch": 0.6449604127746642, + "grad_norm": 0.11074643582105637, + "learning_rate": 0.0002852134797399879, + "loss": 2.5956, + "step": 21750 + }, + { + "epoch": 0.6449900661269756, + "grad_norm": 0.11457806825637817, + "learning_rate": 0.00028517099216232355, + "loss": 2.6223, + "step": 21751 + }, + { + "epoch": 0.6450197194792872, + "grad_norm": 0.11572924256324768, + "learning_rate": 0.00028512850648700685, + "loss": 2.649, + "step": 21752 + }, + { + "epoch": 0.6450493728315986, + "grad_norm": 0.1220962256193161, + "learning_rate": 0.0002850860227144142, + "loss": 2.6175, + "step": 21753 + }, + { + "epoch": 0.6450790261839101, + "grad_norm": 0.12357122451066971, + "learning_rate": 0.0002850435408449216, + "loss": 2.6361, + "step": 21754 + }, + { + "epoch": 0.6451086795362215, + "grad_norm": 0.10961172729730606, + "learning_rate": 0.00028500106087890544, + "loss": 2.6546, + "step": 21755 + }, + { + "epoch": 0.6451383328885331, + "grad_norm": 0.11476446688175201, + "learning_rate": 0.00028495858281674173, + "loss": 2.6234, + "step": 21756 + }, + { + "epoch": 0.6451679862408445, + "grad_norm": 0.12911385297775269, + "learning_rate": 0.00028491610665880675, + "loss": 2.6371, + "step": 21757 + }, + { + "epoch": 0.645197639593156, + "grad_norm": 0.106878861784935, + "learning_rate": 0.0002848736324054765, + "loss": 2.6441, + "step": 21758 + }, + { + "epoch": 0.6452272929454674, + "grad_norm": 0.1019650399684906, + "learning_rate": 0.00028483116005712715, + "loss": 2.6231, + "step": 21759 + }, + { + "epoch": 0.645256946297779, + "grad_norm": 0.11008822917938232, + "learning_rate": 0.00028478868961413485, + "loss": 2.6434, + "step": 21760 + }, + { + "epoch": 0.6452865996500905, + "grad_norm": 0.11587056517601013, + "learning_rate": 0.00028474622107687567, + "loss": 2.6685, + "step": 21761 + }, + { + "epoch": 0.6453162530024019, + "grad_norm": 0.10563363879919052, + "learning_rate": 0.0002847037544457256, + "loss": 2.6479, + "step": 21762 + }, + { + "epoch": 0.6453459063547134, + "grad_norm": 0.11365126818418503, + "learning_rate": 0.00028466128972106075, + "loss": 2.6221, + "step": 21763 + }, + { + "epoch": 0.6453755597070249, + "grad_norm": 0.10581882297992706, + "learning_rate": 0.0002846188269032571, + "loss": 2.6233, + "step": 21764 + }, + { + "epoch": 0.6454052130593364, + "grad_norm": 0.10659067332744598, + "learning_rate": 0.00028457636599269083, + "loss": 2.6394, + "step": 21765 + }, + { + "epoch": 0.6454348664116478, + "grad_norm": 0.11251431703567505, + "learning_rate": 0.00028453390698973766, + "loss": 2.6323, + "step": 21766 + }, + { + "epoch": 0.6454645197639594, + "grad_norm": 0.10228336602449417, + "learning_rate": 0.0002844914498947739, + "loss": 2.6414, + "step": 21767 + }, + { + "epoch": 0.6454941731162708, + "grad_norm": 0.10675349086523056, + "learning_rate": 0.00028444899470817523, + "loss": 2.6081, + "step": 21768 + }, + { + "epoch": 0.6455238264685823, + "grad_norm": 0.10690370947122574, + "learning_rate": 0.00028440654143031775, + "loss": 2.6194, + "step": 21769 + }, + { + "epoch": 0.6455534798208937, + "grad_norm": 0.10924656689167023, + "learning_rate": 0.0002843640900615775, + "loss": 2.6376, + "step": 21770 + }, + { + "epoch": 0.6455831331732053, + "grad_norm": 0.09754204750061035, + "learning_rate": 0.0002843216406023299, + "loss": 2.6354, + "step": 21771 + }, + { + "epoch": 0.6456127865255167, + "grad_norm": 0.11593006551265717, + "learning_rate": 0.00028427919305295136, + "loss": 2.6054, + "step": 21772 + }, + { + "epoch": 0.6456424398778282, + "grad_norm": 0.10278681665658951, + "learning_rate": 0.0002842367474138176, + "loss": 2.6314, + "step": 21773 + }, + { + "epoch": 0.6456720932301396, + "grad_norm": 0.1115659847855568, + "learning_rate": 0.00028419430368530453, + "loss": 2.6694, + "step": 21774 + }, + { + "epoch": 0.6457017465824512, + "grad_norm": 0.10686910897493362, + "learning_rate": 0.00028415186186778787, + "loss": 2.6261, + "step": 21775 + }, + { + "epoch": 0.6457313999347626, + "grad_norm": 0.09910380095243454, + "learning_rate": 0.0002841094219616436, + "loss": 2.6183, + "step": 21776 + }, + { + "epoch": 0.6457610532870741, + "grad_norm": 0.1158166453242302, + "learning_rate": 0.0002840669839672474, + "loss": 2.6164, + "step": 21777 + }, + { + "epoch": 0.6457907066393855, + "grad_norm": 0.10797027498483658, + "learning_rate": 0.0002840245478849751, + "loss": 2.6128, + "step": 21778 + }, + { + "epoch": 0.6458203599916971, + "grad_norm": 0.26345059275627136, + "learning_rate": 0.00028398211371520257, + "loss": 2.6347, + "step": 21779 + }, + { + "epoch": 0.6458500133440085, + "grad_norm": 0.09462115168571472, + "learning_rate": 0.0002839396814583054, + "loss": 2.5906, + "step": 21780 + }, + { + "epoch": 0.64587966669632, + "grad_norm": 0.0981881245970726, + "learning_rate": 0.0002838972511146597, + "loss": 2.6617, + "step": 21781 + }, + { + "epoch": 0.6459093200486316, + "grad_norm": 0.11241548508405685, + "learning_rate": 0.0002838548226846407, + "loss": 2.6415, + "step": 21782 + }, + { + "epoch": 0.645938973400943, + "grad_norm": 0.10125112533569336, + "learning_rate": 0.0002838123961686244, + "loss": 2.5935, + "step": 21783 + }, + { + "epoch": 0.6459686267532545, + "grad_norm": 0.11073155701160431, + "learning_rate": 0.0002837699715669863, + "loss": 2.6182, + "step": 21784 + }, + { + "epoch": 0.6459982801055659, + "grad_norm": 0.09844481945037842, + "learning_rate": 0.00028372754888010223, + "loss": 2.618, + "step": 21785 + }, + { + "epoch": 0.6460279334578775, + "grad_norm": 0.12186141312122345, + "learning_rate": 0.000283685128108348, + "loss": 2.628, + "step": 21786 + }, + { + "epoch": 0.6460575868101889, + "grad_norm": 0.1087428405880928, + "learning_rate": 0.000283642709252099, + "loss": 2.6322, + "step": 21787 + }, + { + "epoch": 0.6460872401625004, + "grad_norm": 0.10114426165819168, + "learning_rate": 0.00028360029231173104, + "loss": 2.6431, + "step": 21788 + }, + { + "epoch": 0.6461168935148118, + "grad_norm": 0.11841584742069244, + "learning_rate": 0.0002835578772876195, + "loss": 2.6473, + "step": 21789 + }, + { + "epoch": 0.6461465468671234, + "grad_norm": 0.1124659851193428, + "learning_rate": 0.0002835154641801402, + "loss": 2.624, + "step": 21790 + }, + { + "epoch": 0.6461762002194348, + "grad_norm": 0.1066618412733078, + "learning_rate": 0.0002834730529896688, + "loss": 2.6556, + "step": 21791 + }, + { + "epoch": 0.6462058535717463, + "grad_norm": 0.11114969104528427, + "learning_rate": 0.0002834306437165805, + "loss": 2.6287, + "step": 21792 + }, + { + "epoch": 0.6462355069240577, + "grad_norm": 0.10352766513824463, + "learning_rate": 0.00028338823636125097, + "loss": 2.6547, + "step": 21793 + }, + { + "epoch": 0.6462651602763693, + "grad_norm": 0.10731726884841919, + "learning_rate": 0.00028334583092405586, + "loss": 2.616, + "step": 21794 + }, + { + "epoch": 0.6462948136286807, + "grad_norm": 0.11210747808218002, + "learning_rate": 0.00028330342740537064, + "loss": 2.6337, + "step": 21795 + }, + { + "epoch": 0.6463244669809922, + "grad_norm": 0.10621435940265656, + "learning_rate": 0.00028326102580557066, + "loss": 2.6562, + "step": 21796 + }, + { + "epoch": 0.6463541203333036, + "grad_norm": 0.11310777068138123, + "learning_rate": 0.00028321862612503134, + "loss": 2.6485, + "step": 21797 + }, + { + "epoch": 0.6463837736856152, + "grad_norm": 0.12050595134496689, + "learning_rate": 0.0002831762283641285, + "loss": 2.6488, + "step": 21798 + }, + { + "epoch": 0.6464134270379266, + "grad_norm": 0.11226014047861099, + "learning_rate": 0.0002831338325232374, + "loss": 2.6349, + "step": 21799 + }, + { + "epoch": 0.6464430803902381, + "grad_norm": 0.10864359885454178, + "learning_rate": 0.00028309143860273336, + "loss": 2.6155, + "step": 21800 + }, + { + "epoch": 0.6464727337425495, + "grad_norm": 0.11920111626386642, + "learning_rate": 0.0002830490466029919, + "loss": 2.6182, + "step": 21801 + }, + { + "epoch": 0.6465023870948611, + "grad_norm": 0.1079922690987587, + "learning_rate": 0.0002830066565243886, + "loss": 2.6058, + "step": 21802 + }, + { + "epoch": 0.6465320404471726, + "grad_norm": 0.1035652607679367, + "learning_rate": 0.0002829642683672984, + "loss": 2.6651, + "step": 21803 + }, + { + "epoch": 0.646561693799484, + "grad_norm": 0.11936552077531815, + "learning_rate": 0.00028292188213209686, + "loss": 2.6301, + "step": 21804 + }, + { + "epoch": 0.6465913471517956, + "grad_norm": 0.11571666598320007, + "learning_rate": 0.0002828794978191593, + "loss": 2.6122, + "step": 21805 + }, + { + "epoch": 0.646621000504107, + "grad_norm": 0.14276602864265442, + "learning_rate": 0.00028283711542886105, + "loss": 2.6536, + "step": 21806 + }, + { + "epoch": 0.6466506538564185, + "grad_norm": 0.12673093378543854, + "learning_rate": 0.00028279473496157744, + "loss": 2.6551, + "step": 21807 + }, + { + "epoch": 0.6466803072087299, + "grad_norm": 0.11533138900995255, + "learning_rate": 0.0002827523564176837, + "loss": 2.647, + "step": 21808 + }, + { + "epoch": 0.6467099605610415, + "grad_norm": 0.11942082643508911, + "learning_rate": 0.0002827099797975551, + "loss": 2.6167, + "step": 21809 + }, + { + "epoch": 0.6467396139133529, + "grad_norm": 0.11613532900810242, + "learning_rate": 0.00028266760510156684, + "loss": 2.6468, + "step": 21810 + }, + { + "epoch": 0.6467692672656644, + "grad_norm": 0.10237013548612595, + "learning_rate": 0.00028262523233009434, + "loss": 2.6388, + "step": 21811 + }, + { + "epoch": 0.6467989206179758, + "grad_norm": 0.11127255856990814, + "learning_rate": 0.00028258286148351297, + "loss": 2.634, + "step": 21812 + }, + { + "epoch": 0.6468285739702874, + "grad_norm": 0.12530399858951569, + "learning_rate": 0.0002825404925621975, + "loss": 2.6675, + "step": 21813 + }, + { + "epoch": 0.6468582273225988, + "grad_norm": 0.10972747206687927, + "learning_rate": 0.0002824981255665232, + "loss": 2.6441, + "step": 21814 + }, + { + "epoch": 0.6468878806749103, + "grad_norm": 0.1172630712389946, + "learning_rate": 0.00028245576049686544, + "loss": 2.6248, + "step": 21815 + }, + { + "epoch": 0.6469175340272217, + "grad_norm": 0.11943323165178299, + "learning_rate": 0.0002824133973535992, + "loss": 2.6383, + "step": 21816 + }, + { + "epoch": 0.6469471873795333, + "grad_norm": 0.11359713971614838, + "learning_rate": 0.00028237103613709967, + "loss": 2.6395, + "step": 21817 + }, + { + "epoch": 0.6469768407318447, + "grad_norm": 0.12341202795505524, + "learning_rate": 0.00028232867684774206, + "loss": 2.6463, + "step": 21818 + }, + { + "epoch": 0.6470064940841562, + "grad_norm": 0.0941399410367012, + "learning_rate": 0.0002822863194859013, + "loss": 2.6324, + "step": 21819 + }, + { + "epoch": 0.6470361474364676, + "grad_norm": 0.10691376775503159, + "learning_rate": 0.0002822439640519525, + "loss": 2.6531, + "step": 21820 + }, + { + "epoch": 0.6470658007887792, + "grad_norm": 0.11922217905521393, + "learning_rate": 0.0002822016105462709, + "loss": 2.6429, + "step": 21821 + }, + { + "epoch": 0.6470954541410906, + "grad_norm": 0.10553941875696182, + "learning_rate": 0.00028215925896923135, + "loss": 2.6536, + "step": 21822 + }, + { + "epoch": 0.6471251074934021, + "grad_norm": 0.11526338011026382, + "learning_rate": 0.000282116909321209, + "loss": 2.5982, + "step": 21823 + }, + { + "epoch": 0.6471547608457137, + "grad_norm": 0.12899141013622284, + "learning_rate": 0.00028207456160257873, + "loss": 2.6255, + "step": 21824 + }, + { + "epoch": 0.6471844141980251, + "grad_norm": 0.1072189211845398, + "learning_rate": 0.00028203221581371573, + "loss": 2.6322, + "step": 21825 + }, + { + "epoch": 0.6472140675503366, + "grad_norm": 0.1087678074836731, + "learning_rate": 0.0002819898719549948, + "loss": 2.6188, + "step": 21826 + }, + { + "epoch": 0.647243720902648, + "grad_norm": 0.10660551488399506, + "learning_rate": 0.00028194753002679096, + "loss": 2.6307, + "step": 21827 + }, + { + "epoch": 0.6472733742549596, + "grad_norm": 0.1211928203701973, + "learning_rate": 0.00028190519002947923, + "loss": 2.6663, + "step": 21828 + }, + { + "epoch": 0.647303027607271, + "grad_norm": 0.11152777075767517, + "learning_rate": 0.00028186285196343444, + "loss": 2.6353, + "step": 21829 + }, + { + "epoch": 0.6473326809595825, + "grad_norm": 0.10267218202352524, + "learning_rate": 0.00028182051582903157, + "loss": 2.6495, + "step": 21830 + }, + { + "epoch": 0.6473623343118939, + "grad_norm": 0.10598800331354141, + "learning_rate": 0.00028177818162664547, + "loss": 2.6428, + "step": 21831 + }, + { + "epoch": 0.6473919876642055, + "grad_norm": 0.11232806742191315, + "learning_rate": 0.00028173584935665096, + "loss": 2.6117, + "step": 21832 + }, + { + "epoch": 0.6474216410165169, + "grad_norm": 0.10727750509977341, + "learning_rate": 0.0002816935190194231, + "loss": 2.6137, + "step": 21833 + }, + { + "epoch": 0.6474512943688284, + "grad_norm": 0.09957412630319595, + "learning_rate": 0.0002816511906153365, + "loss": 2.5817, + "step": 21834 + }, + { + "epoch": 0.6474809477211398, + "grad_norm": 0.11317924410104752, + "learning_rate": 0.00028160886414476614, + "loss": 2.6103, + "step": 21835 + }, + { + "epoch": 0.6475106010734514, + "grad_norm": 0.10320964455604553, + "learning_rate": 0.00028156653960808675, + "loss": 2.6423, + "step": 21836 + }, + { + "epoch": 0.6475402544257628, + "grad_norm": 0.10033617913722992, + "learning_rate": 0.0002815242170056731, + "loss": 2.6385, + "step": 21837 + }, + { + "epoch": 0.6475699077780743, + "grad_norm": 0.10212157666683197, + "learning_rate": 0.0002814818963379001, + "loss": 2.6468, + "step": 21838 + }, + { + "epoch": 0.6475995611303857, + "grad_norm": 0.10762922465801239, + "learning_rate": 0.0002814395776051424, + "loss": 2.5994, + "step": 21839 + }, + { + "epoch": 0.6476292144826973, + "grad_norm": 0.10779178887605667, + "learning_rate": 0.0002813972608077747, + "loss": 2.5976, + "step": 21840 + }, + { + "epoch": 0.6476588678350087, + "grad_norm": 0.10110412538051605, + "learning_rate": 0.00028135494594617186, + "loss": 2.6507, + "step": 21841 + }, + { + "epoch": 0.6476885211873202, + "grad_norm": 0.10550492256879807, + "learning_rate": 0.0002813126330207084, + "loss": 2.598, + "step": 21842 + }, + { + "epoch": 0.6477181745396317, + "grad_norm": 0.11737817525863647, + "learning_rate": 0.0002812703220317592, + "loss": 2.639, + "step": 21843 + }, + { + "epoch": 0.6477478278919432, + "grad_norm": 0.1086505576968193, + "learning_rate": 0.0002812280129796988, + "loss": 2.6468, + "step": 21844 + }, + { + "epoch": 0.6477774812442547, + "grad_norm": 0.09835057705640793, + "learning_rate": 0.0002811857058649019, + "loss": 2.5874, + "step": 21845 + }, + { + "epoch": 0.6478071345965661, + "grad_norm": 0.10713229328393936, + "learning_rate": 0.00028114340068774316, + "loss": 2.6187, + "step": 21846 + }, + { + "epoch": 0.6478367879488777, + "grad_norm": 0.11674968153238297, + "learning_rate": 0.0002811010974485973, + "loss": 2.6003, + "step": 21847 + }, + { + "epoch": 0.6478664413011891, + "grad_norm": 0.09898138046264648, + "learning_rate": 0.00028105879614783846, + "loss": 2.6304, + "step": 21848 + }, + { + "epoch": 0.6478960946535006, + "grad_norm": 0.11988871544599533, + "learning_rate": 0.0002810164967858417, + "loss": 2.6068, + "step": 21849 + }, + { + "epoch": 0.647925748005812, + "grad_norm": 0.10413096845149994, + "learning_rate": 0.0002809741993629815, + "loss": 2.6505, + "step": 21850 + }, + { + "epoch": 0.6479554013581236, + "grad_norm": 0.10497291386127472, + "learning_rate": 0.0002809319038796324, + "loss": 2.6117, + "step": 21851 + }, + { + "epoch": 0.647985054710435, + "grad_norm": 0.10696188360452652, + "learning_rate": 0.0002808896103361688, + "loss": 2.647, + "step": 21852 + }, + { + "epoch": 0.6480147080627465, + "grad_norm": 0.09442870318889618, + "learning_rate": 0.0002808473187329654, + "loss": 2.6247, + "step": 21853 + }, + { + "epoch": 0.648044361415058, + "grad_norm": 0.10328851640224457, + "learning_rate": 0.0002808050290703965, + "loss": 2.6235, + "step": 21854 + }, + { + "epoch": 0.6480740147673695, + "grad_norm": 0.09834962338209152, + "learning_rate": 0.0002807627413488368, + "loss": 2.6349, + "step": 21855 + }, + { + "epoch": 0.6481036681196809, + "grad_norm": 0.09521468728780746, + "learning_rate": 0.0002807204555686606, + "loss": 2.6053, + "step": 21856 + }, + { + "epoch": 0.6481333214719924, + "grad_norm": 0.10959196835756302, + "learning_rate": 0.00028067817173024263, + "loss": 2.5999, + "step": 21857 + }, + { + "epoch": 0.6481629748243039, + "grad_norm": 0.11310440301895142, + "learning_rate": 0.0002806358898339569, + "loss": 2.5912, + "step": 21858 + }, + { + "epoch": 0.6481926281766154, + "grad_norm": 0.0967981219291687, + "learning_rate": 0.00028059360988017803, + "loss": 2.6242, + "step": 21859 + }, + { + "epoch": 0.6482222815289268, + "grad_norm": 0.10813767462968826, + "learning_rate": 0.00028055133186928035, + "loss": 2.615, + "step": 21860 + }, + { + "epoch": 0.6482519348812383, + "grad_norm": 0.11159328371286392, + "learning_rate": 0.0002805090558016381, + "loss": 2.633, + "step": 21861 + }, + { + "epoch": 0.6482815882335498, + "grad_norm": 0.11581621319055557, + "learning_rate": 0.00028046678167762605, + "loss": 2.6136, + "step": 21862 + }, + { + "epoch": 0.6483112415858613, + "grad_norm": 0.10772974789142609, + "learning_rate": 0.00028042450949761834, + "loss": 2.6335, + "step": 21863 + }, + { + "epoch": 0.6483408949381727, + "grad_norm": 0.10066988319158554, + "learning_rate": 0.00028038223926198937, + "loss": 2.6167, + "step": 21864 + }, + { + "epoch": 0.6483705482904842, + "grad_norm": 0.12151797115802765, + "learning_rate": 0.00028033997097111325, + "loss": 2.6378, + "step": 21865 + }, + { + "epoch": 0.6484002016427958, + "grad_norm": 0.10794135928153992, + "learning_rate": 0.0002802977046253644, + "loss": 2.6224, + "step": 21866 + }, + { + "epoch": 0.6484298549951072, + "grad_norm": 0.1198606938123703, + "learning_rate": 0.00028025544022511735, + "loss": 2.6098, + "step": 21867 + }, + { + "epoch": 0.6484595083474187, + "grad_norm": 0.11390335857868195, + "learning_rate": 0.00028021317777074585, + "loss": 2.6091, + "step": 21868 + }, + { + "epoch": 0.6484891616997301, + "grad_norm": 0.1405869573354721, + "learning_rate": 0.0002801709172626244, + "loss": 2.6356, + "step": 21869 + }, + { + "epoch": 0.6485188150520417, + "grad_norm": 0.11874620616436005, + "learning_rate": 0.0002801286587011272, + "loss": 2.6467, + "step": 21870 + }, + { + "epoch": 0.6485484684043531, + "grad_norm": 0.10152580589056015, + "learning_rate": 0.0002800864020866285, + "loss": 2.6106, + "step": 21871 + }, + { + "epoch": 0.6485781217566646, + "grad_norm": 0.10925827920436859, + "learning_rate": 0.0002800441474195024, + "loss": 2.634, + "step": 21872 + }, + { + "epoch": 0.648607775108976, + "grad_norm": 0.11950099468231201, + "learning_rate": 0.00028000189470012317, + "loss": 2.6046, + "step": 21873 + }, + { + "epoch": 0.6486374284612876, + "grad_norm": 0.11175152659416199, + "learning_rate": 0.00027995964392886475, + "loss": 2.6205, + "step": 21874 + }, + { + "epoch": 0.648667081813599, + "grad_norm": 0.10349087417125702, + "learning_rate": 0.0002799173951061016, + "loss": 2.5898, + "step": 21875 + }, + { + "epoch": 0.6486967351659105, + "grad_norm": 0.13219565153121948, + "learning_rate": 0.0002798751482322077, + "loss": 2.6029, + "step": 21876 + }, + { + "epoch": 0.648726388518222, + "grad_norm": 0.11864083260297775, + "learning_rate": 0.00027983290330755716, + "loss": 2.628, + "step": 21877 + }, + { + "epoch": 0.6487560418705335, + "grad_norm": 0.11026310175657272, + "learning_rate": 0.00027979066033252426, + "loss": 2.6376, + "step": 21878 + }, + { + "epoch": 0.6487856952228449, + "grad_norm": 0.1092960461974144, + "learning_rate": 0.00027974841930748263, + "loss": 2.635, + "step": 21879 + }, + { + "epoch": 0.6488153485751564, + "grad_norm": 0.11260266602039337, + "learning_rate": 0.0002797061802328066, + "loss": 2.6416, + "step": 21880 + }, + { + "epoch": 0.6488450019274679, + "grad_norm": 0.10504364222288132, + "learning_rate": 0.0002796639431088701, + "loss": 2.6209, + "step": 21881 + }, + { + "epoch": 0.6488746552797794, + "grad_norm": 0.11649896949529648, + "learning_rate": 0.00027962170793604727, + "loss": 2.6341, + "step": 21882 + }, + { + "epoch": 0.6489043086320908, + "grad_norm": 0.09263014793395996, + "learning_rate": 0.000279579474714712, + "loss": 2.6033, + "step": 21883 + }, + { + "epoch": 0.6489339619844023, + "grad_norm": 0.13086166977882385, + "learning_rate": 0.00027953724344523836, + "loss": 2.6159, + "step": 21884 + }, + { + "epoch": 0.6489636153367138, + "grad_norm": 0.12648168206214905, + "learning_rate": 0.0002794950141280002, + "loss": 2.6502, + "step": 21885 + }, + { + "epoch": 0.6489932686890253, + "grad_norm": 0.10961397737264633, + "learning_rate": 0.0002794527867633716, + "loss": 2.6218, + "step": 21886 + }, + { + "epoch": 0.6490229220413368, + "grad_norm": 0.11537455022335052, + "learning_rate": 0.0002794105613517262, + "loss": 2.654, + "step": 21887 + }, + { + "epoch": 0.6490525753936482, + "grad_norm": 0.09996268898248672, + "learning_rate": 0.00027936833789343854, + "loss": 2.6139, + "step": 21888 + }, + { + "epoch": 0.6490822287459598, + "grad_norm": 0.11218487471342087, + "learning_rate": 0.0002793261163888819, + "loss": 2.6169, + "step": 21889 + }, + { + "epoch": 0.6491118820982712, + "grad_norm": 0.10747197270393372, + "learning_rate": 0.00027928389683843047, + "loss": 2.635, + "step": 21890 + }, + { + "epoch": 0.6491415354505827, + "grad_norm": 0.11014463007450104, + "learning_rate": 0.0002792416792424579, + "loss": 2.6161, + "step": 21891 + }, + { + "epoch": 0.6491711888028942, + "grad_norm": 0.10751365125179291, + "learning_rate": 0.0002791994636013382, + "loss": 2.6282, + "step": 21892 + }, + { + "epoch": 0.6492008421552057, + "grad_norm": 0.10253164172172546, + "learning_rate": 0.0002791572499154452, + "loss": 2.652, + "step": 21893 + }, + { + "epoch": 0.6492304955075171, + "grad_norm": 0.10391119122505188, + "learning_rate": 0.0002791150381851526, + "loss": 2.6461, + "step": 21894 + }, + { + "epoch": 0.6492601488598286, + "grad_norm": 0.09821870923042297, + "learning_rate": 0.00027907282841083423, + "loss": 2.6163, + "step": 21895 + }, + { + "epoch": 0.64928980221214, + "grad_norm": 0.09996780008077621, + "learning_rate": 0.000279030620592864, + "loss": 2.6407, + "step": 21896 + }, + { + "epoch": 0.6493194555644516, + "grad_norm": 0.09607227146625519, + "learning_rate": 0.0002789884147316154, + "loss": 2.6482, + "step": 21897 + }, + { + "epoch": 0.649349108916763, + "grad_norm": 0.10616890341043472, + "learning_rate": 0.0002789462108274624, + "loss": 2.6424, + "step": 21898 + }, + { + "epoch": 0.6493787622690745, + "grad_norm": 0.10597572475671768, + "learning_rate": 0.0002789040088807787, + "loss": 2.6199, + "step": 21899 + }, + { + "epoch": 0.649408415621386, + "grad_norm": 0.09943751245737076, + "learning_rate": 0.0002788618088919379, + "loss": 2.6192, + "step": 21900 + }, + { + "epoch": 0.6494380689736975, + "grad_norm": 0.1020234003663063, + "learning_rate": 0.00027881961086131376, + "loss": 2.6425, + "step": 21901 + }, + { + "epoch": 0.6494677223260089, + "grad_norm": 0.10495056211948395, + "learning_rate": 0.00027877741478927987, + "loss": 2.6164, + "step": 21902 + }, + { + "epoch": 0.6494973756783204, + "grad_norm": 0.1040581613779068, + "learning_rate": 0.00027873522067621, + "loss": 2.6034, + "step": 21903 + }, + { + "epoch": 0.6495270290306319, + "grad_norm": 0.10796327143907547, + "learning_rate": 0.00027869302852247767, + "loss": 2.6376, + "step": 21904 + }, + { + "epoch": 0.6495566823829434, + "grad_norm": 0.11264589428901672, + "learning_rate": 0.0002786508383284566, + "loss": 2.6073, + "step": 21905 + }, + { + "epoch": 0.6495863357352548, + "grad_norm": 0.11891188472509384, + "learning_rate": 0.0002786086500945204, + "loss": 2.6347, + "step": 21906 + }, + { + "epoch": 0.6496159890875663, + "grad_norm": 0.11494255810976028, + "learning_rate": 0.00027856646382104256, + "loss": 2.6339, + "step": 21907 + }, + { + "epoch": 0.6496456424398779, + "grad_norm": 0.13058237731456757, + "learning_rate": 0.0002785242795083967, + "loss": 2.6555, + "step": 21908 + }, + { + "epoch": 0.6496752957921893, + "grad_norm": 0.10165627300739288, + "learning_rate": 0.0002784820971569564, + "loss": 2.6177, + "step": 21909 + }, + { + "epoch": 0.6497049491445008, + "grad_norm": 0.10490110516548157, + "learning_rate": 0.0002784399167670951, + "loss": 2.6212, + "step": 21910 + }, + { + "epoch": 0.6497346024968123, + "grad_norm": 0.11033762991428375, + "learning_rate": 0.00027839773833918634, + "loss": 2.6012, + "step": 21911 + }, + { + "epoch": 0.6497642558491238, + "grad_norm": 0.117890864610672, + "learning_rate": 0.0002783555618736039, + "loss": 2.6478, + "step": 21912 + }, + { + "epoch": 0.6497939092014352, + "grad_norm": 0.1264106184244156, + "learning_rate": 0.00027831338737072055, + "loss": 2.6336, + "step": 21913 + }, + { + "epoch": 0.6498235625537467, + "grad_norm": 0.11664311587810516, + "learning_rate": 0.00027827121483091046, + "loss": 2.6013, + "step": 21914 + }, + { + "epoch": 0.6498532159060582, + "grad_norm": 0.09598451852798462, + "learning_rate": 0.00027822904425454676, + "loss": 2.6647, + "step": 21915 + }, + { + "epoch": 0.6498828692583697, + "grad_norm": 0.13748550415039062, + "learning_rate": 0.00027818687564200295, + "loss": 2.6677, + "step": 21916 + }, + { + "epoch": 0.6499125226106811, + "grad_norm": 0.1251915544271469, + "learning_rate": 0.0002781447089936525, + "loss": 2.6594, + "step": 21917 + }, + { + "epoch": 0.6499421759629926, + "grad_norm": 0.1012522354722023, + "learning_rate": 0.0002781025443098687, + "loss": 2.6309, + "step": 21918 + }, + { + "epoch": 0.6499718293153041, + "grad_norm": 0.13457538187503815, + "learning_rate": 0.000278060381591025, + "loss": 2.6606, + "step": 21919 + }, + { + "epoch": 0.6500014826676156, + "grad_norm": 0.12634187936782837, + "learning_rate": 0.0002780182208374946, + "loss": 2.6499, + "step": 21920 + }, + { + "epoch": 0.650031136019927, + "grad_norm": 0.11350177973508835, + "learning_rate": 0.00027797606204965104, + "loss": 2.6386, + "step": 21921 + }, + { + "epoch": 0.6500607893722385, + "grad_norm": 0.10763236880302429, + "learning_rate": 0.00027793390522786756, + "loss": 2.5898, + "step": 21922 + }, + { + "epoch": 0.65009044272455, + "grad_norm": 0.1136346235871315, + "learning_rate": 0.0002778917503725177, + "loss": 2.6376, + "step": 21923 + }, + { + "epoch": 0.6501200960768615, + "grad_norm": 0.11732622981071472, + "learning_rate": 0.0002778495974839742, + "loss": 2.6256, + "step": 21924 + }, + { + "epoch": 0.6501497494291729, + "grad_norm": 0.11170602589845657, + "learning_rate": 0.0002778074465626107, + "loss": 2.6399, + "step": 21925 + }, + { + "epoch": 0.6501794027814845, + "grad_norm": 0.09968357533216476, + "learning_rate": 0.0002777652976088002, + "loss": 2.6328, + "step": 21926 + }, + { + "epoch": 0.6502090561337959, + "grad_norm": 0.1040642112493515, + "learning_rate": 0.00027772315062291644, + "loss": 2.6157, + "step": 21927 + }, + { + "epoch": 0.6502387094861074, + "grad_norm": 0.11413182318210602, + "learning_rate": 0.00027768100560533217, + "loss": 2.6531, + "step": 21928 + }, + { + "epoch": 0.6502683628384189, + "grad_norm": 0.10083260387182236, + "learning_rate": 0.00027763886255642085, + "loss": 2.6117, + "step": 21929 + }, + { + "epoch": 0.6502980161907304, + "grad_norm": 0.10337553918361664, + "learning_rate": 0.00027759672147655554, + "loss": 2.6087, + "step": 21930 + }, + { + "epoch": 0.6503276695430419, + "grad_norm": 0.11189316213130951, + "learning_rate": 0.00027755458236610944, + "loss": 2.6299, + "step": 21931 + }, + { + "epoch": 0.6503573228953533, + "grad_norm": 0.09833136945962906, + "learning_rate": 0.00027751244522545573, + "loss": 2.624, + "step": 21932 + }, + { + "epoch": 0.6503869762476648, + "grad_norm": 0.10983458161354065, + "learning_rate": 0.00027747031005496763, + "loss": 2.6053, + "step": 21933 + }, + { + "epoch": 0.6504166295999763, + "grad_norm": 0.10359527915716171, + "learning_rate": 0.00027742817685501796, + "loss": 2.6231, + "step": 21934 + }, + { + "epoch": 0.6504462829522878, + "grad_norm": 0.09993386268615723, + "learning_rate": 0.00027738604562598, + "loss": 2.6397, + "step": 21935 + }, + { + "epoch": 0.6504759363045992, + "grad_norm": 0.0973360538482666, + "learning_rate": 0.00027734391636822684, + "loss": 2.6061, + "step": 21936 + }, + { + "epoch": 0.6505055896569107, + "grad_norm": 0.09631416946649551, + "learning_rate": 0.00027730178908213154, + "loss": 2.5983, + "step": 21937 + }, + { + "epoch": 0.6505352430092222, + "grad_norm": 0.10685750097036362, + "learning_rate": 0.0002772596637680671, + "loss": 2.6143, + "step": 21938 + }, + { + "epoch": 0.6505648963615337, + "grad_norm": 0.110801562666893, + "learning_rate": 0.00027721754042640633, + "loss": 2.629, + "step": 21939 + }, + { + "epoch": 0.6505945497138451, + "grad_norm": 0.10894159227609634, + "learning_rate": 0.0002771754190575227, + "loss": 2.6246, + "step": 21940 + }, + { + "epoch": 0.6506242030661566, + "grad_norm": 0.09300930798053741, + "learning_rate": 0.000277133299661789, + "loss": 2.5944, + "step": 21941 + }, + { + "epoch": 0.6506538564184681, + "grad_norm": 0.11321551352739334, + "learning_rate": 0.00027709118223957817, + "loss": 2.6214, + "step": 21942 + }, + { + "epoch": 0.6506835097707796, + "grad_norm": 0.11083293706178665, + "learning_rate": 0.0002770490667912634, + "loss": 2.5985, + "step": 21943 + }, + { + "epoch": 0.650713163123091, + "grad_norm": 0.11818330734968185, + "learning_rate": 0.00027700695331721716, + "loss": 2.6419, + "step": 21944 + }, + { + "epoch": 0.6507428164754026, + "grad_norm": 0.10136904567480087, + "learning_rate": 0.00027696484181781266, + "loss": 2.592, + "step": 21945 + }, + { + "epoch": 0.650772469827714, + "grad_norm": 0.12731465697288513, + "learning_rate": 0.0002769227322934228, + "loss": 2.6164, + "step": 21946 + }, + { + "epoch": 0.6508021231800255, + "grad_norm": 0.12361358106136322, + "learning_rate": 0.00027688062474442044, + "loss": 2.6464, + "step": 21947 + }, + { + "epoch": 0.6508317765323369, + "grad_norm": 0.110616035759449, + "learning_rate": 0.00027683851917117844, + "loss": 2.6328, + "step": 21948 + }, + { + "epoch": 0.6508614298846485, + "grad_norm": 0.10938653349876404, + "learning_rate": 0.0002767964155740695, + "loss": 2.5909, + "step": 21949 + }, + { + "epoch": 0.65089108323696, + "grad_norm": 0.12186741083860397, + "learning_rate": 0.00027675431395346683, + "loss": 2.6409, + "step": 21950 + }, + { + "epoch": 0.6509207365892714, + "grad_norm": 0.12169703096151352, + "learning_rate": 0.00027671221430974294, + "loss": 2.6194, + "step": 21951 + }, + { + "epoch": 0.6509503899415829, + "grad_norm": 0.10275018215179443, + "learning_rate": 0.00027667011664327053, + "loss": 2.5993, + "step": 21952 + }, + { + "epoch": 0.6509800432938944, + "grad_norm": 0.12353866547346115, + "learning_rate": 0.0002766280209544227, + "loss": 2.6072, + "step": 21953 + }, + { + "epoch": 0.6510096966462059, + "grad_norm": 0.12710262835025787, + "learning_rate": 0.0002765859272435724, + "loss": 2.6235, + "step": 21954 + }, + { + "epoch": 0.6510393499985173, + "grad_norm": 0.12263435125350952, + "learning_rate": 0.0002765438355110918, + "loss": 2.6284, + "step": 21955 + }, + { + "epoch": 0.6510690033508288, + "grad_norm": 0.09802756458520889, + "learning_rate": 0.00027650174575735394, + "loss": 2.6431, + "step": 21956 + }, + { + "epoch": 0.6510986567031403, + "grad_norm": 0.11195787787437439, + "learning_rate": 0.0002764596579827314, + "loss": 2.653, + "step": 21957 + }, + { + "epoch": 0.6511283100554518, + "grad_norm": 0.12206467241048813, + "learning_rate": 0.000276417572187597, + "loss": 2.6353, + "step": 21958 + }, + { + "epoch": 0.6511579634077632, + "grad_norm": 0.10340328514575958, + "learning_rate": 0.0002763754883723234, + "loss": 2.6254, + "step": 21959 + }, + { + "epoch": 0.6511876167600748, + "grad_norm": 0.1250738650560379, + "learning_rate": 0.00027633340653728325, + "loss": 2.6107, + "step": 21960 + }, + { + "epoch": 0.6512172701123862, + "grad_norm": 0.11080323904752731, + "learning_rate": 0.0002762913266828492, + "loss": 2.6537, + "step": 21961 + }, + { + "epoch": 0.6512469234646977, + "grad_norm": 0.10228324681520462, + "learning_rate": 0.0002762492488093937, + "loss": 2.5978, + "step": 21962 + }, + { + "epoch": 0.6512765768170091, + "grad_norm": 0.10287778079509735, + "learning_rate": 0.00027620717291728964, + "loss": 2.6197, + "step": 21963 + }, + { + "epoch": 0.6513062301693207, + "grad_norm": 0.10166912525892258, + "learning_rate": 0.0002761650990069094, + "loss": 2.6637, + "step": 21964 + }, + { + "epoch": 0.6513358835216321, + "grad_norm": 0.10529915988445282, + "learning_rate": 0.00027612302707862566, + "loss": 2.628, + "step": 21965 + }, + { + "epoch": 0.6513655368739436, + "grad_norm": 0.13231928646564484, + "learning_rate": 0.000276080957132811, + "loss": 2.6159, + "step": 21966 + }, + { + "epoch": 0.651395190226255, + "grad_norm": 0.10363534837961197, + "learning_rate": 0.0002760388891698379, + "loss": 2.6453, + "step": 21967 + }, + { + "epoch": 0.6514248435785666, + "grad_norm": 0.10745928436517715, + "learning_rate": 0.0002759968231900788, + "loss": 2.6369, + "step": 21968 + }, + { + "epoch": 0.6514544969308781, + "grad_norm": 0.13284048438072205, + "learning_rate": 0.00027595475919390633, + "loss": 2.6198, + "step": 21969 + }, + { + "epoch": 0.6514841502831895, + "grad_norm": 0.11439485102891922, + "learning_rate": 0.00027591269718169287, + "loss": 2.641, + "step": 21970 + }, + { + "epoch": 0.651513803635501, + "grad_norm": 0.11194977164268494, + "learning_rate": 0.000275870637153811, + "loss": 2.6225, + "step": 21971 + }, + { + "epoch": 0.6515434569878125, + "grad_norm": 0.11982165277004242, + "learning_rate": 0.0002758285791106331, + "loss": 2.6151, + "step": 21972 + }, + { + "epoch": 0.651573110340124, + "grad_norm": 0.10475873202085495, + "learning_rate": 0.0002757865230525316, + "loss": 2.6163, + "step": 21973 + }, + { + "epoch": 0.6516027636924354, + "grad_norm": 0.11469976603984833, + "learning_rate": 0.00027574446897987893, + "loss": 2.654, + "step": 21974 + }, + { + "epoch": 0.651632417044747, + "grad_norm": 0.13122503459453583, + "learning_rate": 0.00027570241689304744, + "loss": 2.629, + "step": 21975 + }, + { + "epoch": 0.6516620703970584, + "grad_norm": 0.1150355339050293, + "learning_rate": 0.0002756603667924096, + "loss": 2.6472, + "step": 21976 + }, + { + "epoch": 0.6516917237493699, + "grad_norm": 0.11453446745872498, + "learning_rate": 0.0002756183186783377, + "loss": 2.6279, + "step": 21977 + }, + { + "epoch": 0.6517213771016813, + "grad_norm": 0.12010162323713303, + "learning_rate": 0.0002755762725512041, + "loss": 2.6548, + "step": 21978 + }, + { + "epoch": 0.6517510304539929, + "grad_norm": 0.11152809858322144, + "learning_rate": 0.0002755342284113811, + "loss": 2.6206, + "step": 21979 + }, + { + "epoch": 0.6517806838063043, + "grad_norm": 0.1283208727836609, + "learning_rate": 0.0002754921862592411, + "loss": 2.635, + "step": 21980 + }, + { + "epoch": 0.6518103371586158, + "grad_norm": 0.10619436949491501, + "learning_rate": 0.00027545014609515626, + "loss": 2.6176, + "step": 21981 + }, + { + "epoch": 0.6518399905109272, + "grad_norm": 0.10640670359134674, + "learning_rate": 0.000275408107919499, + "loss": 2.6445, + "step": 21982 + }, + { + "epoch": 0.6518696438632388, + "grad_norm": 0.12400789558887482, + "learning_rate": 0.00027536607173264145, + "loss": 2.6016, + "step": 21983 + }, + { + "epoch": 0.6518992972155502, + "grad_norm": 0.11713606119155884, + "learning_rate": 0.0002753240375349559, + "loss": 2.6125, + "step": 21984 + }, + { + "epoch": 0.6519289505678617, + "grad_norm": 0.10938511043787003, + "learning_rate": 0.0002752820053268146, + "loss": 2.6345, + "step": 21985 + }, + { + "epoch": 0.6519586039201731, + "grad_norm": 0.10989486426115036, + "learning_rate": 0.0002752399751085896, + "loss": 2.6056, + "step": 21986 + }, + { + "epoch": 0.6519882572724847, + "grad_norm": 0.10882523655891418, + "learning_rate": 0.00027519794688065323, + "loss": 2.5989, + "step": 21987 + }, + { + "epoch": 0.6520179106247961, + "grad_norm": 0.10043122619390488, + "learning_rate": 0.0002751559206433779, + "loss": 2.6123, + "step": 21988 + }, + { + "epoch": 0.6520475639771076, + "grad_norm": 0.10417226701974869, + "learning_rate": 0.00027511389639713524, + "loss": 2.636, + "step": 21989 + }, + { + "epoch": 0.6520772173294191, + "grad_norm": 0.09995636343955994, + "learning_rate": 0.00027507187414229766, + "loss": 2.6091, + "step": 21990 + }, + { + "epoch": 0.6521068706817306, + "grad_norm": 0.127594992518425, + "learning_rate": 0.00027502985387923705, + "loss": 2.6376, + "step": 21991 + }, + { + "epoch": 0.6521365240340421, + "grad_norm": 0.10691934823989868, + "learning_rate": 0.00027498783560832583, + "loss": 2.6433, + "step": 21992 + }, + { + "epoch": 0.6521661773863535, + "grad_norm": 0.11811104416847229, + "learning_rate": 0.000274945819329936, + "loss": 2.5981, + "step": 21993 + }, + { + "epoch": 0.652195830738665, + "grad_norm": 0.11206603050231934, + "learning_rate": 0.0002749038050444396, + "loss": 2.6399, + "step": 21994 + }, + { + "epoch": 0.6522254840909765, + "grad_norm": 0.11315354704856873, + "learning_rate": 0.0002748617927522086, + "loss": 2.5928, + "step": 21995 + }, + { + "epoch": 0.652255137443288, + "grad_norm": 0.10074178874492645, + "learning_rate": 0.00027481978245361507, + "loss": 2.5843, + "step": 21996 + }, + { + "epoch": 0.6522847907955994, + "grad_norm": 0.10171212255954742, + "learning_rate": 0.00027477777414903104, + "loss": 2.6294, + "step": 21997 + }, + { + "epoch": 0.652314444147911, + "grad_norm": 0.10610182583332062, + "learning_rate": 0.00027473576783882845, + "loss": 2.6219, + "step": 21998 + }, + { + "epoch": 0.6523440975002224, + "grad_norm": 0.1089850440621376, + "learning_rate": 0.00027469376352337957, + "loss": 2.6201, + "step": 21999 + }, + { + "epoch": 0.6523737508525339, + "grad_norm": 0.10149037837982178, + "learning_rate": 0.00027465176120305577, + "loss": 2.6153, + "step": 22000 + }, + { + "epoch": 0.6524034042048453, + "grad_norm": 0.10171201825141907, + "learning_rate": 0.00027460976087822944, + "loss": 2.6038, + "step": 22001 + }, + { + "epoch": 0.6524330575571569, + "grad_norm": 0.10291566699743271, + "learning_rate": 0.0002745677625492723, + "loss": 2.6314, + "step": 22002 + }, + { + "epoch": 0.6524627109094683, + "grad_norm": 0.1033715009689331, + "learning_rate": 0.0002745257662165562, + "loss": 2.6392, + "step": 22003 + }, + { + "epoch": 0.6524923642617798, + "grad_norm": 0.10702785849571228, + "learning_rate": 0.00027448377188045317, + "loss": 2.6204, + "step": 22004 + }, + { + "epoch": 0.6525220176140912, + "grad_norm": 0.11230239272117615, + "learning_rate": 0.00027444177954133507, + "loss": 2.6255, + "step": 22005 + }, + { + "epoch": 0.6525516709664028, + "grad_norm": 0.1063595861196518, + "learning_rate": 0.0002743997891995738, + "loss": 2.6301, + "step": 22006 + }, + { + "epoch": 0.6525813243187142, + "grad_norm": 0.11722122877836227, + "learning_rate": 0.00027435780085554115, + "loss": 2.6031, + "step": 22007 + }, + { + "epoch": 0.6526109776710257, + "grad_norm": 0.09496976435184479, + "learning_rate": 0.00027431581450960887, + "loss": 2.6126, + "step": 22008 + }, + { + "epoch": 0.6526406310233371, + "grad_norm": 0.11832419037818909, + "learning_rate": 0.00027427383016214894, + "loss": 2.6111, + "step": 22009 + }, + { + "epoch": 0.6526702843756487, + "grad_norm": 0.11062512546777725, + "learning_rate": 0.0002742318478135328, + "loss": 2.6623, + "step": 22010 + }, + { + "epoch": 0.6526999377279602, + "grad_norm": 0.12081477046012878, + "learning_rate": 0.00027418986746413247, + "loss": 2.6565, + "step": 22011 + }, + { + "epoch": 0.6527295910802716, + "grad_norm": 0.11194369196891785, + "learning_rate": 0.00027414788911431963, + "loss": 2.6467, + "step": 22012 + }, + { + "epoch": 0.6527592444325832, + "grad_norm": 0.13559578359127045, + "learning_rate": 0.00027410591276446597, + "loss": 2.6339, + "step": 22013 + }, + { + "epoch": 0.6527888977848946, + "grad_norm": 0.14846543967723846, + "learning_rate": 0.00027406393841494315, + "loss": 2.6549, + "step": 22014 + }, + { + "epoch": 0.6528185511372061, + "grad_norm": 0.1216813325881958, + "learning_rate": 0.00027402196606612304, + "loss": 2.6165, + "step": 22015 + }, + { + "epoch": 0.6528482044895175, + "grad_norm": 0.12450318783521652, + "learning_rate": 0.00027397999571837693, + "loss": 2.6478, + "step": 22016 + }, + { + "epoch": 0.6528778578418291, + "grad_norm": 0.12673832476139069, + "learning_rate": 0.0002739380273720771, + "loss": 2.6146, + "step": 22017 + }, + { + "epoch": 0.6529075111941405, + "grad_norm": 0.11969130486249924, + "learning_rate": 0.0002738960610275947, + "loss": 2.6023, + "step": 22018 + }, + { + "epoch": 0.652937164546452, + "grad_norm": 0.14028224349021912, + "learning_rate": 0.0002738540966853015, + "loss": 2.6184, + "step": 22019 + }, + { + "epoch": 0.6529668178987634, + "grad_norm": 0.10055314004421234, + "learning_rate": 0.0002738121343455693, + "loss": 2.6311, + "step": 22020 + }, + { + "epoch": 0.652996471251075, + "grad_norm": 0.11582019925117493, + "learning_rate": 0.0002737701740087693, + "loss": 2.612, + "step": 22021 + }, + { + "epoch": 0.6530261246033864, + "grad_norm": 0.110912024974823, + "learning_rate": 0.0002737282156752732, + "loss": 2.6623, + "step": 22022 + }, + { + "epoch": 0.6530557779556979, + "grad_norm": 0.10779306292533875, + "learning_rate": 0.00027368625934545265, + "loss": 2.658, + "step": 22023 + }, + { + "epoch": 0.6530854313080093, + "grad_norm": 0.11114336550235748, + "learning_rate": 0.0002736443050196791, + "loss": 2.6317, + "step": 22024 + }, + { + "epoch": 0.6531150846603209, + "grad_norm": 0.11236165463924408, + "learning_rate": 0.0002736023526983241, + "loss": 2.6256, + "step": 22025 + }, + { + "epoch": 0.6531447380126323, + "grad_norm": 0.10783319175243378, + "learning_rate": 0.00027356040238175905, + "loss": 2.6075, + "step": 22026 + }, + { + "epoch": 0.6531743913649438, + "grad_norm": 0.11380670964717865, + "learning_rate": 0.0002735184540703556, + "loss": 2.6185, + "step": 22027 + }, + { + "epoch": 0.6532040447172552, + "grad_norm": 0.11473844200372696, + "learning_rate": 0.00027347650776448506, + "loss": 2.6303, + "step": 22028 + }, + { + "epoch": 0.6532336980695668, + "grad_norm": 0.10826170444488525, + "learning_rate": 0.0002734345634645187, + "loss": 2.6491, + "step": 22029 + }, + { + "epoch": 0.6532633514218782, + "grad_norm": 0.1041724681854248, + "learning_rate": 0.00027339262117082866, + "loss": 2.6148, + "step": 22030 + }, + { + "epoch": 0.6532930047741897, + "grad_norm": 0.1161738708615303, + "learning_rate": 0.00027335068088378555, + "loss": 2.6387, + "step": 22031 + }, + { + "epoch": 0.6533226581265013, + "grad_norm": 0.10443700850009918, + "learning_rate": 0.00027330874260376114, + "loss": 2.6527, + "step": 22032 + }, + { + "epoch": 0.6533523114788127, + "grad_norm": 0.1081656664609909, + "learning_rate": 0.00027326680633112674, + "loss": 2.6097, + "step": 22033 + }, + { + "epoch": 0.6533819648311242, + "grad_norm": 0.09851142019033432, + "learning_rate": 0.00027322487206625365, + "loss": 2.6691, + "step": 22034 + }, + { + "epoch": 0.6534116181834356, + "grad_norm": 0.11549687385559082, + "learning_rate": 0.00027318293980951327, + "loss": 2.6504, + "step": 22035 + }, + { + "epoch": 0.6534412715357472, + "grad_norm": 0.10969405621290207, + "learning_rate": 0.00027314100956127687, + "loss": 2.6276, + "step": 22036 + }, + { + "epoch": 0.6534709248880586, + "grad_norm": 0.1269683986902237, + "learning_rate": 0.00027309908132191574, + "loss": 2.6119, + "step": 22037 + }, + { + "epoch": 0.6535005782403701, + "grad_norm": 0.11312439292669296, + "learning_rate": 0.00027305715509180116, + "loss": 2.643, + "step": 22038 + }, + { + "epoch": 0.6535302315926815, + "grad_norm": 0.1035367101430893, + "learning_rate": 0.00027301523087130454, + "loss": 2.6072, + "step": 22039 + }, + { + "epoch": 0.6535598849449931, + "grad_norm": 0.12084726989269257, + "learning_rate": 0.00027297330866079693, + "loss": 2.6244, + "step": 22040 + }, + { + "epoch": 0.6535895382973045, + "grad_norm": 0.10293497145175934, + "learning_rate": 0.0002729313884606497, + "loss": 2.63, + "step": 22041 + }, + { + "epoch": 0.653619191649616, + "grad_norm": 0.11261221766471863, + "learning_rate": 0.00027288947027123396, + "loss": 2.609, + "step": 22042 + }, + { + "epoch": 0.6536488450019274, + "grad_norm": 0.10921590030193329, + "learning_rate": 0.00027284755409292096, + "loss": 2.6094, + "step": 22043 + }, + { + "epoch": 0.653678498354239, + "grad_norm": 0.11198779940605164, + "learning_rate": 0.00027280563992608184, + "loss": 2.6322, + "step": 22044 + }, + { + "epoch": 0.6537081517065504, + "grad_norm": 0.10334030538797379, + "learning_rate": 0.0002727637277710878, + "loss": 2.638, + "step": 22045 + }, + { + "epoch": 0.6537378050588619, + "grad_norm": 0.10405991971492767, + "learning_rate": 0.00027272181762830997, + "loss": 2.5996, + "step": 22046 + }, + { + "epoch": 0.6537674584111733, + "grad_norm": 0.1028539165854454, + "learning_rate": 0.0002726799094981194, + "loss": 2.6474, + "step": 22047 + }, + { + "epoch": 0.6537971117634849, + "grad_norm": 0.10869968682527542, + "learning_rate": 0.0002726380033808872, + "loss": 2.6532, + "step": 22048 + }, + { + "epoch": 0.6538267651157963, + "grad_norm": 0.11151257157325745, + "learning_rate": 0.0002725960992769846, + "loss": 2.6594, + "step": 22049 + }, + { + "epoch": 0.6538564184681078, + "grad_norm": 0.12011388689279556, + "learning_rate": 0.0002725541971867826, + "loss": 2.62, + "step": 22050 + }, + { + "epoch": 0.6538860718204192, + "grad_norm": 0.10643409192562103, + "learning_rate": 0.0002725122971106522, + "loss": 2.6064, + "step": 22051 + }, + { + "epoch": 0.6539157251727308, + "grad_norm": 0.11670120060443878, + "learning_rate": 0.0002724703990489644, + "loss": 2.6241, + "step": 22052 + }, + { + "epoch": 0.6539453785250423, + "grad_norm": 0.10459653288125992, + "learning_rate": 0.0002724285030020903, + "loss": 2.6236, + "step": 22053 + }, + { + "epoch": 0.6539750318773537, + "grad_norm": 0.11387563496828079, + "learning_rate": 0.00027238660897040113, + "loss": 2.5882, + "step": 22054 + }, + { + "epoch": 0.6540046852296653, + "grad_norm": 0.1072736456990242, + "learning_rate": 0.0002723447169542671, + "loss": 2.6267, + "step": 22055 + }, + { + "epoch": 0.6540343385819767, + "grad_norm": 0.10838521271944046, + "learning_rate": 0.00027230282695405995, + "loss": 2.6179, + "step": 22056 + }, + { + "epoch": 0.6540639919342882, + "grad_norm": 0.11729815602302551, + "learning_rate": 0.00027226093897015036, + "loss": 2.6423, + "step": 22057 + }, + { + "epoch": 0.6540936452865996, + "grad_norm": 0.11504543572664261, + "learning_rate": 0.00027221905300290917, + "loss": 2.6163, + "step": 22058 + }, + { + "epoch": 0.6541232986389112, + "grad_norm": 0.09080350399017334, + "learning_rate": 0.00027217716905270745, + "loss": 2.6152, + "step": 22059 + }, + { + "epoch": 0.6541529519912226, + "grad_norm": 0.11193763464689255, + "learning_rate": 0.00027213528711991596, + "loss": 2.6363, + "step": 22060 + }, + { + "epoch": 0.6541826053435341, + "grad_norm": 0.09937908500432968, + "learning_rate": 0.0002720934072049056, + "loss": 2.6206, + "step": 22061 + }, + { + "epoch": 0.6542122586958455, + "grad_norm": 0.10639733821153641, + "learning_rate": 0.0002720515293080473, + "loss": 2.5922, + "step": 22062 + }, + { + "epoch": 0.6542419120481571, + "grad_norm": 0.09392943233251572, + "learning_rate": 0.0002720096534297118, + "loss": 2.6326, + "step": 22063 + }, + { + "epoch": 0.6542715654004685, + "grad_norm": 0.10546108335256577, + "learning_rate": 0.0002719677795702701, + "loss": 2.659, + "step": 22064 + }, + { + "epoch": 0.65430121875278, + "grad_norm": 0.10165924578905106, + "learning_rate": 0.00027192590773009276, + "loss": 2.6153, + "step": 22065 + }, + { + "epoch": 0.6543308721050914, + "grad_norm": 0.10233938694000244, + "learning_rate": 0.00027188403790955057, + "loss": 2.6149, + "step": 22066 + }, + { + "epoch": 0.654360525457403, + "grad_norm": 0.10665484517812729, + "learning_rate": 0.0002718421701090144, + "loss": 2.6196, + "step": 22067 + }, + { + "epoch": 0.6543901788097144, + "grad_norm": 0.10167817026376724, + "learning_rate": 0.0002718003043288548, + "loss": 2.624, + "step": 22068 + }, + { + "epoch": 0.6544198321620259, + "grad_norm": 0.11083526164293289, + "learning_rate": 0.0002717584405694429, + "loss": 2.633, + "step": 22069 + }, + { + "epoch": 0.6544494855143373, + "grad_norm": 0.11053906381130219, + "learning_rate": 0.0002717165788311491, + "loss": 2.6115, + "step": 22070 + }, + { + "epoch": 0.6544791388666489, + "grad_norm": 0.10998749732971191, + "learning_rate": 0.00027167471911434426, + "loss": 2.6294, + "step": 22071 + }, + { + "epoch": 0.6545087922189603, + "grad_norm": 0.11381562054157257, + "learning_rate": 0.0002716328614193989, + "loss": 2.6235, + "step": 22072 + }, + { + "epoch": 0.6545384455712718, + "grad_norm": 0.11300606280565262, + "learning_rate": 0.00027159100574668385, + "loss": 2.6249, + "step": 22073 + }, + { + "epoch": 0.6545680989235834, + "grad_norm": 0.11496837437152863, + "learning_rate": 0.00027154915209656955, + "loss": 2.6035, + "step": 22074 + }, + { + "epoch": 0.6545977522758948, + "grad_norm": 0.1180671751499176, + "learning_rate": 0.00027150730046942694, + "loss": 2.6196, + "step": 22075 + }, + { + "epoch": 0.6546274056282063, + "grad_norm": 0.10877608507871628, + "learning_rate": 0.0002714654508656262, + "loss": 2.6093, + "step": 22076 + }, + { + "epoch": 0.6546570589805177, + "grad_norm": 0.11185813695192337, + "learning_rate": 0.0002714236032855382, + "loss": 2.6041, + "step": 22077 + }, + { + "epoch": 0.6546867123328293, + "grad_norm": 0.113745778799057, + "learning_rate": 0.0002713817577295333, + "loss": 2.6662, + "step": 22078 + }, + { + "epoch": 0.6547163656851407, + "grad_norm": 0.10574718564748764, + "learning_rate": 0.00027133991419798234, + "loss": 2.6053, + "step": 22079 + }, + { + "epoch": 0.6547460190374522, + "grad_norm": 0.09527081996202469, + "learning_rate": 0.0002712980726912556, + "loss": 2.6101, + "step": 22080 + }, + { + "epoch": 0.6547756723897636, + "grad_norm": 0.10575336217880249, + "learning_rate": 0.0002712562332097235, + "loss": 2.6391, + "step": 22081 + }, + { + "epoch": 0.6548053257420752, + "grad_norm": 0.12413159012794495, + "learning_rate": 0.00027121439575375684, + "loss": 2.6483, + "step": 22082 + }, + { + "epoch": 0.6548349790943866, + "grad_norm": 0.1057036742568016, + "learning_rate": 0.0002711725603237261, + "loss": 2.6123, + "step": 22083 + }, + { + "epoch": 0.6548646324466981, + "grad_norm": 0.10835651308298111, + "learning_rate": 0.00027113072692000153, + "loss": 2.5984, + "step": 22084 + }, + { + "epoch": 0.6548942857990095, + "grad_norm": 0.11329469829797745, + "learning_rate": 0.0002710888955429538, + "loss": 2.6114, + "step": 22085 + }, + { + "epoch": 0.6549239391513211, + "grad_norm": 0.10844442993402481, + "learning_rate": 0.0002710470661929531, + "loss": 2.5806, + "step": 22086 + }, + { + "epoch": 0.6549535925036325, + "grad_norm": 0.10750316828489304, + "learning_rate": 0.0002710052388703699, + "loss": 2.6413, + "step": 22087 + }, + { + "epoch": 0.654983245855944, + "grad_norm": 0.10834560543298721, + "learning_rate": 0.00027096341357557465, + "loss": 2.6215, + "step": 22088 + }, + { + "epoch": 0.6550128992082555, + "grad_norm": 0.10491738468408585, + "learning_rate": 0.0002709215903089376, + "loss": 2.6478, + "step": 22089 + }, + { + "epoch": 0.655042552560567, + "grad_norm": 0.11276024580001831, + "learning_rate": 0.00027087976907082923, + "loss": 2.6645, + "step": 22090 + }, + { + "epoch": 0.6550722059128784, + "grad_norm": 0.1285669356584549, + "learning_rate": 0.0002708379498616199, + "loss": 2.6143, + "step": 22091 + }, + { + "epoch": 0.6551018592651899, + "grad_norm": 0.10565681010484695, + "learning_rate": 0.0002707961326816798, + "loss": 2.6274, + "step": 22092 + }, + { + "epoch": 0.6551315126175014, + "grad_norm": 0.10867144912481308, + "learning_rate": 0.00027075431753137923, + "loss": 2.6287, + "step": 22093 + }, + { + "epoch": 0.6551611659698129, + "grad_norm": 0.11209367960691452, + "learning_rate": 0.0002707125044110884, + "loss": 2.6271, + "step": 22094 + }, + { + "epoch": 0.6551908193221244, + "grad_norm": 0.11576274037361145, + "learning_rate": 0.0002706706933211779, + "loss": 2.6644, + "step": 22095 + }, + { + "epoch": 0.6552204726744358, + "grad_norm": 0.09794475138187408, + "learning_rate": 0.00027062888426201796, + "loss": 2.6197, + "step": 22096 + }, + { + "epoch": 0.6552501260267474, + "grad_norm": 0.10370314121246338, + "learning_rate": 0.00027058707723397847, + "loss": 2.6252, + "step": 22097 + }, + { + "epoch": 0.6552797793790588, + "grad_norm": 0.10526195913553238, + "learning_rate": 0.0002705452722374298, + "loss": 2.6434, + "step": 22098 + }, + { + "epoch": 0.6553094327313703, + "grad_norm": 0.09186375141143799, + "learning_rate": 0.000270503469272742, + "loss": 2.6295, + "step": 22099 + }, + { + "epoch": 0.6553390860836817, + "grad_norm": 0.11134002357721329, + "learning_rate": 0.0002704616683402854, + "loss": 2.6331, + "step": 22100 + }, + { + "epoch": 0.6553687394359933, + "grad_norm": 0.11199449002742767, + "learning_rate": 0.0002704198694404302, + "loss": 2.6244, + "step": 22101 + }, + { + "epoch": 0.6553983927883047, + "grad_norm": 0.09997500479221344, + "learning_rate": 0.0002703780725735464, + "loss": 2.6255, + "step": 22102 + }, + { + "epoch": 0.6554280461406162, + "grad_norm": 0.1028311625123024, + "learning_rate": 0.00027033627774000426, + "loss": 2.5751, + "step": 22103 + }, + { + "epoch": 0.6554576994929276, + "grad_norm": 0.11069740355014801, + "learning_rate": 0.0002702944849401737, + "loss": 2.6272, + "step": 22104 + }, + { + "epoch": 0.6554873528452392, + "grad_norm": 0.11269491165876389, + "learning_rate": 0.000270252694174425, + "loss": 2.6716, + "step": 22105 + }, + { + "epoch": 0.6555170061975506, + "grad_norm": 0.10472260415554047, + "learning_rate": 0.0002702109054431281, + "loss": 2.6161, + "step": 22106 + }, + { + "epoch": 0.6555466595498621, + "grad_norm": 0.10015827417373657, + "learning_rate": 0.0002701691187466531, + "loss": 2.6176, + "step": 22107 + }, + { + "epoch": 0.6555763129021736, + "grad_norm": 0.1020219549536705, + "learning_rate": 0.00027012733408536993, + "loss": 2.6559, + "step": 22108 + }, + { + "epoch": 0.6556059662544851, + "grad_norm": 0.10987404733896255, + "learning_rate": 0.00027008555145964877, + "loss": 2.6338, + "step": 22109 + }, + { + "epoch": 0.6556356196067965, + "grad_norm": 0.1103290244936943, + "learning_rate": 0.0002700437708698594, + "loss": 2.6189, + "step": 22110 + }, + { + "epoch": 0.655665272959108, + "grad_norm": 0.09981238842010498, + "learning_rate": 0.000270001992316372, + "loss": 2.6579, + "step": 22111 + }, + { + "epoch": 0.6556949263114195, + "grad_norm": 0.12019221484661102, + "learning_rate": 0.0002699602157995564, + "loss": 2.5921, + "step": 22112 + }, + { + "epoch": 0.655724579663731, + "grad_norm": 0.09922609478235245, + "learning_rate": 0.0002699184413197826, + "loss": 2.6057, + "step": 22113 + }, + { + "epoch": 0.6557542330160424, + "grad_norm": 0.10791190713644028, + "learning_rate": 0.00026987666887742047, + "loss": 2.5868, + "step": 22114 + }, + { + "epoch": 0.6557838863683539, + "grad_norm": 0.09500209987163544, + "learning_rate": 0.00026983489847283996, + "loss": 2.5933, + "step": 22115 + }, + { + "epoch": 0.6558135397206655, + "grad_norm": 0.10467850416898727, + "learning_rate": 0.0002697931301064109, + "loss": 2.6202, + "step": 22116 + }, + { + "epoch": 0.6558431930729769, + "grad_norm": 0.1038801446557045, + "learning_rate": 0.0002697513637785032, + "loss": 2.6062, + "step": 22117 + }, + { + "epoch": 0.6558728464252884, + "grad_norm": 0.11130020767450333, + "learning_rate": 0.00026970959948948673, + "loss": 2.6362, + "step": 22118 + }, + { + "epoch": 0.6559024997775998, + "grad_norm": 0.1147698387503624, + "learning_rate": 0.00026966783723973145, + "loss": 2.638, + "step": 22119 + }, + { + "epoch": 0.6559321531299114, + "grad_norm": 0.10411960631608963, + "learning_rate": 0.00026962607702960657, + "loss": 2.6408, + "step": 22120 + }, + { + "epoch": 0.6559618064822228, + "grad_norm": 0.10996218770742416, + "learning_rate": 0.00026958431885948256, + "loss": 2.6212, + "step": 22121 + }, + { + "epoch": 0.6559914598345343, + "grad_norm": 0.11389603465795517, + "learning_rate": 0.000269542562729729, + "loss": 2.6162, + "step": 22122 + }, + { + "epoch": 0.6560211131868458, + "grad_norm": 0.11987021565437317, + "learning_rate": 0.0002695008086407155, + "loss": 2.6084, + "step": 22123 + }, + { + "epoch": 0.6560507665391573, + "grad_norm": 0.1131303459405899, + "learning_rate": 0.000269459056592812, + "loss": 2.6427, + "step": 22124 + }, + { + "epoch": 0.6560804198914687, + "grad_norm": 0.10201197117567062, + "learning_rate": 0.0002694173065863881, + "loss": 2.6189, + "step": 22125 + }, + { + "epoch": 0.6561100732437802, + "grad_norm": 0.10661990195512772, + "learning_rate": 0.0002693755586218135, + "loss": 2.6326, + "step": 22126 + }, + { + "epoch": 0.6561397265960917, + "grad_norm": 0.08914860337972641, + "learning_rate": 0.00026933381269945793, + "loss": 2.6133, + "step": 22127 + }, + { + "epoch": 0.6561693799484032, + "grad_norm": 0.1023755669593811, + "learning_rate": 0.000269292068819691, + "loss": 2.6017, + "step": 22128 + }, + { + "epoch": 0.6561990333007146, + "grad_norm": 0.08936808258295059, + "learning_rate": 0.00026925032698288257, + "loss": 2.6073, + "step": 22129 + }, + { + "epoch": 0.6562286866530261, + "grad_norm": 0.10876423865556717, + "learning_rate": 0.0002692085871894021, + "loss": 2.5647, + "step": 22130 + }, + { + "epoch": 0.6562583400053376, + "grad_norm": 0.1049111932516098, + "learning_rate": 0.0002691668494396191, + "loss": 2.6314, + "step": 22131 + }, + { + "epoch": 0.6562879933576491, + "grad_norm": 0.09765699505805969, + "learning_rate": 0.00026912511373390326, + "loss": 2.6622, + "step": 22132 + }, + { + "epoch": 0.6563176467099605, + "grad_norm": 0.10690554976463318, + "learning_rate": 0.00026908338007262397, + "loss": 2.658, + "step": 22133 + }, + { + "epoch": 0.656347300062272, + "grad_norm": 0.11184652149677277, + "learning_rate": 0.0002690416484561512, + "loss": 2.6073, + "step": 22134 + }, + { + "epoch": 0.6563769534145835, + "grad_norm": 0.11247655004262924, + "learning_rate": 0.0002689999188848542, + "loss": 2.635, + "step": 22135 + }, + { + "epoch": 0.656406606766895, + "grad_norm": 0.10710514336824417, + "learning_rate": 0.00026895819135910263, + "loss": 2.6588, + "step": 22136 + }, + { + "epoch": 0.6564362601192065, + "grad_norm": 0.12481982260942459, + "learning_rate": 0.000268916465879266, + "loss": 2.5964, + "step": 22137 + }, + { + "epoch": 0.656465913471518, + "grad_norm": 0.11990010738372803, + "learning_rate": 0.00026887474244571363, + "loss": 2.6026, + "step": 22138 + }, + { + "epoch": 0.6564955668238295, + "grad_norm": 0.12845949828624725, + "learning_rate": 0.0002688330210588151, + "loss": 2.6363, + "step": 22139 + }, + { + "epoch": 0.6565252201761409, + "grad_norm": 0.1100827232003212, + "learning_rate": 0.0002687913017189401, + "loss": 2.6119, + "step": 22140 + }, + { + "epoch": 0.6565548735284524, + "grad_norm": 0.10423733294010162, + "learning_rate": 0.0002687495844264575, + "loss": 2.587, + "step": 22141 + }, + { + "epoch": 0.6565845268807639, + "grad_norm": 0.11095292866230011, + "learning_rate": 0.00026870786918173714, + "loss": 2.6365, + "step": 22142 + }, + { + "epoch": 0.6566141802330754, + "grad_norm": 0.11305375397205353, + "learning_rate": 0.0002686661559851482, + "loss": 2.6379, + "step": 22143 + }, + { + "epoch": 0.6566438335853868, + "grad_norm": 0.11139050126075745, + "learning_rate": 0.0002686244448370603, + "loss": 2.611, + "step": 22144 + }, + { + "epoch": 0.6566734869376983, + "grad_norm": 0.1026563048362732, + "learning_rate": 0.0002685827357378425, + "loss": 2.6245, + "step": 22145 + }, + { + "epoch": 0.6567031402900098, + "grad_norm": 0.10393177717924118, + "learning_rate": 0.0002685410286878642, + "loss": 2.622, + "step": 22146 + }, + { + "epoch": 0.6567327936423213, + "grad_norm": 0.10649120062589645, + "learning_rate": 0.00026849932368749494, + "loss": 2.6106, + "step": 22147 + }, + { + "epoch": 0.6567624469946327, + "grad_norm": 0.09403296560049057, + "learning_rate": 0.000268457620737104, + "loss": 2.6593, + "step": 22148 + }, + { + "epoch": 0.6567921003469442, + "grad_norm": 0.10571335256099701, + "learning_rate": 0.0002684159198370605, + "loss": 2.5918, + "step": 22149 + }, + { + "epoch": 0.6568217536992557, + "grad_norm": 0.1035638228058815, + "learning_rate": 0.0002683742209877338, + "loss": 2.6329, + "step": 22150 + }, + { + "epoch": 0.6568514070515672, + "grad_norm": 0.1029176414012909, + "learning_rate": 0.0002683325241894934, + "loss": 2.6303, + "step": 22151 + }, + { + "epoch": 0.6568810604038786, + "grad_norm": 0.10958132147789001, + "learning_rate": 0.000268290829442708, + "loss": 2.6048, + "step": 22152 + }, + { + "epoch": 0.6569107137561901, + "grad_norm": 0.13080866634845734, + "learning_rate": 0.00026824913674774705, + "loss": 2.6004, + "step": 22153 + }, + { + "epoch": 0.6569403671085016, + "grad_norm": 0.13414694368839264, + "learning_rate": 0.00026820744610497985, + "loss": 2.653, + "step": 22154 + }, + { + "epoch": 0.6569700204608131, + "grad_norm": 0.1280367225408554, + "learning_rate": 0.0002681657575147754, + "loss": 2.6219, + "step": 22155 + }, + { + "epoch": 0.6569996738131245, + "grad_norm": 0.09463212639093399, + "learning_rate": 0.000268124070977503, + "loss": 2.6339, + "step": 22156 + }, + { + "epoch": 0.657029327165436, + "grad_norm": 0.11586115509271622, + "learning_rate": 0.0002680823864935318, + "loss": 2.6332, + "step": 22157 + }, + { + "epoch": 0.6570589805177476, + "grad_norm": 0.11830196529626846, + "learning_rate": 0.0002680407040632308, + "loss": 2.6486, + "step": 22158 + }, + { + "epoch": 0.657088633870059, + "grad_norm": 0.11585898697376251, + "learning_rate": 0.00026799902368696905, + "loss": 2.6493, + "step": 22159 + }, + { + "epoch": 0.6571182872223705, + "grad_norm": 0.1013268381357193, + "learning_rate": 0.00026795734536511594, + "loss": 2.605, + "step": 22160 + }, + { + "epoch": 0.657147940574682, + "grad_norm": 0.12195268273353577, + "learning_rate": 0.00026791566909804056, + "loss": 2.6134, + "step": 22161 + }, + { + "epoch": 0.6571775939269935, + "grad_norm": 0.12060605734586716, + "learning_rate": 0.00026787399488611155, + "loss": 2.6242, + "step": 22162 + }, + { + "epoch": 0.6572072472793049, + "grad_norm": 0.11043421179056168, + "learning_rate": 0.00026783232272969813, + "loss": 2.6363, + "step": 22163 + }, + { + "epoch": 0.6572369006316164, + "grad_norm": 0.11616653203964233, + "learning_rate": 0.00026779065262916947, + "loss": 2.6309, + "step": 22164 + }, + { + "epoch": 0.6572665539839279, + "grad_norm": 0.1198243498802185, + "learning_rate": 0.0002677489845848944, + "loss": 2.6543, + "step": 22165 + }, + { + "epoch": 0.6572962073362394, + "grad_norm": 0.10643476992845535, + "learning_rate": 0.00026770731859724185, + "loss": 2.6286, + "step": 22166 + }, + { + "epoch": 0.6573258606885508, + "grad_norm": 0.12359734624624252, + "learning_rate": 0.0002676656546665809, + "loss": 2.6092, + "step": 22167 + }, + { + "epoch": 0.6573555140408623, + "grad_norm": 0.12452176213264465, + "learning_rate": 0.0002676239927932805, + "loss": 2.6263, + "step": 22168 + }, + { + "epoch": 0.6573851673931738, + "grad_norm": 0.10146981477737427, + "learning_rate": 0.0002675823329777095, + "loss": 2.6114, + "step": 22169 + }, + { + "epoch": 0.6574148207454853, + "grad_norm": 0.1285201758146286, + "learning_rate": 0.00026754067522023685, + "loss": 2.6696, + "step": 22170 + }, + { + "epoch": 0.6574444740977967, + "grad_norm": 0.12505659461021423, + "learning_rate": 0.0002674990195212314, + "loss": 2.6429, + "step": 22171 + }, + { + "epoch": 0.6574741274501082, + "grad_norm": 0.11627166718244553, + "learning_rate": 0.00026745736588106207, + "loss": 2.6305, + "step": 22172 + }, + { + "epoch": 0.6575037808024197, + "grad_norm": 0.124466173350811, + "learning_rate": 0.0002674157143000977, + "loss": 2.6261, + "step": 22173 + }, + { + "epoch": 0.6575334341547312, + "grad_norm": 0.11708934605121613, + "learning_rate": 0.000267374064778707, + "loss": 2.6351, + "step": 22174 + }, + { + "epoch": 0.6575630875070426, + "grad_norm": 0.11253351718187332, + "learning_rate": 0.000267332417317259, + "loss": 2.6294, + "step": 22175 + }, + { + "epoch": 0.6575927408593542, + "grad_norm": 0.11389948427677155, + "learning_rate": 0.0002672907719161223, + "loss": 2.6302, + "step": 22176 + }, + { + "epoch": 0.6576223942116657, + "grad_norm": 0.11609573662281036, + "learning_rate": 0.0002672491285756658, + "loss": 2.6127, + "step": 22177 + }, + { + "epoch": 0.6576520475639771, + "grad_norm": 0.11482199281454086, + "learning_rate": 0.0002672074872962582, + "loss": 2.6135, + "step": 22178 + }, + { + "epoch": 0.6576817009162886, + "grad_norm": 0.1152309775352478, + "learning_rate": 0.0002671658480782683, + "loss": 2.6522, + "step": 22179 + }, + { + "epoch": 0.6577113542686001, + "grad_norm": 0.10513604432344437, + "learning_rate": 0.00026712421092206474, + "loss": 2.6349, + "step": 22180 + }, + { + "epoch": 0.6577410076209116, + "grad_norm": 0.10988713055849075, + "learning_rate": 0.0002670825758280163, + "loss": 2.663, + "step": 22181 + }, + { + "epoch": 0.657770660973223, + "grad_norm": 0.11753545701503754, + "learning_rate": 0.0002670409427964916, + "loss": 2.6399, + "step": 22182 + }, + { + "epoch": 0.6578003143255345, + "grad_norm": 0.10666743665933609, + "learning_rate": 0.0002669993118278593, + "loss": 2.6051, + "step": 22183 + }, + { + "epoch": 0.657829967677846, + "grad_norm": 0.11684059351682663, + "learning_rate": 0.0002669576829224881, + "loss": 2.6321, + "step": 22184 + }, + { + "epoch": 0.6578596210301575, + "grad_norm": 0.10687768459320068, + "learning_rate": 0.0002669160560807467, + "loss": 2.6221, + "step": 22185 + }, + { + "epoch": 0.6578892743824689, + "grad_norm": 0.10843902826309204, + "learning_rate": 0.00026687443130300357, + "loss": 2.6003, + "step": 22186 + }, + { + "epoch": 0.6579189277347804, + "grad_norm": 0.10419216006994247, + "learning_rate": 0.00026683280858962743, + "loss": 2.6268, + "step": 22187 + }, + { + "epoch": 0.6579485810870919, + "grad_norm": 0.10651778429746628, + "learning_rate": 0.0002667911879409867, + "loss": 2.6524, + "step": 22188 + }, + { + "epoch": 0.6579782344394034, + "grad_norm": 0.10285639017820358, + "learning_rate": 0.0002667495693574501, + "loss": 2.5967, + "step": 22189 + }, + { + "epoch": 0.6580078877917148, + "grad_norm": 0.10917844623327255, + "learning_rate": 0.0002667079528393861, + "loss": 2.5958, + "step": 22190 + }, + { + "epoch": 0.6580375411440264, + "grad_norm": 0.10633944720029831, + "learning_rate": 0.00026666633838716316, + "loss": 2.6115, + "step": 22191 + }, + { + "epoch": 0.6580671944963378, + "grad_norm": 0.11746422201395035, + "learning_rate": 0.00026662472600114985, + "loss": 2.6255, + "step": 22192 + }, + { + "epoch": 0.6580968478486493, + "grad_norm": 0.10628687590360641, + "learning_rate": 0.0002665831156817147, + "loss": 2.6297, + "step": 22193 + }, + { + "epoch": 0.6581265012009607, + "grad_norm": 0.09847036749124527, + "learning_rate": 0.0002665415074292261, + "loss": 2.6318, + "step": 22194 + }, + { + "epoch": 0.6581561545532723, + "grad_norm": 0.11103420704603195, + "learning_rate": 0.00026649990124405276, + "loss": 2.6699, + "step": 22195 + }, + { + "epoch": 0.6581858079055837, + "grad_norm": 0.09389982372522354, + "learning_rate": 0.00026645829712656263, + "loss": 2.629, + "step": 22196 + }, + { + "epoch": 0.6582154612578952, + "grad_norm": 0.12184861302375793, + "learning_rate": 0.00026641669507712417, + "loss": 2.6147, + "step": 22197 + }, + { + "epoch": 0.6582451146102067, + "grad_norm": 0.1044323742389679, + "learning_rate": 0.0002663750950961062, + "loss": 2.6188, + "step": 22198 + }, + { + "epoch": 0.6582747679625182, + "grad_norm": 0.1168469712138176, + "learning_rate": 0.0002663334971838768, + "loss": 2.6301, + "step": 22199 + }, + { + "epoch": 0.6583044213148297, + "grad_norm": 0.09311066567897797, + "learning_rate": 0.00026629190134080445, + "loss": 2.6084, + "step": 22200 + }, + { + "epoch": 0.6583340746671411, + "grad_norm": 0.12311634421348572, + "learning_rate": 0.0002662503075672574, + "loss": 2.6155, + "step": 22201 + }, + { + "epoch": 0.6583637280194526, + "grad_norm": 0.1226312667131424, + "learning_rate": 0.00026620871586360405, + "loss": 2.5898, + "step": 22202 + }, + { + "epoch": 0.6583933813717641, + "grad_norm": 0.1066674068570137, + "learning_rate": 0.0002661671262302126, + "loss": 2.6543, + "step": 22203 + }, + { + "epoch": 0.6584230347240756, + "grad_norm": 0.12193352729082108, + "learning_rate": 0.0002661255386674514, + "loss": 2.5829, + "step": 22204 + }, + { + "epoch": 0.658452688076387, + "grad_norm": 0.10644975304603577, + "learning_rate": 0.0002660839531756887, + "loss": 2.6231, + "step": 22205 + }, + { + "epoch": 0.6584823414286985, + "grad_norm": 0.1076878011226654, + "learning_rate": 0.0002660423697552929, + "loss": 2.6422, + "step": 22206 + }, + { + "epoch": 0.65851199478101, + "grad_norm": 0.097933329641819, + "learning_rate": 0.00026600078840663193, + "loss": 2.595, + "step": 22207 + }, + { + "epoch": 0.6585416481333215, + "grad_norm": 0.12282291799783707, + "learning_rate": 0.0002659592091300741, + "loss": 2.6075, + "step": 22208 + }, + { + "epoch": 0.6585713014856329, + "grad_norm": 0.12470046430826187, + "learning_rate": 0.00026591763192598773, + "loss": 2.6374, + "step": 22209 + }, + { + "epoch": 0.6586009548379445, + "grad_norm": 0.11197525262832642, + "learning_rate": 0.00026587605679474064, + "loss": 2.6345, + "step": 22210 + }, + { + "epoch": 0.6586306081902559, + "grad_norm": 0.09654504805803299, + "learning_rate": 0.00026583448373670147, + "loss": 2.6497, + "step": 22211 + }, + { + "epoch": 0.6586602615425674, + "grad_norm": 0.12422943860292435, + "learning_rate": 0.00026579291275223815, + "loss": 2.5976, + "step": 22212 + }, + { + "epoch": 0.6586899148948788, + "grad_norm": 0.1056484654545784, + "learning_rate": 0.0002657513438417187, + "loss": 2.6368, + "step": 22213 + }, + { + "epoch": 0.6587195682471904, + "grad_norm": 0.09881594777107239, + "learning_rate": 0.00026570977700551146, + "loss": 2.601, + "step": 22214 + }, + { + "epoch": 0.6587492215995018, + "grad_norm": 0.10900338739156723, + "learning_rate": 0.0002656682122439843, + "loss": 2.629, + "step": 22215 + }, + { + "epoch": 0.6587788749518133, + "grad_norm": 0.1040244922041893, + "learning_rate": 0.0002656266495575055, + "loss": 2.6162, + "step": 22216 + }, + { + "epoch": 0.6588085283041247, + "grad_norm": 0.09697767347097397, + "learning_rate": 0.0002655850889464428, + "loss": 2.6371, + "step": 22217 + }, + { + "epoch": 0.6588381816564363, + "grad_norm": 0.12573997676372528, + "learning_rate": 0.0002655435304111643, + "loss": 2.6519, + "step": 22218 + }, + { + "epoch": 0.6588678350087478, + "grad_norm": 0.11746788024902344, + "learning_rate": 0.0002655019739520381, + "loss": 2.621, + "step": 22219 + }, + { + "epoch": 0.6588974883610592, + "grad_norm": 0.11092493683099747, + "learning_rate": 0.0002654604195694322, + "loss": 2.6126, + "step": 22220 + }, + { + "epoch": 0.6589271417133707, + "grad_norm": 0.10534010082483292, + "learning_rate": 0.00026541886726371463, + "loss": 2.6159, + "step": 22221 + }, + { + "epoch": 0.6589567950656822, + "grad_norm": 0.10408506542444229, + "learning_rate": 0.00026537731703525316, + "loss": 2.6207, + "step": 22222 + }, + { + "epoch": 0.6589864484179937, + "grad_norm": 0.09355878084897995, + "learning_rate": 0.0002653357688844156, + "loss": 2.5985, + "step": 22223 + }, + { + "epoch": 0.6590161017703051, + "grad_norm": 0.11243169009685516, + "learning_rate": 0.00026529422281157037, + "loss": 2.6483, + "step": 22224 + }, + { + "epoch": 0.6590457551226166, + "grad_norm": 0.11946718394756317, + "learning_rate": 0.00026525267881708506, + "loss": 2.6431, + "step": 22225 + }, + { + "epoch": 0.6590754084749281, + "grad_norm": 0.10090040415525436, + "learning_rate": 0.00026521113690132747, + "loss": 2.6286, + "step": 22226 + }, + { + "epoch": 0.6591050618272396, + "grad_norm": 0.11190304160118103, + "learning_rate": 0.0002651695970646659, + "loss": 2.6376, + "step": 22227 + }, + { + "epoch": 0.659134715179551, + "grad_norm": 0.10738474875688553, + "learning_rate": 0.0002651280593074676, + "loss": 2.5868, + "step": 22228 + }, + { + "epoch": 0.6591643685318626, + "grad_norm": 0.09714789688587189, + "learning_rate": 0.0002650865236301006, + "loss": 2.6023, + "step": 22229 + }, + { + "epoch": 0.659194021884174, + "grad_norm": 0.10253982245922089, + "learning_rate": 0.0002650449900329328, + "loss": 2.6023, + "step": 22230 + }, + { + "epoch": 0.6592236752364855, + "grad_norm": 0.09851614385843277, + "learning_rate": 0.00026500345851633193, + "loss": 2.6389, + "step": 22231 + }, + { + "epoch": 0.6592533285887969, + "grad_norm": 0.10536061972379684, + "learning_rate": 0.00026496192908066584, + "loss": 2.6561, + "step": 22232 + }, + { + "epoch": 0.6592829819411085, + "grad_norm": 0.1116710901260376, + "learning_rate": 0.00026492040172630216, + "loss": 2.6424, + "step": 22233 + }, + { + "epoch": 0.6593126352934199, + "grad_norm": 0.11848420649766922, + "learning_rate": 0.00026487887645360866, + "loss": 2.6061, + "step": 22234 + }, + { + "epoch": 0.6593422886457314, + "grad_norm": 0.10993584990501404, + "learning_rate": 0.0002648373532629531, + "loss": 2.6491, + "step": 22235 + }, + { + "epoch": 0.6593719419980428, + "grad_norm": 0.12121792137622833, + "learning_rate": 0.0002647958321547029, + "loss": 2.6565, + "step": 22236 + }, + { + "epoch": 0.6594015953503544, + "grad_norm": 0.11163762956857681, + "learning_rate": 0.0002647543131292264, + "loss": 2.5954, + "step": 22237 + }, + { + "epoch": 0.6594312487026658, + "grad_norm": 0.11642669141292572, + "learning_rate": 0.00026471279618689057, + "loss": 2.6059, + "step": 22238 + }, + { + "epoch": 0.6594609020549773, + "grad_norm": 0.10052569210529327, + "learning_rate": 0.0002646712813280634, + "loss": 2.6307, + "step": 22239 + }, + { + "epoch": 0.6594905554072888, + "grad_norm": 0.12337595224380493, + "learning_rate": 0.00026462976855311243, + "loss": 2.6176, + "step": 22240 + }, + { + "epoch": 0.6595202087596003, + "grad_norm": 0.12894956767559052, + "learning_rate": 0.00026458825786240527, + "loss": 2.6145, + "step": 22241 + }, + { + "epoch": 0.6595498621119118, + "grad_norm": 0.13840827345848083, + "learning_rate": 0.00026454674925630945, + "loss": 2.6234, + "step": 22242 + }, + { + "epoch": 0.6595795154642232, + "grad_norm": 0.12472712248563766, + "learning_rate": 0.0002645052427351926, + "loss": 2.6048, + "step": 22243 + }, + { + "epoch": 0.6596091688165348, + "grad_norm": 0.10446533560752869, + "learning_rate": 0.0002644637382994223, + "loss": 2.6388, + "step": 22244 + }, + { + "epoch": 0.6596388221688462, + "grad_norm": 0.13374580442905426, + "learning_rate": 0.0002644222359493659, + "loss": 2.608, + "step": 22245 + }, + { + "epoch": 0.6596684755211577, + "grad_norm": 0.1118316799402237, + "learning_rate": 0.0002643807356853911, + "loss": 2.6375, + "step": 22246 + }, + { + "epoch": 0.6596981288734691, + "grad_norm": 0.1154794991016388, + "learning_rate": 0.00026433923750786536, + "loss": 2.63, + "step": 22247 + }, + { + "epoch": 0.6597277822257807, + "grad_norm": 0.13279734551906586, + "learning_rate": 0.0002642977414171561, + "loss": 2.6172, + "step": 22248 + }, + { + "epoch": 0.6597574355780921, + "grad_norm": 0.10601210594177246, + "learning_rate": 0.00026425624741363075, + "loss": 2.5715, + "step": 22249 + }, + { + "epoch": 0.6597870889304036, + "grad_norm": 0.10717257857322693, + "learning_rate": 0.0002642147554976568, + "loss": 2.6095, + "step": 22250 + }, + { + "epoch": 0.659816742282715, + "grad_norm": 0.10003150254487991, + "learning_rate": 0.00026417326566960175, + "loss": 2.6471, + "step": 22251 + }, + { + "epoch": 0.6598463956350266, + "grad_norm": 0.10624661296606064, + "learning_rate": 0.0002641317779298329, + "loss": 2.6361, + "step": 22252 + }, + { + "epoch": 0.659876048987338, + "grad_norm": 0.1069331169128418, + "learning_rate": 0.00026409029227871764, + "loss": 2.6574, + "step": 22253 + }, + { + "epoch": 0.6599057023396495, + "grad_norm": 0.11374600976705551, + "learning_rate": 0.0002640488087166233, + "loss": 2.5956, + "step": 22254 + }, + { + "epoch": 0.6599353556919609, + "grad_norm": 0.10921172797679901, + "learning_rate": 0.0002640073272439172, + "loss": 2.6413, + "step": 22255 + }, + { + "epoch": 0.6599650090442725, + "grad_norm": 0.11433357000350952, + "learning_rate": 0.0002639658478609668, + "loss": 2.6052, + "step": 22256 + }, + { + "epoch": 0.6599946623965839, + "grad_norm": 0.11406324803829193, + "learning_rate": 0.00026392437056813934, + "loss": 2.6079, + "step": 22257 + }, + { + "epoch": 0.6600243157488954, + "grad_norm": 0.11700209975242615, + "learning_rate": 0.000263882895365802, + "loss": 2.6377, + "step": 22258 + }, + { + "epoch": 0.6600539691012068, + "grad_norm": 0.12641416490077972, + "learning_rate": 0.0002638414222543223, + "loss": 2.6404, + "step": 22259 + }, + { + "epoch": 0.6600836224535184, + "grad_norm": 0.12031634896993637, + "learning_rate": 0.00026379995123406726, + "loss": 2.6453, + "step": 22260 + }, + { + "epoch": 0.6601132758058299, + "grad_norm": 0.11050312221050262, + "learning_rate": 0.0002637584823054044, + "loss": 2.5873, + "step": 22261 + }, + { + "epoch": 0.6601429291581413, + "grad_norm": 0.11192787438631058, + "learning_rate": 0.00026371701546870033, + "loss": 2.6175, + "step": 22262 + }, + { + "epoch": 0.6601725825104529, + "grad_norm": 0.1120164766907692, + "learning_rate": 0.0002636755507243228, + "loss": 2.6264, + "step": 22263 + }, + { + "epoch": 0.6602022358627643, + "grad_norm": 0.10612834244966507, + "learning_rate": 0.0002636340880726389, + "loss": 2.6332, + "step": 22264 + }, + { + "epoch": 0.6602318892150758, + "grad_norm": 0.11040761321783066, + "learning_rate": 0.00026359262751401573, + "loss": 2.616, + "step": 22265 + }, + { + "epoch": 0.6602615425673872, + "grad_norm": 0.11863245069980621, + "learning_rate": 0.00026355116904882035, + "loss": 2.6091, + "step": 22266 + }, + { + "epoch": 0.6602911959196988, + "grad_norm": 0.10393321514129639, + "learning_rate": 0.0002635097126774201, + "loss": 2.6184, + "step": 22267 + }, + { + "epoch": 0.6603208492720102, + "grad_norm": 0.11802402138710022, + "learning_rate": 0.0002634682584001818, + "loss": 2.5935, + "step": 22268 + }, + { + "epoch": 0.6603505026243217, + "grad_norm": 0.10005193948745728, + "learning_rate": 0.0002634268062174727, + "loss": 2.6303, + "step": 22269 + }, + { + "epoch": 0.6603801559766331, + "grad_norm": 0.11391864717006683, + "learning_rate": 0.0002633853561296599, + "loss": 2.6015, + "step": 22270 + }, + { + "epoch": 0.6604098093289447, + "grad_norm": 0.1095435842871666, + "learning_rate": 0.0002633439081371105, + "loss": 2.6117, + "step": 22271 + }, + { + "epoch": 0.6604394626812561, + "grad_norm": 0.10167567431926727, + "learning_rate": 0.0002633024622401912, + "loss": 2.6352, + "step": 22272 + }, + { + "epoch": 0.6604691160335676, + "grad_norm": 0.10425544530153275, + "learning_rate": 0.0002632610184392693, + "loss": 2.6145, + "step": 22273 + }, + { + "epoch": 0.660498769385879, + "grad_norm": 0.10779891163110733, + "learning_rate": 0.0002632195767347117, + "loss": 2.6279, + "step": 22274 + }, + { + "epoch": 0.6605284227381906, + "grad_norm": 0.10846903175115585, + "learning_rate": 0.0002631781371268852, + "loss": 2.6424, + "step": 22275 + }, + { + "epoch": 0.660558076090502, + "grad_norm": 0.1019304171204567, + "learning_rate": 0.00026313669961615713, + "loss": 2.6108, + "step": 22276 + }, + { + "epoch": 0.6605877294428135, + "grad_norm": 0.11392951756715775, + "learning_rate": 0.0002630952642028942, + "loss": 2.6204, + "step": 22277 + }, + { + "epoch": 0.6606173827951249, + "grad_norm": 0.10562099516391754, + "learning_rate": 0.00026305383088746345, + "loss": 2.5967, + "step": 22278 + }, + { + "epoch": 0.6606470361474365, + "grad_norm": 0.11150159686803818, + "learning_rate": 0.0002630123996702316, + "loss": 2.627, + "step": 22279 + }, + { + "epoch": 0.6606766894997479, + "grad_norm": 0.11145395040512085, + "learning_rate": 0.0002629709705515657, + "loss": 2.6184, + "step": 22280 + }, + { + "epoch": 0.6607063428520594, + "grad_norm": 0.1081438735127449, + "learning_rate": 0.00026292954353183257, + "loss": 2.6122, + "step": 22281 + }, + { + "epoch": 0.660735996204371, + "grad_norm": 0.10234750062227249, + "learning_rate": 0.00026288811861139915, + "loss": 2.5671, + "step": 22282 + }, + { + "epoch": 0.6607656495566824, + "grad_norm": 0.08917365223169327, + "learning_rate": 0.00026284669579063204, + "loss": 2.6133, + "step": 22283 + }, + { + "epoch": 0.6607953029089939, + "grad_norm": 0.12293307483196259, + "learning_rate": 0.00026280527506989803, + "loss": 2.6167, + "step": 22284 + }, + { + "epoch": 0.6608249562613053, + "grad_norm": 0.10533967614173889, + "learning_rate": 0.00026276385644956405, + "loss": 2.5863, + "step": 22285 + }, + { + "epoch": 0.6608546096136169, + "grad_norm": 0.11291911453008652, + "learning_rate": 0.0002627224399299969, + "loss": 2.6179, + "step": 22286 + }, + { + "epoch": 0.6608842629659283, + "grad_norm": 0.10939971357584, + "learning_rate": 0.00026268102551156325, + "loss": 2.6224, + "step": 22287 + }, + { + "epoch": 0.6609139163182398, + "grad_norm": 0.10839579254388809, + "learning_rate": 0.00026263961319462957, + "loss": 2.6234, + "step": 22288 + }, + { + "epoch": 0.6609435696705512, + "grad_norm": 0.10588214546442032, + "learning_rate": 0.0002625982029795632, + "loss": 2.591, + "step": 22289 + }, + { + "epoch": 0.6609732230228628, + "grad_norm": 0.11348474025726318, + "learning_rate": 0.0002625567948667304, + "loss": 2.6549, + "step": 22290 + }, + { + "epoch": 0.6610028763751742, + "grad_norm": 0.10798152536153793, + "learning_rate": 0.00026251538885649795, + "loss": 2.6316, + "step": 22291 + }, + { + "epoch": 0.6610325297274857, + "grad_norm": 0.11377634108066559, + "learning_rate": 0.0002624739849492327, + "loss": 2.5745, + "step": 22292 + }, + { + "epoch": 0.6610621830797971, + "grad_norm": 0.1025548055768013, + "learning_rate": 0.0002624325831453009, + "loss": 2.5905, + "step": 22293 + }, + { + "epoch": 0.6610918364321087, + "grad_norm": 0.1114225909113884, + "learning_rate": 0.00026239118344506936, + "loss": 2.6226, + "step": 22294 + }, + { + "epoch": 0.6611214897844201, + "grad_norm": 0.0923381820321083, + "learning_rate": 0.00026234978584890466, + "loss": 2.5838, + "step": 22295 + }, + { + "epoch": 0.6611511431367316, + "grad_norm": 0.12192219495773315, + "learning_rate": 0.00026230839035717334, + "loss": 2.6642, + "step": 22296 + }, + { + "epoch": 0.661180796489043, + "grad_norm": 0.11737503111362457, + "learning_rate": 0.00026226699697024213, + "loss": 2.6087, + "step": 22297 + }, + { + "epoch": 0.6612104498413546, + "grad_norm": 0.12150230258703232, + "learning_rate": 0.00026222560568847745, + "loss": 2.6591, + "step": 22298 + }, + { + "epoch": 0.661240103193666, + "grad_norm": 0.09982438385486603, + "learning_rate": 0.0002621842165122458, + "loss": 2.6175, + "step": 22299 + }, + { + "epoch": 0.6612697565459775, + "grad_norm": 0.12186183035373688, + "learning_rate": 0.0002621428294419137, + "loss": 2.6014, + "step": 22300 + }, + { + "epoch": 0.661299409898289, + "grad_norm": 0.10517044365406036, + "learning_rate": 0.0002621014444778476, + "loss": 2.603, + "step": 22301 + }, + { + "epoch": 0.6613290632506005, + "grad_norm": 0.11091716587543488, + "learning_rate": 0.00026206006162041406, + "loss": 2.6603, + "step": 22302 + }, + { + "epoch": 0.661358716602912, + "grad_norm": 0.10298091173171997, + "learning_rate": 0.00026201868086997985, + "loss": 2.6449, + "step": 22303 + }, + { + "epoch": 0.6613883699552234, + "grad_norm": 0.1116664931178093, + "learning_rate": 0.00026197730222691086, + "loss": 2.6389, + "step": 22304 + }, + { + "epoch": 0.661418023307535, + "grad_norm": 0.10214363038539886, + "learning_rate": 0.00026193592569157367, + "loss": 2.6418, + "step": 22305 + }, + { + "epoch": 0.6614476766598464, + "grad_norm": 0.10475169867277145, + "learning_rate": 0.0002618945512643348, + "loss": 2.6172, + "step": 22306 + }, + { + "epoch": 0.6614773300121579, + "grad_norm": 0.10202667117118835, + "learning_rate": 0.0002618531789455605, + "loss": 2.6558, + "step": 22307 + }, + { + "epoch": 0.6615069833644693, + "grad_norm": 0.11706864088773727, + "learning_rate": 0.0002618118087356171, + "loss": 2.6445, + "step": 22308 + }, + { + "epoch": 0.6615366367167809, + "grad_norm": 0.10845436155796051, + "learning_rate": 0.0002617704406348711, + "loss": 2.6322, + "step": 22309 + }, + { + "epoch": 0.6615662900690923, + "grad_norm": 0.11847604066133499, + "learning_rate": 0.0002617290746436888, + "loss": 2.6331, + "step": 22310 + }, + { + "epoch": 0.6615959434214038, + "grad_norm": 0.11347862333059311, + "learning_rate": 0.0002616877107624363, + "loss": 2.6145, + "step": 22311 + }, + { + "epoch": 0.6616255967737152, + "grad_norm": 0.11499963700771332, + "learning_rate": 0.0002616463489914801, + "loss": 2.6054, + "step": 22312 + }, + { + "epoch": 0.6616552501260268, + "grad_norm": 0.09637509286403656, + "learning_rate": 0.0002616049893311864, + "loss": 2.617, + "step": 22313 + }, + { + "epoch": 0.6616849034783382, + "grad_norm": 0.12070321291685104, + "learning_rate": 0.00026156363178192146, + "loss": 2.6517, + "step": 22314 + }, + { + "epoch": 0.6617145568306497, + "grad_norm": 0.11271844059228897, + "learning_rate": 0.00026152227634405146, + "loss": 2.6114, + "step": 22315 + }, + { + "epoch": 0.6617442101829611, + "grad_norm": 0.11131057143211365, + "learning_rate": 0.0002614809230179426, + "loss": 2.6429, + "step": 22316 + }, + { + "epoch": 0.6617738635352727, + "grad_norm": 0.11962061375379562, + "learning_rate": 0.00026143957180396114, + "loss": 2.656, + "step": 22317 + }, + { + "epoch": 0.6618035168875841, + "grad_norm": 0.0950026884675026, + "learning_rate": 0.00026139822270247325, + "loss": 2.6194, + "step": 22318 + }, + { + "epoch": 0.6618331702398956, + "grad_norm": 0.11808927357196808, + "learning_rate": 0.00026135687571384505, + "loss": 2.6189, + "step": 22319 + }, + { + "epoch": 0.661862823592207, + "grad_norm": 0.11471544951200485, + "learning_rate": 0.0002613155308384426, + "loss": 2.5835, + "step": 22320 + }, + { + "epoch": 0.6618924769445186, + "grad_norm": 0.1002262756228447, + "learning_rate": 0.00026127418807663216, + "loss": 2.6386, + "step": 22321 + }, + { + "epoch": 0.66192213029683, + "grad_norm": 0.09347634017467499, + "learning_rate": 0.00026123284742877973, + "loss": 2.615, + "step": 22322 + }, + { + "epoch": 0.6619517836491415, + "grad_norm": 0.10430938750505447, + "learning_rate": 0.00026119150889525143, + "loss": 2.618, + "step": 22323 + }, + { + "epoch": 0.6619814370014531, + "grad_norm": 0.11231238394975662, + "learning_rate": 0.0002611501724764134, + "loss": 2.6159, + "step": 22324 + }, + { + "epoch": 0.6620110903537645, + "grad_norm": 0.0938657596707344, + "learning_rate": 0.0002611088381726315, + "loss": 2.61, + "step": 22325 + }, + { + "epoch": 0.662040743706076, + "grad_norm": 0.10383179038763046, + "learning_rate": 0.00026106750598427187, + "loss": 2.6327, + "step": 22326 + }, + { + "epoch": 0.6620703970583874, + "grad_norm": 0.09710520505905151, + "learning_rate": 0.00026102617591170044, + "loss": 2.6303, + "step": 22327 + }, + { + "epoch": 0.662100050410699, + "grad_norm": 0.10170305520296097, + "learning_rate": 0.00026098484795528327, + "loss": 2.6039, + "step": 22328 + }, + { + "epoch": 0.6621297037630104, + "grad_norm": 0.11241468787193298, + "learning_rate": 0.0002609435221153863, + "loss": 2.6582, + "step": 22329 + }, + { + "epoch": 0.6621593571153219, + "grad_norm": 0.09368862956762314, + "learning_rate": 0.0002609021983923755, + "loss": 2.6386, + "step": 22330 + }, + { + "epoch": 0.6621890104676333, + "grad_norm": 0.1022704616189003, + "learning_rate": 0.00026086087678661675, + "loss": 2.6445, + "step": 22331 + }, + { + "epoch": 0.6622186638199449, + "grad_norm": 0.1011161059141159, + "learning_rate": 0.00026081955729847595, + "loss": 2.6262, + "step": 22332 + }, + { + "epoch": 0.6622483171722563, + "grad_norm": 0.10273583233356476, + "learning_rate": 0.00026077823992831905, + "loss": 2.6322, + "step": 22333 + }, + { + "epoch": 0.6622779705245678, + "grad_norm": 0.10222788155078888, + "learning_rate": 0.00026073692467651187, + "loss": 2.6151, + "step": 22334 + }, + { + "epoch": 0.6623076238768792, + "grad_norm": 0.10760590434074402, + "learning_rate": 0.00026069561154342037, + "loss": 2.64, + "step": 22335 + }, + { + "epoch": 0.6623372772291908, + "grad_norm": 0.10647734254598618, + "learning_rate": 0.0002606543005294103, + "loss": 2.6294, + "step": 22336 + }, + { + "epoch": 0.6623669305815022, + "grad_norm": 0.09043344855308533, + "learning_rate": 0.00026061299163484766, + "loss": 2.5794, + "step": 22337 + }, + { + "epoch": 0.6623965839338137, + "grad_norm": 0.10520360618829727, + "learning_rate": 0.0002605716848600978, + "loss": 2.6534, + "step": 22338 + }, + { + "epoch": 0.6624262372861252, + "grad_norm": 0.1039215475320816, + "learning_rate": 0.0002605303802055268, + "loss": 2.5905, + "step": 22339 + }, + { + "epoch": 0.6624558906384367, + "grad_norm": 0.09886699914932251, + "learning_rate": 0.00026048907767150023, + "loss": 2.5876, + "step": 22340 + }, + { + "epoch": 0.6624855439907481, + "grad_norm": 0.10391674190759659, + "learning_rate": 0.0002604477772583842, + "loss": 2.6049, + "step": 22341 + }, + { + "epoch": 0.6625151973430596, + "grad_norm": 0.10846368223428726, + "learning_rate": 0.00026040647896654413, + "loss": 2.6417, + "step": 22342 + }, + { + "epoch": 0.6625448506953711, + "grad_norm": 0.10840923339128494, + "learning_rate": 0.0002603651827963459, + "loss": 2.6137, + "step": 22343 + }, + { + "epoch": 0.6625745040476826, + "grad_norm": 0.11075456440448761, + "learning_rate": 0.00026032388874815506, + "loss": 2.6151, + "step": 22344 + }, + { + "epoch": 0.6626041573999941, + "grad_norm": 0.09646633267402649, + "learning_rate": 0.00026028259682233735, + "loss": 2.6216, + "step": 22345 + }, + { + "epoch": 0.6626338107523055, + "grad_norm": 0.10870032757520676, + "learning_rate": 0.0002602413070192584, + "loss": 2.6148, + "step": 22346 + }, + { + "epoch": 0.6626634641046171, + "grad_norm": 0.10121766477823257, + "learning_rate": 0.00026020001933928406, + "loss": 2.6251, + "step": 22347 + }, + { + "epoch": 0.6626931174569285, + "grad_norm": 0.11147367209196091, + "learning_rate": 0.0002601587337827794, + "loss": 2.6278, + "step": 22348 + }, + { + "epoch": 0.66272277080924, + "grad_norm": 0.1015043631196022, + "learning_rate": 0.0002601174503501104, + "loss": 2.6142, + "step": 22349 + }, + { + "epoch": 0.6627524241615514, + "grad_norm": 0.11146991699934006, + "learning_rate": 0.00026007616904164254, + "loss": 2.6348, + "step": 22350 + }, + { + "epoch": 0.662782077513863, + "grad_norm": 0.10478717088699341, + "learning_rate": 0.00026003488985774145, + "loss": 2.6163, + "step": 22351 + }, + { + "epoch": 0.6628117308661744, + "grad_norm": 0.11903761327266693, + "learning_rate": 0.00025999361279877253, + "loss": 2.6132, + "step": 22352 + }, + { + "epoch": 0.6628413842184859, + "grad_norm": 0.10376105457544327, + "learning_rate": 0.0002599523378651012, + "loss": 2.6271, + "step": 22353 + }, + { + "epoch": 0.6628710375707974, + "grad_norm": 0.10813020914793015, + "learning_rate": 0.00025991106505709327, + "loss": 2.6133, + "step": 22354 + }, + { + "epoch": 0.6629006909231089, + "grad_norm": 0.10983601212501526, + "learning_rate": 0.00025986979437511406, + "loss": 2.6386, + "step": 22355 + }, + { + "epoch": 0.6629303442754203, + "grad_norm": 0.11622507125139236, + "learning_rate": 0.0002598285258195291, + "loss": 2.6695, + "step": 22356 + }, + { + "epoch": 0.6629599976277318, + "grad_norm": 0.1242053210735321, + "learning_rate": 0.0002597872593907037, + "loss": 2.6373, + "step": 22357 + }, + { + "epoch": 0.6629896509800433, + "grad_norm": 0.10007182508707047, + "learning_rate": 0.00025974599508900363, + "loss": 2.6832, + "step": 22358 + }, + { + "epoch": 0.6630193043323548, + "grad_norm": 0.11477240175008774, + "learning_rate": 0.0002597047329147938, + "loss": 2.62, + "step": 22359 + }, + { + "epoch": 0.6630489576846662, + "grad_norm": 0.10762074589729309, + "learning_rate": 0.0002596634728684397, + "loss": 2.5965, + "step": 22360 + }, + { + "epoch": 0.6630786110369777, + "grad_norm": 0.11937302350997925, + "learning_rate": 0.0002596222149503069, + "loss": 2.6455, + "step": 22361 + }, + { + "epoch": 0.6631082643892892, + "grad_norm": 0.11826815456151962, + "learning_rate": 0.0002595809591607606, + "loss": 2.6017, + "step": 22362 + }, + { + "epoch": 0.6631379177416007, + "grad_norm": 0.10895930975675583, + "learning_rate": 0.00025953970550016625, + "loss": 2.6418, + "step": 22363 + }, + { + "epoch": 0.6631675710939121, + "grad_norm": 0.10970553010702133, + "learning_rate": 0.00025949845396888905, + "loss": 2.6224, + "step": 22364 + }, + { + "epoch": 0.6631972244462236, + "grad_norm": 0.11587972939014435, + "learning_rate": 0.00025945720456729425, + "loss": 2.6014, + "step": 22365 + }, + { + "epoch": 0.6632268777985352, + "grad_norm": 0.11182382702827454, + "learning_rate": 0.00025941595729574705, + "loss": 2.6294, + "step": 22366 + }, + { + "epoch": 0.6632565311508466, + "grad_norm": 0.11652961373329163, + "learning_rate": 0.0002593747121546131, + "loss": 2.5908, + "step": 22367 + }, + { + "epoch": 0.6632861845031581, + "grad_norm": 0.1092594712972641, + "learning_rate": 0.0002593334691442575, + "loss": 2.6012, + "step": 22368 + }, + { + "epoch": 0.6633158378554695, + "grad_norm": 0.11349377781152725, + "learning_rate": 0.00025929222826504515, + "loss": 2.6361, + "step": 22369 + }, + { + "epoch": 0.6633454912077811, + "grad_norm": 0.1047976016998291, + "learning_rate": 0.0002592509895173415, + "loss": 2.604, + "step": 22370 + }, + { + "epoch": 0.6633751445600925, + "grad_norm": 0.11364058405160904, + "learning_rate": 0.00025920975290151163, + "loss": 2.6169, + "step": 22371 + }, + { + "epoch": 0.663404797912404, + "grad_norm": 0.10732755064964294, + "learning_rate": 0.0002591685184179207, + "loss": 2.6239, + "step": 22372 + }, + { + "epoch": 0.6634344512647155, + "grad_norm": 0.10728491097688675, + "learning_rate": 0.0002591272860669338, + "loss": 2.6406, + "step": 22373 + }, + { + "epoch": 0.663464104617027, + "grad_norm": 0.10115377604961395, + "learning_rate": 0.00025908605584891626, + "loss": 2.6171, + "step": 22374 + }, + { + "epoch": 0.6634937579693384, + "grad_norm": 0.10296467691659927, + "learning_rate": 0.00025904482776423297, + "loss": 2.6516, + "step": 22375 + }, + { + "epoch": 0.6635234113216499, + "grad_norm": 0.09252564609050751, + "learning_rate": 0.00025900360181324914, + "loss": 2.6287, + "step": 22376 + }, + { + "epoch": 0.6635530646739614, + "grad_norm": 0.11005863547325134, + "learning_rate": 0.00025896237799632977, + "loss": 2.6293, + "step": 22377 + }, + { + "epoch": 0.6635827180262729, + "grad_norm": 0.10833830386400223, + "learning_rate": 0.00025892115631383987, + "loss": 2.6067, + "step": 22378 + }, + { + "epoch": 0.6636123713785843, + "grad_norm": 0.10279304534196854, + "learning_rate": 0.0002588799367661446, + "loss": 2.6266, + "step": 22379 + }, + { + "epoch": 0.6636420247308958, + "grad_norm": 0.10559112578630447, + "learning_rate": 0.0002588387193536088, + "loss": 2.6245, + "step": 22380 + }, + { + "epoch": 0.6636716780832073, + "grad_norm": 0.10730601102113724, + "learning_rate": 0.0002587975040765976, + "loss": 2.6, + "step": 22381 + }, + { + "epoch": 0.6637013314355188, + "grad_norm": 0.10721217095851898, + "learning_rate": 0.0002587562909354758, + "loss": 2.6166, + "step": 22382 + }, + { + "epoch": 0.6637309847878302, + "grad_norm": 0.11037822812795639, + "learning_rate": 0.00025871507993060854, + "loss": 2.6268, + "step": 22383 + }, + { + "epoch": 0.6637606381401417, + "grad_norm": 0.11063236743211746, + "learning_rate": 0.0002586738710623606, + "loss": 2.6251, + "step": 22384 + }, + { + "epoch": 0.6637902914924533, + "grad_norm": 0.1183369979262352, + "learning_rate": 0.00025863266433109704, + "loss": 2.6029, + "step": 22385 + }, + { + "epoch": 0.6638199448447647, + "grad_norm": 0.10381865501403809, + "learning_rate": 0.00025859145973718264, + "loss": 2.6267, + "step": 22386 + }, + { + "epoch": 0.6638495981970762, + "grad_norm": 0.12108439952135086, + "learning_rate": 0.00025855025728098224, + "loss": 2.6212, + "step": 22387 + }, + { + "epoch": 0.6638792515493877, + "grad_norm": 0.12546297907829285, + "learning_rate": 0.0002585090569628609, + "loss": 2.6501, + "step": 22388 + }, + { + "epoch": 0.6639089049016992, + "grad_norm": 0.10393227636814117, + "learning_rate": 0.00025846785878318315, + "loss": 2.6449, + "step": 22389 + }, + { + "epoch": 0.6639385582540106, + "grad_norm": 0.12087499350309372, + "learning_rate": 0.000258426662742314, + "loss": 2.6077, + "step": 22390 + }, + { + "epoch": 0.6639682116063221, + "grad_norm": 0.11677592247724533, + "learning_rate": 0.0002583854688406183, + "loss": 2.6412, + "step": 22391 + }, + { + "epoch": 0.6639978649586336, + "grad_norm": 0.10696956515312195, + "learning_rate": 0.00025834427707846063, + "loss": 2.6281, + "step": 22392 + }, + { + "epoch": 0.6640275183109451, + "grad_norm": 0.1228206679224968, + "learning_rate": 0.000258303087456206, + "loss": 2.6508, + "step": 22393 + }, + { + "epoch": 0.6640571716632565, + "grad_norm": 0.11928810924291611, + "learning_rate": 0.0002582618999742189, + "loss": 2.6365, + "step": 22394 + }, + { + "epoch": 0.664086825015568, + "grad_norm": 0.11668863892555237, + "learning_rate": 0.00025822071463286426, + "loss": 2.6095, + "step": 22395 + }, + { + "epoch": 0.6641164783678795, + "grad_norm": 0.10253192484378815, + "learning_rate": 0.0002581795314325066, + "loss": 2.6123, + "step": 22396 + }, + { + "epoch": 0.664146131720191, + "grad_norm": 0.13462935388088226, + "learning_rate": 0.00025813835037351074, + "loss": 2.6153, + "step": 22397 + }, + { + "epoch": 0.6641757850725024, + "grad_norm": 0.11859564483165741, + "learning_rate": 0.00025809717145624134, + "loss": 2.6324, + "step": 22398 + }, + { + "epoch": 0.6642054384248139, + "grad_norm": 0.10181117057800293, + "learning_rate": 0.000258055994681063, + "loss": 2.651, + "step": 22399 + }, + { + "epoch": 0.6642350917771254, + "grad_norm": 0.13395477831363678, + "learning_rate": 0.0002580148200483403, + "loss": 2.6421, + "step": 22400 + }, + { + "epoch": 0.6642647451294369, + "grad_norm": 0.1201833188533783, + "learning_rate": 0.000257973647558438, + "loss": 2.5858, + "step": 22401 + }, + { + "epoch": 0.6642943984817483, + "grad_norm": 0.11950338631868362, + "learning_rate": 0.00025793247721172055, + "loss": 2.6069, + "step": 22402 + }, + { + "epoch": 0.6643240518340598, + "grad_norm": 0.12902268767356873, + "learning_rate": 0.00025789130900855274, + "loss": 2.6036, + "step": 22403 + }, + { + "epoch": 0.6643537051863713, + "grad_norm": 0.11816160380840302, + "learning_rate": 0.00025785014294929856, + "loss": 2.6053, + "step": 22404 + }, + { + "epoch": 0.6643833585386828, + "grad_norm": 0.10221114754676819, + "learning_rate": 0.0002578089790343231, + "loss": 2.6282, + "step": 22405 + }, + { + "epoch": 0.6644130118909943, + "grad_norm": 0.1157468855381012, + "learning_rate": 0.0002577678172639907, + "loss": 2.6747, + "step": 22406 + }, + { + "epoch": 0.6644426652433058, + "grad_norm": 0.10505346208810806, + "learning_rate": 0.00025772665763866586, + "loss": 2.6261, + "step": 22407 + }, + { + "epoch": 0.6644723185956173, + "grad_norm": 0.10210072994232178, + "learning_rate": 0.00025768550015871307, + "loss": 2.6205, + "step": 22408 + }, + { + "epoch": 0.6645019719479287, + "grad_norm": 0.0973227396607399, + "learning_rate": 0.0002576443448244968, + "loss": 2.6027, + "step": 22409 + }, + { + "epoch": 0.6645316253002402, + "grad_norm": 0.10916511714458466, + "learning_rate": 0.0002576031916363815, + "loss": 2.6183, + "step": 22410 + }, + { + "epoch": 0.6645612786525517, + "grad_norm": 0.10358160734176636, + "learning_rate": 0.00025756204059473143, + "loss": 2.64, + "step": 22411 + }, + { + "epoch": 0.6645909320048632, + "grad_norm": 0.10869524627923965, + "learning_rate": 0.0002575208916999111, + "loss": 2.6516, + "step": 22412 + }, + { + "epoch": 0.6646205853571746, + "grad_norm": 0.1123858317732811, + "learning_rate": 0.00025747974495228515, + "loss": 2.5948, + "step": 22413 + }, + { + "epoch": 0.6646502387094861, + "grad_norm": 0.0990864560008049, + "learning_rate": 0.0002574386003522175, + "loss": 2.6202, + "step": 22414 + }, + { + "epoch": 0.6646798920617976, + "grad_norm": 0.11464561522006989, + "learning_rate": 0.00025739745790007265, + "loss": 2.6144, + "step": 22415 + }, + { + "epoch": 0.6647095454141091, + "grad_norm": 0.10527736693620682, + "learning_rate": 0.000257356317596215, + "loss": 2.6246, + "step": 22416 + }, + { + "epoch": 0.6647391987664205, + "grad_norm": 0.09622597694396973, + "learning_rate": 0.0002573151794410086, + "loss": 2.6352, + "step": 22417 + }, + { + "epoch": 0.664768852118732, + "grad_norm": 0.13045604526996613, + "learning_rate": 0.00025727404343481807, + "loss": 2.6073, + "step": 22418 + }, + { + "epoch": 0.6647985054710435, + "grad_norm": 0.11724630743265152, + "learning_rate": 0.0002572329095780076, + "loss": 2.6858, + "step": 22419 + }, + { + "epoch": 0.664828158823355, + "grad_norm": 0.11254095286130905, + "learning_rate": 0.00025719177787094136, + "loss": 2.588, + "step": 22420 + }, + { + "epoch": 0.6648578121756664, + "grad_norm": 0.10971163213253021, + "learning_rate": 0.0002571506483139836, + "loss": 2.6381, + "step": 22421 + }, + { + "epoch": 0.664887465527978, + "grad_norm": 0.12691453099250793, + "learning_rate": 0.00025710952090749855, + "loss": 2.6448, + "step": 22422 + }, + { + "epoch": 0.6649171188802894, + "grad_norm": 0.10358108580112457, + "learning_rate": 0.0002570683956518506, + "loss": 2.6202, + "step": 22423 + }, + { + "epoch": 0.6649467722326009, + "grad_norm": 0.1215268075466156, + "learning_rate": 0.0002570272725474035, + "loss": 2.6018, + "step": 22424 + }, + { + "epoch": 0.6649764255849123, + "grad_norm": 0.12751831114292145, + "learning_rate": 0.0002569861515945216, + "loss": 2.6119, + "step": 22425 + }, + { + "epoch": 0.6650060789372239, + "grad_norm": 0.09844792634248734, + "learning_rate": 0.0002569450327935691, + "loss": 2.6193, + "step": 22426 + }, + { + "epoch": 0.6650357322895354, + "grad_norm": 0.13180673122406006, + "learning_rate": 0.00025690391614490994, + "loss": 2.6055, + "step": 22427 + }, + { + "epoch": 0.6650653856418468, + "grad_norm": 0.13878054916858673, + "learning_rate": 0.0002568628016489084, + "loss": 2.6521, + "step": 22428 + }, + { + "epoch": 0.6650950389941583, + "grad_norm": 0.09604370594024658, + "learning_rate": 0.00025682168930592843, + "loss": 2.6486, + "step": 22429 + }, + { + "epoch": 0.6651246923464698, + "grad_norm": 0.1145537793636322, + "learning_rate": 0.000256780579116334, + "loss": 2.6129, + "step": 22430 + }, + { + "epoch": 0.6651543456987813, + "grad_norm": 0.10873125493526459, + "learning_rate": 0.0002567394710804895, + "loss": 2.6181, + "step": 22431 + }, + { + "epoch": 0.6651839990510927, + "grad_norm": 0.10875293612480164, + "learning_rate": 0.0002566983651987587, + "loss": 2.6334, + "step": 22432 + }, + { + "epoch": 0.6652136524034042, + "grad_norm": 0.13043926656246185, + "learning_rate": 0.00025665726147150567, + "loss": 2.6381, + "step": 22433 + }, + { + "epoch": 0.6652433057557157, + "grad_norm": 0.10746541619300842, + "learning_rate": 0.0002566161598990945, + "loss": 2.6151, + "step": 22434 + }, + { + "epoch": 0.6652729591080272, + "grad_norm": 0.10800568014383316, + "learning_rate": 0.00025657506048188885, + "loss": 2.6332, + "step": 22435 + }, + { + "epoch": 0.6653026124603386, + "grad_norm": 0.10825568437576294, + "learning_rate": 0.0002565339632202528, + "loss": 2.6031, + "step": 22436 + }, + { + "epoch": 0.6653322658126501, + "grad_norm": 0.10369972139596939, + "learning_rate": 0.00025649286811455033, + "loss": 2.6119, + "step": 22437 + }, + { + "epoch": 0.6653619191649616, + "grad_norm": 0.11658517271280289, + "learning_rate": 0.0002564517751651453, + "loss": 2.6154, + "step": 22438 + }, + { + "epoch": 0.6653915725172731, + "grad_norm": 0.10953296720981598, + "learning_rate": 0.0002564106843724016, + "loss": 2.6417, + "step": 22439 + }, + { + "epoch": 0.6654212258695845, + "grad_norm": 0.10393161326646805, + "learning_rate": 0.0002563695957366831, + "loss": 2.6495, + "step": 22440 + }, + { + "epoch": 0.665450879221896, + "grad_norm": 0.10846609622240067, + "learning_rate": 0.0002563285092583537, + "loss": 2.6076, + "step": 22441 + }, + { + "epoch": 0.6654805325742075, + "grad_norm": 0.10621508210897446, + "learning_rate": 0.00025628742493777714, + "loss": 2.5972, + "step": 22442 + }, + { + "epoch": 0.665510185926519, + "grad_norm": 0.10740530490875244, + "learning_rate": 0.000256246342775317, + "loss": 2.6127, + "step": 22443 + }, + { + "epoch": 0.6655398392788304, + "grad_norm": 0.1101139560341835, + "learning_rate": 0.0002562052627713378, + "loss": 2.61, + "step": 22444 + }, + { + "epoch": 0.665569492631142, + "grad_norm": 0.09330954402685165, + "learning_rate": 0.00025616418492620263, + "loss": 2.6048, + "step": 22445 + }, + { + "epoch": 0.6655991459834534, + "grad_norm": 0.12891235947608948, + "learning_rate": 0.0002561231092402755, + "loss": 2.6269, + "step": 22446 + }, + { + "epoch": 0.6656287993357649, + "grad_norm": 0.09128525853157043, + "learning_rate": 0.00025608203571392, + "loss": 2.6202, + "step": 22447 + }, + { + "epoch": 0.6656584526880764, + "grad_norm": 0.10551571100950241, + "learning_rate": 0.00025604096434750004, + "loss": 2.6663, + "step": 22448 + }, + { + "epoch": 0.6656881060403879, + "grad_norm": 0.11267050355672836, + "learning_rate": 0.0002559998951413792, + "loss": 2.633, + "step": 22449 + }, + { + "epoch": 0.6657177593926994, + "grad_norm": 0.11302530765533447, + "learning_rate": 0.00025595882809592113, + "loss": 2.6197, + "step": 22450 + }, + { + "epoch": 0.6657474127450108, + "grad_norm": 0.11755642294883728, + "learning_rate": 0.00025591776321148954, + "loss": 2.6262, + "step": 22451 + }, + { + "epoch": 0.6657770660973223, + "grad_norm": 0.12902383506298065, + "learning_rate": 0.000255876700488448, + "loss": 2.6182, + "step": 22452 + }, + { + "epoch": 0.6658067194496338, + "grad_norm": 0.10748454928398132, + "learning_rate": 0.0002558356399271603, + "loss": 2.6518, + "step": 22453 + }, + { + "epoch": 0.6658363728019453, + "grad_norm": 0.11526764184236526, + "learning_rate": 0.0002557945815279898, + "loss": 2.5886, + "step": 22454 + }, + { + "epoch": 0.6658660261542567, + "grad_norm": 0.1295514851808548, + "learning_rate": 0.0002557535252913003, + "loss": 2.6148, + "step": 22455 + }, + { + "epoch": 0.6658956795065682, + "grad_norm": 0.10061702132225037, + "learning_rate": 0.0002557124712174552, + "loss": 2.5802, + "step": 22456 + }, + { + "epoch": 0.6659253328588797, + "grad_norm": 0.11541089415550232, + "learning_rate": 0.0002556714193068181, + "loss": 2.5922, + "step": 22457 + }, + { + "epoch": 0.6659549862111912, + "grad_norm": 0.1117422878742218, + "learning_rate": 0.00025563036955975255, + "loss": 2.6197, + "step": 22458 + }, + { + "epoch": 0.6659846395635026, + "grad_norm": 0.10114721208810806, + "learning_rate": 0.000255589321976622, + "loss": 2.6416, + "step": 22459 + }, + { + "epoch": 0.6660142929158142, + "grad_norm": 0.11599715054035187, + "learning_rate": 0.00025554827655779, + "loss": 2.5807, + "step": 22460 + }, + { + "epoch": 0.6660439462681256, + "grad_norm": 0.10367636382579803, + "learning_rate": 0.0002555072333036199, + "loss": 2.6204, + "step": 22461 + }, + { + "epoch": 0.6660735996204371, + "grad_norm": 0.1254178285598755, + "learning_rate": 0.0002554661922144753, + "loss": 2.6546, + "step": 22462 + }, + { + "epoch": 0.6661032529727485, + "grad_norm": 0.10216610878705978, + "learning_rate": 0.00025542515329071946, + "loss": 2.6316, + "step": 22463 + }, + { + "epoch": 0.6661329063250601, + "grad_norm": 0.11154142022132874, + "learning_rate": 0.00025538411653271587, + "loss": 2.6382, + "step": 22464 + }, + { + "epoch": 0.6661625596773715, + "grad_norm": 0.11826201528310776, + "learning_rate": 0.00025534308194082793, + "loss": 2.6304, + "step": 22465 + }, + { + "epoch": 0.666192213029683, + "grad_norm": 0.1039850115776062, + "learning_rate": 0.000255302049515419, + "loss": 2.61, + "step": 22466 + }, + { + "epoch": 0.6662218663819944, + "grad_norm": 0.11588681489229202, + "learning_rate": 0.0002552610192568524, + "loss": 2.6589, + "step": 22467 + }, + { + "epoch": 0.666251519734306, + "grad_norm": 0.10144483298063278, + "learning_rate": 0.00025521999116549175, + "loss": 2.6264, + "step": 22468 + }, + { + "epoch": 0.6662811730866175, + "grad_norm": 0.11294753849506378, + "learning_rate": 0.0002551789652416997, + "loss": 2.6073, + "step": 22469 + }, + { + "epoch": 0.6663108264389289, + "grad_norm": 0.10917112976312637, + "learning_rate": 0.0002551379414858401, + "loss": 2.6056, + "step": 22470 + }, + { + "epoch": 0.6663404797912404, + "grad_norm": 0.102692611515522, + "learning_rate": 0.000255096919898276, + "loss": 2.5922, + "step": 22471 + }, + { + "epoch": 0.6663701331435519, + "grad_norm": 0.1237020194530487, + "learning_rate": 0.0002550559004793708, + "loss": 2.6194, + "step": 22472 + }, + { + "epoch": 0.6663997864958634, + "grad_norm": 0.11352578550577164, + "learning_rate": 0.0002550148832294876, + "loss": 2.651, + "step": 22473 + }, + { + "epoch": 0.6664294398481748, + "grad_norm": 0.1102457344532013, + "learning_rate": 0.0002549738681489896, + "loss": 2.6433, + "step": 22474 + }, + { + "epoch": 0.6664590932004864, + "grad_norm": 0.10905884951353073, + "learning_rate": 0.0002549328552382402, + "loss": 2.6221, + "step": 22475 + }, + { + "epoch": 0.6664887465527978, + "grad_norm": 0.13587093353271484, + "learning_rate": 0.0002548918444976023, + "loss": 2.6696, + "step": 22476 + }, + { + "epoch": 0.6665183999051093, + "grad_norm": 0.11615722626447678, + "learning_rate": 0.0002548508359274393, + "loss": 2.6163, + "step": 22477 + }, + { + "epoch": 0.6665480532574207, + "grad_norm": 0.11735620349645615, + "learning_rate": 0.00025480982952811416, + "loss": 2.6175, + "step": 22478 + }, + { + "epoch": 0.6665777066097323, + "grad_norm": 0.10575643181800842, + "learning_rate": 0.00025476882529999024, + "loss": 2.6295, + "step": 22479 + }, + { + "epoch": 0.6666073599620437, + "grad_norm": 0.11010885238647461, + "learning_rate": 0.00025472782324343035, + "loss": 2.6407, + "step": 22480 + }, + { + "epoch": 0.6666370133143552, + "grad_norm": 0.10359746217727661, + "learning_rate": 0.0002546868233587976, + "loss": 2.605, + "step": 22481 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.09422904253005981, + "learning_rate": 0.00025464582564645497, + "loss": 2.6265, + "step": 22482 + }, + { + "epoch": 0.6666963200189782, + "grad_norm": 0.11899703741073608, + "learning_rate": 0.00025460483010676595, + "loss": 2.6053, + "step": 22483 + }, + { + "epoch": 0.6667259733712896, + "grad_norm": 0.10234720259904861, + "learning_rate": 0.0002545638367400932, + "loss": 2.6433, + "step": 22484 + }, + { + "epoch": 0.6667556267236011, + "grad_norm": 0.11582427471876144, + "learning_rate": 0.00025452284554679976, + "loss": 2.6138, + "step": 22485 + }, + { + "epoch": 0.6667852800759125, + "grad_norm": 0.11781927943229675, + "learning_rate": 0.00025448185652724874, + "loss": 2.6329, + "step": 22486 + }, + { + "epoch": 0.6668149334282241, + "grad_norm": 0.10681343078613281, + "learning_rate": 0.00025444086968180296, + "loss": 2.5767, + "step": 22487 + }, + { + "epoch": 0.6668445867805355, + "grad_norm": 0.10514199733734131, + "learning_rate": 0.00025439988501082546, + "loss": 2.5849, + "step": 22488 + }, + { + "epoch": 0.666874240132847, + "grad_norm": 0.10295797139406204, + "learning_rate": 0.0002543589025146793, + "loss": 2.6413, + "step": 22489 + }, + { + "epoch": 0.6669038934851585, + "grad_norm": 0.10420611500740051, + "learning_rate": 0.0002543179221937271, + "loss": 2.6064, + "step": 22490 + }, + { + "epoch": 0.66693354683747, + "grad_norm": 0.10713180154561996, + "learning_rate": 0.0002542769440483318, + "loss": 2.5898, + "step": 22491 + }, + { + "epoch": 0.6669632001897815, + "grad_norm": 0.09979767352342606, + "learning_rate": 0.0002542359680788564, + "loss": 2.6165, + "step": 22492 + }, + { + "epoch": 0.6669928535420929, + "grad_norm": 0.10432770848274231, + "learning_rate": 0.00025419499428566364, + "loss": 2.64, + "step": 22493 + }, + { + "epoch": 0.6670225068944045, + "grad_norm": 0.09284260123968124, + "learning_rate": 0.0002541540226691164, + "loss": 2.6056, + "step": 22494 + }, + { + "epoch": 0.6670521602467159, + "grad_norm": 0.10243973135948181, + "learning_rate": 0.00025411305322957736, + "loss": 2.5776, + "step": 22495 + }, + { + "epoch": 0.6670818135990274, + "grad_norm": 0.1002538800239563, + "learning_rate": 0.0002540720859674095, + "loss": 2.6272, + "step": 22496 + }, + { + "epoch": 0.6671114669513388, + "grad_norm": 0.09972808510065079, + "learning_rate": 0.00025403112088297566, + "loss": 2.6183, + "step": 22497 + }, + { + "epoch": 0.6671411203036504, + "grad_norm": 0.10278171300888062, + "learning_rate": 0.0002539901579766384, + "loss": 2.6252, + "step": 22498 + }, + { + "epoch": 0.6671707736559618, + "grad_norm": 0.10220646113157272, + "learning_rate": 0.0002539491972487605, + "loss": 2.6016, + "step": 22499 + }, + { + "epoch": 0.6672004270082733, + "grad_norm": 0.09639499336481094, + "learning_rate": 0.0002539082386997049, + "loss": 2.6596, + "step": 22500 + }, + { + "epoch": 0.6672300803605847, + "grad_norm": 0.10382553935050964, + "learning_rate": 0.0002538672823298339, + "loss": 2.62, + "step": 22501 + }, + { + "epoch": 0.6672597337128963, + "grad_norm": 0.09733954817056656, + "learning_rate": 0.00025382632813951043, + "loss": 2.6629, + "step": 22502 + }, + { + "epoch": 0.6672893870652077, + "grad_norm": 0.0996343269944191, + "learning_rate": 0.000253785376129097, + "loss": 2.6136, + "step": 22503 + }, + { + "epoch": 0.6673190404175192, + "grad_norm": 0.11268974095582962, + "learning_rate": 0.0002537444262989564, + "loss": 2.6021, + "step": 22504 + }, + { + "epoch": 0.6673486937698306, + "grad_norm": 0.11493421345949173, + "learning_rate": 0.0002537034786494511, + "loss": 2.6311, + "step": 22505 + }, + { + "epoch": 0.6673783471221422, + "grad_norm": 0.10943766683340073, + "learning_rate": 0.00025366253318094377, + "loss": 2.6307, + "step": 22506 + }, + { + "epoch": 0.6674080004744536, + "grad_norm": 0.10960064083337784, + "learning_rate": 0.00025362158989379705, + "loss": 2.622, + "step": 22507 + }, + { + "epoch": 0.6674376538267651, + "grad_norm": 0.11302243173122406, + "learning_rate": 0.0002535806487883732, + "loss": 2.6515, + "step": 22508 + }, + { + "epoch": 0.6674673071790765, + "grad_norm": 0.12076479941606522, + "learning_rate": 0.0002535397098650353, + "loss": 2.6299, + "step": 22509 + }, + { + "epoch": 0.6674969605313881, + "grad_norm": 0.09214340150356293, + "learning_rate": 0.0002534987731241456, + "loss": 2.6156, + "step": 22510 + }, + { + "epoch": 0.6675266138836996, + "grad_norm": 0.11780817061662674, + "learning_rate": 0.0002534578385660665, + "loss": 2.6309, + "step": 22511 + }, + { + "epoch": 0.667556267236011, + "grad_norm": 0.11431916803121567, + "learning_rate": 0.00025341690619116054, + "loss": 2.6218, + "step": 22512 + }, + { + "epoch": 0.6675859205883226, + "grad_norm": 0.10479200631380081, + "learning_rate": 0.0002533759759997901, + "loss": 2.5992, + "step": 22513 + }, + { + "epoch": 0.667615573940634, + "grad_norm": 0.10354266315698624, + "learning_rate": 0.0002533350479923179, + "loss": 2.6249, + "step": 22514 + }, + { + "epoch": 0.6676452272929455, + "grad_norm": 0.104530468583107, + "learning_rate": 0.0002532941221691061, + "loss": 2.5995, + "step": 22515 + }, + { + "epoch": 0.6676748806452569, + "grad_norm": 0.11537813395261765, + "learning_rate": 0.00025325319853051716, + "loss": 2.6289, + "step": 22516 + }, + { + "epoch": 0.6677045339975685, + "grad_norm": 0.10027434676885605, + "learning_rate": 0.0002532122770769135, + "loss": 2.5956, + "step": 22517 + }, + { + "epoch": 0.6677341873498799, + "grad_norm": 0.10455428063869476, + "learning_rate": 0.00025317135780865755, + "loss": 2.6311, + "step": 22518 + }, + { + "epoch": 0.6677638407021914, + "grad_norm": 0.1045820415019989, + "learning_rate": 0.0002531304407261116, + "loss": 2.6133, + "step": 22519 + }, + { + "epoch": 0.6677934940545028, + "grad_norm": 0.10880346596240997, + "learning_rate": 0.0002530895258296378, + "loss": 2.6087, + "step": 22520 + }, + { + "epoch": 0.6678231474068144, + "grad_norm": 0.09197881072759628, + "learning_rate": 0.00025304861311959884, + "loss": 2.6086, + "step": 22521 + }, + { + "epoch": 0.6678528007591258, + "grad_norm": 0.10680276155471802, + "learning_rate": 0.00025300770259635677, + "loss": 2.6164, + "step": 22522 + }, + { + "epoch": 0.6678824541114373, + "grad_norm": 0.09931133687496185, + "learning_rate": 0.0002529667942602738, + "loss": 2.6031, + "step": 22523 + }, + { + "epoch": 0.6679121074637487, + "grad_norm": 0.10163126140832901, + "learning_rate": 0.0002529258881117123, + "loss": 2.6301, + "step": 22524 + }, + { + "epoch": 0.6679417608160603, + "grad_norm": 0.10140743851661682, + "learning_rate": 0.0002528849841510345, + "loss": 2.6289, + "step": 22525 + }, + { + "epoch": 0.6679714141683717, + "grad_norm": 0.1017373725771904, + "learning_rate": 0.0002528440823786026, + "loss": 2.644, + "step": 22526 + }, + { + "epoch": 0.6680010675206832, + "grad_norm": 0.09442534297704697, + "learning_rate": 0.00025280318279477873, + "loss": 2.6124, + "step": 22527 + }, + { + "epoch": 0.6680307208729946, + "grad_norm": 0.1147855818271637, + "learning_rate": 0.00025276228539992506, + "loss": 2.6104, + "step": 22528 + }, + { + "epoch": 0.6680603742253062, + "grad_norm": 0.10314799100160599, + "learning_rate": 0.0002527213901944039, + "loss": 2.6231, + "step": 22529 + }, + { + "epoch": 0.6680900275776176, + "grad_norm": 0.10192716866731644, + "learning_rate": 0.0002526804971785772, + "loss": 2.5789, + "step": 22530 + }, + { + "epoch": 0.6681196809299291, + "grad_norm": 0.09532928466796875, + "learning_rate": 0.00025263960635280713, + "loss": 2.6079, + "step": 22531 + }, + { + "epoch": 0.6681493342822407, + "grad_norm": 0.09913326054811478, + "learning_rate": 0.0002525987177174559, + "loss": 2.6115, + "step": 22532 + }, + { + "epoch": 0.6681789876345521, + "grad_norm": 0.0909632071852684, + "learning_rate": 0.0002525578312728855, + "loss": 2.6382, + "step": 22533 + }, + { + "epoch": 0.6682086409868636, + "grad_norm": 0.09136702865362167, + "learning_rate": 0.0002525169470194578, + "loss": 2.6304, + "step": 22534 + }, + { + "epoch": 0.668238294339175, + "grad_norm": 0.10569067299365997, + "learning_rate": 0.0002524760649575352, + "loss": 2.6241, + "step": 22535 + }, + { + "epoch": 0.6682679476914866, + "grad_norm": 0.0975690707564354, + "learning_rate": 0.00025243518508747943, + "loss": 2.5976, + "step": 22536 + }, + { + "epoch": 0.668297601043798, + "grad_norm": 0.10347356647253036, + "learning_rate": 0.00025239430740965266, + "loss": 2.6059, + "step": 22537 + }, + { + "epoch": 0.6683272543961095, + "grad_norm": 0.09886326640844345, + "learning_rate": 0.0002523534319244167, + "loss": 2.638, + "step": 22538 + }, + { + "epoch": 0.6683569077484209, + "grad_norm": 0.09839450567960739, + "learning_rate": 0.00025231255863213364, + "loss": 2.6533, + "step": 22539 + }, + { + "epoch": 0.6683865611007325, + "grad_norm": 0.10809937119483948, + "learning_rate": 0.0002522716875331654, + "loss": 2.5918, + "step": 22540 + }, + { + "epoch": 0.6684162144530439, + "grad_norm": 0.10427675396203995, + "learning_rate": 0.00025223081862787403, + "loss": 2.6157, + "step": 22541 + }, + { + "epoch": 0.6684458678053554, + "grad_norm": 0.11593934893608093, + "learning_rate": 0.0002521899519166211, + "loss": 2.6228, + "step": 22542 + }, + { + "epoch": 0.6684755211576668, + "grad_norm": 0.11010342836380005, + "learning_rate": 0.0002521490873997687, + "loss": 2.6075, + "step": 22543 + }, + { + "epoch": 0.6685051745099784, + "grad_norm": 0.12217766791582108, + "learning_rate": 0.00025210822507767895, + "loss": 2.6198, + "step": 22544 + }, + { + "epoch": 0.6685348278622898, + "grad_norm": 0.10717377066612244, + "learning_rate": 0.0002520673649507132, + "loss": 2.6541, + "step": 22545 + }, + { + "epoch": 0.6685644812146013, + "grad_norm": 0.12477549910545349, + "learning_rate": 0.0002520265070192335, + "loss": 2.6262, + "step": 22546 + }, + { + "epoch": 0.6685941345669127, + "grad_norm": 0.13739946484565735, + "learning_rate": 0.0002519856512836014, + "loss": 2.6382, + "step": 22547 + }, + { + "epoch": 0.6686237879192243, + "grad_norm": 0.1041032150387764, + "learning_rate": 0.0002519447977441792, + "loss": 2.6401, + "step": 22548 + }, + { + "epoch": 0.6686534412715357, + "grad_norm": 0.1086573451757431, + "learning_rate": 0.0002519039464013283, + "loss": 2.6112, + "step": 22549 + }, + { + "epoch": 0.6686830946238472, + "grad_norm": 0.10924383252859116, + "learning_rate": 0.00025186309725541055, + "loss": 2.6227, + "step": 22550 + }, + { + "epoch": 0.6687127479761587, + "grad_norm": 0.09862861782312393, + "learning_rate": 0.00025182225030678774, + "loss": 2.6277, + "step": 22551 + }, + { + "epoch": 0.6687424013284702, + "grad_norm": 0.11109934747219086, + "learning_rate": 0.0002517814055558215, + "loss": 2.6395, + "step": 22552 + }, + { + "epoch": 0.6687720546807817, + "grad_norm": 0.0985666811466217, + "learning_rate": 0.00025174056300287346, + "loss": 2.6208, + "step": 22553 + }, + { + "epoch": 0.6688017080330931, + "grad_norm": 0.10562539845705032, + "learning_rate": 0.0002516997226483053, + "loss": 2.6152, + "step": 22554 + }, + { + "epoch": 0.6688313613854047, + "grad_norm": 0.10251640528440475, + "learning_rate": 0.000251658884492479, + "loss": 2.6167, + "step": 22555 + }, + { + "epoch": 0.6688610147377161, + "grad_norm": 0.11977995932102203, + "learning_rate": 0.00025161804853575577, + "loss": 2.6021, + "step": 22556 + }, + { + "epoch": 0.6688906680900276, + "grad_norm": 0.09694046527147293, + "learning_rate": 0.00025157721477849724, + "loss": 2.6233, + "step": 22557 + }, + { + "epoch": 0.668920321442339, + "grad_norm": 0.11278584599494934, + "learning_rate": 0.00025153638322106514, + "loss": 2.6053, + "step": 22558 + }, + { + "epoch": 0.6689499747946506, + "grad_norm": 0.11195239424705505, + "learning_rate": 0.00025149555386382103, + "loss": 2.6273, + "step": 22559 + }, + { + "epoch": 0.668979628146962, + "grad_norm": 0.10373635590076447, + "learning_rate": 0.00025145472670712625, + "loss": 2.6246, + "step": 22560 + }, + { + "epoch": 0.6690092814992735, + "grad_norm": 0.1044611856341362, + "learning_rate": 0.00025141390175134273, + "loss": 2.6374, + "step": 22561 + }, + { + "epoch": 0.6690389348515849, + "grad_norm": 0.09866051375865936, + "learning_rate": 0.00025137307899683175, + "loss": 2.6058, + "step": 22562 + }, + { + "epoch": 0.6690685882038965, + "grad_norm": 0.1133950874209404, + "learning_rate": 0.0002513322584439549, + "loss": 2.6254, + "step": 22563 + }, + { + "epoch": 0.6690982415562079, + "grad_norm": 0.1029781922698021, + "learning_rate": 0.0002512914400930735, + "loss": 2.6405, + "step": 22564 + }, + { + "epoch": 0.6691278949085194, + "grad_norm": 0.10820672661066055, + "learning_rate": 0.00025125062394454936, + "loss": 2.5931, + "step": 22565 + }, + { + "epoch": 0.6691575482608308, + "grad_norm": 0.10655643045902252, + "learning_rate": 0.00025120980999874333, + "loss": 2.6014, + "step": 22566 + }, + { + "epoch": 0.6691872016131424, + "grad_norm": 0.10000469535589218, + "learning_rate": 0.00025116899825601725, + "loss": 2.6308, + "step": 22567 + }, + { + "epoch": 0.6692168549654538, + "grad_norm": 0.10023998469114304, + "learning_rate": 0.0002511281887167324, + "loss": 2.6311, + "step": 22568 + }, + { + "epoch": 0.6692465083177653, + "grad_norm": 0.09988357126712799, + "learning_rate": 0.0002510873813812501, + "loss": 2.6711, + "step": 22569 + }, + { + "epoch": 0.6692761616700768, + "grad_norm": 0.10308587551116943, + "learning_rate": 0.00025104657624993177, + "loss": 2.6584, + "step": 22570 + }, + { + "epoch": 0.6693058150223883, + "grad_norm": 0.10020092129707336, + "learning_rate": 0.00025100577332313876, + "loss": 2.6138, + "step": 22571 + }, + { + "epoch": 0.6693354683746997, + "grad_norm": 0.09847987443208694, + "learning_rate": 0.0002509649726012322, + "loss": 2.6338, + "step": 22572 + }, + { + "epoch": 0.6693651217270112, + "grad_norm": 0.11150094866752625, + "learning_rate": 0.0002509241740845737, + "loss": 2.6528, + "step": 22573 + }, + { + "epoch": 0.6693947750793228, + "grad_norm": 0.11072763055562973, + "learning_rate": 0.00025088337777352435, + "loss": 2.6216, + "step": 22574 + }, + { + "epoch": 0.6694244284316342, + "grad_norm": 0.12890692055225372, + "learning_rate": 0.00025084258366844553, + "loss": 2.6344, + "step": 22575 + }, + { + "epoch": 0.6694540817839457, + "grad_norm": 0.09177358448505402, + "learning_rate": 0.00025080179176969856, + "loss": 2.6258, + "step": 22576 + }, + { + "epoch": 0.6694837351362571, + "grad_norm": 0.10626383125782013, + "learning_rate": 0.00025076100207764427, + "loss": 2.6217, + "step": 22577 + }, + { + "epoch": 0.6695133884885687, + "grad_norm": 0.11328837275505066, + "learning_rate": 0.0002507202145926442, + "loss": 2.6499, + "step": 22578 + }, + { + "epoch": 0.6695430418408801, + "grad_norm": 0.10522536933422089, + "learning_rate": 0.0002506794293150593, + "loss": 2.6338, + "step": 22579 + }, + { + "epoch": 0.6695726951931916, + "grad_norm": 0.11399148404598236, + "learning_rate": 0.0002506386462452509, + "loss": 2.6246, + "step": 22580 + }, + { + "epoch": 0.669602348545503, + "grad_norm": 0.11348606646060944, + "learning_rate": 0.0002505978653835801, + "loss": 2.616, + "step": 22581 + }, + { + "epoch": 0.6696320018978146, + "grad_norm": 0.1294514536857605, + "learning_rate": 0.000250557086730408, + "loss": 2.655, + "step": 22582 + }, + { + "epoch": 0.669661655250126, + "grad_norm": 0.11895142495632172, + "learning_rate": 0.00025051631028609575, + "loss": 2.6146, + "step": 22583 + }, + { + "epoch": 0.6696913086024375, + "grad_norm": 0.1393875628709793, + "learning_rate": 0.00025047553605100437, + "loss": 2.6447, + "step": 22584 + }, + { + "epoch": 0.669720961954749, + "grad_norm": 0.10827567428350449, + "learning_rate": 0.0002504347640254947, + "loss": 2.6182, + "step": 22585 + }, + { + "epoch": 0.6697506153070605, + "grad_norm": 0.13947151601314545, + "learning_rate": 0.0002503939942099285, + "loss": 2.6055, + "step": 22586 + }, + { + "epoch": 0.6697802686593719, + "grad_norm": 0.13222110271453857, + "learning_rate": 0.0002503532266046661, + "loss": 2.6124, + "step": 22587 + }, + { + "epoch": 0.6698099220116834, + "grad_norm": 0.118243508040905, + "learning_rate": 0.00025031246121006866, + "loss": 2.6304, + "step": 22588 + }, + { + "epoch": 0.6698395753639949, + "grad_norm": 0.12435781210660934, + "learning_rate": 0.00025027169802649727, + "loss": 2.6178, + "step": 22589 + }, + { + "epoch": 0.6698692287163064, + "grad_norm": 0.11884883791208267, + "learning_rate": 0.000250230937054313, + "loss": 2.6065, + "step": 22590 + }, + { + "epoch": 0.6698988820686178, + "grad_norm": 0.1220829039812088, + "learning_rate": 0.0002501901782938765, + "loss": 2.6122, + "step": 22591 + }, + { + "epoch": 0.6699285354209293, + "grad_norm": 0.13525724411010742, + "learning_rate": 0.0002501494217455489, + "loss": 2.6528, + "step": 22592 + }, + { + "epoch": 0.6699581887732409, + "grad_norm": 0.11173740029335022, + "learning_rate": 0.00025010866740969107, + "loss": 2.6016, + "step": 22593 + }, + { + "epoch": 0.6699878421255523, + "grad_norm": 0.11008360981941223, + "learning_rate": 0.0002500679152866638, + "loss": 2.6243, + "step": 22594 + }, + { + "epoch": 0.6700174954778638, + "grad_norm": 0.11456824839115143, + "learning_rate": 0.0002500271653768281, + "loss": 2.617, + "step": 22595 + }, + { + "epoch": 0.6700471488301752, + "grad_norm": 0.10369834303855896, + "learning_rate": 0.00024998641768054483, + "loss": 2.5978, + "step": 22596 + }, + { + "epoch": 0.6700768021824868, + "grad_norm": 0.12627719342708588, + "learning_rate": 0.0002499456721981747, + "loss": 2.6309, + "step": 22597 + }, + { + "epoch": 0.6701064555347982, + "grad_norm": 0.10563616454601288, + "learning_rate": 0.0002499049289300785, + "loss": 2.5927, + "step": 22598 + }, + { + "epoch": 0.6701361088871097, + "grad_norm": 0.11052575707435608, + "learning_rate": 0.00024986418787661713, + "loss": 2.6077, + "step": 22599 + }, + { + "epoch": 0.6701657622394211, + "grad_norm": 0.11825782060623169, + "learning_rate": 0.0002498234490381513, + "loss": 2.575, + "step": 22600 + }, + { + "epoch": 0.6701954155917327, + "grad_norm": 0.0927824079990387, + "learning_rate": 0.00024978271241504177, + "loss": 2.6223, + "step": 22601 + }, + { + "epoch": 0.6702250689440441, + "grad_norm": 0.11478740721940994, + "learning_rate": 0.00024974197800764925, + "loss": 2.6082, + "step": 22602 + }, + { + "epoch": 0.6702547222963556, + "grad_norm": 0.1133570671081543, + "learning_rate": 0.00024970124581633453, + "loss": 2.6084, + "step": 22603 + }, + { + "epoch": 0.670284375648667, + "grad_norm": 0.10145886987447739, + "learning_rate": 0.0002496605158414582, + "loss": 2.6325, + "step": 22604 + }, + { + "epoch": 0.6703140290009786, + "grad_norm": 0.12842486798763275, + "learning_rate": 0.00024961978808338097, + "loss": 2.658, + "step": 22605 + }, + { + "epoch": 0.67034368235329, + "grad_norm": 0.11074557155370712, + "learning_rate": 0.0002495790625424635, + "loss": 2.6177, + "step": 22606 + }, + { + "epoch": 0.6703733357056015, + "grad_norm": 0.10897574573755264, + "learning_rate": 0.00024953833921906646, + "loss": 2.6336, + "step": 22607 + }, + { + "epoch": 0.670402989057913, + "grad_norm": 0.11400692164897919, + "learning_rate": 0.00024949761811355036, + "loss": 2.6001, + "step": 22608 + }, + { + "epoch": 0.6704326424102245, + "grad_norm": 0.11938068270683289, + "learning_rate": 0.00024945689922627586, + "loss": 2.6241, + "step": 22609 + }, + { + "epoch": 0.6704622957625359, + "grad_norm": 0.11980722099542618, + "learning_rate": 0.0002494161825576037, + "loss": 2.5876, + "step": 22610 + }, + { + "epoch": 0.6704919491148474, + "grad_norm": 0.11821319907903671, + "learning_rate": 0.0002493754681078939, + "loss": 2.6299, + "step": 22611 + }, + { + "epoch": 0.6705216024671589, + "grad_norm": 0.12839999794960022, + "learning_rate": 0.00024933475587750754, + "loss": 2.6642, + "step": 22612 + }, + { + "epoch": 0.6705512558194704, + "grad_norm": 0.11050157248973846, + "learning_rate": 0.00024929404586680493, + "loss": 2.6301, + "step": 22613 + }, + { + "epoch": 0.6705809091717819, + "grad_norm": 0.10566119104623795, + "learning_rate": 0.00024925333807614657, + "loss": 2.6407, + "step": 22614 + }, + { + "epoch": 0.6706105625240933, + "grad_norm": 0.11799433082342148, + "learning_rate": 0.000249212632505893, + "loss": 2.626, + "step": 22615 + }, + { + "epoch": 0.6706402158764049, + "grad_norm": 0.09744491428136826, + "learning_rate": 0.00024917192915640456, + "loss": 2.6125, + "step": 22616 + }, + { + "epoch": 0.6706698692287163, + "grad_norm": 0.11608121544122696, + "learning_rate": 0.0002491312280280418, + "loss": 2.6349, + "step": 22617 + }, + { + "epoch": 0.6706995225810278, + "grad_norm": 0.10047151148319244, + "learning_rate": 0.000249090529121165, + "loss": 2.6467, + "step": 22618 + }, + { + "epoch": 0.6707291759333393, + "grad_norm": 0.09975940734148026, + "learning_rate": 0.00024904983243613467, + "loss": 2.6248, + "step": 22619 + }, + { + "epoch": 0.6707588292856508, + "grad_norm": 0.10244134068489075, + "learning_rate": 0.00024900913797331134, + "loss": 2.652, + "step": 22620 + }, + { + "epoch": 0.6707884826379622, + "grad_norm": 0.10753463953733444, + "learning_rate": 0.0002489684457330549, + "loss": 2.6178, + "step": 22621 + }, + { + "epoch": 0.6708181359902737, + "grad_norm": 0.10160116851329803, + "learning_rate": 0.00024892775571572613, + "loss": 2.6014, + "step": 22622 + }, + { + "epoch": 0.6708477893425852, + "grad_norm": 0.11458679288625717, + "learning_rate": 0.0002488870679216851, + "loss": 2.6219, + "step": 22623 + }, + { + "epoch": 0.6708774426948967, + "grad_norm": 0.10683873295783997, + "learning_rate": 0.0002488463823512919, + "loss": 2.5741, + "step": 22624 + }, + { + "epoch": 0.6709070960472081, + "grad_norm": 0.11932021379470825, + "learning_rate": 0.0002488056990049074, + "loss": 2.5854, + "step": 22625 + }, + { + "epoch": 0.6709367493995196, + "grad_norm": 0.1029137372970581, + "learning_rate": 0.00024876501788289153, + "loss": 2.6084, + "step": 22626 + }, + { + "epoch": 0.6709664027518311, + "grad_norm": 0.12356539070606232, + "learning_rate": 0.00024872433898560453, + "loss": 2.6314, + "step": 22627 + }, + { + "epoch": 0.6709960561041426, + "grad_norm": 0.11213527619838715, + "learning_rate": 0.0002486836623134067, + "loss": 2.6151, + "step": 22628 + }, + { + "epoch": 0.671025709456454, + "grad_norm": 0.10854260623455048, + "learning_rate": 0.00024864298786665814, + "loss": 2.6631, + "step": 22629 + }, + { + "epoch": 0.6710553628087655, + "grad_norm": 0.09988049417734146, + "learning_rate": 0.000248602315645719, + "loss": 2.611, + "step": 22630 + }, + { + "epoch": 0.671085016161077, + "grad_norm": 0.1102214977145195, + "learning_rate": 0.0002485616456509498, + "loss": 2.6215, + "step": 22631 + }, + { + "epoch": 0.6711146695133885, + "grad_norm": 0.10735565423965454, + "learning_rate": 0.0002485209778827101, + "loss": 2.5958, + "step": 22632 + }, + { + "epoch": 0.6711443228656999, + "grad_norm": 0.09675998240709305, + "learning_rate": 0.0002484803123413604, + "loss": 2.611, + "step": 22633 + }, + { + "epoch": 0.6711739762180114, + "grad_norm": 0.10529238730669022, + "learning_rate": 0.00024843964902726063, + "loss": 2.6178, + "step": 22634 + }, + { + "epoch": 0.671203629570323, + "grad_norm": 0.10293915122747421, + "learning_rate": 0.00024839898794077096, + "loss": 2.6234, + "step": 22635 + }, + { + "epoch": 0.6712332829226344, + "grad_norm": 0.11088075488805771, + "learning_rate": 0.0002483583290822515, + "loss": 2.6323, + "step": 22636 + }, + { + "epoch": 0.6712629362749459, + "grad_norm": 0.0917125716805458, + "learning_rate": 0.000248317672452062, + "loss": 2.6068, + "step": 22637 + }, + { + "epoch": 0.6712925896272574, + "grad_norm": 0.10701557993888855, + "learning_rate": 0.0002482770180505629, + "loss": 2.6385, + "step": 22638 + }, + { + "epoch": 0.6713222429795689, + "grad_norm": 0.10524020344018936, + "learning_rate": 0.000248236365878114, + "loss": 2.6321, + "step": 22639 + }, + { + "epoch": 0.6713518963318803, + "grad_norm": 0.1037563756108284, + "learning_rate": 0.0002481957159350753, + "loss": 2.6341, + "step": 22640 + }, + { + "epoch": 0.6713815496841918, + "grad_norm": 0.1020866334438324, + "learning_rate": 0.000248155068221807, + "loss": 2.6098, + "step": 22641 + }, + { + "epoch": 0.6714112030365033, + "grad_norm": 0.10636228322982788, + "learning_rate": 0.0002481144227386685, + "loss": 2.6359, + "step": 22642 + }, + { + "epoch": 0.6714408563888148, + "grad_norm": 0.09778206795454025, + "learning_rate": 0.00024807377948602013, + "loss": 2.6212, + "step": 22643 + }, + { + "epoch": 0.6714705097411262, + "grad_norm": 0.10746432840824127, + "learning_rate": 0.00024803313846422163, + "loss": 2.6173, + "step": 22644 + }, + { + "epoch": 0.6715001630934377, + "grad_norm": 0.11109407246112823, + "learning_rate": 0.000247992499673633, + "loss": 2.6442, + "step": 22645 + }, + { + "epoch": 0.6715298164457492, + "grad_norm": 0.11212382465600967, + "learning_rate": 0.00024795186311461396, + "loss": 2.632, + "step": 22646 + }, + { + "epoch": 0.6715594697980607, + "grad_norm": 0.1044500321149826, + "learning_rate": 0.0002479112287875245, + "loss": 2.6395, + "step": 22647 + }, + { + "epoch": 0.6715891231503721, + "grad_norm": 0.10163360089063644, + "learning_rate": 0.00024787059669272433, + "loss": 2.6289, + "step": 22648 + }, + { + "epoch": 0.6716187765026836, + "grad_norm": 0.11585980653762817, + "learning_rate": 0.0002478299668305733, + "loss": 2.5986, + "step": 22649 + }, + { + "epoch": 0.6716484298549951, + "grad_norm": 0.08838649094104767, + "learning_rate": 0.00024778933920143104, + "loss": 2.6362, + "step": 22650 + }, + { + "epoch": 0.6716780832073066, + "grad_norm": 0.11357039213180542, + "learning_rate": 0.00024774871380565765, + "loss": 2.6267, + "step": 22651 + }, + { + "epoch": 0.671707736559618, + "grad_norm": 0.09778635203838348, + "learning_rate": 0.00024770809064361286, + "loss": 2.6241, + "step": 22652 + }, + { + "epoch": 0.6717373899119295, + "grad_norm": 0.09717478603124619, + "learning_rate": 0.0002476674697156561, + "loss": 2.5967, + "step": 22653 + }, + { + "epoch": 0.671767043264241, + "grad_norm": 0.11171920597553253, + "learning_rate": 0.0002476268510221472, + "loss": 2.6126, + "step": 22654 + }, + { + "epoch": 0.6717966966165525, + "grad_norm": 0.10643338412046432, + "learning_rate": 0.0002475862345634458, + "loss": 2.6445, + "step": 22655 + }, + { + "epoch": 0.671826349968864, + "grad_norm": 0.11452557891607285, + "learning_rate": 0.00024754562033991166, + "loss": 2.622, + "step": 22656 + }, + { + "epoch": 0.6718560033211755, + "grad_norm": 0.12025585025548935, + "learning_rate": 0.0002475050083519044, + "loss": 2.5924, + "step": 22657 + }, + { + "epoch": 0.671885656673487, + "grad_norm": 0.09874651581048965, + "learning_rate": 0.0002474643985997836, + "loss": 2.5988, + "step": 22658 + }, + { + "epoch": 0.6719153100257984, + "grad_norm": 0.10652164369821548, + "learning_rate": 0.00024742379108390897, + "loss": 2.6826, + "step": 22659 + }, + { + "epoch": 0.6719449633781099, + "grad_norm": 0.10632310807704926, + "learning_rate": 0.0002473831858046399, + "loss": 2.608, + "step": 22660 + }, + { + "epoch": 0.6719746167304214, + "grad_norm": 0.09589193761348724, + "learning_rate": 0.00024734258276233617, + "loss": 2.6202, + "step": 22661 + }, + { + "epoch": 0.6720042700827329, + "grad_norm": 0.09886059910058975, + "learning_rate": 0.0002473019819573572, + "loss": 2.6211, + "step": 22662 + }, + { + "epoch": 0.6720339234350443, + "grad_norm": 0.10369161516427994, + "learning_rate": 0.00024726138339006257, + "loss": 2.6409, + "step": 22663 + }, + { + "epoch": 0.6720635767873558, + "grad_norm": 0.09722286462783813, + "learning_rate": 0.00024722078706081174, + "loss": 2.6181, + "step": 22664 + }, + { + "epoch": 0.6720932301396673, + "grad_norm": 0.09451838582754135, + "learning_rate": 0.0002471801929699643, + "loss": 2.6248, + "step": 22665 + }, + { + "epoch": 0.6721228834919788, + "grad_norm": 0.10621307790279388, + "learning_rate": 0.0002471396011178796, + "loss": 2.6108, + "step": 22666 + }, + { + "epoch": 0.6721525368442902, + "grad_norm": 0.09372364729642868, + "learning_rate": 0.00024709901150491716, + "loss": 2.5949, + "step": 22667 + }, + { + "epoch": 0.6721821901966017, + "grad_norm": 0.10722150653600693, + "learning_rate": 0.00024705842413143633, + "loss": 2.606, + "step": 22668 + }, + { + "epoch": 0.6722118435489132, + "grad_norm": 0.10643566399812698, + "learning_rate": 0.0002470178389977966, + "loss": 2.6156, + "step": 22669 + }, + { + "epoch": 0.6722414969012247, + "grad_norm": 0.0978558212518692, + "learning_rate": 0.00024697725610435734, + "loss": 2.6109, + "step": 22670 + }, + { + "epoch": 0.6722711502535361, + "grad_norm": 0.09689608216285706, + "learning_rate": 0.0002469366754514779, + "loss": 2.6215, + "step": 22671 + }, + { + "epoch": 0.6723008036058477, + "grad_norm": 0.10751379281282425, + "learning_rate": 0.00024689609703951767, + "loss": 2.6269, + "step": 22672 + }, + { + "epoch": 0.6723304569581591, + "grad_norm": 0.09610971808433533, + "learning_rate": 0.0002468555208688359, + "loss": 2.6256, + "step": 22673 + }, + { + "epoch": 0.6723601103104706, + "grad_norm": 0.11475146561861038, + "learning_rate": 0.00024681494693979197, + "loss": 2.6385, + "step": 22674 + }, + { + "epoch": 0.672389763662782, + "grad_norm": 0.08985213190317154, + "learning_rate": 0.0002467743752527453, + "loss": 2.639, + "step": 22675 + }, + { + "epoch": 0.6724194170150936, + "grad_norm": 0.11412369459867477, + "learning_rate": 0.0002467338058080546, + "loss": 2.6168, + "step": 22676 + }, + { + "epoch": 0.6724490703674051, + "grad_norm": 0.09824725240468979, + "learning_rate": 0.0002466932386060798, + "loss": 2.595, + "step": 22677 + }, + { + "epoch": 0.6724787237197165, + "grad_norm": 0.11563386023044586, + "learning_rate": 0.0002466526736471798, + "loss": 2.6367, + "step": 22678 + }, + { + "epoch": 0.672508377072028, + "grad_norm": 0.10481826961040497, + "learning_rate": 0.00024661211093171385, + "loss": 2.5912, + "step": 22679 + }, + { + "epoch": 0.6725380304243395, + "grad_norm": 0.10846813768148422, + "learning_rate": 0.0002465715504600412, + "loss": 2.6005, + "step": 22680 + }, + { + "epoch": 0.672567683776651, + "grad_norm": 0.1033826693892479, + "learning_rate": 0.0002465309922325209, + "loss": 2.6066, + "step": 22681 + }, + { + "epoch": 0.6725973371289624, + "grad_norm": 0.10373644530773163, + "learning_rate": 0.00024649043624951224, + "loss": 2.6193, + "step": 22682 + }, + { + "epoch": 0.672626990481274, + "grad_norm": 0.0984995886683464, + "learning_rate": 0.00024644988251137424, + "loss": 2.6199, + "step": 22683 + }, + { + "epoch": 0.6726566438335854, + "grad_norm": 0.10741119831800461, + "learning_rate": 0.00024640933101846606, + "loss": 2.6106, + "step": 22684 + }, + { + "epoch": 0.6726862971858969, + "grad_norm": 0.10235516726970673, + "learning_rate": 0.0002463687817711468, + "loss": 2.5637, + "step": 22685 + }, + { + "epoch": 0.6727159505382083, + "grad_norm": 0.10065009444952011, + "learning_rate": 0.00024632823476977565, + "loss": 2.5992, + "step": 22686 + }, + { + "epoch": 0.6727456038905198, + "grad_norm": 0.11178845912218094, + "learning_rate": 0.0002462876900147114, + "loss": 2.5796, + "step": 22687 + }, + { + "epoch": 0.6727752572428313, + "grad_norm": 0.10783898085355759, + "learning_rate": 0.0002462471475063132, + "loss": 2.6532, + "step": 22688 + }, + { + "epoch": 0.6728049105951428, + "grad_norm": 0.11065975576639175, + "learning_rate": 0.0002462066072449399, + "loss": 2.6242, + "step": 22689 + }, + { + "epoch": 0.6728345639474542, + "grad_norm": 0.11798962950706482, + "learning_rate": 0.00024616606923095075, + "loss": 2.6503, + "step": 22690 + }, + { + "epoch": 0.6728642172997658, + "grad_norm": 0.13084158301353455, + "learning_rate": 0.0002461255334647047, + "loss": 2.6538, + "step": 22691 + }, + { + "epoch": 0.6728938706520772, + "grad_norm": 0.10096390545368195, + "learning_rate": 0.0002460849999465606, + "loss": 2.6411, + "step": 22692 + }, + { + "epoch": 0.6729235240043887, + "grad_norm": 0.1086282879114151, + "learning_rate": 0.0002460444686768774, + "loss": 2.6215, + "step": 22693 + }, + { + "epoch": 0.6729531773567001, + "grad_norm": 0.10699532181024551, + "learning_rate": 0.00024600393965601403, + "loss": 2.6121, + "step": 22694 + }, + { + "epoch": 0.6729828307090117, + "grad_norm": 0.10271728783845901, + "learning_rate": 0.0002459634128843294, + "loss": 2.5737, + "step": 22695 + }, + { + "epoch": 0.6730124840613231, + "grad_norm": 0.10224813222885132, + "learning_rate": 0.0002459228883621825, + "loss": 2.6465, + "step": 22696 + }, + { + "epoch": 0.6730421374136346, + "grad_norm": 0.10307879000902176, + "learning_rate": 0.00024588236608993186, + "loss": 2.6111, + "step": 22697 + }, + { + "epoch": 0.6730717907659461, + "grad_norm": 0.1026049256324768, + "learning_rate": 0.0002458418460679365, + "loss": 2.632, + "step": 22698 + }, + { + "epoch": 0.6731014441182576, + "grad_norm": 0.11574511229991913, + "learning_rate": 0.0002458013282965552, + "loss": 2.6439, + "step": 22699 + }, + { + "epoch": 0.6731310974705691, + "grad_norm": 0.10402864217758179, + "learning_rate": 0.00024576081277614677, + "loss": 2.5949, + "step": 22700 + }, + { + "epoch": 0.6731607508228805, + "grad_norm": 0.10228463262319565, + "learning_rate": 0.00024572029950706997, + "loss": 2.6281, + "step": 22701 + }, + { + "epoch": 0.673190404175192, + "grad_norm": 0.1352432817220688, + "learning_rate": 0.00024567978848968343, + "loss": 2.6274, + "step": 22702 + }, + { + "epoch": 0.6732200575275035, + "grad_norm": 0.11537805944681168, + "learning_rate": 0.0002456392797243462, + "loss": 2.5975, + "step": 22703 + }, + { + "epoch": 0.673249710879815, + "grad_norm": 0.10137730836868286, + "learning_rate": 0.00024559877321141674, + "loss": 2.6222, + "step": 22704 + }, + { + "epoch": 0.6732793642321264, + "grad_norm": 0.13497136533260345, + "learning_rate": 0.0002455582689512539, + "loss": 2.6108, + "step": 22705 + }, + { + "epoch": 0.673309017584438, + "grad_norm": 0.1326473355293274, + "learning_rate": 0.0002455177669442162, + "loss": 2.6239, + "step": 22706 + }, + { + "epoch": 0.6733386709367494, + "grad_norm": 0.13221774995326996, + "learning_rate": 0.0002454772671906625, + "loss": 2.6227, + "step": 22707 + }, + { + "epoch": 0.6733683242890609, + "grad_norm": 0.11931112408638, + "learning_rate": 0.0002454367696909512, + "loss": 2.606, + "step": 22708 + }, + { + "epoch": 0.6733979776413723, + "grad_norm": 0.12299457937479019, + "learning_rate": 0.000245396274445441, + "loss": 2.6267, + "step": 22709 + }, + { + "epoch": 0.6734276309936839, + "grad_norm": 0.11911381036043167, + "learning_rate": 0.00024535578145449047, + "loss": 2.6287, + "step": 22710 + }, + { + "epoch": 0.6734572843459953, + "grad_norm": 0.11659980565309525, + "learning_rate": 0.00024531529071845817, + "loss": 2.6355, + "step": 22711 + }, + { + "epoch": 0.6734869376983068, + "grad_norm": 0.110026054084301, + "learning_rate": 0.00024527480223770267, + "loss": 2.6627, + "step": 22712 + }, + { + "epoch": 0.6735165910506182, + "grad_norm": 0.116689532995224, + "learning_rate": 0.0002452343160125826, + "loss": 2.6028, + "step": 22713 + }, + { + "epoch": 0.6735462444029298, + "grad_norm": 0.10586926341056824, + "learning_rate": 0.0002451938320434563, + "loss": 2.6056, + "step": 22714 + }, + { + "epoch": 0.6735758977552412, + "grad_norm": 0.10979682952165604, + "learning_rate": 0.0002451533503306822, + "loss": 2.6188, + "step": 22715 + }, + { + "epoch": 0.6736055511075527, + "grad_norm": 0.1102827787399292, + "learning_rate": 0.00024511287087461913, + "loss": 2.6218, + "step": 22716 + }, + { + "epoch": 0.6736352044598641, + "grad_norm": 0.12256745994091034, + "learning_rate": 0.0002450723936756255, + "loss": 2.5969, + "step": 22717 + }, + { + "epoch": 0.6736648578121757, + "grad_norm": 0.0992983728647232, + "learning_rate": 0.00024503191873405947, + "loss": 2.616, + "step": 22718 + }, + { + "epoch": 0.6736945111644872, + "grad_norm": 0.1277724653482437, + "learning_rate": 0.00024499144605027946, + "loss": 2.6173, + "step": 22719 + }, + { + "epoch": 0.6737241645167986, + "grad_norm": 0.11459436267614365, + "learning_rate": 0.000244950975624644, + "loss": 2.6331, + "step": 22720 + }, + { + "epoch": 0.6737538178691101, + "grad_norm": 0.09798083454370499, + "learning_rate": 0.0002449105074575115, + "loss": 2.6132, + "step": 22721 + }, + { + "epoch": 0.6737834712214216, + "grad_norm": 0.10667579621076584, + "learning_rate": 0.00024487004154924013, + "loss": 2.6358, + "step": 22722 + }, + { + "epoch": 0.6738131245737331, + "grad_norm": 0.11127142608165741, + "learning_rate": 0.0002448295779001884, + "loss": 2.6558, + "step": 22723 + }, + { + "epoch": 0.6738427779260445, + "grad_norm": 0.106939397752285, + "learning_rate": 0.00024478911651071457, + "loss": 2.6227, + "step": 22724 + }, + { + "epoch": 0.673872431278356, + "grad_norm": 0.10609066486358643, + "learning_rate": 0.00024474865738117693, + "loss": 2.6369, + "step": 22725 + }, + { + "epoch": 0.6739020846306675, + "grad_norm": 0.10839427262544632, + "learning_rate": 0.00024470820051193367, + "loss": 2.6184, + "step": 22726 + }, + { + "epoch": 0.673931737982979, + "grad_norm": 0.10226361453533173, + "learning_rate": 0.00024466774590334317, + "loss": 2.6074, + "step": 22727 + }, + { + "epoch": 0.6739613913352904, + "grad_norm": 0.1135195717215538, + "learning_rate": 0.0002446272935557636, + "loss": 2.6322, + "step": 22728 + }, + { + "epoch": 0.673991044687602, + "grad_norm": 0.11010569334030151, + "learning_rate": 0.0002445868434695532, + "loss": 2.6195, + "step": 22729 + }, + { + "epoch": 0.6740206980399134, + "grad_norm": 0.28763607144355774, + "learning_rate": 0.0002445463956450701, + "loss": 2.6143, + "step": 22730 + }, + { + "epoch": 0.6740503513922249, + "grad_norm": 0.10650613158941269, + "learning_rate": 0.00024450595008267253, + "loss": 2.6238, + "step": 22731 + }, + { + "epoch": 0.6740800047445363, + "grad_norm": 0.1030130386352539, + "learning_rate": 0.0002444655067827186, + "loss": 2.6022, + "step": 22732 + }, + { + "epoch": 0.6741096580968479, + "grad_norm": 0.09903358668088913, + "learning_rate": 0.0002444250657455665, + "loss": 2.6059, + "step": 22733 + }, + { + "epoch": 0.6741393114491593, + "grad_norm": 0.1082129254937172, + "learning_rate": 0.0002443846269715743, + "loss": 2.6106, + "step": 22734 + }, + { + "epoch": 0.6741689648014708, + "grad_norm": 0.09908977895975113, + "learning_rate": 0.00024434419046110014, + "loss": 2.6311, + "step": 22735 + }, + { + "epoch": 0.6741986181537822, + "grad_norm": 0.10032643377780914, + "learning_rate": 0.000244303756214502, + "loss": 2.5928, + "step": 22736 + }, + { + "epoch": 0.6742282715060938, + "grad_norm": 0.1013963520526886, + "learning_rate": 0.000244263324232138, + "loss": 2.626, + "step": 22737 + }, + { + "epoch": 0.6742579248584052, + "grad_norm": 0.09980306774377823, + "learning_rate": 0.00024422289451436616, + "loss": 2.608, + "step": 22738 + }, + { + "epoch": 0.6742875782107167, + "grad_norm": 0.08964595943689346, + "learning_rate": 0.0002441824670615445, + "loss": 2.6067, + "step": 22739 + }, + { + "epoch": 0.6743172315630283, + "grad_norm": 0.10129056125879288, + "learning_rate": 0.000244142041874031, + "loss": 2.6108, + "step": 22740 + }, + { + "epoch": 0.6743468849153397, + "grad_norm": 0.10107897222042084, + "learning_rate": 0.00024410161895218368, + "loss": 2.613, + "step": 22741 + }, + { + "epoch": 0.6743765382676512, + "grad_norm": 0.097382090985775, + "learning_rate": 0.00024406119829636043, + "loss": 2.6349, + "step": 22742 + }, + { + "epoch": 0.6744061916199626, + "grad_norm": 0.10012050718069077, + "learning_rate": 0.00024402077990691917, + "loss": 2.62, + "step": 22743 + }, + { + "epoch": 0.6744358449722742, + "grad_norm": 0.1100703552365303, + "learning_rate": 0.0002439803637842179, + "loss": 2.5995, + "step": 22744 + }, + { + "epoch": 0.6744654983245856, + "grad_norm": 0.09874848276376724, + "learning_rate": 0.0002439399499286144, + "loss": 2.5932, + "step": 22745 + }, + { + "epoch": 0.6744951516768971, + "grad_norm": 0.10091961175203323, + "learning_rate": 0.00024389953834046664, + "loss": 2.568, + "step": 22746 + }, + { + "epoch": 0.6745248050292085, + "grad_norm": 0.10356558859348297, + "learning_rate": 0.00024385912902013236, + "loss": 2.5956, + "step": 22747 + }, + { + "epoch": 0.6745544583815201, + "grad_norm": 0.11388097703456879, + "learning_rate": 0.00024381872196796951, + "loss": 2.6153, + "step": 22748 + }, + { + "epoch": 0.6745841117338315, + "grad_norm": 0.09857448190450668, + "learning_rate": 0.00024377831718433584, + "loss": 2.5974, + "step": 22749 + }, + { + "epoch": 0.674613765086143, + "grad_norm": 0.10406811535358429, + "learning_rate": 0.00024373791466958918, + "loss": 2.6374, + "step": 22750 + }, + { + "epoch": 0.6746434184384544, + "grad_norm": 0.09670273214578629, + "learning_rate": 0.0002436975144240874, + "loss": 2.6298, + "step": 22751 + }, + { + "epoch": 0.674673071790766, + "grad_norm": 0.11053169518709183, + "learning_rate": 0.00024365711644818794, + "loss": 2.6336, + "step": 22752 + }, + { + "epoch": 0.6747027251430774, + "grad_norm": 0.09693772345781326, + "learning_rate": 0.00024361672074224856, + "loss": 2.622, + "step": 22753 + }, + { + "epoch": 0.6747323784953889, + "grad_norm": 0.10549376904964447, + "learning_rate": 0.00024357632730662732, + "loss": 2.6042, + "step": 22754 + }, + { + "epoch": 0.6747620318477003, + "grad_norm": 0.1046973243355751, + "learning_rate": 0.0002435359361416817, + "loss": 2.5923, + "step": 22755 + }, + { + "epoch": 0.6747916852000119, + "grad_norm": 0.09992551058530807, + "learning_rate": 0.0002434955472477694, + "loss": 2.6044, + "step": 22756 + }, + { + "epoch": 0.6748213385523233, + "grad_norm": 0.10072194784879684, + "learning_rate": 0.00024345516062524803, + "loss": 2.6275, + "step": 22757 + }, + { + "epoch": 0.6748509919046348, + "grad_norm": 0.08978056907653809, + "learning_rate": 0.0002434147762744752, + "loss": 2.6221, + "step": 22758 + }, + { + "epoch": 0.6748806452569462, + "grad_norm": 0.10365080833435059, + "learning_rate": 0.0002433743941958087, + "loss": 2.6143, + "step": 22759 + }, + { + "epoch": 0.6749102986092578, + "grad_norm": 0.08710408955812454, + "learning_rate": 0.0002433340143896059, + "loss": 2.6484, + "step": 22760 + }, + { + "epoch": 0.6749399519615693, + "grad_norm": 0.0967956930398941, + "learning_rate": 0.00024329363685622447, + "loss": 2.5934, + "step": 22761 + }, + { + "epoch": 0.6749696053138807, + "grad_norm": 0.09085509926080704, + "learning_rate": 0.00024325326159602218, + "loss": 2.6011, + "step": 22762 + }, + { + "epoch": 0.6749992586661923, + "grad_norm": 0.10094763338565826, + "learning_rate": 0.00024321288860935608, + "loss": 2.638, + "step": 22763 + }, + { + "epoch": 0.6750289120185037, + "grad_norm": 0.10411945730447769, + "learning_rate": 0.00024317251789658396, + "loss": 2.6102, + "step": 22764 + }, + { + "epoch": 0.6750585653708152, + "grad_norm": 0.1068764179944992, + "learning_rate": 0.0002431321494580633, + "loss": 2.6425, + "step": 22765 + }, + { + "epoch": 0.6750882187231266, + "grad_norm": 0.09210896492004395, + "learning_rate": 0.00024309178329415127, + "loss": 2.6106, + "step": 22766 + }, + { + "epoch": 0.6751178720754382, + "grad_norm": 0.10126630961894989, + "learning_rate": 0.00024305141940520586, + "loss": 2.6226, + "step": 22767 + }, + { + "epoch": 0.6751475254277496, + "grad_norm": 0.08890065550804138, + "learning_rate": 0.00024301105779158422, + "loss": 2.5808, + "step": 22768 + }, + { + "epoch": 0.6751771787800611, + "grad_norm": 0.10040184110403061, + "learning_rate": 0.00024297069845364378, + "loss": 2.5863, + "step": 22769 + }, + { + "epoch": 0.6752068321323725, + "grad_norm": 0.09737401455640793, + "learning_rate": 0.00024293034139174192, + "loss": 2.6427, + "step": 22770 + }, + { + "epoch": 0.6752364854846841, + "grad_norm": 0.09579311311244965, + "learning_rate": 0.000242889986606236, + "loss": 2.5938, + "step": 22771 + }, + { + "epoch": 0.6752661388369955, + "grad_norm": 0.11880998313426971, + "learning_rate": 0.00024284963409748362, + "loss": 2.6284, + "step": 22772 + }, + { + "epoch": 0.675295792189307, + "grad_norm": 0.11076398938894272, + "learning_rate": 0.0002428092838658416, + "loss": 2.6135, + "step": 22773 + }, + { + "epoch": 0.6753254455416184, + "grad_norm": 0.11217710375785828, + "learning_rate": 0.0002427689359116676, + "loss": 2.5964, + "step": 22774 + }, + { + "epoch": 0.67535509889393, + "grad_norm": 0.10236620903015137, + "learning_rate": 0.0002427285902353188, + "loss": 2.6337, + "step": 22775 + }, + { + "epoch": 0.6753847522462414, + "grad_norm": 0.11332415044307709, + "learning_rate": 0.00024268824683715246, + "loss": 2.6176, + "step": 22776 + }, + { + "epoch": 0.6754144055985529, + "grad_norm": 0.11056677997112274, + "learning_rate": 0.00024264790571752588, + "loss": 2.6363, + "step": 22777 + }, + { + "epoch": 0.6754440589508643, + "grad_norm": 0.11022460460662842, + "learning_rate": 0.00024260756687679625, + "loss": 2.5776, + "step": 22778 + }, + { + "epoch": 0.6754737123031759, + "grad_norm": 0.09906620532274246, + "learning_rate": 0.00024256723031532062, + "loss": 2.6642, + "step": 22779 + }, + { + "epoch": 0.6755033656554873, + "grad_norm": 0.11598610877990723, + "learning_rate": 0.00024252689603345657, + "loss": 2.611, + "step": 22780 + }, + { + "epoch": 0.6755330190077988, + "grad_norm": 0.10700701922178268, + "learning_rate": 0.000242486564031561, + "loss": 2.6125, + "step": 22781 + }, + { + "epoch": 0.6755626723601104, + "grad_norm": 0.1024412289261818, + "learning_rate": 0.00024244623430999114, + "loss": 2.625, + "step": 22782 + }, + { + "epoch": 0.6755923257124218, + "grad_norm": 0.1016763374209404, + "learning_rate": 0.0002424059068691043, + "loss": 2.6139, + "step": 22783 + }, + { + "epoch": 0.6756219790647333, + "grad_norm": 0.108744315803051, + "learning_rate": 0.00024236558170925715, + "loss": 2.6489, + "step": 22784 + }, + { + "epoch": 0.6756516324170447, + "grad_norm": 0.09634848684072495, + "learning_rate": 0.00024232525883080696, + "loss": 2.6281, + "step": 22785 + }, + { + "epoch": 0.6756812857693563, + "grad_norm": 0.09606102108955383, + "learning_rate": 0.0002422849382341109, + "loss": 2.5878, + "step": 22786 + }, + { + "epoch": 0.6757109391216677, + "grad_norm": 0.10831853747367859, + "learning_rate": 0.0002422446199195259, + "loss": 2.6116, + "step": 22787 + }, + { + "epoch": 0.6757405924739792, + "grad_norm": 0.10239963233470917, + "learning_rate": 0.00024220430388740904, + "loss": 2.6063, + "step": 22788 + }, + { + "epoch": 0.6757702458262906, + "grad_norm": 0.11717492341995239, + "learning_rate": 0.00024216399013811735, + "loss": 2.5932, + "step": 22789 + }, + { + "epoch": 0.6757998991786022, + "grad_norm": 0.10566841810941696, + "learning_rate": 0.00024212367867200774, + "loss": 2.6248, + "step": 22790 + }, + { + "epoch": 0.6758295525309136, + "grad_norm": 0.12027639150619507, + "learning_rate": 0.00024208336948943726, + "loss": 2.6605, + "step": 22791 + }, + { + "epoch": 0.6758592058832251, + "grad_norm": 0.09664767235517502, + "learning_rate": 0.0002420430625907626, + "loss": 2.5956, + "step": 22792 + }, + { + "epoch": 0.6758888592355365, + "grad_norm": 0.10485542565584183, + "learning_rate": 0.0002420027579763413, + "loss": 2.6311, + "step": 22793 + }, + { + "epoch": 0.6759185125878481, + "grad_norm": 0.10615898668766022, + "learning_rate": 0.00024196245564652962, + "loss": 2.6413, + "step": 22794 + }, + { + "epoch": 0.6759481659401595, + "grad_norm": 0.09872312843799591, + "learning_rate": 0.00024192215560168474, + "loss": 2.6503, + "step": 22795 + }, + { + "epoch": 0.675977819292471, + "grad_norm": 0.10629673302173615, + "learning_rate": 0.00024188185784216349, + "loss": 2.5781, + "step": 22796 + }, + { + "epoch": 0.6760074726447824, + "grad_norm": 0.11491027474403381, + "learning_rate": 0.00024184156236832266, + "loss": 2.6132, + "step": 22797 + }, + { + "epoch": 0.676037125997094, + "grad_norm": 0.1026374027132988, + "learning_rate": 0.00024180126918051909, + "loss": 2.5993, + "step": 22798 + }, + { + "epoch": 0.6760667793494054, + "grad_norm": 0.10198941826820374, + "learning_rate": 0.00024176097827910965, + "loss": 2.6138, + "step": 22799 + }, + { + "epoch": 0.6760964327017169, + "grad_norm": 0.10785014927387238, + "learning_rate": 0.00024172068966445105, + "loss": 2.6192, + "step": 22800 + }, + { + "epoch": 0.6761260860540285, + "grad_norm": 0.09914124757051468, + "learning_rate": 0.00024168040333690007, + "loss": 2.6593, + "step": 22801 + }, + { + "epoch": 0.6761557394063399, + "grad_norm": 0.09710122644901276, + "learning_rate": 0.0002416401192968134, + "loss": 2.5796, + "step": 22802 + }, + { + "epoch": 0.6761853927586514, + "grad_norm": 0.10471480339765549, + "learning_rate": 0.0002415998375445479, + "loss": 2.6465, + "step": 22803 + }, + { + "epoch": 0.6762150461109628, + "grad_norm": 0.09064454585313797, + "learning_rate": 0.00024155955808046015, + "loss": 2.6077, + "step": 22804 + }, + { + "epoch": 0.6762446994632744, + "grad_norm": 0.0979766920208931, + "learning_rate": 0.00024151928090490694, + "loss": 2.5882, + "step": 22805 + }, + { + "epoch": 0.6762743528155858, + "grad_norm": 0.09167026728391647, + "learning_rate": 0.0002414790060182448, + "loss": 2.6255, + "step": 22806 + }, + { + "epoch": 0.6763040061678973, + "grad_norm": 0.10437260568141937, + "learning_rate": 0.00024143873342083046, + "loss": 2.5764, + "step": 22807 + }, + { + "epoch": 0.6763336595202087, + "grad_norm": 0.10717697441577911, + "learning_rate": 0.0002413984631130205, + "loss": 2.6473, + "step": 22808 + }, + { + "epoch": 0.6763633128725203, + "grad_norm": 0.09447117149829865, + "learning_rate": 0.00024135819509517155, + "loss": 2.6206, + "step": 22809 + }, + { + "epoch": 0.6763929662248317, + "grad_norm": 0.09219090640544891, + "learning_rate": 0.0002413179293676402, + "loss": 2.5887, + "step": 22810 + }, + { + "epoch": 0.6764226195771432, + "grad_norm": 0.10540821403265, + "learning_rate": 0.00024127766593078293, + "loss": 2.6162, + "step": 22811 + }, + { + "epoch": 0.6764522729294546, + "grad_norm": 0.09172641485929489, + "learning_rate": 0.00024123740478495636, + "loss": 2.6083, + "step": 22812 + }, + { + "epoch": 0.6764819262817662, + "grad_norm": 0.09464088082313538, + "learning_rate": 0.000241197145930517, + "loss": 2.6147, + "step": 22813 + }, + { + "epoch": 0.6765115796340776, + "grad_norm": 0.10528867691755295, + "learning_rate": 0.00024115688936782138, + "loss": 2.6498, + "step": 22814 + }, + { + "epoch": 0.6765412329863891, + "grad_norm": 0.10641076415777206, + "learning_rate": 0.00024111663509722585, + "loss": 2.6218, + "step": 22815 + }, + { + "epoch": 0.6765708863387006, + "grad_norm": 0.10220447927713394, + "learning_rate": 0.00024107638311908697, + "loss": 2.6332, + "step": 22816 + }, + { + "epoch": 0.6766005396910121, + "grad_norm": 0.11607760936021805, + "learning_rate": 0.0002410361334337614, + "loss": 2.6418, + "step": 22817 + }, + { + "epoch": 0.6766301930433235, + "grad_norm": 0.11285670846700668, + "learning_rate": 0.00024099588604160495, + "loss": 2.6278, + "step": 22818 + }, + { + "epoch": 0.676659846395635, + "grad_norm": 0.1008845716714859, + "learning_rate": 0.0002409556409429745, + "loss": 2.624, + "step": 22819 + }, + { + "epoch": 0.6766894997479465, + "grad_norm": 0.10350636392831802, + "learning_rate": 0.00024091539813822632, + "loss": 2.6373, + "step": 22820 + }, + { + "epoch": 0.676719153100258, + "grad_norm": 0.10151413828134537, + "learning_rate": 0.00024087515762771683, + "loss": 2.6288, + "step": 22821 + }, + { + "epoch": 0.6767488064525695, + "grad_norm": 0.10092971473932266, + "learning_rate": 0.00024083491941180224, + "loss": 2.6281, + "step": 22822 + }, + { + "epoch": 0.6767784598048809, + "grad_norm": 0.09555769711732864, + "learning_rate": 0.00024079468349083894, + "loss": 2.6239, + "step": 22823 + }, + { + "epoch": 0.6768081131571925, + "grad_norm": 0.10564056783914566, + "learning_rate": 0.00024075444986518325, + "loss": 2.5887, + "step": 22824 + }, + { + "epoch": 0.6768377665095039, + "grad_norm": 0.10289628803730011, + "learning_rate": 0.00024071421853519138, + "loss": 2.6056, + "step": 22825 + }, + { + "epoch": 0.6768674198618154, + "grad_norm": 0.11192762851715088, + "learning_rate": 0.00024067398950121955, + "loss": 2.6183, + "step": 22826 + }, + { + "epoch": 0.6768970732141268, + "grad_norm": 0.1017899289727211, + "learning_rate": 0.00024063376276362431, + "loss": 2.6254, + "step": 22827 + }, + { + "epoch": 0.6769267265664384, + "grad_norm": 0.10706210136413574, + "learning_rate": 0.00024059353832276144, + "loss": 2.5667, + "step": 22828 + }, + { + "epoch": 0.6769563799187498, + "grad_norm": 0.09704351425170898, + "learning_rate": 0.0002405533161789873, + "loss": 2.5738, + "step": 22829 + }, + { + "epoch": 0.6769860332710613, + "grad_norm": 0.10468858480453491, + "learning_rate": 0.00024051309633265806, + "loss": 2.6096, + "step": 22830 + }, + { + "epoch": 0.6770156866233727, + "grad_norm": 0.10066398233175278, + "learning_rate": 0.0002404728787841297, + "loss": 2.6208, + "step": 22831 + }, + { + "epoch": 0.6770453399756843, + "grad_norm": 0.10534151643514633, + "learning_rate": 0.00024043266353375876, + "loss": 2.6137, + "step": 22832 + }, + { + "epoch": 0.6770749933279957, + "grad_norm": 0.1050073578953743, + "learning_rate": 0.00024039245058190113, + "loss": 2.6274, + "step": 22833 + }, + { + "epoch": 0.6771046466803072, + "grad_norm": 0.09042111039161682, + "learning_rate": 0.00024035223992891286, + "loss": 2.6146, + "step": 22834 + }, + { + "epoch": 0.6771343000326187, + "grad_norm": 0.1060882955789566, + "learning_rate": 0.00024031203157515014, + "loss": 2.6198, + "step": 22835 + }, + { + "epoch": 0.6771639533849302, + "grad_norm": 0.10620179772377014, + "learning_rate": 0.00024027182552096893, + "loss": 2.5926, + "step": 22836 + }, + { + "epoch": 0.6771936067372416, + "grad_norm": 0.09828649461269379, + "learning_rate": 0.0002402316217667253, + "loss": 2.6195, + "step": 22837 + }, + { + "epoch": 0.6772232600895531, + "grad_norm": 0.11082612723112106, + "learning_rate": 0.00024019142031277542, + "loss": 2.6122, + "step": 22838 + }, + { + "epoch": 0.6772529134418646, + "grad_norm": 0.10526341199874878, + "learning_rate": 0.00024015122115947495, + "loss": 2.6506, + "step": 22839 + }, + { + "epoch": 0.6772825667941761, + "grad_norm": 0.09725140035152435, + "learning_rate": 0.00024011102430718, + "loss": 2.604, + "step": 22840 + }, + { + "epoch": 0.6773122201464875, + "grad_norm": 0.09440760314464569, + "learning_rate": 0.00024007082975624656, + "loss": 2.6198, + "step": 22841 + }, + { + "epoch": 0.677341873498799, + "grad_norm": 0.10203225910663605, + "learning_rate": 0.00024003063750703047, + "loss": 2.6003, + "step": 22842 + }, + { + "epoch": 0.6773715268511106, + "grad_norm": 0.09957108646631241, + "learning_rate": 0.00023999044755988774, + "loss": 2.6425, + "step": 22843 + }, + { + "epoch": 0.677401180203422, + "grad_norm": 0.09971190989017487, + "learning_rate": 0.0002399502599151741, + "loss": 2.5885, + "step": 22844 + }, + { + "epoch": 0.6774308335557335, + "grad_norm": 0.09490610659122467, + "learning_rate": 0.00023991007457324566, + "loss": 2.5904, + "step": 22845 + }, + { + "epoch": 0.677460486908045, + "grad_norm": 0.09576542675495148, + "learning_rate": 0.00023986989153445821, + "loss": 2.6171, + "step": 22846 + }, + { + "epoch": 0.6774901402603565, + "grad_norm": 0.10718235373497009, + "learning_rate": 0.0002398297107991675, + "loss": 2.6004, + "step": 22847 + }, + { + "epoch": 0.6775197936126679, + "grad_norm": 0.11863133311271667, + "learning_rate": 0.00023978953236772948, + "loss": 2.6019, + "step": 22848 + }, + { + "epoch": 0.6775494469649794, + "grad_norm": 0.11263591796159744, + "learning_rate": 0.0002397493562404997, + "loss": 2.6067, + "step": 22849 + }, + { + "epoch": 0.6775791003172908, + "grad_norm": 0.10449299216270447, + "learning_rate": 0.00023970918241783402, + "loss": 2.619, + "step": 22850 + }, + { + "epoch": 0.6776087536696024, + "grad_norm": 0.08737393468618393, + "learning_rate": 0.00023966901090008826, + "loss": 2.6345, + "step": 22851 + }, + { + "epoch": 0.6776384070219138, + "grad_norm": 0.10123572498559952, + "learning_rate": 0.00023962884168761802, + "loss": 2.62, + "step": 22852 + }, + { + "epoch": 0.6776680603742253, + "grad_norm": 0.09718680381774902, + "learning_rate": 0.0002395886747807791, + "loss": 2.6189, + "step": 22853 + }, + { + "epoch": 0.6776977137265368, + "grad_norm": 0.08894186466932297, + "learning_rate": 0.00023954851017992719, + "loss": 2.6637, + "step": 22854 + }, + { + "epoch": 0.6777273670788483, + "grad_norm": 0.10742398351430893, + "learning_rate": 0.0002395083478854179, + "loss": 2.6434, + "step": 22855 + }, + { + "epoch": 0.6777570204311597, + "grad_norm": 0.10104163736104965, + "learning_rate": 0.00023946818789760693, + "loss": 2.6125, + "step": 22856 + }, + { + "epoch": 0.6777866737834712, + "grad_norm": 0.1049196794629097, + "learning_rate": 0.00023942803021684966, + "loss": 2.5981, + "step": 22857 + }, + { + "epoch": 0.6778163271357827, + "grad_norm": 0.10369326919317245, + "learning_rate": 0.00023938787484350212, + "loss": 2.6281, + "step": 22858 + }, + { + "epoch": 0.6778459804880942, + "grad_norm": 0.10337232798337936, + "learning_rate": 0.00023934772177791985, + "loss": 2.6315, + "step": 22859 + }, + { + "epoch": 0.6778756338404056, + "grad_norm": 0.09776152670383453, + "learning_rate": 0.00023930757102045801, + "loss": 2.6455, + "step": 22860 + }, + { + "epoch": 0.6779052871927171, + "grad_norm": 0.10411829501390457, + "learning_rate": 0.00023926742257147248, + "loss": 2.6645, + "step": 22861 + }, + { + "epoch": 0.6779349405450286, + "grad_norm": 0.10834544897079468, + "learning_rate": 0.00023922727643131865, + "loss": 2.6369, + "step": 22862 + }, + { + "epoch": 0.6779645938973401, + "grad_norm": 0.11009054630994797, + "learning_rate": 0.00023918713260035198, + "loss": 2.632, + "step": 22863 + }, + { + "epoch": 0.6779942472496516, + "grad_norm": 0.09327571839094162, + "learning_rate": 0.00023914699107892808, + "loss": 2.6079, + "step": 22864 + }, + { + "epoch": 0.678023900601963, + "grad_norm": 0.1020822748541832, + "learning_rate": 0.00023910685186740233, + "loss": 2.6261, + "step": 22865 + }, + { + "epoch": 0.6780535539542746, + "grad_norm": 0.10027563571929932, + "learning_rate": 0.00023906671496613018, + "loss": 2.651, + "step": 22866 + }, + { + "epoch": 0.678083207306586, + "grad_norm": 0.10210326313972473, + "learning_rate": 0.0002390265803754671, + "loss": 2.6203, + "step": 22867 + }, + { + "epoch": 0.6781128606588975, + "grad_norm": 0.10245751589536667, + "learning_rate": 0.00023898644809576837, + "loss": 2.6064, + "step": 22868 + }, + { + "epoch": 0.678142514011209, + "grad_norm": 0.09826969355344772, + "learning_rate": 0.00023894631812738947, + "loss": 2.6723, + "step": 22869 + }, + { + "epoch": 0.6781721673635205, + "grad_norm": 0.10439082235097885, + "learning_rate": 0.0002389061904706858, + "loss": 2.618, + "step": 22870 + }, + { + "epoch": 0.6782018207158319, + "grad_norm": 0.10177434235811234, + "learning_rate": 0.00023886606512601256, + "loss": 2.6141, + "step": 22871 + }, + { + "epoch": 0.6782314740681434, + "grad_norm": 0.10849796235561371, + "learning_rate": 0.00023882594209372515, + "loss": 2.6125, + "step": 22872 + }, + { + "epoch": 0.6782611274204549, + "grad_norm": 0.10723907500505447, + "learning_rate": 0.00023878582137417886, + "loss": 2.5972, + "step": 22873 + }, + { + "epoch": 0.6782907807727664, + "grad_norm": 0.10606002062559128, + "learning_rate": 0.00023874570296772895, + "loss": 2.6183, + "step": 22874 + }, + { + "epoch": 0.6783204341250778, + "grad_norm": 0.11886047571897507, + "learning_rate": 0.0002387055868747307, + "loss": 2.6224, + "step": 22875 + }, + { + "epoch": 0.6783500874773893, + "grad_norm": 0.1191200390458107, + "learning_rate": 0.00023866547309553933, + "loss": 2.6338, + "step": 22876 + }, + { + "epoch": 0.6783797408297008, + "grad_norm": 0.11427290737628937, + "learning_rate": 0.00023862536163051008, + "loss": 2.5822, + "step": 22877 + }, + { + "epoch": 0.6784093941820123, + "grad_norm": 0.10396724194288254, + "learning_rate": 0.00023858525247999807, + "loss": 2.606, + "step": 22878 + }, + { + "epoch": 0.6784390475343237, + "grad_norm": 0.13489262759685516, + "learning_rate": 0.00023854514564435853, + "loss": 2.5939, + "step": 22879 + }, + { + "epoch": 0.6784687008866352, + "grad_norm": 0.11330666393041611, + "learning_rate": 0.0002385050411239466, + "loss": 2.6391, + "step": 22880 + }, + { + "epoch": 0.6784983542389467, + "grad_norm": 0.10767726600170135, + "learning_rate": 0.00023846493891911742, + "loss": 2.6349, + "step": 22881 + }, + { + "epoch": 0.6785280075912582, + "grad_norm": 0.129190593957901, + "learning_rate": 0.0002384248390302261, + "loss": 2.6039, + "step": 22882 + }, + { + "epoch": 0.6785576609435696, + "grad_norm": 0.11725790053606033, + "learning_rate": 0.00023838474145762773, + "loss": 2.5989, + "step": 22883 + }, + { + "epoch": 0.6785873142958811, + "grad_norm": 0.10643050819635391, + "learning_rate": 0.00023834464620167735, + "loss": 2.6423, + "step": 22884 + }, + { + "epoch": 0.6786169676481927, + "grad_norm": 0.11287230253219604, + "learning_rate": 0.0002383045532627301, + "loss": 2.6135, + "step": 22885 + }, + { + "epoch": 0.6786466210005041, + "grad_norm": 0.09883024543523788, + "learning_rate": 0.00023826446264114089, + "loss": 2.6097, + "step": 22886 + }, + { + "epoch": 0.6786762743528156, + "grad_norm": 0.09874773025512695, + "learning_rate": 0.00023822437433726484, + "loss": 2.628, + "step": 22887 + }, + { + "epoch": 0.678705927705127, + "grad_norm": 0.11079274117946625, + "learning_rate": 0.00023818428835145684, + "loss": 2.5904, + "step": 22888 + }, + { + "epoch": 0.6787355810574386, + "grad_norm": 0.1098121777176857, + "learning_rate": 0.00023814420468407194, + "loss": 2.6337, + "step": 22889 + }, + { + "epoch": 0.67876523440975, + "grad_norm": 0.10315509885549545, + "learning_rate": 0.00023810412333546505, + "loss": 2.6121, + "step": 22890 + }, + { + "epoch": 0.6787948877620615, + "grad_norm": 0.1174914538860321, + "learning_rate": 0.0002380640443059911, + "loss": 2.6423, + "step": 22891 + }, + { + "epoch": 0.678824541114373, + "grad_norm": 0.10661344975233078, + "learning_rate": 0.00023802396759600502, + "loss": 2.612, + "step": 22892 + }, + { + "epoch": 0.6788541944666845, + "grad_norm": 0.11002794653177261, + "learning_rate": 0.00023798389320586183, + "loss": 2.6227, + "step": 22893 + }, + { + "epoch": 0.6788838478189959, + "grad_norm": 0.10195095092058182, + "learning_rate": 0.00023794382113591605, + "loss": 2.6067, + "step": 22894 + }, + { + "epoch": 0.6789135011713074, + "grad_norm": 0.11479160189628601, + "learning_rate": 0.00023790375138652275, + "loss": 2.6227, + "step": 22895 + }, + { + "epoch": 0.6789431545236189, + "grad_norm": 0.10093090683221817, + "learning_rate": 0.00023786368395803647, + "loss": 2.6308, + "step": 22896 + }, + { + "epoch": 0.6789728078759304, + "grad_norm": 0.10685113072395325, + "learning_rate": 0.0002378236188508125, + "loss": 2.6164, + "step": 22897 + }, + { + "epoch": 0.6790024612282418, + "grad_norm": 0.09899231791496277, + "learning_rate": 0.00023778355606520542, + "loss": 2.6482, + "step": 22898 + }, + { + "epoch": 0.6790321145805533, + "grad_norm": 0.10319047421216965, + "learning_rate": 0.00023774349560156988, + "loss": 2.5837, + "step": 22899 + }, + { + "epoch": 0.6790617679328648, + "grad_norm": 0.10097410529851913, + "learning_rate": 0.00023770343746026075, + "loss": 2.6272, + "step": 22900 + }, + { + "epoch": 0.6790914212851763, + "grad_norm": 0.09327618032693863, + "learning_rate": 0.0002376633816416327, + "loss": 2.5936, + "step": 22901 + }, + { + "epoch": 0.6791210746374877, + "grad_norm": 0.09940069168806076, + "learning_rate": 0.00023762332814604044, + "loss": 2.5964, + "step": 22902 + }, + { + "epoch": 0.6791507279897993, + "grad_norm": 0.09955774247646332, + "learning_rate": 0.00023758327697383865, + "loss": 2.6316, + "step": 22903 + }, + { + "epoch": 0.6791803813421107, + "grad_norm": 0.09805002808570862, + "learning_rate": 0.00023754322812538214, + "loss": 2.5974, + "step": 22904 + }, + { + "epoch": 0.6792100346944222, + "grad_norm": 0.1161159798502922, + "learning_rate": 0.00023750318160102525, + "loss": 2.6602, + "step": 22905 + }, + { + "epoch": 0.6792396880467337, + "grad_norm": 0.10519547760486603, + "learning_rate": 0.0002374631374011227, + "loss": 2.6324, + "step": 22906 + }, + { + "epoch": 0.6792693413990452, + "grad_norm": 0.10757028311491013, + "learning_rate": 0.00023742309552602915, + "loss": 2.6247, + "step": 22907 + }, + { + "epoch": 0.6792989947513567, + "grad_norm": 0.10812085121870041, + "learning_rate": 0.00023738305597609916, + "loss": 2.5812, + "step": 22908 + }, + { + "epoch": 0.6793286481036681, + "grad_norm": 0.09650139510631561, + "learning_rate": 0.00023734301875168713, + "loss": 2.6069, + "step": 22909 + }, + { + "epoch": 0.6793583014559796, + "grad_norm": 0.09548409283161163, + "learning_rate": 0.00023730298385314785, + "loss": 2.6, + "step": 22910 + }, + { + "epoch": 0.6793879548082911, + "grad_norm": 0.09897095710039139, + "learning_rate": 0.0002372629512808358, + "loss": 2.6027, + "step": 22911 + }, + { + "epoch": 0.6794176081606026, + "grad_norm": 0.09027596563100815, + "learning_rate": 0.00023722292103510546, + "loss": 2.6149, + "step": 22912 + }, + { + "epoch": 0.679447261512914, + "grad_norm": 0.09804502874612808, + "learning_rate": 0.0002371828931163112, + "loss": 2.6241, + "step": 22913 + }, + { + "epoch": 0.6794769148652255, + "grad_norm": 0.09614194184541702, + "learning_rate": 0.00023714286752480774, + "loss": 2.6364, + "step": 22914 + }, + { + "epoch": 0.679506568217537, + "grad_norm": 0.09260084480047226, + "learning_rate": 0.00023710284426094912, + "loss": 2.6117, + "step": 22915 + }, + { + "epoch": 0.6795362215698485, + "grad_norm": 0.09417720139026642, + "learning_rate": 0.00023706282332508995, + "loss": 2.5944, + "step": 22916 + }, + { + "epoch": 0.6795658749221599, + "grad_norm": 0.09476038813591003, + "learning_rate": 0.0002370228047175846, + "loss": 2.6326, + "step": 22917 + }, + { + "epoch": 0.6795955282744714, + "grad_norm": 0.10334828495979309, + "learning_rate": 0.00023698278843878746, + "loss": 2.6195, + "step": 22918 + }, + { + "epoch": 0.6796251816267829, + "grad_norm": 0.10890910774469376, + "learning_rate": 0.00023694277448905283, + "loss": 2.6175, + "step": 22919 + }, + { + "epoch": 0.6796548349790944, + "grad_norm": 0.09880857169628143, + "learning_rate": 0.00023690276286873513, + "loss": 2.5598, + "step": 22920 + }, + { + "epoch": 0.6796844883314058, + "grad_norm": 0.10335885733366013, + "learning_rate": 0.0002368627535781886, + "loss": 2.6276, + "step": 22921 + }, + { + "epoch": 0.6797141416837174, + "grad_norm": 0.1145944595336914, + "learning_rate": 0.00023682274661776737, + "loss": 2.6312, + "step": 22922 + }, + { + "epoch": 0.6797437950360288, + "grad_norm": 0.0981755405664444, + "learning_rate": 0.00023678274198782613, + "loss": 2.6427, + "step": 22923 + }, + { + "epoch": 0.6797734483883403, + "grad_norm": 0.09873934090137482, + "learning_rate": 0.000236742739688719, + "loss": 2.629, + "step": 22924 + }, + { + "epoch": 0.6798031017406517, + "grad_norm": 0.09559397399425507, + "learning_rate": 0.00023670273972079998, + "loss": 2.6106, + "step": 22925 + }, + { + "epoch": 0.6798327550929633, + "grad_norm": 0.10202644765377045, + "learning_rate": 0.00023666274208442335, + "loss": 2.5871, + "step": 22926 + }, + { + "epoch": 0.6798624084452748, + "grad_norm": 0.10391587018966675, + "learning_rate": 0.00023662274677994338, + "loss": 2.5916, + "step": 22927 + }, + { + "epoch": 0.6798920617975862, + "grad_norm": 0.1300165206193924, + "learning_rate": 0.00023658275380771416, + "loss": 2.6572, + "step": 22928 + }, + { + "epoch": 0.6799217151498977, + "grad_norm": 0.11467760801315308, + "learning_rate": 0.00023654276316808988, + "loss": 2.5906, + "step": 22929 + }, + { + "epoch": 0.6799513685022092, + "grad_norm": 0.10292228311300278, + "learning_rate": 0.00023650277486142462, + "loss": 2.6669, + "step": 22930 + }, + { + "epoch": 0.6799810218545207, + "grad_norm": 0.10015063732862473, + "learning_rate": 0.0002364627888880726, + "loss": 2.6049, + "step": 22931 + }, + { + "epoch": 0.6800106752068321, + "grad_norm": 0.1029970645904541, + "learning_rate": 0.00023642280524838777, + "loss": 2.5607, + "step": 22932 + }, + { + "epoch": 0.6800403285591436, + "grad_norm": 0.10962287336587906, + "learning_rate": 0.00023638282394272426, + "loss": 2.5921, + "step": 22933 + }, + { + "epoch": 0.6800699819114551, + "grad_norm": 0.10629843175411224, + "learning_rate": 0.00023634284497143587, + "loss": 2.6361, + "step": 22934 + }, + { + "epoch": 0.6800996352637666, + "grad_norm": 0.1051415279507637, + "learning_rate": 0.00023630286833487723, + "loss": 2.5985, + "step": 22935 + }, + { + "epoch": 0.680129288616078, + "grad_norm": 0.10637689381837845, + "learning_rate": 0.00023626289403340178, + "loss": 2.5813, + "step": 22936 + }, + { + "epoch": 0.6801589419683896, + "grad_norm": 0.09814777970314026, + "learning_rate": 0.00023622292206736367, + "loss": 2.6274, + "step": 22937 + }, + { + "epoch": 0.680188595320701, + "grad_norm": 0.10701724141836166, + "learning_rate": 0.00023618295243711684, + "loss": 2.6002, + "step": 22938 + }, + { + "epoch": 0.6802182486730125, + "grad_norm": 0.09604386240243912, + "learning_rate": 0.0002361429851430153, + "loss": 2.6174, + "step": 22939 + }, + { + "epoch": 0.6802479020253239, + "grad_norm": 0.10557205229997635, + "learning_rate": 0.00023610302018541284, + "loss": 2.6127, + "step": 22940 + }, + { + "epoch": 0.6802775553776355, + "grad_norm": 0.11708365380764008, + "learning_rate": 0.00023606305756466352, + "loss": 2.6008, + "step": 22941 + }, + { + "epoch": 0.6803072087299469, + "grad_norm": 0.11899139732122421, + "learning_rate": 0.0002360230972811211, + "loss": 2.6281, + "step": 22942 + }, + { + "epoch": 0.6803368620822584, + "grad_norm": 0.11406941711902618, + "learning_rate": 0.00023598313933513942, + "loss": 2.5917, + "step": 22943 + }, + { + "epoch": 0.6803665154345698, + "grad_norm": 0.10501470416784286, + "learning_rate": 0.00023594318372707242, + "loss": 2.6097, + "step": 22944 + }, + { + "epoch": 0.6803961687868814, + "grad_norm": 0.11513425409793854, + "learning_rate": 0.0002359032304572738, + "loss": 2.6413, + "step": 22945 + }, + { + "epoch": 0.6804258221391928, + "grad_norm": 0.11751029640436172, + "learning_rate": 0.00023586327952609742, + "loss": 2.6347, + "step": 22946 + }, + { + "epoch": 0.6804554754915043, + "grad_norm": 0.11845937371253967, + "learning_rate": 0.00023582333093389706, + "loss": 2.624, + "step": 22947 + }, + { + "epoch": 0.6804851288438158, + "grad_norm": 0.11304350942373276, + "learning_rate": 0.00023578338468102644, + "loss": 2.6159, + "step": 22948 + }, + { + "epoch": 0.6805147821961273, + "grad_norm": 0.10767155885696411, + "learning_rate": 0.0002357434407678393, + "loss": 2.5986, + "step": 22949 + }, + { + "epoch": 0.6805444355484388, + "grad_norm": 0.1054307073354721, + "learning_rate": 0.00023570349919468936, + "loss": 2.6234, + "step": 22950 + }, + { + "epoch": 0.6805740889007502, + "grad_norm": 0.10667932778596878, + "learning_rate": 0.00023566355996193028, + "loss": 2.6128, + "step": 22951 + }, + { + "epoch": 0.6806037422530617, + "grad_norm": 0.10236530005931854, + "learning_rate": 0.00023562362306991575, + "loss": 2.592, + "step": 22952 + }, + { + "epoch": 0.6806333956053732, + "grad_norm": 0.10756563395261765, + "learning_rate": 0.00023558368851899947, + "loss": 2.6298, + "step": 22953 + }, + { + "epoch": 0.6806630489576847, + "grad_norm": 0.10083261877298355, + "learning_rate": 0.00023554375630953494, + "loss": 2.6262, + "step": 22954 + }, + { + "epoch": 0.6806927023099961, + "grad_norm": 0.10695435106754303, + "learning_rate": 0.0002355038264418759, + "loss": 2.5913, + "step": 22955 + }, + { + "epoch": 0.6807223556623077, + "grad_norm": 0.10385038703680038, + "learning_rate": 0.00023546389891637587, + "loss": 2.6464, + "step": 22956 + }, + { + "epoch": 0.6807520090146191, + "grad_norm": 0.09424068033695221, + "learning_rate": 0.00023542397373338837, + "loss": 2.5971, + "step": 22957 + }, + { + "epoch": 0.6807816623669306, + "grad_norm": 0.10374026745557785, + "learning_rate": 0.00023538405089326703, + "loss": 2.6466, + "step": 22958 + }, + { + "epoch": 0.680811315719242, + "grad_norm": 0.09409578889608383, + "learning_rate": 0.00023534413039636554, + "loss": 2.6359, + "step": 22959 + }, + { + "epoch": 0.6808409690715536, + "grad_norm": 0.09959955513477325, + "learning_rate": 0.00023530421224303682, + "loss": 2.6288, + "step": 22960 + }, + { + "epoch": 0.680870622423865, + "grad_norm": 0.11485157161951065, + "learning_rate": 0.00023526429643363488, + "loss": 2.6188, + "step": 22961 + }, + { + "epoch": 0.6809002757761765, + "grad_norm": 0.11084944009780884, + "learning_rate": 0.00023522438296851313, + "loss": 2.6167, + "step": 22962 + }, + { + "epoch": 0.6809299291284879, + "grad_norm": 0.10701944679021835, + "learning_rate": 0.00023518447184802483, + "loss": 2.6002, + "step": 22963 + }, + { + "epoch": 0.6809595824807995, + "grad_norm": 0.11612579971551895, + "learning_rate": 0.00023514456307252351, + "loss": 2.6244, + "step": 22964 + }, + { + "epoch": 0.6809892358331109, + "grad_norm": 0.10826824605464935, + "learning_rate": 0.00023510465664236259, + "loss": 2.6393, + "step": 22965 + }, + { + "epoch": 0.6810188891854224, + "grad_norm": 0.11720868200063705, + "learning_rate": 0.00023506475255789534, + "loss": 2.6313, + "step": 22966 + }, + { + "epoch": 0.6810485425377338, + "grad_norm": 0.1127614825963974, + "learning_rate": 0.00023502485081947522, + "loss": 2.5787, + "step": 22967 + }, + { + "epoch": 0.6810781958900454, + "grad_norm": 0.10482091456651688, + "learning_rate": 0.00023498495142745552, + "loss": 2.6116, + "step": 22968 + }, + { + "epoch": 0.6811078492423569, + "grad_norm": 0.10388600826263428, + "learning_rate": 0.00023494505438218977, + "loss": 2.627, + "step": 22969 + }, + { + "epoch": 0.6811375025946683, + "grad_norm": 0.10781538486480713, + "learning_rate": 0.00023490515968403082, + "loss": 2.6265, + "step": 22970 + }, + { + "epoch": 0.6811671559469799, + "grad_norm": 0.12364940345287323, + "learning_rate": 0.00023486526733333224, + "loss": 2.6204, + "step": 22971 + }, + { + "epoch": 0.6811968092992913, + "grad_norm": 0.11471662670373917, + "learning_rate": 0.00023482537733044716, + "loss": 2.6154, + "step": 22972 + }, + { + "epoch": 0.6812264626516028, + "grad_norm": 0.10184979438781738, + "learning_rate": 0.00023478548967572878, + "loss": 2.5806, + "step": 22973 + }, + { + "epoch": 0.6812561160039142, + "grad_norm": 0.1153513491153717, + "learning_rate": 0.00023474560436953057, + "loss": 2.6211, + "step": 22974 + }, + { + "epoch": 0.6812857693562258, + "grad_norm": 0.1159454956650734, + "learning_rate": 0.00023470572141220554, + "loss": 2.6335, + "step": 22975 + }, + { + "epoch": 0.6813154227085372, + "grad_norm": 0.12038097530603409, + "learning_rate": 0.00023466584080410693, + "loss": 2.5967, + "step": 22976 + }, + { + "epoch": 0.6813450760608487, + "grad_norm": 0.13080070912837982, + "learning_rate": 0.00023462596254558777, + "loss": 2.6293, + "step": 22977 + }, + { + "epoch": 0.6813747294131601, + "grad_norm": 0.09844120591878891, + "learning_rate": 0.00023458608663700132, + "loss": 2.5955, + "step": 22978 + }, + { + "epoch": 0.6814043827654717, + "grad_norm": 0.11363821476697922, + "learning_rate": 0.0002345462130787006, + "loss": 2.6081, + "step": 22979 + }, + { + "epoch": 0.6814340361177831, + "grad_norm": 0.12761783599853516, + "learning_rate": 0.00023450634187103893, + "loss": 2.6193, + "step": 22980 + }, + { + "epoch": 0.6814636894700946, + "grad_norm": 0.10557854920625687, + "learning_rate": 0.00023446647301436895, + "loss": 2.6133, + "step": 22981 + }, + { + "epoch": 0.681493342822406, + "grad_norm": 0.11303791403770447, + "learning_rate": 0.00023442660650904395, + "loss": 2.6256, + "step": 22982 + }, + { + "epoch": 0.6815229961747176, + "grad_norm": 0.11581925302743912, + "learning_rate": 0.00023438674235541697, + "loss": 2.6115, + "step": 22983 + }, + { + "epoch": 0.681552649527029, + "grad_norm": 0.09141435474157333, + "learning_rate": 0.00023434688055384097, + "loss": 2.6065, + "step": 22984 + }, + { + "epoch": 0.6815823028793405, + "grad_norm": 0.12489510327577591, + "learning_rate": 0.0002343070211046689, + "loss": 2.6447, + "step": 22985 + }, + { + "epoch": 0.6816119562316519, + "grad_norm": 0.13364849984645844, + "learning_rate": 0.0002342671640082536, + "loss": 2.6146, + "step": 22986 + }, + { + "epoch": 0.6816416095839635, + "grad_norm": 0.10461913049221039, + "learning_rate": 0.00023422730926494839, + "loss": 2.6046, + "step": 22987 + }, + { + "epoch": 0.6816712629362749, + "grad_norm": 0.11983351409435272, + "learning_rate": 0.00023418745687510596, + "loss": 2.6045, + "step": 22988 + }, + { + "epoch": 0.6817009162885864, + "grad_norm": 0.11280723661184311, + "learning_rate": 0.0002341476068390792, + "loss": 2.5811, + "step": 22989 + }, + { + "epoch": 0.681730569640898, + "grad_norm": 0.1094072014093399, + "learning_rate": 0.00023410775915722122, + "loss": 2.626, + "step": 22990 + }, + { + "epoch": 0.6817602229932094, + "grad_norm": 0.11008204519748688, + "learning_rate": 0.0002340679138298845, + "loss": 2.6528, + "step": 22991 + }, + { + "epoch": 0.6817898763455209, + "grad_norm": 0.1025678738951683, + "learning_rate": 0.000234028070857422, + "loss": 2.5944, + "step": 22992 + }, + { + "epoch": 0.6818195296978323, + "grad_norm": 0.11707829684019089, + "learning_rate": 0.00023398823024018667, + "loss": 2.6446, + "step": 22993 + }, + { + "epoch": 0.6818491830501439, + "grad_norm": 0.1117783710360527, + "learning_rate": 0.00023394839197853114, + "loss": 2.6137, + "step": 22994 + }, + { + "epoch": 0.6818788364024553, + "grad_norm": 0.11035259068012238, + "learning_rate": 0.0002339085560728083, + "loss": 2.6199, + "step": 22995 + }, + { + "epoch": 0.6819084897547668, + "grad_norm": 0.1048029288649559, + "learning_rate": 0.0002338687225233709, + "loss": 2.6001, + "step": 22996 + }, + { + "epoch": 0.6819381431070782, + "grad_norm": 0.10203373432159424, + "learning_rate": 0.00023382889133057155, + "loss": 2.6047, + "step": 22997 + }, + { + "epoch": 0.6819677964593898, + "grad_norm": 0.10672051459550858, + "learning_rate": 0.00023378906249476317, + "loss": 2.6333, + "step": 22998 + }, + { + "epoch": 0.6819974498117012, + "grad_norm": 0.12734775245189667, + "learning_rate": 0.00023374923601629806, + "loss": 2.5935, + "step": 22999 + }, + { + "epoch": 0.6820271031640127, + "grad_norm": 0.12829001247882843, + "learning_rate": 0.0002337094118955296, + "loss": 2.6558, + "step": 23000 + }, + { + "epoch": 0.6820567565163241, + "grad_norm": 0.1019856259226799, + "learning_rate": 0.0002336695901328098, + "loss": 2.6082, + "step": 23001 + }, + { + "epoch": 0.6820864098686357, + "grad_norm": 0.10894406586885452, + "learning_rate": 0.00023362977072849145, + "loss": 2.6226, + "step": 23002 + }, + { + "epoch": 0.6821160632209471, + "grad_norm": 0.10842135548591614, + "learning_rate": 0.00023358995368292723, + "loss": 2.6321, + "step": 23003 + }, + { + "epoch": 0.6821457165732586, + "grad_norm": 0.11191968619823456, + "learning_rate": 0.00023355013899646976, + "loss": 2.6284, + "step": 23004 + }, + { + "epoch": 0.68217536992557, + "grad_norm": 0.10194531828165054, + "learning_rate": 0.00023351032666947148, + "loss": 2.5769, + "step": 23005 + }, + { + "epoch": 0.6822050232778816, + "grad_norm": 0.11386469006538391, + "learning_rate": 0.00023347051670228504, + "loss": 2.6507, + "step": 23006 + }, + { + "epoch": 0.682234676630193, + "grad_norm": 0.10652086138725281, + "learning_rate": 0.00023343070909526286, + "loss": 2.612, + "step": 23007 + }, + { + "epoch": 0.6822643299825045, + "grad_norm": 0.10793018341064453, + "learning_rate": 0.00023339090384875754, + "loss": 2.6136, + "step": 23008 + }, + { + "epoch": 0.682293983334816, + "grad_norm": 0.11222048103809357, + "learning_rate": 0.00023335110096312157, + "loss": 2.5763, + "step": 23009 + }, + { + "epoch": 0.6823236366871275, + "grad_norm": 0.1006356030702591, + "learning_rate": 0.0002333113004387073, + "loss": 2.6352, + "step": 23010 + }, + { + "epoch": 0.682353290039439, + "grad_norm": 0.1011262908577919, + "learning_rate": 0.0002332715022758673, + "loss": 2.6196, + "step": 23011 + }, + { + "epoch": 0.6823829433917504, + "grad_norm": 0.09928636252880096, + "learning_rate": 0.0002332317064749539, + "loss": 2.6313, + "step": 23012 + }, + { + "epoch": 0.682412596744062, + "grad_norm": 0.09672360122203827, + "learning_rate": 0.00023319191303631953, + "loss": 2.6162, + "step": 23013 + }, + { + "epoch": 0.6824422500963734, + "grad_norm": 0.1044580489397049, + "learning_rate": 0.00023315212196031655, + "loss": 2.6254, + "step": 23014 + }, + { + "epoch": 0.6824719034486849, + "grad_norm": 0.08650905638933182, + "learning_rate": 0.00023311233324729735, + "loss": 2.6113, + "step": 23015 + }, + { + "epoch": 0.6825015568009963, + "grad_norm": 0.09784762561321259, + "learning_rate": 0.00023307254689761427, + "loss": 2.6029, + "step": 23016 + }, + { + "epoch": 0.6825312101533079, + "grad_norm": 0.09637473523616791, + "learning_rate": 0.00023303276291161964, + "loss": 2.6354, + "step": 23017 + }, + { + "epoch": 0.6825608635056193, + "grad_norm": 0.10482778400182724, + "learning_rate": 0.0002329929812896656, + "loss": 2.611, + "step": 23018 + }, + { + "epoch": 0.6825905168579308, + "grad_norm": 0.09765441715717316, + "learning_rate": 0.00023295320203210463, + "loss": 2.5965, + "step": 23019 + }, + { + "epoch": 0.6826201702102422, + "grad_norm": 0.10476604104042053, + "learning_rate": 0.00023291342513928888, + "loss": 2.6077, + "step": 23020 + }, + { + "epoch": 0.6826498235625538, + "grad_norm": 0.11236847937107086, + "learning_rate": 0.00023287365061157062, + "loss": 2.5798, + "step": 23021 + }, + { + "epoch": 0.6826794769148652, + "grad_norm": 0.09759963303804398, + "learning_rate": 0.00023283387844930199, + "loss": 2.5997, + "step": 23022 + }, + { + "epoch": 0.6827091302671767, + "grad_norm": 0.11921284347772598, + "learning_rate": 0.00023279410865283524, + "loss": 2.6061, + "step": 23023 + }, + { + "epoch": 0.6827387836194881, + "grad_norm": 0.09675098210573196, + "learning_rate": 0.0002327543412225227, + "loss": 2.5653, + "step": 23024 + }, + { + "epoch": 0.6827684369717997, + "grad_norm": 0.10204107314348221, + "learning_rate": 0.000232714576158716, + "loss": 2.6094, + "step": 23025 + }, + { + "epoch": 0.6827980903241111, + "grad_norm": 0.1095898449420929, + "learning_rate": 0.00023267481346176777, + "loss": 2.6068, + "step": 23026 + }, + { + "epoch": 0.6828277436764226, + "grad_norm": 0.10249920934438705, + "learning_rate": 0.00023263505313202992, + "loss": 2.6201, + "step": 23027 + }, + { + "epoch": 0.682857397028734, + "grad_norm": 0.09968800097703934, + "learning_rate": 0.00023259529516985461, + "loss": 2.6031, + "step": 23028 + }, + { + "epoch": 0.6828870503810456, + "grad_norm": 0.10778672248125076, + "learning_rate": 0.00023255553957559388, + "loss": 2.5988, + "step": 23029 + }, + { + "epoch": 0.6829167037333571, + "grad_norm": 0.10486814379692078, + "learning_rate": 0.0002325157863495997, + "loss": 2.6197, + "step": 23030 + }, + { + "epoch": 0.6829463570856685, + "grad_norm": 0.10907121747732162, + "learning_rate": 0.0002324760354922242, + "loss": 2.5989, + "step": 23031 + }, + { + "epoch": 0.6829760104379801, + "grad_norm": 0.09827939420938492, + "learning_rate": 0.0002324362870038193, + "loss": 2.6055, + "step": 23032 + }, + { + "epoch": 0.6830056637902915, + "grad_norm": 0.10657519847154617, + "learning_rate": 0.00023239654088473699, + "loss": 2.6097, + "step": 23033 + }, + { + "epoch": 0.683035317142603, + "grad_norm": 0.12295053899288177, + "learning_rate": 0.00023235679713532926, + "loss": 2.6089, + "step": 23034 + }, + { + "epoch": 0.6830649704949144, + "grad_norm": 0.1263585239648819, + "learning_rate": 0.0002323170557559482, + "loss": 2.6131, + "step": 23035 + }, + { + "epoch": 0.683094623847226, + "grad_norm": 0.13253135979175568, + "learning_rate": 0.00023227731674694535, + "loss": 2.626, + "step": 23036 + }, + { + "epoch": 0.6831242771995374, + "grad_norm": 0.12501277029514313, + "learning_rate": 0.0002322375801086729, + "loss": 2.5963, + "step": 23037 + }, + { + "epoch": 0.6831539305518489, + "grad_norm": 0.11089412122964859, + "learning_rate": 0.0002321978458414824, + "loss": 2.6114, + "step": 23038 + }, + { + "epoch": 0.6831835839041603, + "grad_norm": 0.11514652520418167, + "learning_rate": 0.00023215811394572611, + "loss": 2.6138, + "step": 23039 + }, + { + "epoch": 0.6832132372564719, + "grad_norm": 0.12082921713590622, + "learning_rate": 0.0002321183844217557, + "loss": 2.6121, + "step": 23040 + }, + { + "epoch": 0.6832428906087833, + "grad_norm": 0.11472789943218231, + "learning_rate": 0.000232078657269923, + "loss": 2.6525, + "step": 23041 + }, + { + "epoch": 0.6832725439610948, + "grad_norm": 0.13117903470993042, + "learning_rate": 0.0002320389324905798, + "loss": 2.624, + "step": 23042 + }, + { + "epoch": 0.6833021973134062, + "grad_norm": 0.13337989151477814, + "learning_rate": 0.0002319992100840778, + "loss": 2.6321, + "step": 23043 + }, + { + "epoch": 0.6833318506657178, + "grad_norm": 0.1239519789814949, + "learning_rate": 0.00023195949005076882, + "loss": 2.6253, + "step": 23044 + }, + { + "epoch": 0.6833615040180292, + "grad_norm": 0.11181606352329254, + "learning_rate": 0.00023191977239100475, + "loss": 2.6579, + "step": 23045 + }, + { + "epoch": 0.6833911573703407, + "grad_norm": 0.11496534943580627, + "learning_rate": 0.00023188005710513693, + "loss": 2.6189, + "step": 23046 + }, + { + "epoch": 0.6834208107226522, + "grad_norm": 0.1151345893740654, + "learning_rate": 0.00023184034419351725, + "loss": 2.6268, + "step": 23047 + }, + { + "epoch": 0.6834504640749637, + "grad_norm": 0.11842638999223709, + "learning_rate": 0.00023180063365649728, + "loss": 2.5918, + "step": 23048 + }, + { + "epoch": 0.6834801174272751, + "grad_norm": 0.10656993091106415, + "learning_rate": 0.00023176092549442878, + "loss": 2.5963, + "step": 23049 + }, + { + "epoch": 0.6835097707795866, + "grad_norm": 0.1187957152724266, + "learning_rate": 0.0002317212197076633, + "loss": 2.5868, + "step": 23050 + }, + { + "epoch": 0.6835394241318982, + "grad_norm": 0.11831466108560562, + "learning_rate": 0.00023168151629655232, + "loss": 2.5909, + "step": 23051 + }, + { + "epoch": 0.6835690774842096, + "grad_norm": 0.10570165514945984, + "learning_rate": 0.00023164181526144772, + "loss": 2.6177, + "step": 23052 + }, + { + "epoch": 0.6835987308365211, + "grad_norm": 0.11038850992918015, + "learning_rate": 0.0002316021166027009, + "loss": 2.6296, + "step": 23053 + }, + { + "epoch": 0.6836283841888325, + "grad_norm": 0.10271402448415756, + "learning_rate": 0.0002315624203206635, + "loss": 2.6159, + "step": 23054 + }, + { + "epoch": 0.6836580375411441, + "grad_norm": 0.1118030846118927, + "learning_rate": 0.00023152272641568684, + "loss": 2.671, + "step": 23055 + }, + { + "epoch": 0.6836876908934555, + "grad_norm": 0.10773541778326035, + "learning_rate": 0.00023148303488812277, + "loss": 2.6182, + "step": 23056 + }, + { + "epoch": 0.683717344245767, + "grad_norm": 0.09968136996030807, + "learning_rate": 0.00023144334573832232, + "loss": 2.6055, + "step": 23057 + }, + { + "epoch": 0.6837469975980784, + "grad_norm": 0.11281561851501465, + "learning_rate": 0.00023140365896663712, + "loss": 2.6279, + "step": 23058 + }, + { + "epoch": 0.68377665095039, + "grad_norm": 0.11481975018978119, + "learning_rate": 0.00023136397457341863, + "loss": 2.6631, + "step": 23059 + }, + { + "epoch": 0.6838063043027014, + "grad_norm": 0.10479501634836197, + "learning_rate": 0.00023132429255901828, + "loss": 2.6035, + "step": 23060 + }, + { + "epoch": 0.6838359576550129, + "grad_norm": 0.10267571359872818, + "learning_rate": 0.00023128461292378738, + "loss": 2.5788, + "step": 23061 + }, + { + "epoch": 0.6838656110073243, + "grad_norm": 0.0994017943739891, + "learning_rate": 0.0002312449356680774, + "loss": 2.6163, + "step": 23062 + }, + { + "epoch": 0.6838952643596359, + "grad_norm": 0.10751873254776001, + "learning_rate": 0.00023120526079223964, + "loss": 2.6074, + "step": 23063 + }, + { + "epoch": 0.6839249177119473, + "grad_norm": 0.10104689747095108, + "learning_rate": 0.00023116558829662525, + "loss": 2.5952, + "step": 23064 + }, + { + "epoch": 0.6839545710642588, + "grad_norm": 0.11719255894422531, + "learning_rate": 0.0002311259181815859, + "loss": 2.619, + "step": 23065 + }, + { + "epoch": 0.6839842244165703, + "grad_norm": 0.10320200771093369, + "learning_rate": 0.0002310862504474729, + "loss": 2.6323, + "step": 23066 + }, + { + "epoch": 0.6840138777688818, + "grad_norm": 0.11433843523263931, + "learning_rate": 0.0002310465850946371, + "loss": 2.5918, + "step": 23067 + }, + { + "epoch": 0.6840435311211932, + "grad_norm": 0.10108604282140732, + "learning_rate": 0.00023100692212342993, + "loss": 2.641, + "step": 23068 + }, + { + "epoch": 0.6840731844735047, + "grad_norm": 0.10903631895780563, + "learning_rate": 0.00023096726153420271, + "loss": 2.6157, + "step": 23069 + }, + { + "epoch": 0.6841028378258162, + "grad_norm": 0.10891272127628326, + "learning_rate": 0.0002309276033273065, + "loss": 2.6073, + "step": 23070 + }, + { + "epoch": 0.6841324911781277, + "grad_norm": 0.10815293341875076, + "learning_rate": 0.0002308879475030926, + "loss": 2.6117, + "step": 23071 + }, + { + "epoch": 0.6841621445304392, + "grad_norm": 0.09982377290725708, + "learning_rate": 0.0002308482940619121, + "loss": 2.6462, + "step": 23072 + }, + { + "epoch": 0.6841917978827506, + "grad_norm": 0.1103110983967781, + "learning_rate": 0.00023080864300411614, + "loss": 2.5774, + "step": 23073 + }, + { + "epoch": 0.6842214512350622, + "grad_norm": 0.10296137630939484, + "learning_rate": 0.00023076899433005588, + "loss": 2.6293, + "step": 23074 + }, + { + "epoch": 0.6842511045873736, + "grad_norm": 0.1135864332318306, + "learning_rate": 0.00023072934804008234, + "loss": 2.5994, + "step": 23075 + }, + { + "epoch": 0.6842807579396851, + "grad_norm": 0.0980154350399971, + "learning_rate": 0.00023068970413454672, + "loss": 2.6142, + "step": 23076 + }, + { + "epoch": 0.6843104112919965, + "grad_norm": 0.10961373150348663, + "learning_rate": 0.00023065006261379988, + "loss": 2.6172, + "step": 23077 + }, + { + "epoch": 0.6843400646443081, + "grad_norm": 0.1026579737663269, + "learning_rate": 0.00023061042347819307, + "loss": 2.6225, + "step": 23078 + }, + { + "epoch": 0.6843697179966195, + "grad_norm": 0.12576735019683838, + "learning_rate": 0.0002305707867280772, + "loss": 2.6015, + "step": 23079 + }, + { + "epoch": 0.684399371348931, + "grad_norm": 0.1246352344751358, + "learning_rate": 0.00023053115236380318, + "loss": 2.5767, + "step": 23080 + }, + { + "epoch": 0.6844290247012424, + "grad_norm": 0.1022171825170517, + "learning_rate": 0.00023049152038572213, + "loss": 2.5895, + "step": 23081 + }, + { + "epoch": 0.684458678053554, + "grad_norm": 0.14189192652702332, + "learning_rate": 0.00023045189079418487, + "loss": 2.6334, + "step": 23082 + }, + { + "epoch": 0.6844883314058654, + "grad_norm": 0.13158570230007172, + "learning_rate": 0.00023041226358954243, + "loss": 2.6239, + "step": 23083 + }, + { + "epoch": 0.6845179847581769, + "grad_norm": 0.11669711023569107, + "learning_rate": 0.0002303726387721457, + "loss": 2.587, + "step": 23084 + }, + { + "epoch": 0.6845476381104884, + "grad_norm": 0.1377635896205902, + "learning_rate": 0.0002303330163423455, + "loss": 2.6319, + "step": 23085 + }, + { + "epoch": 0.6845772914627999, + "grad_norm": 0.12734447419643402, + "learning_rate": 0.00023029339630049268, + "loss": 2.6222, + "step": 23086 + }, + { + "epoch": 0.6846069448151113, + "grad_norm": 0.10759666562080383, + "learning_rate": 0.0002302537786469382, + "loss": 2.5789, + "step": 23087 + }, + { + "epoch": 0.6846365981674228, + "grad_norm": 0.1257745921611786, + "learning_rate": 0.00023021416338203277, + "loss": 2.62, + "step": 23088 + }, + { + "epoch": 0.6846662515197343, + "grad_norm": 0.10464034229516983, + "learning_rate": 0.00023017455050612724, + "loss": 2.6376, + "step": 23089 + }, + { + "epoch": 0.6846959048720458, + "grad_norm": 0.1032029315829277, + "learning_rate": 0.0002301349400195724, + "loss": 2.6112, + "step": 23090 + }, + { + "epoch": 0.6847255582243572, + "grad_norm": 0.1146087720990181, + "learning_rate": 0.00023009533192271898, + "loss": 2.6405, + "step": 23091 + }, + { + "epoch": 0.6847552115766687, + "grad_norm": 0.10608097910881042, + "learning_rate": 0.0002300557262159177, + "loss": 2.6395, + "step": 23092 + }, + { + "epoch": 0.6847848649289803, + "grad_norm": 0.11465276032686234, + "learning_rate": 0.00023001612289951935, + "loss": 2.6272, + "step": 23093 + }, + { + "epoch": 0.6848145182812917, + "grad_norm": 0.09861623495817184, + "learning_rate": 0.00022997652197387453, + "loss": 2.5978, + "step": 23094 + }, + { + "epoch": 0.6848441716336032, + "grad_norm": 0.09461171925067902, + "learning_rate": 0.00022993692343933398, + "loss": 2.6243, + "step": 23095 + }, + { + "epoch": 0.6848738249859146, + "grad_norm": 0.1014186441898346, + "learning_rate": 0.0002298973272962483, + "loss": 2.6127, + "step": 23096 + }, + { + "epoch": 0.6849034783382262, + "grad_norm": 0.09315633773803711, + "learning_rate": 0.00022985773354496813, + "loss": 2.5808, + "step": 23097 + }, + { + "epoch": 0.6849331316905376, + "grad_norm": 0.1007503941655159, + "learning_rate": 0.00022981814218584417, + "loss": 2.6632, + "step": 23098 + }, + { + "epoch": 0.6849627850428491, + "grad_norm": 0.1211509183049202, + "learning_rate": 0.00022977855321922692, + "loss": 2.5916, + "step": 23099 + }, + { + "epoch": 0.6849924383951606, + "grad_norm": 0.10976053774356842, + "learning_rate": 0.00022973896664546712, + "loss": 2.6203, + "step": 23100 + }, + { + "epoch": 0.6850220917474721, + "grad_norm": 0.1065152958035469, + "learning_rate": 0.00022969938246491495, + "loss": 2.6111, + "step": 23101 + }, + { + "epoch": 0.6850517450997835, + "grad_norm": 0.10281302034854889, + "learning_rate": 0.00022965980067792119, + "loss": 2.6044, + "step": 23102 + }, + { + "epoch": 0.685081398452095, + "grad_norm": 0.11445040255784988, + "learning_rate": 0.00022962022128483612, + "loss": 2.651, + "step": 23103 + }, + { + "epoch": 0.6851110518044065, + "grad_norm": 0.09981323033571243, + "learning_rate": 0.00022958064428601056, + "loss": 2.6005, + "step": 23104 + }, + { + "epoch": 0.685140705156718, + "grad_norm": 0.10930544883012772, + "learning_rate": 0.00022954106968179483, + "loss": 2.6093, + "step": 23105 + }, + { + "epoch": 0.6851703585090294, + "grad_norm": 0.10968472808599472, + "learning_rate": 0.00022950149747253936, + "loss": 2.6334, + "step": 23106 + }, + { + "epoch": 0.6852000118613409, + "grad_norm": 0.10038536041975021, + "learning_rate": 0.00022946192765859453, + "loss": 2.6141, + "step": 23107 + }, + { + "epoch": 0.6852296652136524, + "grad_norm": 0.10746768116950989, + "learning_rate": 0.00022942236024031077, + "loss": 2.6225, + "step": 23108 + }, + { + "epoch": 0.6852593185659639, + "grad_norm": 0.1132851392030716, + "learning_rate": 0.00022938279521803845, + "loss": 2.6203, + "step": 23109 + }, + { + "epoch": 0.6852889719182753, + "grad_norm": 0.10418025404214859, + "learning_rate": 0.00022934323259212797, + "loss": 2.6243, + "step": 23110 + }, + { + "epoch": 0.6853186252705868, + "grad_norm": 0.10309728980064392, + "learning_rate": 0.00022930367236292977, + "loss": 2.6011, + "step": 23111 + }, + { + "epoch": 0.6853482786228983, + "grad_norm": 0.12339019030332565, + "learning_rate": 0.00022926411453079389, + "loss": 2.6052, + "step": 23112 + }, + { + "epoch": 0.6853779319752098, + "grad_norm": 0.09331027418375015, + "learning_rate": 0.00022922455909607064, + "loss": 2.5794, + "step": 23113 + }, + { + "epoch": 0.6854075853275213, + "grad_norm": 0.11497136205434799, + "learning_rate": 0.00022918500605911046, + "loss": 2.6228, + "step": 23114 + }, + { + "epoch": 0.6854372386798327, + "grad_norm": 0.10441050678491592, + "learning_rate": 0.00022914545542026356, + "loss": 2.6166, + "step": 23115 + }, + { + "epoch": 0.6854668920321443, + "grad_norm": 0.10702193528413773, + "learning_rate": 0.00022910590717987995, + "loss": 2.6054, + "step": 23116 + }, + { + "epoch": 0.6854965453844557, + "grad_norm": 0.10100159049034119, + "learning_rate": 0.00022906636133831015, + "loss": 2.6385, + "step": 23117 + }, + { + "epoch": 0.6855261987367672, + "grad_norm": 0.10010731220245361, + "learning_rate": 0.00022902681789590428, + "loss": 2.6582, + "step": 23118 + }, + { + "epoch": 0.6855558520890787, + "grad_norm": 0.10443094372749329, + "learning_rate": 0.00022898727685301246, + "loss": 2.6295, + "step": 23119 + }, + { + "epoch": 0.6855855054413902, + "grad_norm": 0.11640121042728424, + "learning_rate": 0.00022894773820998483, + "loss": 2.6546, + "step": 23120 + }, + { + "epoch": 0.6856151587937016, + "grad_norm": 0.11304889619350433, + "learning_rate": 0.00022890820196717166, + "loss": 2.6044, + "step": 23121 + }, + { + "epoch": 0.6856448121460131, + "grad_norm": 0.10591806471347809, + "learning_rate": 0.00022886866812492267, + "loss": 2.6588, + "step": 23122 + }, + { + "epoch": 0.6856744654983246, + "grad_norm": 0.11325598508119583, + "learning_rate": 0.00022882913668358822, + "loss": 2.6526, + "step": 23123 + }, + { + "epoch": 0.6857041188506361, + "grad_norm": 0.1036924347281456, + "learning_rate": 0.00022878960764351831, + "loss": 2.6263, + "step": 23124 + }, + { + "epoch": 0.6857337722029475, + "grad_norm": 0.11753258109092712, + "learning_rate": 0.00022875008100506296, + "loss": 2.6411, + "step": 23125 + }, + { + "epoch": 0.685763425555259, + "grad_norm": 0.11467237025499344, + "learning_rate": 0.00022871055676857222, + "loss": 2.6212, + "step": 23126 + }, + { + "epoch": 0.6857930789075705, + "grad_norm": 0.0949568822979927, + "learning_rate": 0.00022867103493439607, + "loss": 2.6268, + "step": 23127 + }, + { + "epoch": 0.685822732259882, + "grad_norm": 0.11930146813392639, + "learning_rate": 0.00022863151550288425, + "loss": 2.605, + "step": 23128 + }, + { + "epoch": 0.6858523856121934, + "grad_norm": 0.09892459213733673, + "learning_rate": 0.00022859199847438718, + "loss": 2.6096, + "step": 23129 + }, + { + "epoch": 0.685882038964505, + "grad_norm": 0.1138724759221077, + "learning_rate": 0.00022855248384925448, + "loss": 2.6171, + "step": 23130 + }, + { + "epoch": 0.6859116923168164, + "grad_norm": 0.1111626848578453, + "learning_rate": 0.00022851297162783618, + "loss": 2.5688, + "step": 23131 + }, + { + "epoch": 0.6859413456691279, + "grad_norm": 0.10617227107286453, + "learning_rate": 0.00022847346181048228, + "loss": 2.5865, + "step": 23132 + }, + { + "epoch": 0.6859709990214393, + "grad_norm": 0.11101634055376053, + "learning_rate": 0.00022843395439754233, + "loss": 2.6207, + "step": 23133 + }, + { + "epoch": 0.6860006523737509, + "grad_norm": 0.10133162885904312, + "learning_rate": 0.00022839444938936628, + "loss": 2.6292, + "step": 23134 + }, + { + "epoch": 0.6860303057260624, + "grad_norm": 0.10857206583023071, + "learning_rate": 0.00022835494678630404, + "loss": 2.624, + "step": 23135 + }, + { + "epoch": 0.6860599590783738, + "grad_norm": 0.10085851699113846, + "learning_rate": 0.00022831544658870535, + "loss": 2.6179, + "step": 23136 + }, + { + "epoch": 0.6860896124306853, + "grad_norm": 0.10390222817659378, + "learning_rate": 0.00022827594879692005, + "loss": 2.6451, + "step": 23137 + }, + { + "epoch": 0.6861192657829968, + "grad_norm": 0.10483994334936142, + "learning_rate": 0.00022823645341129783, + "loss": 2.6102, + "step": 23138 + }, + { + "epoch": 0.6861489191353083, + "grad_norm": 0.11526413261890411, + "learning_rate": 0.00022819696043218846, + "loss": 2.5866, + "step": 23139 + }, + { + "epoch": 0.6861785724876197, + "grad_norm": 0.10722038894891739, + "learning_rate": 0.0002281574698599417, + "loss": 2.5802, + "step": 23140 + }, + { + "epoch": 0.6862082258399312, + "grad_norm": 0.10537408292293549, + "learning_rate": 0.00022811798169490694, + "loss": 2.6208, + "step": 23141 + }, + { + "epoch": 0.6862378791922427, + "grad_norm": 0.11311300098896027, + "learning_rate": 0.00022807849593743456, + "loss": 2.5894, + "step": 23142 + }, + { + "epoch": 0.6862675325445542, + "grad_norm": 0.1202329620718956, + "learning_rate": 0.00022803901258787356, + "loss": 2.5838, + "step": 23143 + }, + { + "epoch": 0.6862971858968656, + "grad_norm": 0.10380105674266815, + "learning_rate": 0.00022799953164657382, + "loss": 2.625, + "step": 23144 + }, + { + "epoch": 0.6863268392491771, + "grad_norm": 0.11827652901411057, + "learning_rate": 0.0002279600531138849, + "loss": 2.5917, + "step": 23145 + }, + { + "epoch": 0.6863564926014886, + "grad_norm": 0.10818508267402649, + "learning_rate": 0.0002279205769901564, + "loss": 2.6031, + "step": 23146 + }, + { + "epoch": 0.6863861459538001, + "grad_norm": 0.10580108314752579, + "learning_rate": 0.00022788110327573785, + "loss": 2.6034, + "step": 23147 + }, + { + "epoch": 0.6864157993061115, + "grad_norm": 0.11238875985145569, + "learning_rate": 0.00022784163197097891, + "loss": 2.5998, + "step": 23148 + }, + { + "epoch": 0.686445452658423, + "grad_norm": 0.1060432717204094, + "learning_rate": 0.00022780216307622896, + "loss": 2.6036, + "step": 23149 + }, + { + "epoch": 0.6864751060107345, + "grad_norm": 0.10187052935361862, + "learning_rate": 0.00022776269659183763, + "loss": 2.6009, + "step": 23150 + }, + { + "epoch": 0.686504759363046, + "grad_norm": 0.10002269595861435, + "learning_rate": 0.00022772323251815435, + "loss": 2.5988, + "step": 23151 + }, + { + "epoch": 0.6865344127153574, + "grad_norm": 0.11597215384244919, + "learning_rate": 0.00022768377085552856, + "loss": 2.6029, + "step": 23152 + }, + { + "epoch": 0.686564066067669, + "grad_norm": 0.1039978414773941, + "learning_rate": 0.00022764431160430976, + "loss": 2.6392, + "step": 23153 + }, + { + "epoch": 0.6865937194199804, + "grad_norm": 0.11399117112159729, + "learning_rate": 0.00022760485476484727, + "loss": 2.6063, + "step": 23154 + }, + { + "epoch": 0.6866233727722919, + "grad_norm": 0.0981234610080719, + "learning_rate": 0.00022756540033749058, + "loss": 2.5771, + "step": 23155 + }, + { + "epoch": 0.6866530261246034, + "grad_norm": 0.1080184280872345, + "learning_rate": 0.00022752594832258904, + "loss": 2.6398, + "step": 23156 + }, + { + "epoch": 0.6866826794769149, + "grad_norm": 0.09648305177688599, + "learning_rate": 0.00022748649872049198, + "loss": 2.5991, + "step": 23157 + }, + { + "epoch": 0.6867123328292264, + "grad_norm": 0.10249707102775574, + "learning_rate": 0.00022744705153154876, + "loss": 2.6265, + "step": 23158 + }, + { + "epoch": 0.6867419861815378, + "grad_norm": 0.09714067727327347, + "learning_rate": 0.0002274076067561087, + "loss": 2.6055, + "step": 23159 + }, + { + "epoch": 0.6867716395338493, + "grad_norm": 0.09289515763521194, + "learning_rate": 0.00022736816439452106, + "loss": 2.6023, + "step": 23160 + }, + { + "epoch": 0.6868012928861608, + "grad_norm": 0.11056103557348251, + "learning_rate": 0.0002273287244471351, + "loss": 2.6542, + "step": 23161 + }, + { + "epoch": 0.6868309462384723, + "grad_norm": 0.10745532065629959, + "learning_rate": 0.0002272892869143001, + "loss": 2.5933, + "step": 23162 + }, + { + "epoch": 0.6868605995907837, + "grad_norm": 0.110688216984272, + "learning_rate": 0.00022724985179636533, + "loss": 2.6331, + "step": 23163 + }, + { + "epoch": 0.6868902529430952, + "grad_norm": 0.11401878297328949, + "learning_rate": 0.00022721041909367983, + "loss": 2.6085, + "step": 23164 + }, + { + "epoch": 0.6869199062954067, + "grad_norm": 0.12192763388156891, + "learning_rate": 0.00022717098880659298, + "loss": 2.641, + "step": 23165 + }, + { + "epoch": 0.6869495596477182, + "grad_norm": 0.09374614804983139, + "learning_rate": 0.000227131560935454, + "loss": 2.6117, + "step": 23166 + }, + { + "epoch": 0.6869792130000296, + "grad_norm": 0.10664384067058563, + "learning_rate": 0.00022709213548061154, + "loss": 2.6289, + "step": 23167 + }, + { + "epoch": 0.6870088663523412, + "grad_norm": 0.10262559354305267, + "learning_rate": 0.00022705271244241522, + "loss": 2.6632, + "step": 23168 + }, + { + "epoch": 0.6870385197046526, + "grad_norm": 0.10737346112728119, + "learning_rate": 0.000227013291821214, + "loss": 2.611, + "step": 23169 + }, + { + "epoch": 0.6870681730569641, + "grad_norm": 0.10843702405691147, + "learning_rate": 0.00022697387361735695, + "loss": 2.599, + "step": 23170 + }, + { + "epoch": 0.6870978264092755, + "grad_norm": 0.08909029513597488, + "learning_rate": 0.0002269344578311931, + "loss": 2.5997, + "step": 23171 + }, + { + "epoch": 0.6871274797615871, + "grad_norm": 0.11328429728746414, + "learning_rate": 0.00022689504446307148, + "loss": 2.6169, + "step": 23172 + }, + { + "epoch": 0.6871571331138985, + "grad_norm": 0.10928763449192047, + "learning_rate": 0.00022685563351334116, + "loss": 2.594, + "step": 23173 + }, + { + "epoch": 0.68718678646621, + "grad_norm": 0.10441460460424423, + "learning_rate": 0.00022681622498235105, + "loss": 2.6104, + "step": 23174 + }, + { + "epoch": 0.6872164398185214, + "grad_norm": 0.1119932234287262, + "learning_rate": 0.00022677681887045017, + "loss": 2.5936, + "step": 23175 + }, + { + "epoch": 0.687246093170833, + "grad_norm": 0.10585629194974899, + "learning_rate": 0.00022673741517798763, + "loss": 2.5955, + "step": 23176 + }, + { + "epoch": 0.6872757465231445, + "grad_norm": 0.11655879765748978, + "learning_rate": 0.00022669801390531202, + "loss": 2.603, + "step": 23177 + }, + { + "epoch": 0.6873053998754559, + "grad_norm": 0.10270626097917557, + "learning_rate": 0.0002266586150527724, + "loss": 2.5783, + "step": 23178 + }, + { + "epoch": 0.6873350532277674, + "grad_norm": 0.09897409379482269, + "learning_rate": 0.00022661921862071767, + "loss": 2.617, + "step": 23179 + }, + { + "epoch": 0.6873647065800789, + "grad_norm": 0.10082627832889557, + "learning_rate": 0.0002265798246094965, + "loss": 2.6094, + "step": 23180 + }, + { + "epoch": 0.6873943599323904, + "grad_norm": 0.10718618333339691, + "learning_rate": 0.00022654043301945808, + "loss": 2.6471, + "step": 23181 + }, + { + "epoch": 0.6874240132847018, + "grad_norm": 0.10817452520132065, + "learning_rate": 0.0002265010438509511, + "loss": 2.6227, + "step": 23182 + }, + { + "epoch": 0.6874536666370133, + "grad_norm": 0.09554068744182587, + "learning_rate": 0.00022646165710432425, + "loss": 2.6381, + "step": 23183 + }, + { + "epoch": 0.6874833199893248, + "grad_norm": 0.10948538780212402, + "learning_rate": 0.00022642227277992644, + "loss": 2.6157, + "step": 23184 + }, + { + "epoch": 0.6875129733416363, + "grad_norm": 0.10144127905368805, + "learning_rate": 0.00022638289087810638, + "loss": 2.6187, + "step": 23185 + }, + { + "epoch": 0.6875426266939477, + "grad_norm": 0.11126487702131271, + "learning_rate": 0.00022634351139921277, + "loss": 2.6251, + "step": 23186 + }, + { + "epoch": 0.6875722800462593, + "grad_norm": 0.09900226444005966, + "learning_rate": 0.00022630413434359447, + "loss": 2.5977, + "step": 23187 + }, + { + "epoch": 0.6876019333985707, + "grad_norm": 0.09428805857896805, + "learning_rate": 0.00022626475971159994, + "loss": 2.6051, + "step": 23188 + }, + { + "epoch": 0.6876315867508822, + "grad_norm": 0.11581850796937943, + "learning_rate": 0.0002262253875035779, + "loss": 2.613, + "step": 23189 + }, + { + "epoch": 0.6876612401031936, + "grad_norm": 0.09853648394346237, + "learning_rate": 0.00022618601771987707, + "loss": 2.6103, + "step": 23190 + }, + { + "epoch": 0.6876908934555052, + "grad_norm": 0.10337786376476288, + "learning_rate": 0.00022614665036084603, + "loss": 2.6153, + "step": 23191 + }, + { + "epoch": 0.6877205468078166, + "grad_norm": 0.10681962221860886, + "learning_rate": 0.0002261072854268334, + "loss": 2.625, + "step": 23192 + }, + { + "epoch": 0.6877502001601281, + "grad_norm": 0.11773297935724258, + "learning_rate": 0.0002260679229181876, + "loss": 2.6408, + "step": 23193 + }, + { + "epoch": 0.6877798535124395, + "grad_norm": 0.10611248016357422, + "learning_rate": 0.0002260285628352575, + "loss": 2.5764, + "step": 23194 + }, + { + "epoch": 0.6878095068647511, + "grad_norm": 0.10227945446968079, + "learning_rate": 0.00022598920517839162, + "loss": 2.6075, + "step": 23195 + }, + { + "epoch": 0.6878391602170625, + "grad_norm": 0.1160859614610672, + "learning_rate": 0.00022594984994793826, + "loss": 2.6049, + "step": 23196 + }, + { + "epoch": 0.687868813569374, + "grad_norm": 0.1018269807100296, + "learning_rate": 0.00022591049714424622, + "loss": 2.6183, + "step": 23197 + }, + { + "epoch": 0.6878984669216855, + "grad_norm": 0.11074888706207275, + "learning_rate": 0.00022587114676766363, + "loss": 2.6215, + "step": 23198 + }, + { + "epoch": 0.687928120273997, + "grad_norm": 0.10748157650232315, + "learning_rate": 0.00022583179881853905, + "loss": 2.6186, + "step": 23199 + }, + { + "epoch": 0.6879577736263085, + "grad_norm": 0.10999609529972076, + "learning_rate": 0.000225792453297221, + "loss": 2.6031, + "step": 23200 + }, + { + "epoch": 0.6879874269786199, + "grad_norm": 0.10708824545145035, + "learning_rate": 0.00022575311020405774, + "loss": 2.6095, + "step": 23201 + }, + { + "epoch": 0.6880170803309315, + "grad_norm": 0.10685036331415176, + "learning_rate": 0.00022571376953939786, + "loss": 2.6371, + "step": 23202 + }, + { + "epoch": 0.6880467336832429, + "grad_norm": 0.10570037364959717, + "learning_rate": 0.0002256744313035896, + "loss": 2.6069, + "step": 23203 + }, + { + "epoch": 0.6880763870355544, + "grad_norm": 0.10155536979436874, + "learning_rate": 0.00022563509549698135, + "loss": 2.6247, + "step": 23204 + }, + { + "epoch": 0.6881060403878658, + "grad_norm": 0.10179643332958221, + "learning_rate": 0.00022559576211992144, + "loss": 2.628, + "step": 23205 + }, + { + "epoch": 0.6881356937401774, + "grad_norm": 0.10313821583986282, + "learning_rate": 0.00022555643117275792, + "loss": 2.6196, + "step": 23206 + }, + { + "epoch": 0.6881653470924888, + "grad_norm": 0.10691115260124207, + "learning_rate": 0.00022551710265583953, + "loss": 2.6354, + "step": 23207 + }, + { + "epoch": 0.6881950004448003, + "grad_norm": 0.09521627426147461, + "learning_rate": 0.00022547777656951445, + "loss": 2.6115, + "step": 23208 + }, + { + "epoch": 0.6882246537971117, + "grad_norm": 0.11064042896032333, + "learning_rate": 0.00022543845291413068, + "loss": 2.6186, + "step": 23209 + }, + { + "epoch": 0.6882543071494233, + "grad_norm": 0.10145378857851028, + "learning_rate": 0.00022539913169003644, + "loss": 2.6402, + "step": 23210 + }, + { + "epoch": 0.6882839605017347, + "grad_norm": 0.12751713395118713, + "learning_rate": 0.00022535981289758012, + "loss": 2.6106, + "step": 23211 + }, + { + "epoch": 0.6883136138540462, + "grad_norm": 0.10691529512405396, + "learning_rate": 0.00022532049653710973, + "loss": 2.6222, + "step": 23212 + }, + { + "epoch": 0.6883432672063576, + "grad_norm": 0.11101683974266052, + "learning_rate": 0.00022528118260897352, + "loss": 2.6296, + "step": 23213 + }, + { + "epoch": 0.6883729205586692, + "grad_norm": 0.11787174642086029, + "learning_rate": 0.00022524187111351958, + "loss": 2.5996, + "step": 23214 + }, + { + "epoch": 0.6884025739109806, + "grad_norm": 0.10177246481180191, + "learning_rate": 0.00022520256205109602, + "loss": 2.6134, + "step": 23215 + }, + { + "epoch": 0.6884322272632921, + "grad_norm": 0.1148918941617012, + "learning_rate": 0.00022516325542205095, + "loss": 2.6269, + "step": 23216 + }, + { + "epoch": 0.6884618806156035, + "grad_norm": 0.10089292377233505, + "learning_rate": 0.00022512395122673245, + "loss": 2.6473, + "step": 23217 + }, + { + "epoch": 0.6884915339679151, + "grad_norm": 0.12923786044120789, + "learning_rate": 0.00022508464946548857, + "loss": 2.6163, + "step": 23218 + }, + { + "epoch": 0.6885211873202266, + "grad_norm": 0.11148316413164139, + "learning_rate": 0.00022504535013866722, + "loss": 2.627, + "step": 23219 + }, + { + "epoch": 0.688550840672538, + "grad_norm": 0.11150661110877991, + "learning_rate": 0.00022500605324661654, + "loss": 2.6276, + "step": 23220 + }, + { + "epoch": 0.6885804940248496, + "grad_norm": 0.10703462362289429, + "learning_rate": 0.0002249667587896845, + "loss": 2.6076, + "step": 23221 + }, + { + "epoch": 0.688610147377161, + "grad_norm": 0.12930026650428772, + "learning_rate": 0.00022492746676821895, + "loss": 2.6577, + "step": 23222 + }, + { + "epoch": 0.6886398007294725, + "grad_norm": 0.10492253303527832, + "learning_rate": 0.00022488817718256793, + "loss": 2.6115, + "step": 23223 + }, + { + "epoch": 0.6886694540817839, + "grad_norm": 0.11504679918289185, + "learning_rate": 0.00022484889003307934, + "loss": 2.6084, + "step": 23224 + }, + { + "epoch": 0.6886991074340955, + "grad_norm": 0.10213584452867508, + "learning_rate": 0.00022480960532010103, + "loss": 2.5995, + "step": 23225 + }, + { + "epoch": 0.6887287607864069, + "grad_norm": 0.1128382608294487, + "learning_rate": 0.00022477032304398092, + "loss": 2.6204, + "step": 23226 + }, + { + "epoch": 0.6887584141387184, + "grad_norm": 0.11179185658693314, + "learning_rate": 0.00022473104320506682, + "loss": 2.6385, + "step": 23227 + }, + { + "epoch": 0.6887880674910298, + "grad_norm": 0.0964096263051033, + "learning_rate": 0.0002246917658037066, + "loss": 2.6093, + "step": 23228 + }, + { + "epoch": 0.6888177208433414, + "grad_norm": 0.1069900244474411, + "learning_rate": 0.00022465249084024802, + "loss": 2.6234, + "step": 23229 + }, + { + "epoch": 0.6888473741956528, + "grad_norm": 0.10217541456222534, + "learning_rate": 0.00022461321831503895, + "loss": 2.6276, + "step": 23230 + }, + { + "epoch": 0.6888770275479643, + "grad_norm": 0.10192539542913437, + "learning_rate": 0.00022457394822842725, + "loss": 2.6157, + "step": 23231 + }, + { + "epoch": 0.6889066809002757, + "grad_norm": 0.10194564610719681, + "learning_rate": 0.00022453468058076015, + "loss": 2.6157, + "step": 23232 + }, + { + "epoch": 0.6889363342525873, + "grad_norm": 0.11294865608215332, + "learning_rate": 0.00022449541537238589, + "loss": 2.582, + "step": 23233 + }, + { + "epoch": 0.6889659876048987, + "grad_norm": 0.0945020467042923, + "learning_rate": 0.00022445615260365204, + "loss": 2.6059, + "step": 23234 + }, + { + "epoch": 0.6889956409572102, + "grad_norm": 0.12099438160657883, + "learning_rate": 0.0002244168922749063, + "loss": 2.6128, + "step": 23235 + }, + { + "epoch": 0.6890252943095216, + "grad_norm": 0.09264776110649109, + "learning_rate": 0.0002243776343864962, + "loss": 2.6276, + "step": 23236 + }, + { + "epoch": 0.6890549476618332, + "grad_norm": 0.12481766194105148, + "learning_rate": 0.00022433837893876953, + "loss": 2.5638, + "step": 23237 + }, + { + "epoch": 0.6890846010141447, + "grad_norm": 0.09263147413730621, + "learning_rate": 0.0002242991259320738, + "loss": 2.6198, + "step": 23238 + }, + { + "epoch": 0.6891142543664561, + "grad_norm": 0.11478962749242783, + "learning_rate": 0.00022425987536675663, + "loss": 2.661, + "step": 23239 + }, + { + "epoch": 0.6891439077187677, + "grad_norm": 0.09320830553770065, + "learning_rate": 0.0002242206272431656, + "loss": 2.6243, + "step": 23240 + }, + { + "epoch": 0.6891735610710791, + "grad_norm": 0.10758527368307114, + "learning_rate": 0.00022418138156164824, + "loss": 2.6346, + "step": 23241 + }, + { + "epoch": 0.6892032144233906, + "grad_norm": 0.10223700106143951, + "learning_rate": 0.00022414213832255232, + "loss": 2.6032, + "step": 23242 + }, + { + "epoch": 0.689232867775702, + "grad_norm": 0.09262669086456299, + "learning_rate": 0.00022410289752622487, + "loss": 2.6032, + "step": 23243 + }, + { + "epoch": 0.6892625211280136, + "grad_norm": 0.0914408266544342, + "learning_rate": 0.00022406365917301363, + "loss": 2.6139, + "step": 23244 + }, + { + "epoch": 0.689292174480325, + "grad_norm": 0.10016752779483795, + "learning_rate": 0.00022402442326326593, + "loss": 2.5723, + "step": 23245 + }, + { + "epoch": 0.6893218278326365, + "grad_norm": 0.10665223002433777, + "learning_rate": 0.00022398518979732947, + "loss": 2.6092, + "step": 23246 + }, + { + "epoch": 0.6893514811849479, + "grad_norm": 0.09409194439649582, + "learning_rate": 0.00022394595877555152, + "loss": 2.6319, + "step": 23247 + }, + { + "epoch": 0.6893811345372595, + "grad_norm": 0.09449116140604019, + "learning_rate": 0.0002239067301982795, + "loss": 2.64, + "step": 23248 + }, + { + "epoch": 0.6894107878895709, + "grad_norm": 0.09254623204469681, + "learning_rate": 0.00022386750406586076, + "loss": 2.6123, + "step": 23249 + }, + { + "epoch": 0.6894404412418824, + "grad_norm": 0.08936901390552521, + "learning_rate": 0.00022382828037864272, + "loss": 2.6214, + "step": 23250 + }, + { + "epoch": 0.6894700945941938, + "grad_norm": 0.09551772475242615, + "learning_rate": 0.00022378905913697262, + "loss": 2.5968, + "step": 23251 + }, + { + "epoch": 0.6894997479465054, + "grad_norm": 0.08721422404050827, + "learning_rate": 0.000223749840341198, + "loss": 2.6044, + "step": 23252 + }, + { + "epoch": 0.6895294012988168, + "grad_norm": 0.09473036229610443, + "learning_rate": 0.0002237106239916658, + "loss": 2.6339, + "step": 23253 + }, + { + "epoch": 0.6895590546511283, + "grad_norm": 0.10659563541412354, + "learning_rate": 0.00022367141008872344, + "loss": 2.6304, + "step": 23254 + }, + { + "epoch": 0.6895887080034397, + "grad_norm": 0.09591927379369736, + "learning_rate": 0.00022363219863271816, + "loss": 2.6143, + "step": 23255 + }, + { + "epoch": 0.6896183613557513, + "grad_norm": 0.10521609336137772, + "learning_rate": 0.00022359298962399722, + "loss": 2.6184, + "step": 23256 + }, + { + "epoch": 0.6896480147080627, + "grad_norm": 0.10715050250291824, + "learning_rate": 0.00022355378306290774, + "loss": 2.5715, + "step": 23257 + }, + { + "epoch": 0.6896776680603742, + "grad_norm": 0.10769319534301758, + "learning_rate": 0.0002235145789497968, + "loss": 2.5915, + "step": 23258 + }, + { + "epoch": 0.6897073214126858, + "grad_norm": 0.1200462132692337, + "learning_rate": 0.00022347537728501199, + "loss": 2.6143, + "step": 23259 + }, + { + "epoch": 0.6897369747649972, + "grad_norm": 0.10018622875213623, + "learning_rate": 0.0002234361780689001, + "loss": 2.6029, + "step": 23260 + }, + { + "epoch": 0.6897666281173087, + "grad_norm": 0.11875618249177933, + "learning_rate": 0.00022339698130180837, + "loss": 2.6117, + "step": 23261 + }, + { + "epoch": 0.6897962814696201, + "grad_norm": 0.10194815695285797, + "learning_rate": 0.0002233577869840838, + "loss": 2.6057, + "step": 23262 + }, + { + "epoch": 0.6898259348219317, + "grad_norm": 0.1143411248922348, + "learning_rate": 0.0002233185951160737, + "loss": 2.5928, + "step": 23263 + }, + { + "epoch": 0.6898555881742431, + "grad_norm": 0.10008837282657623, + "learning_rate": 0.00022327940569812478, + "loss": 2.6449, + "step": 23264 + }, + { + "epoch": 0.6898852415265546, + "grad_norm": 0.1085088700056076, + "learning_rate": 0.0002232402187305842, + "loss": 2.6025, + "step": 23265 + }, + { + "epoch": 0.689914894878866, + "grad_norm": 0.09917312115430832, + "learning_rate": 0.000223201034213799, + "loss": 2.6533, + "step": 23266 + }, + { + "epoch": 0.6899445482311776, + "grad_norm": 0.1000918373465538, + "learning_rate": 0.00022316185214811614, + "loss": 2.5955, + "step": 23267 + }, + { + "epoch": 0.689974201583489, + "grad_norm": 0.10165229439735413, + "learning_rate": 0.00022312267253388264, + "loss": 2.6615, + "step": 23268 + }, + { + "epoch": 0.6900038549358005, + "grad_norm": 0.11273905634880066, + "learning_rate": 0.00022308349537144534, + "loss": 2.5983, + "step": 23269 + }, + { + "epoch": 0.6900335082881119, + "grad_norm": 0.10931151360273361, + "learning_rate": 0.00022304432066115127, + "loss": 2.6007, + "step": 23270 + }, + { + "epoch": 0.6900631616404235, + "grad_norm": 0.11052883416414261, + "learning_rate": 0.00022300514840334706, + "loss": 2.61, + "step": 23271 + }, + { + "epoch": 0.6900928149927349, + "grad_norm": 0.10670818388462067, + "learning_rate": 0.00022296597859838003, + "loss": 2.5909, + "step": 23272 + }, + { + "epoch": 0.6901224683450464, + "grad_norm": 0.1234196349978447, + "learning_rate": 0.00022292681124659697, + "loss": 2.612, + "step": 23273 + }, + { + "epoch": 0.6901521216973578, + "grad_norm": 0.12405113130807877, + "learning_rate": 0.00022288764634834435, + "loss": 2.6163, + "step": 23274 + }, + { + "epoch": 0.6901817750496694, + "grad_norm": 0.095669224858284, + "learning_rate": 0.00022284848390396921, + "loss": 2.6037, + "step": 23275 + }, + { + "epoch": 0.6902114284019808, + "grad_norm": 0.12314830720424652, + "learning_rate": 0.0002228093239138183, + "loss": 2.5917, + "step": 23276 + }, + { + "epoch": 0.6902410817542923, + "grad_norm": 0.10911551117897034, + "learning_rate": 0.00022277016637823843, + "loss": 2.573, + "step": 23277 + }, + { + "epoch": 0.6902707351066037, + "grad_norm": 0.11901558190584183, + "learning_rate": 0.00022273101129757634, + "loss": 2.5891, + "step": 23278 + }, + { + "epoch": 0.6903003884589153, + "grad_norm": 0.10654182732105255, + "learning_rate": 0.00022269185867217868, + "loss": 2.5611, + "step": 23279 + }, + { + "epoch": 0.6903300418112268, + "grad_norm": 0.1312159299850464, + "learning_rate": 0.00022265270850239228, + "loss": 2.6274, + "step": 23280 + }, + { + "epoch": 0.6903596951635382, + "grad_norm": 0.10537263751029968, + "learning_rate": 0.0002226135607885637, + "loss": 2.6388, + "step": 23281 + }, + { + "epoch": 0.6903893485158498, + "grad_norm": 0.1135135367512703, + "learning_rate": 0.00022257441553103963, + "loss": 2.6586, + "step": 23282 + }, + { + "epoch": 0.6904190018681612, + "grad_norm": 0.10073812305927277, + "learning_rate": 0.00022253527273016676, + "loss": 2.5931, + "step": 23283 + }, + { + "epoch": 0.6904486552204727, + "grad_norm": 0.16290241479873657, + "learning_rate": 0.00022249613238629174, + "loss": 2.6132, + "step": 23284 + }, + { + "epoch": 0.6904783085727841, + "grad_norm": 0.09029919654130936, + "learning_rate": 0.00022245699449976102, + "loss": 2.6004, + "step": 23285 + }, + { + "epoch": 0.6905079619250957, + "grad_norm": 0.12348684668540955, + "learning_rate": 0.00022241785907092126, + "loss": 2.6445, + "step": 23286 + }, + { + "epoch": 0.6905376152774071, + "grad_norm": 0.09051833301782608, + "learning_rate": 0.00022237872610011907, + "loss": 2.6101, + "step": 23287 + }, + { + "epoch": 0.6905672686297186, + "grad_norm": 0.11533872038125992, + "learning_rate": 0.00022233959558770085, + "loss": 2.5893, + "step": 23288 + }, + { + "epoch": 0.69059692198203, + "grad_norm": 0.10223711282014847, + "learning_rate": 0.00022230046753401317, + "loss": 2.6044, + "step": 23289 + }, + { + "epoch": 0.6906265753343416, + "grad_norm": 0.1040896400809288, + "learning_rate": 0.00022226134193940257, + "loss": 2.6371, + "step": 23290 + }, + { + "epoch": 0.690656228686653, + "grad_norm": 0.09960996359586716, + "learning_rate": 0.0002222222188042154, + "loss": 2.6028, + "step": 23291 + }, + { + "epoch": 0.6906858820389645, + "grad_norm": 0.10759799927473068, + "learning_rate": 0.00022218309812879817, + "loss": 2.5784, + "step": 23292 + }, + { + "epoch": 0.690715535391276, + "grad_norm": 0.10749027132987976, + "learning_rate": 0.00022214397991349733, + "loss": 2.6329, + "step": 23293 + }, + { + "epoch": 0.6907451887435875, + "grad_norm": 0.10731799900531769, + "learning_rate": 0.00022210486415865922, + "loss": 2.6156, + "step": 23294 + }, + { + "epoch": 0.6907748420958989, + "grad_norm": 0.10231614112854004, + "learning_rate": 0.0002220657508646302, + "loss": 2.6293, + "step": 23295 + }, + { + "epoch": 0.6908044954482104, + "grad_norm": 0.09891096502542496, + "learning_rate": 0.0002220266400317567, + "loss": 2.5919, + "step": 23296 + }, + { + "epoch": 0.6908341488005219, + "grad_norm": 0.10364345461130142, + "learning_rate": 0.00022198753166038506, + "loss": 2.5863, + "step": 23297 + }, + { + "epoch": 0.6908638021528334, + "grad_norm": 0.10336710512638092, + "learning_rate": 0.00022194842575086148, + "loss": 2.6407, + "step": 23298 + }, + { + "epoch": 0.6908934555051448, + "grad_norm": 0.10731012374162674, + "learning_rate": 0.00022190932230353234, + "loss": 2.629, + "step": 23299 + }, + { + "epoch": 0.6909231088574563, + "grad_norm": 0.10118110477924347, + "learning_rate": 0.0002218702213187439, + "loss": 2.5995, + "step": 23300 + }, + { + "epoch": 0.6909527622097679, + "grad_norm": 0.10958243906497955, + "learning_rate": 0.00022183112279684235, + "loss": 2.6311, + "step": 23301 + }, + { + "epoch": 0.6909824155620793, + "grad_norm": 0.09733335673809052, + "learning_rate": 0.000221792026738174, + "loss": 2.5938, + "step": 23302 + }, + { + "epoch": 0.6910120689143908, + "grad_norm": 0.10292308777570724, + "learning_rate": 0.00022175293314308497, + "loss": 2.5814, + "step": 23303 + }, + { + "epoch": 0.6910417222667022, + "grad_norm": 0.10392369329929352, + "learning_rate": 0.00022171384201192151, + "loss": 2.6174, + "step": 23304 + }, + { + "epoch": 0.6910713756190138, + "grad_norm": 0.10306252539157867, + "learning_rate": 0.00022167475334502974, + "loss": 2.6627, + "step": 23305 + }, + { + "epoch": 0.6911010289713252, + "grad_norm": 0.09692022204399109, + "learning_rate": 0.00022163566714275578, + "loss": 2.5902, + "step": 23306 + }, + { + "epoch": 0.6911306823236367, + "grad_norm": 0.1234351396560669, + "learning_rate": 0.00022159658340544598, + "loss": 2.6175, + "step": 23307 + }, + { + "epoch": 0.6911603356759481, + "grad_norm": 0.10715869069099426, + "learning_rate": 0.000221557502133446, + "loss": 2.5501, + "step": 23308 + }, + { + "epoch": 0.6911899890282597, + "grad_norm": 0.10089118778705597, + "learning_rate": 0.00022151842332710197, + "loss": 2.6057, + "step": 23309 + }, + { + "epoch": 0.6912196423805711, + "grad_norm": 0.09900783747434616, + "learning_rate": 0.0002214793469867603, + "loss": 2.6331, + "step": 23310 + }, + { + "epoch": 0.6912492957328826, + "grad_norm": 0.10111294686794281, + "learning_rate": 0.00022144027311276683, + "loss": 2.5804, + "step": 23311 + }, + { + "epoch": 0.691278949085194, + "grad_norm": 0.1012873575091362, + "learning_rate": 0.00022140120170546752, + "loss": 2.6326, + "step": 23312 + }, + { + "epoch": 0.6913086024375056, + "grad_norm": 0.10227102786302567, + "learning_rate": 0.0002213621327652084, + "loss": 2.635, + "step": 23313 + }, + { + "epoch": 0.691338255789817, + "grad_norm": 0.11674998700618744, + "learning_rate": 0.00022132306629233539, + "loss": 2.5797, + "step": 23314 + }, + { + "epoch": 0.6913679091421285, + "grad_norm": 0.1036873385310173, + "learning_rate": 0.00022128400228719453, + "loss": 2.6292, + "step": 23315 + }, + { + "epoch": 0.69139756249444, + "grad_norm": 0.1020810455083847, + "learning_rate": 0.00022124494075013163, + "loss": 2.5704, + "step": 23316 + }, + { + "epoch": 0.6914272158467515, + "grad_norm": 0.1073421910405159, + "learning_rate": 0.00022120588168149263, + "loss": 2.6063, + "step": 23317 + }, + { + "epoch": 0.6914568691990629, + "grad_norm": 0.1050616055727005, + "learning_rate": 0.00022116682508162362, + "loss": 2.6073, + "step": 23318 + }, + { + "epoch": 0.6914865225513744, + "grad_norm": 0.11420369148254395, + "learning_rate": 0.00022112777095087, + "loss": 2.5809, + "step": 23319 + }, + { + "epoch": 0.6915161759036859, + "grad_norm": 0.10516293346881866, + "learning_rate": 0.00022108871928957786, + "loss": 2.6198, + "step": 23320 + }, + { + "epoch": 0.6915458292559974, + "grad_norm": 0.11635179817676544, + "learning_rate": 0.00022104967009809297, + "loss": 2.6164, + "step": 23321 + }, + { + "epoch": 0.6915754826083089, + "grad_norm": 0.11101596802473068, + "learning_rate": 0.000221010623376761, + "loss": 2.5951, + "step": 23322 + }, + { + "epoch": 0.6916051359606203, + "grad_norm": 0.11642211675643921, + "learning_rate": 0.00022097157912592797, + "loss": 2.608, + "step": 23323 + }, + { + "epoch": 0.6916347893129319, + "grad_norm": 0.10277996212244034, + "learning_rate": 0.00022093253734593955, + "loss": 2.6243, + "step": 23324 + }, + { + "epoch": 0.6916644426652433, + "grad_norm": 0.12088752537965775, + "learning_rate": 0.00022089349803714137, + "loss": 2.5829, + "step": 23325 + }, + { + "epoch": 0.6916940960175548, + "grad_norm": 0.10266873240470886, + "learning_rate": 0.0002208544611998792, + "loss": 2.5971, + "step": 23326 + }, + { + "epoch": 0.6917237493698662, + "grad_norm": 0.0999172106385231, + "learning_rate": 0.00022081542683449867, + "loss": 2.5918, + "step": 23327 + }, + { + "epoch": 0.6917534027221778, + "grad_norm": 0.1044330820441246, + "learning_rate": 0.00022077639494134566, + "loss": 2.6133, + "step": 23328 + }, + { + "epoch": 0.6917830560744892, + "grad_norm": 0.11421415209770203, + "learning_rate": 0.00022073736552076539, + "loss": 2.6343, + "step": 23329 + }, + { + "epoch": 0.6918127094268007, + "grad_norm": 0.10115737468004227, + "learning_rate": 0.00022069833857310374, + "loss": 2.6218, + "step": 23330 + }, + { + "epoch": 0.6918423627791122, + "grad_norm": 0.10851666331291199, + "learning_rate": 0.00022065931409870622, + "loss": 2.6244, + "step": 23331 + }, + { + "epoch": 0.6918720161314237, + "grad_norm": 0.10737958550453186, + "learning_rate": 0.0002206202920979184, + "loss": 2.6124, + "step": 23332 + }, + { + "epoch": 0.6919016694837351, + "grad_norm": 0.11425319314002991, + "learning_rate": 0.00022058127257108584, + "loss": 2.6092, + "step": 23333 + }, + { + "epoch": 0.6919313228360466, + "grad_norm": 0.09642373770475388, + "learning_rate": 0.00022054225551855405, + "loss": 2.6034, + "step": 23334 + }, + { + "epoch": 0.6919609761883581, + "grad_norm": 0.10521283000707626, + "learning_rate": 0.00022050324094066842, + "loss": 2.5721, + "step": 23335 + }, + { + "epoch": 0.6919906295406696, + "grad_norm": 0.09457116574048996, + "learning_rate": 0.0002204642288377748, + "loss": 2.5909, + "step": 23336 + }, + { + "epoch": 0.692020282892981, + "grad_norm": 0.10345252603292465, + "learning_rate": 0.0002204252192102183, + "loss": 2.618, + "step": 23337 + }, + { + "epoch": 0.6920499362452925, + "grad_norm": 0.10690789669752121, + "learning_rate": 0.00022038621205834453, + "loss": 2.6081, + "step": 23338 + }, + { + "epoch": 0.692079589597604, + "grad_norm": 0.09311843663454056, + "learning_rate": 0.00022034720738249903, + "loss": 2.6114, + "step": 23339 + }, + { + "epoch": 0.6921092429499155, + "grad_norm": 0.1086203083395958, + "learning_rate": 0.00022030820518302685, + "loss": 2.6239, + "step": 23340 + }, + { + "epoch": 0.6921388963022269, + "grad_norm": 0.10332171618938446, + "learning_rate": 0.0002202692054602735, + "loss": 2.6198, + "step": 23341 + }, + { + "epoch": 0.6921685496545384, + "grad_norm": 0.09878060221672058, + "learning_rate": 0.00022023020821458432, + "loss": 2.6063, + "step": 23342 + }, + { + "epoch": 0.69219820300685, + "grad_norm": 0.10356088727712631, + "learning_rate": 0.00022019121344630472, + "loss": 2.6335, + "step": 23343 + }, + { + "epoch": 0.6922278563591614, + "grad_norm": 0.10840986669063568, + "learning_rate": 0.00022015222115577993, + "loss": 2.6043, + "step": 23344 + }, + { + "epoch": 0.6922575097114729, + "grad_norm": 0.09964677691459656, + "learning_rate": 0.00022011323134335525, + "loss": 2.6214, + "step": 23345 + }, + { + "epoch": 0.6922871630637843, + "grad_norm": 0.1012786403298378, + "learning_rate": 0.00022007424400937597, + "loss": 2.6124, + "step": 23346 + }, + { + "epoch": 0.6923168164160959, + "grad_norm": 0.10986791551113129, + "learning_rate": 0.00022003525915418733, + "loss": 2.6382, + "step": 23347 + }, + { + "epoch": 0.6923464697684073, + "grad_norm": 0.10206466913223267, + "learning_rate": 0.00021999627677813426, + "loss": 2.6185, + "step": 23348 + }, + { + "epoch": 0.6923761231207188, + "grad_norm": 0.12585173547267914, + "learning_rate": 0.00021995729688156264, + "loss": 2.6182, + "step": 23349 + }, + { + "epoch": 0.6924057764730303, + "grad_norm": 0.10387080907821655, + "learning_rate": 0.00021991831946481706, + "loss": 2.6085, + "step": 23350 + }, + { + "epoch": 0.6924354298253418, + "grad_norm": 0.12909264862537384, + "learning_rate": 0.00021987934452824282, + "loss": 2.6388, + "step": 23351 + }, + { + "epoch": 0.6924650831776532, + "grad_norm": 0.10059837251901627, + "learning_rate": 0.00021984037207218506, + "loss": 2.6084, + "step": 23352 + }, + { + "epoch": 0.6924947365299647, + "grad_norm": 0.11568048596382141, + "learning_rate": 0.0002198014020969889, + "loss": 2.625, + "step": 23353 + }, + { + "epoch": 0.6925243898822762, + "grad_norm": 0.10151185095310211, + "learning_rate": 0.0002197624346029994, + "loss": 2.5898, + "step": 23354 + }, + { + "epoch": 0.6925540432345877, + "grad_norm": 0.1092619076371193, + "learning_rate": 0.00021972346959056168, + "loss": 2.6467, + "step": 23355 + }, + { + "epoch": 0.6925836965868991, + "grad_norm": 0.10605066269636154, + "learning_rate": 0.0002196845070600207, + "loss": 2.602, + "step": 23356 + }, + { + "epoch": 0.6926133499392106, + "grad_norm": 0.1083502545952797, + "learning_rate": 0.0002196455470117215, + "loss": 2.5822, + "step": 23357 + }, + { + "epoch": 0.6926430032915221, + "grad_norm": 0.1280050426721573, + "learning_rate": 0.00021960658944600919, + "loss": 2.6029, + "step": 23358 + }, + { + "epoch": 0.6926726566438336, + "grad_norm": 0.10126163065433502, + "learning_rate": 0.00021956763436322863, + "loss": 2.6191, + "step": 23359 + }, + { + "epoch": 0.692702309996145, + "grad_norm": 0.11948706954717636, + "learning_rate": 0.00021952868176372477, + "loss": 2.5838, + "step": 23360 + }, + { + "epoch": 0.6927319633484565, + "grad_norm": 0.1041751280426979, + "learning_rate": 0.00021948973164784258, + "loss": 2.6138, + "step": 23361 + }, + { + "epoch": 0.692761616700768, + "grad_norm": 0.11964000761508942, + "learning_rate": 0.000219450784015927, + "loss": 2.6534, + "step": 23362 + }, + { + "epoch": 0.6927912700530795, + "grad_norm": 0.1046958863735199, + "learning_rate": 0.0002194118388683229, + "loss": 2.5786, + "step": 23363 + }, + { + "epoch": 0.692820923405391, + "grad_norm": 0.11297222971916199, + "learning_rate": 0.0002193728962053751, + "loss": 2.6506, + "step": 23364 + }, + { + "epoch": 0.6928505767577025, + "grad_norm": 0.11468997597694397, + "learning_rate": 0.00021933395602742844, + "loss": 2.5949, + "step": 23365 + }, + { + "epoch": 0.692880230110014, + "grad_norm": 0.10602473467588425, + "learning_rate": 0.0002192950183348278, + "loss": 2.6028, + "step": 23366 + }, + { + "epoch": 0.6929098834623254, + "grad_norm": 0.1087777316570282, + "learning_rate": 0.00021925608312791794, + "loss": 2.6382, + "step": 23367 + }, + { + "epoch": 0.6929395368146369, + "grad_norm": 0.10797801613807678, + "learning_rate": 0.0002192171504070437, + "loss": 2.6127, + "step": 23368 + }, + { + "epoch": 0.6929691901669484, + "grad_norm": 0.11103877425193787, + "learning_rate": 0.00021917822017254978, + "loss": 2.6387, + "step": 23369 + }, + { + "epoch": 0.6929988435192599, + "grad_norm": 0.11520001292228699, + "learning_rate": 0.00021913929242478087, + "loss": 2.5933, + "step": 23370 + }, + { + "epoch": 0.6930284968715713, + "grad_norm": 0.11300688236951828, + "learning_rate": 0.00021910036716408176, + "loss": 2.6084, + "step": 23371 + }, + { + "epoch": 0.6930581502238828, + "grad_norm": 0.10022681951522827, + "learning_rate": 0.00021906144439079716, + "loss": 2.6207, + "step": 23372 + }, + { + "epoch": 0.6930878035761943, + "grad_norm": 0.1096942126750946, + "learning_rate": 0.00021902252410527185, + "loss": 2.643, + "step": 23373 + }, + { + "epoch": 0.6931174569285058, + "grad_norm": 0.09699193388223648, + "learning_rate": 0.00021898360630784991, + "loss": 2.621, + "step": 23374 + }, + { + "epoch": 0.6931471102808172, + "grad_norm": 0.1055300310254097, + "learning_rate": 0.0002189446909988766, + "loss": 2.577, + "step": 23375 + }, + { + "epoch": 0.6931767636331287, + "grad_norm": 0.10095054656267166, + "learning_rate": 0.0002189057781786963, + "loss": 2.58, + "step": 23376 + }, + { + "epoch": 0.6932064169854402, + "grad_norm": 0.10150771588087082, + "learning_rate": 0.00021886686784765354, + "loss": 2.611, + "step": 23377 + }, + { + "epoch": 0.6932360703377517, + "grad_norm": 0.12081214040517807, + "learning_rate": 0.00021882796000609296, + "loss": 2.5715, + "step": 23378 + }, + { + "epoch": 0.6932657236900631, + "grad_norm": 0.09478341042995453, + "learning_rate": 0.00021878905465435905, + "loss": 2.6198, + "step": 23379 + }, + { + "epoch": 0.6932953770423746, + "grad_norm": 0.11244557797908783, + "learning_rate": 0.00021875015179279627, + "loss": 2.6227, + "step": 23380 + }, + { + "epoch": 0.6933250303946861, + "grad_norm": 0.11896511167287827, + "learning_rate": 0.00021871125142174924, + "loss": 2.6493, + "step": 23381 + }, + { + "epoch": 0.6933546837469976, + "grad_norm": 0.10237232595682144, + "learning_rate": 0.00021867235354156234, + "loss": 2.6182, + "step": 23382 + }, + { + "epoch": 0.693384337099309, + "grad_norm": 0.11531945317983627, + "learning_rate": 0.00021863345815258006, + "loss": 2.5748, + "step": 23383 + }, + { + "epoch": 0.6934139904516206, + "grad_norm": 0.11560455709695816, + "learning_rate": 0.00021859456525514698, + "loss": 2.6255, + "step": 23384 + }, + { + "epoch": 0.6934436438039321, + "grad_norm": 0.12042749673128128, + "learning_rate": 0.00021855567484960716, + "loss": 2.6234, + "step": 23385 + }, + { + "epoch": 0.6934732971562435, + "grad_norm": 0.1095040962100029, + "learning_rate": 0.0002185167869363051, + "loss": 2.584, + "step": 23386 + }, + { + "epoch": 0.693502950508555, + "grad_norm": 0.12335950136184692, + "learning_rate": 0.00021847790151558505, + "loss": 2.576, + "step": 23387 + }, + { + "epoch": 0.6935326038608665, + "grad_norm": 0.13017168641090393, + "learning_rate": 0.0002184390185877917, + "loss": 2.616, + "step": 23388 + }, + { + "epoch": 0.693562257213178, + "grad_norm": 0.10920495539903641, + "learning_rate": 0.00021840013815326915, + "loss": 2.6041, + "step": 23389 + }, + { + "epoch": 0.6935919105654894, + "grad_norm": 0.10548409819602966, + "learning_rate": 0.0002183612602123617, + "loss": 2.6463, + "step": 23390 + }, + { + "epoch": 0.6936215639178009, + "grad_norm": 0.12664256989955902, + "learning_rate": 0.00021832238476541366, + "loss": 2.6073, + "step": 23391 + }, + { + "epoch": 0.6936512172701124, + "grad_norm": 0.11455687135457993, + "learning_rate": 0.00021828351181276922, + "loss": 2.6089, + "step": 23392 + }, + { + "epoch": 0.6936808706224239, + "grad_norm": 0.10487083345651627, + "learning_rate": 0.00021824464135477268, + "loss": 2.5968, + "step": 23393 + }, + { + "epoch": 0.6937105239747353, + "grad_norm": 0.1296403706073761, + "learning_rate": 0.0002182057733917684, + "loss": 2.6431, + "step": 23394 + }, + { + "epoch": 0.6937401773270468, + "grad_norm": 0.1036609411239624, + "learning_rate": 0.0002181669079241001, + "loss": 2.6507, + "step": 23395 + }, + { + "epoch": 0.6937698306793583, + "grad_norm": 0.12165168672800064, + "learning_rate": 0.00021812804495211231, + "loss": 2.6357, + "step": 23396 + }, + { + "epoch": 0.6937994840316698, + "grad_norm": 0.11074072867631912, + "learning_rate": 0.00021808918447614902, + "loss": 2.6431, + "step": 23397 + }, + { + "epoch": 0.6938291373839812, + "grad_norm": 0.11136512458324432, + "learning_rate": 0.00021805032649655436, + "loss": 2.5894, + "step": 23398 + }, + { + "epoch": 0.6938587907362928, + "grad_norm": 0.1128987967967987, + "learning_rate": 0.00021801147101367248, + "loss": 2.6115, + "step": 23399 + }, + { + "epoch": 0.6938884440886042, + "grad_norm": 0.11218906193971634, + "learning_rate": 0.00021797261802784725, + "loss": 2.6335, + "step": 23400 + }, + { + "epoch": 0.6939180974409157, + "grad_norm": 0.12081470340490341, + "learning_rate": 0.00021793376753942307, + "loss": 2.6256, + "step": 23401 + }, + { + "epoch": 0.6939477507932271, + "grad_norm": 0.10432308912277222, + "learning_rate": 0.0002178949195487438, + "loss": 2.609, + "step": 23402 + }, + { + "epoch": 0.6939774041455387, + "grad_norm": 0.11469617486000061, + "learning_rate": 0.00021785607405615343, + "loss": 2.6241, + "step": 23403 + }, + { + "epoch": 0.6940070574978501, + "grad_norm": 0.10808025300502777, + "learning_rate": 0.00021781723106199615, + "loss": 2.5871, + "step": 23404 + }, + { + "epoch": 0.6940367108501616, + "grad_norm": 0.12038274109363556, + "learning_rate": 0.00021777839056661552, + "loss": 2.6214, + "step": 23405 + }, + { + "epoch": 0.6940663642024731, + "grad_norm": 0.10993696749210358, + "learning_rate": 0.0002177395525703557, + "loss": 2.6165, + "step": 23406 + }, + { + "epoch": 0.6940960175547846, + "grad_norm": 0.10943669080734253, + "learning_rate": 0.00021770071707356058, + "loss": 2.607, + "step": 23407 + }, + { + "epoch": 0.6941256709070961, + "grad_norm": 0.11118375509977341, + "learning_rate": 0.00021766188407657406, + "loss": 2.5782, + "step": 23408 + }, + { + "epoch": 0.6941553242594075, + "grad_norm": 0.11333432048559189, + "learning_rate": 0.00021762305357974005, + "loss": 2.6081, + "step": 23409 + }, + { + "epoch": 0.694184977611719, + "grad_norm": 0.09605181962251663, + "learning_rate": 0.00021758422558340235, + "loss": 2.5843, + "step": 23410 + }, + { + "epoch": 0.6942146309640305, + "grad_norm": 0.10239355266094208, + "learning_rate": 0.00021754540008790484, + "loss": 2.6027, + "step": 23411 + }, + { + "epoch": 0.694244284316342, + "grad_norm": 0.09455804526805878, + "learning_rate": 0.00021750657709359127, + "loss": 2.5925, + "step": 23412 + }, + { + "epoch": 0.6942739376686534, + "grad_norm": 0.10422445088624954, + "learning_rate": 0.00021746775660080525, + "loss": 2.6367, + "step": 23413 + }, + { + "epoch": 0.694303591020965, + "grad_norm": 0.10271204262971878, + "learning_rate": 0.00021742893860989093, + "loss": 2.6198, + "step": 23414 + }, + { + "epoch": 0.6943332443732764, + "grad_norm": 0.09441652148962021, + "learning_rate": 0.00021739012312119207, + "loss": 2.5791, + "step": 23415 + }, + { + "epoch": 0.6943628977255879, + "grad_norm": 0.09204696118831635, + "learning_rate": 0.00021735131013505198, + "loss": 2.6183, + "step": 23416 + }, + { + "epoch": 0.6943925510778993, + "grad_norm": 0.10239231586456299, + "learning_rate": 0.00021731249965181455, + "loss": 2.591, + "step": 23417 + }, + { + "epoch": 0.6944222044302109, + "grad_norm": 0.10363820195198059, + "learning_rate": 0.00021727369167182347, + "loss": 2.6025, + "step": 23418 + }, + { + "epoch": 0.6944518577825223, + "grad_norm": 0.11401831358671188, + "learning_rate": 0.00021723488619542237, + "loss": 2.6243, + "step": 23419 + }, + { + "epoch": 0.6944815111348338, + "grad_norm": 0.09669125825166702, + "learning_rate": 0.00021719608322295493, + "loss": 2.6062, + "step": 23420 + }, + { + "epoch": 0.6945111644871452, + "grad_norm": 0.11998708546161652, + "learning_rate": 0.00021715728275476464, + "loss": 2.6124, + "step": 23421 + }, + { + "epoch": 0.6945408178394568, + "grad_norm": 0.10840407758951187, + "learning_rate": 0.00021711848479119523, + "loss": 2.6246, + "step": 23422 + }, + { + "epoch": 0.6945704711917682, + "grad_norm": 0.10125544667243958, + "learning_rate": 0.00021707968933259015, + "loss": 2.6208, + "step": 23423 + }, + { + "epoch": 0.6946001245440797, + "grad_norm": 0.10762403160333633, + "learning_rate": 0.00021704089637929297, + "loss": 2.6186, + "step": 23424 + }, + { + "epoch": 0.6946297778963911, + "grad_norm": 0.0980336144566536, + "learning_rate": 0.00021700210593164726, + "loss": 2.6521, + "step": 23425 + }, + { + "epoch": 0.6946594312487027, + "grad_norm": 0.11055746674537659, + "learning_rate": 0.00021696331798999648, + "loss": 2.6098, + "step": 23426 + }, + { + "epoch": 0.6946890846010142, + "grad_norm": 0.11063911020755768, + "learning_rate": 0.0002169245325546841, + "loss": 2.6338, + "step": 23427 + }, + { + "epoch": 0.6947187379533256, + "grad_norm": 0.10423071682453156, + "learning_rate": 0.0002168857496260535, + "loss": 2.6249, + "step": 23428 + }, + { + "epoch": 0.6947483913056371, + "grad_norm": 0.10219580680131912, + "learning_rate": 0.0002168469692044483, + "loss": 2.6126, + "step": 23429 + }, + { + "epoch": 0.6947780446579486, + "grad_norm": 0.10417735576629639, + "learning_rate": 0.00021680819129021173, + "loss": 2.6213, + "step": 23430 + }, + { + "epoch": 0.6948076980102601, + "grad_norm": 0.09789304435253143, + "learning_rate": 0.0002167694158836872, + "loss": 2.6151, + "step": 23431 + }, + { + "epoch": 0.6948373513625715, + "grad_norm": 0.10359072685241699, + "learning_rate": 0.00021673064298521815, + "loss": 2.5963, + "step": 23432 + }, + { + "epoch": 0.694867004714883, + "grad_norm": 0.11323525756597519, + "learning_rate": 0.0002166918725951479, + "loss": 2.5658, + "step": 23433 + }, + { + "epoch": 0.6948966580671945, + "grad_norm": 0.10892727971076965, + "learning_rate": 0.00021665310471381973, + "loss": 2.6441, + "step": 23434 + }, + { + "epoch": 0.694926311419506, + "grad_norm": 0.10153244435787201, + "learning_rate": 0.00021661433934157693, + "loss": 2.6218, + "step": 23435 + }, + { + "epoch": 0.6949559647718174, + "grad_norm": 0.10492927581071854, + "learning_rate": 0.0002165755764787628, + "loss": 2.651, + "step": 23436 + }, + { + "epoch": 0.694985618124129, + "grad_norm": 0.10890063643455505, + "learning_rate": 0.00021653681612572064, + "loss": 2.628, + "step": 23437 + }, + { + "epoch": 0.6950152714764404, + "grad_norm": 0.10645212233066559, + "learning_rate": 0.00021649805828279358, + "loss": 2.6108, + "step": 23438 + }, + { + "epoch": 0.6950449248287519, + "grad_norm": 0.10834719240665436, + "learning_rate": 0.0002164593029503249, + "loss": 2.6015, + "step": 23439 + }, + { + "epoch": 0.6950745781810633, + "grad_norm": 0.13022619485855103, + "learning_rate": 0.00021642055012865773, + "loss": 2.5936, + "step": 23440 + }, + { + "epoch": 0.6951042315333749, + "grad_norm": 0.11159760504961014, + "learning_rate": 0.00021638179981813528, + "loss": 2.6317, + "step": 23441 + }, + { + "epoch": 0.6951338848856863, + "grad_norm": 0.10807802528142929, + "learning_rate": 0.0002163430520191007, + "loss": 2.6209, + "step": 23442 + }, + { + "epoch": 0.6951635382379978, + "grad_norm": 0.10209271311759949, + "learning_rate": 0.00021630430673189705, + "loss": 2.6083, + "step": 23443 + }, + { + "epoch": 0.6951931915903092, + "grad_norm": 0.12063644081354141, + "learning_rate": 0.00021626556395686747, + "loss": 2.6309, + "step": 23444 + }, + { + "epoch": 0.6952228449426208, + "grad_norm": 0.09532345831394196, + "learning_rate": 0.00021622682369435505, + "loss": 2.6103, + "step": 23445 + }, + { + "epoch": 0.6952524982949323, + "grad_norm": 0.10147412121295929, + "learning_rate": 0.0002161880859447028, + "loss": 2.6155, + "step": 23446 + }, + { + "epoch": 0.6952821516472437, + "grad_norm": 0.10445265471935272, + "learning_rate": 0.00021614935070825376, + "loss": 2.6485, + "step": 23447 + }, + { + "epoch": 0.6953118049995552, + "grad_norm": 0.10266897082328796, + "learning_rate": 0.00021611061798535092, + "loss": 2.5771, + "step": 23448 + }, + { + "epoch": 0.6953414583518667, + "grad_norm": 0.10045524686574936, + "learning_rate": 0.00021607188777633752, + "loss": 2.6185, + "step": 23449 + }, + { + "epoch": 0.6953711117041782, + "grad_norm": 0.09916000068187714, + "learning_rate": 0.00021603316008155605, + "loss": 2.5707, + "step": 23450 + }, + { + "epoch": 0.6954007650564896, + "grad_norm": 0.10238544642925262, + "learning_rate": 0.00021599443490134975, + "loss": 2.5876, + "step": 23451 + }, + { + "epoch": 0.6954304184088012, + "grad_norm": 0.1013735756278038, + "learning_rate": 0.0002159557122360612, + "loss": 2.6326, + "step": 23452 + }, + { + "epoch": 0.6954600717611126, + "grad_norm": 0.1017150729894638, + "learning_rate": 0.00021591699208603382, + "loss": 2.622, + "step": 23453 + }, + { + "epoch": 0.6954897251134241, + "grad_norm": 0.09039273113012314, + "learning_rate": 0.00021587827445161022, + "loss": 2.6204, + "step": 23454 + }, + { + "epoch": 0.6955193784657355, + "grad_norm": 0.10414505004882812, + "learning_rate": 0.0002158395593331333, + "loss": 2.6, + "step": 23455 + }, + { + "epoch": 0.6955490318180471, + "grad_norm": 0.09442494064569473, + "learning_rate": 0.00021580084673094584, + "loss": 2.619, + "step": 23456 + }, + { + "epoch": 0.6955786851703585, + "grad_norm": 0.10938379168510437, + "learning_rate": 0.00021576213664539068, + "loss": 2.6591, + "step": 23457 + }, + { + "epoch": 0.69560833852267, + "grad_norm": 0.1114545464515686, + "learning_rate": 0.0002157234290768106, + "loss": 2.6649, + "step": 23458 + }, + { + "epoch": 0.6956379918749814, + "grad_norm": 0.10302673280239105, + "learning_rate": 0.0002156847240255483, + "loss": 2.6337, + "step": 23459 + }, + { + "epoch": 0.695667645227293, + "grad_norm": 0.10278067737817764, + "learning_rate": 0.0002156460214919468, + "loss": 2.6231, + "step": 23460 + }, + { + "epoch": 0.6956972985796044, + "grad_norm": 0.11446495354175568, + "learning_rate": 0.00021560732147634837, + "loss": 2.6223, + "step": 23461 + }, + { + "epoch": 0.6957269519319159, + "grad_norm": 0.11286872625350952, + "learning_rate": 0.00021556862397909593, + "loss": 2.5755, + "step": 23462 + }, + { + "epoch": 0.6957566052842273, + "grad_norm": 0.09742322564125061, + "learning_rate": 0.00021552992900053213, + "loss": 2.609, + "step": 23463 + }, + { + "epoch": 0.6957862586365389, + "grad_norm": 0.1146143302321434, + "learning_rate": 0.00021549123654099968, + "loss": 2.6348, + "step": 23464 + }, + { + "epoch": 0.6958159119888503, + "grad_norm": 0.11092005670070648, + "learning_rate": 0.00021545254660084096, + "loss": 2.6396, + "step": 23465 + }, + { + "epoch": 0.6958455653411618, + "grad_norm": 0.10387901961803436, + "learning_rate": 0.00021541385918039897, + "loss": 2.5955, + "step": 23466 + }, + { + "epoch": 0.6958752186934734, + "grad_norm": 0.09778066724538803, + "learning_rate": 0.00021537517428001614, + "loss": 2.5901, + "step": 23467 + }, + { + "epoch": 0.6959048720457848, + "grad_norm": 0.10447901487350464, + "learning_rate": 0.0002153364919000349, + "loss": 2.6271, + "step": 23468 + }, + { + "epoch": 0.6959345253980963, + "grad_norm": 0.09972598403692245, + "learning_rate": 0.00021529781204079795, + "loss": 2.5871, + "step": 23469 + }, + { + "epoch": 0.6959641787504077, + "grad_norm": 0.09180420637130737, + "learning_rate": 0.00021525913470264797, + "loss": 2.6056, + "step": 23470 + }, + { + "epoch": 0.6959938321027193, + "grad_norm": 0.10379116982221603, + "learning_rate": 0.000215220459885927, + "loss": 2.6038, + "step": 23471 + }, + { + "epoch": 0.6960234854550307, + "grad_norm": 0.10013451427221298, + "learning_rate": 0.00021518178759097773, + "loss": 2.5946, + "step": 23472 + }, + { + "epoch": 0.6960531388073422, + "grad_norm": 0.10962909460067749, + "learning_rate": 0.0002151431178181426, + "loss": 2.6012, + "step": 23473 + }, + { + "epoch": 0.6960827921596536, + "grad_norm": 0.11350270360708237, + "learning_rate": 0.00021510445056776407, + "loss": 2.6182, + "step": 23474 + }, + { + "epoch": 0.6961124455119652, + "grad_norm": 0.11708877235651016, + "learning_rate": 0.00021506578584018455, + "loss": 2.6339, + "step": 23475 + }, + { + "epoch": 0.6961420988642766, + "grad_norm": 0.12665916979312897, + "learning_rate": 0.0002150271236357464, + "loss": 2.6152, + "step": 23476 + }, + { + "epoch": 0.6961717522165881, + "grad_norm": 0.10716275125741959, + "learning_rate": 0.00021498846395479198, + "loss": 2.6044, + "step": 23477 + }, + { + "epoch": 0.6962014055688995, + "grad_norm": 0.11171405017375946, + "learning_rate": 0.00021494980679766346, + "loss": 2.6005, + "step": 23478 + }, + { + "epoch": 0.6962310589212111, + "grad_norm": 0.10581451654434204, + "learning_rate": 0.00021491115216470353, + "loss": 2.6158, + "step": 23479 + }, + { + "epoch": 0.6962607122735225, + "grad_norm": 0.10595890134572983, + "learning_rate": 0.00021487250005625442, + "loss": 2.6375, + "step": 23480 + }, + { + "epoch": 0.696290365625834, + "grad_norm": 0.10064545273780823, + "learning_rate": 0.00021483385047265814, + "loss": 2.6051, + "step": 23481 + }, + { + "epoch": 0.6963200189781454, + "grad_norm": 0.11200936883687973, + "learning_rate": 0.000214795203414257, + "loss": 2.5867, + "step": 23482 + }, + { + "epoch": 0.696349672330457, + "grad_norm": 0.10869738459587097, + "learning_rate": 0.00021475655888139334, + "loss": 2.6181, + "step": 23483 + }, + { + "epoch": 0.6963793256827684, + "grad_norm": 0.10655951499938965, + "learning_rate": 0.00021471791687440928, + "loss": 2.5969, + "step": 23484 + }, + { + "epoch": 0.6964089790350799, + "grad_norm": 0.09838126599788666, + "learning_rate": 0.00021467927739364702, + "loss": 2.5997, + "step": 23485 + }, + { + "epoch": 0.6964386323873913, + "grad_norm": 0.10510797798633575, + "learning_rate": 0.00021464064043944875, + "loss": 2.6299, + "step": 23486 + }, + { + "epoch": 0.6964682857397029, + "grad_norm": 0.10133723169565201, + "learning_rate": 0.00021460200601215658, + "loss": 2.5862, + "step": 23487 + }, + { + "epoch": 0.6964979390920144, + "grad_norm": 0.1099824607372284, + "learning_rate": 0.00021456337411211268, + "loss": 2.616, + "step": 23488 + }, + { + "epoch": 0.6965275924443258, + "grad_norm": 0.10097327083349228, + "learning_rate": 0.00021452474473965906, + "loss": 2.6331, + "step": 23489 + }, + { + "epoch": 0.6965572457966374, + "grad_norm": 0.10875090211629868, + "learning_rate": 0.00021448611789513767, + "loss": 2.6353, + "step": 23490 + }, + { + "epoch": 0.6965868991489488, + "grad_norm": 0.09432618319988251, + "learning_rate": 0.00021444749357889104, + "loss": 2.6215, + "step": 23491 + }, + { + "epoch": 0.6966165525012603, + "grad_norm": 0.10380648076534271, + "learning_rate": 0.00021440887179126068, + "loss": 2.6139, + "step": 23492 + }, + { + "epoch": 0.6966462058535717, + "grad_norm": 0.09902539104223251, + "learning_rate": 0.0002143702525325888, + "loss": 2.6263, + "step": 23493 + }, + { + "epoch": 0.6966758592058833, + "grad_norm": 0.10326457768678665, + "learning_rate": 0.0002143316358032174, + "loss": 2.6295, + "step": 23494 + }, + { + "epoch": 0.6967055125581947, + "grad_norm": 0.10987058281898499, + "learning_rate": 0.00021429302160348834, + "loss": 2.6404, + "step": 23495 + }, + { + "epoch": 0.6967351659105062, + "grad_norm": 0.10077621787786484, + "learning_rate": 0.00021425440993374367, + "loss": 2.5615, + "step": 23496 + }, + { + "epoch": 0.6967648192628176, + "grad_norm": 0.10042936354875565, + "learning_rate": 0.0002142158007943252, + "loss": 2.6231, + "step": 23497 + }, + { + "epoch": 0.6967944726151292, + "grad_norm": 0.10856036841869354, + "learning_rate": 0.00021417719418557492, + "loss": 2.6158, + "step": 23498 + }, + { + "epoch": 0.6968241259674406, + "grad_norm": 0.0932777002453804, + "learning_rate": 0.0002141385901078346, + "loss": 2.5973, + "step": 23499 + }, + { + "epoch": 0.6968537793197521, + "grad_norm": 0.10088856518268585, + "learning_rate": 0.00021409998856144615, + "loss": 2.6327, + "step": 23500 + }, + { + "epoch": 0.6968834326720635, + "grad_norm": 0.10082872211933136, + "learning_rate": 0.00021406138954675136, + "loss": 2.6227, + "step": 23501 + }, + { + "epoch": 0.6969130860243751, + "grad_norm": 0.09187585115432739, + "learning_rate": 0.00021402279306409206, + "loss": 2.6276, + "step": 23502 + }, + { + "epoch": 0.6969427393766865, + "grad_norm": 0.10274749249219894, + "learning_rate": 0.00021398419911381, + "loss": 2.6091, + "step": 23503 + }, + { + "epoch": 0.696972392728998, + "grad_norm": 0.09884664416313171, + "learning_rate": 0.00021394560769624695, + "loss": 2.6079, + "step": 23504 + }, + { + "epoch": 0.6970020460813094, + "grad_norm": 0.10008378326892853, + "learning_rate": 0.0002139070188117447, + "loss": 2.641, + "step": 23505 + }, + { + "epoch": 0.697031699433621, + "grad_norm": 0.09398239105939865, + "learning_rate": 0.00021386843246064486, + "loss": 2.6394, + "step": 23506 + }, + { + "epoch": 0.6970613527859324, + "grad_norm": 0.09957849979400635, + "learning_rate": 0.00021382984864328915, + "loss": 2.6191, + "step": 23507 + }, + { + "epoch": 0.6970910061382439, + "grad_norm": 0.09862682968378067, + "learning_rate": 0.00021379126736001926, + "loss": 2.5964, + "step": 23508 + }, + { + "epoch": 0.6971206594905555, + "grad_norm": 0.09917663782835007, + "learning_rate": 0.0002137526886111768, + "loss": 2.6292, + "step": 23509 + }, + { + "epoch": 0.6971503128428669, + "grad_norm": 0.1037122905254364, + "learning_rate": 0.00021371411239710347, + "loss": 2.5853, + "step": 23510 + }, + { + "epoch": 0.6971799661951784, + "grad_norm": 0.0932081863284111, + "learning_rate": 0.00021367553871814082, + "loss": 2.6211, + "step": 23511 + }, + { + "epoch": 0.6972096195474898, + "grad_norm": 0.10893924534320831, + "learning_rate": 0.00021363696757463035, + "loss": 2.6318, + "step": 23512 + }, + { + "epoch": 0.6972392728998014, + "grad_norm": 0.09478447586297989, + "learning_rate": 0.00021359839896691374, + "loss": 2.6189, + "step": 23513 + }, + { + "epoch": 0.6972689262521128, + "grad_norm": 0.10124589502811432, + "learning_rate": 0.00021355983289533248, + "loss": 2.612, + "step": 23514 + }, + { + "epoch": 0.6972985796044243, + "grad_norm": 0.086970254778862, + "learning_rate": 0.0002135212693602282, + "loss": 2.6114, + "step": 23515 + }, + { + "epoch": 0.6973282329567357, + "grad_norm": 0.09446831792593002, + "learning_rate": 0.00021348270836194194, + "loss": 2.5791, + "step": 23516 + }, + { + "epoch": 0.6973578863090473, + "grad_norm": 0.09995155036449432, + "learning_rate": 0.0002134441499008156, + "loss": 2.5975, + "step": 23517 + }, + { + "epoch": 0.6973875396613587, + "grad_norm": 0.09592308104038239, + "learning_rate": 0.00021340559397719055, + "loss": 2.6405, + "step": 23518 + }, + { + "epoch": 0.6974171930136702, + "grad_norm": 0.09817230701446533, + "learning_rate": 0.00021336704059140817, + "loss": 2.6056, + "step": 23519 + }, + { + "epoch": 0.6974468463659816, + "grad_norm": 0.11458328366279602, + "learning_rate": 0.0002133284897438098, + "loss": 2.6212, + "step": 23520 + }, + { + "epoch": 0.6974764997182932, + "grad_norm": 0.11262429505586624, + "learning_rate": 0.00021328994143473691, + "loss": 2.6272, + "step": 23521 + }, + { + "epoch": 0.6975061530706046, + "grad_norm": 0.09703060239553452, + "learning_rate": 0.00021325139566453078, + "loss": 2.6545, + "step": 23522 + }, + { + "epoch": 0.6975358064229161, + "grad_norm": 0.0980689749121666, + "learning_rate": 0.00021321285243353273, + "loss": 2.5797, + "step": 23523 + }, + { + "epoch": 0.6975654597752275, + "grad_norm": 0.11553968489170074, + "learning_rate": 0.00021317431174208414, + "loss": 2.6214, + "step": 23524 + }, + { + "epoch": 0.6975951131275391, + "grad_norm": 0.10693435370922089, + "learning_rate": 0.0002131357735905264, + "loss": 2.6259, + "step": 23525 + }, + { + "epoch": 0.6976247664798505, + "grad_norm": 0.10695325583219528, + "learning_rate": 0.00021309723797920043, + "loss": 2.6176, + "step": 23526 + }, + { + "epoch": 0.697654419832162, + "grad_norm": 0.10914435237646103, + "learning_rate": 0.00021305870490844769, + "loss": 2.6009, + "step": 23527 + }, + { + "epoch": 0.6976840731844735, + "grad_norm": 0.11425402015447617, + "learning_rate": 0.0002130201743786094, + "loss": 2.6268, + "step": 23528 + }, + { + "epoch": 0.697713726536785, + "grad_norm": 0.11269479990005493, + "learning_rate": 0.0002129816463900265, + "loss": 2.6126, + "step": 23529 + }, + { + "epoch": 0.6977433798890965, + "grad_norm": 0.12174566090106964, + "learning_rate": 0.00021294312094304057, + "loss": 2.628, + "step": 23530 + }, + { + "epoch": 0.6977730332414079, + "grad_norm": 0.10619241744279861, + "learning_rate": 0.00021290459803799261, + "loss": 2.58, + "step": 23531 + }, + { + "epoch": 0.6978026865937195, + "grad_norm": 0.10552798211574554, + "learning_rate": 0.0002128660776752237, + "loss": 2.5976, + "step": 23532 + }, + { + "epoch": 0.6978323399460309, + "grad_norm": 0.10156918317079544, + "learning_rate": 0.00021282755985507497, + "loss": 2.6093, + "step": 23533 + }, + { + "epoch": 0.6978619932983424, + "grad_norm": 0.10399433970451355, + "learning_rate": 0.00021278904457788745, + "loss": 2.6021, + "step": 23534 + }, + { + "epoch": 0.6978916466506538, + "grad_norm": 0.11238224059343338, + "learning_rate": 0.0002127505318440023, + "loss": 2.6415, + "step": 23535 + }, + { + "epoch": 0.6979213000029654, + "grad_norm": 0.11088676005601883, + "learning_rate": 0.0002127120216537606, + "loss": 2.6195, + "step": 23536 + }, + { + "epoch": 0.6979509533552768, + "grad_norm": 0.10497593879699707, + "learning_rate": 0.00021267351400750312, + "loss": 2.5864, + "step": 23537 + }, + { + "epoch": 0.6979806067075883, + "grad_norm": 0.11678008735179901, + "learning_rate": 0.00021263500890557097, + "loss": 2.6166, + "step": 23538 + }, + { + "epoch": 0.6980102600598997, + "grad_norm": 0.10361815989017487, + "learning_rate": 0.00021259650634830517, + "loss": 2.5857, + "step": 23539 + }, + { + "epoch": 0.6980399134122113, + "grad_norm": 0.1087329164147377, + "learning_rate": 0.00021255800633604666, + "loss": 2.6553, + "step": 23540 + }, + { + "epoch": 0.6980695667645227, + "grad_norm": 0.11600183695554733, + "learning_rate": 0.0002125195088691363, + "loss": 2.6111, + "step": 23541 + }, + { + "epoch": 0.6980992201168342, + "grad_norm": 0.10860566794872284, + "learning_rate": 0.00021248101394791486, + "loss": 2.6068, + "step": 23542 + }, + { + "epoch": 0.6981288734691456, + "grad_norm": 0.11464093625545502, + "learning_rate": 0.0002124425215727236, + "loss": 2.6288, + "step": 23543 + }, + { + "epoch": 0.6981585268214572, + "grad_norm": 0.09670910984277725, + "learning_rate": 0.00021240403174390315, + "loss": 2.608, + "step": 23544 + }, + { + "epoch": 0.6981881801737686, + "grad_norm": 0.11476821452379227, + "learning_rate": 0.00021236554446179434, + "loss": 2.618, + "step": 23545 + }, + { + "epoch": 0.6982178335260801, + "grad_norm": 0.11926990747451782, + "learning_rate": 0.0002123270597267382, + "loss": 2.6089, + "step": 23546 + }, + { + "epoch": 0.6982474868783916, + "grad_norm": 0.11150253564119339, + "learning_rate": 0.00021228857753907522, + "loss": 2.6083, + "step": 23547 + }, + { + "epoch": 0.6982771402307031, + "grad_norm": 0.10540922731161118, + "learning_rate": 0.00021225009789914618, + "loss": 2.6259, + "step": 23548 + }, + { + "epoch": 0.6983067935830145, + "grad_norm": 0.11868729442358017, + "learning_rate": 0.00021221162080729196, + "loss": 2.6195, + "step": 23549 + }, + { + "epoch": 0.698336446935326, + "grad_norm": 0.10952375829219818, + "learning_rate": 0.0002121731462638532, + "loss": 2.6211, + "step": 23550 + }, + { + "epoch": 0.6983661002876376, + "grad_norm": 0.11381512135267258, + "learning_rate": 0.00021213467426917066, + "loss": 2.6237, + "step": 23551 + }, + { + "epoch": 0.698395753639949, + "grad_norm": 0.1757289171218872, + "learning_rate": 0.00021209620482358498, + "loss": 2.591, + "step": 23552 + }, + { + "epoch": 0.6984254069922605, + "grad_norm": 0.1024821475148201, + "learning_rate": 0.00021205773792743683, + "loss": 2.6437, + "step": 23553 + }, + { + "epoch": 0.6984550603445719, + "grad_norm": 0.11058644950389862, + "learning_rate": 0.00021201927358106682, + "loss": 2.6145, + "step": 23554 + }, + { + "epoch": 0.6984847136968835, + "grad_norm": 0.12264551967382431, + "learning_rate": 0.00021198081178481543, + "loss": 2.6331, + "step": 23555 + }, + { + "epoch": 0.6985143670491949, + "grad_norm": 0.10300824046134949, + "learning_rate": 0.00021194235253902357, + "loss": 2.5935, + "step": 23556 + }, + { + "epoch": 0.6985440204015064, + "grad_norm": 0.11836882680654526, + "learning_rate": 0.00021190389584403174, + "loss": 2.616, + "step": 23557 + }, + { + "epoch": 0.6985736737538178, + "grad_norm": 0.10828039795160294, + "learning_rate": 0.00021186544170018025, + "loss": 2.605, + "step": 23558 + }, + { + "epoch": 0.6986033271061294, + "grad_norm": 0.11158347874879837, + "learning_rate": 0.00021182699010780965, + "loss": 2.6262, + "step": 23559 + }, + { + "epoch": 0.6986329804584408, + "grad_norm": 0.10773975402116776, + "learning_rate": 0.00021178854106726057, + "loss": 2.5875, + "step": 23560 + }, + { + "epoch": 0.6986626338107523, + "grad_norm": 0.10856077075004578, + "learning_rate": 0.00021175009457887346, + "loss": 2.5969, + "step": 23561 + }, + { + "epoch": 0.6986922871630638, + "grad_norm": 0.10260796546936035, + "learning_rate": 0.00021171165064298868, + "loss": 2.564, + "step": 23562 + }, + { + "epoch": 0.6987219405153753, + "grad_norm": 0.1126386895775795, + "learning_rate": 0.00021167320925994678, + "loss": 2.6121, + "step": 23563 + }, + { + "epoch": 0.6987515938676867, + "grad_norm": 0.10512048751115799, + "learning_rate": 0.00021163477043008804, + "loss": 2.5723, + "step": 23564 + }, + { + "epoch": 0.6987812472199982, + "grad_norm": 0.10599040240049362, + "learning_rate": 0.00021159633415375297, + "loss": 2.6253, + "step": 23565 + }, + { + "epoch": 0.6988109005723097, + "grad_norm": 0.1096312403678894, + "learning_rate": 0.0002115579004312818, + "loss": 2.602, + "step": 23566 + }, + { + "epoch": 0.6988405539246212, + "grad_norm": 0.1170593649148941, + "learning_rate": 0.00021151946926301497, + "loss": 2.6014, + "step": 23567 + }, + { + "epoch": 0.6988702072769326, + "grad_norm": 0.10797420889139175, + "learning_rate": 0.00021148104064929273, + "loss": 2.595, + "step": 23568 + }, + { + "epoch": 0.6988998606292441, + "grad_norm": 0.10674727708101273, + "learning_rate": 0.00021144261459045543, + "loss": 2.6183, + "step": 23569 + }, + { + "epoch": 0.6989295139815556, + "grad_norm": 0.12783236801624298, + "learning_rate": 0.00021140419108684334, + "loss": 2.5843, + "step": 23570 + }, + { + "epoch": 0.6989591673338671, + "grad_norm": 0.11187009513378143, + "learning_rate": 0.0002113657701387966, + "loss": 2.6111, + "step": 23571 + }, + { + "epoch": 0.6989888206861786, + "grad_norm": 0.10632996261119843, + "learning_rate": 0.00021132735174665557, + "loss": 2.5936, + "step": 23572 + }, + { + "epoch": 0.69901847403849, + "grad_norm": 0.10808350890874863, + "learning_rate": 0.0002112889359107603, + "loss": 2.5928, + "step": 23573 + }, + { + "epoch": 0.6990481273908016, + "grad_norm": 0.09854169189929962, + "learning_rate": 0.00021125052263145118, + "loss": 2.6069, + "step": 23574 + }, + { + "epoch": 0.699077780743113, + "grad_norm": 0.11209910362958908, + "learning_rate": 0.00021121211190906815, + "loss": 2.5662, + "step": 23575 + }, + { + "epoch": 0.6991074340954245, + "grad_norm": 0.10670452564954758, + "learning_rate": 0.0002111737037439515, + "loss": 2.5738, + "step": 23576 + }, + { + "epoch": 0.699137087447736, + "grad_norm": 0.11428917199373245, + "learning_rate": 0.00021113529813644122, + "loss": 2.6496, + "step": 23577 + }, + { + "epoch": 0.6991667408000475, + "grad_norm": 0.11044607311487198, + "learning_rate": 0.00021109689508687753, + "loss": 2.6279, + "step": 23578 + }, + { + "epoch": 0.6991963941523589, + "grad_norm": 0.10339292138814926, + "learning_rate": 0.00021105849459560034, + "loss": 2.6288, + "step": 23579 + }, + { + "epoch": 0.6992260475046704, + "grad_norm": 0.11381015926599503, + "learning_rate": 0.00021102009666295002, + "loss": 2.628, + "step": 23580 + }, + { + "epoch": 0.6992557008569819, + "grad_norm": 0.10721451044082642, + "learning_rate": 0.00021098170128926598, + "loss": 2.6373, + "step": 23581 + }, + { + "epoch": 0.6992853542092934, + "grad_norm": 0.11022607982158661, + "learning_rate": 0.00021094330847488873, + "loss": 2.6391, + "step": 23582 + }, + { + "epoch": 0.6993150075616048, + "grad_norm": 0.10268940776586533, + "learning_rate": 0.00021090491822015812, + "loss": 2.5992, + "step": 23583 + }, + { + "epoch": 0.6993446609139163, + "grad_norm": 0.10977554321289062, + "learning_rate": 0.0002108665305254141, + "loss": 2.6143, + "step": 23584 + }, + { + "epoch": 0.6993743142662278, + "grad_norm": 0.1062714159488678, + "learning_rate": 0.00021082814539099653, + "loss": 2.6097, + "step": 23585 + }, + { + "epoch": 0.6994039676185393, + "grad_norm": 0.10500849783420563, + "learning_rate": 0.00021078976281724542, + "loss": 2.6199, + "step": 23586 + }, + { + "epoch": 0.6994336209708507, + "grad_norm": 0.0943511500954628, + "learning_rate": 0.00021075138280450062, + "loss": 2.5925, + "step": 23587 + }, + { + "epoch": 0.6994632743231622, + "grad_norm": 0.11681206524372101, + "learning_rate": 0.0002107130053531019, + "loss": 2.6361, + "step": 23588 + }, + { + "epoch": 0.6994929276754737, + "grad_norm": 0.0984424352645874, + "learning_rate": 0.00021067463046338925, + "loss": 2.6021, + "step": 23589 + }, + { + "epoch": 0.6995225810277852, + "grad_norm": 0.09745241701602936, + "learning_rate": 0.00021063625813570237, + "loss": 2.6098, + "step": 23590 + }, + { + "epoch": 0.6995522343800966, + "grad_norm": 0.10311733186244965, + "learning_rate": 0.00021059788837038125, + "loss": 2.6021, + "step": 23591 + }, + { + "epoch": 0.6995818877324081, + "grad_norm": 0.10084527730941772, + "learning_rate": 0.00021055952116776533, + "loss": 2.6017, + "step": 23592 + }, + { + "epoch": 0.6996115410847197, + "grad_norm": 0.09888385981321335, + "learning_rate": 0.0002105211565281946, + "loss": 2.6512, + "step": 23593 + }, + { + "epoch": 0.6996411944370311, + "grad_norm": 0.10056169331073761, + "learning_rate": 0.00021048279445200847, + "loss": 2.6176, + "step": 23594 + }, + { + "epoch": 0.6996708477893426, + "grad_norm": 0.09459000825881958, + "learning_rate": 0.00021044443493954706, + "loss": 2.61, + "step": 23595 + }, + { + "epoch": 0.699700501141654, + "grad_norm": 0.09328508377075195, + "learning_rate": 0.00021040607799114992, + "loss": 2.5896, + "step": 23596 + }, + { + "epoch": 0.6997301544939656, + "grad_norm": 0.09730187803506851, + "learning_rate": 0.00021036772360715666, + "loss": 2.6184, + "step": 23597 + }, + { + "epoch": 0.699759807846277, + "grad_norm": 0.1141517162322998, + "learning_rate": 0.0002103293717879069, + "loss": 2.6172, + "step": 23598 + }, + { + "epoch": 0.6997894611985885, + "grad_norm": 0.09857654571533203, + "learning_rate": 0.00021029102253374032, + "loss": 2.6238, + "step": 23599 + }, + { + "epoch": 0.6998191145509, + "grad_norm": 0.10577596724033356, + "learning_rate": 0.00021025267584499641, + "loss": 2.6101, + "step": 23600 + }, + { + "epoch": 0.6998487679032115, + "grad_norm": 0.09396784752607346, + "learning_rate": 0.00021021433172201503, + "loss": 2.6246, + "step": 23601 + }, + { + "epoch": 0.6998784212555229, + "grad_norm": 0.10063496977090836, + "learning_rate": 0.00021017599016513527, + "loss": 2.5738, + "step": 23602 + }, + { + "epoch": 0.6999080746078344, + "grad_norm": 0.1114993765950203, + "learning_rate": 0.00021013765117469684, + "loss": 2.6154, + "step": 23603 + }, + { + "epoch": 0.6999377279601459, + "grad_norm": 0.10242269933223724, + "learning_rate": 0.00021009931475103927, + "loss": 2.5819, + "step": 23604 + }, + { + "epoch": 0.6999673813124574, + "grad_norm": 0.10429271310567856, + "learning_rate": 0.0002100609808945021, + "loss": 2.6008, + "step": 23605 + }, + { + "epoch": 0.6999970346647688, + "grad_norm": 0.09889756888151169, + "learning_rate": 0.0002100226496054246, + "loss": 2.6329, + "step": 23606 + }, + { + "epoch": 0.7000266880170803, + "grad_norm": 0.12530899047851562, + "learning_rate": 0.00020998432088414621, + "loss": 2.6098, + "step": 23607 + }, + { + "epoch": 0.7000563413693918, + "grad_norm": 0.10919201374053955, + "learning_rate": 0.00020994599473100661, + "loss": 2.5628, + "step": 23608 + }, + { + "epoch": 0.7000859947217033, + "grad_norm": 0.10905082523822784, + "learning_rate": 0.00020990767114634502, + "loss": 2.6032, + "step": 23609 + }, + { + "epoch": 0.7001156480740147, + "grad_norm": 0.11141311377286911, + "learning_rate": 0.00020986935013050074, + "loss": 2.6085, + "step": 23610 + }, + { + "epoch": 0.7001453014263262, + "grad_norm": 0.11571098864078522, + "learning_rate": 0.00020983103168381324, + "loss": 2.6165, + "step": 23611 + }, + { + "epoch": 0.7001749547786377, + "grad_norm": 0.0994226261973381, + "learning_rate": 0.00020979271580662192, + "loss": 2.6362, + "step": 23612 + }, + { + "epoch": 0.7002046081309492, + "grad_norm": 0.12511779367923737, + "learning_rate": 0.0002097544024992657, + "loss": 2.586, + "step": 23613 + }, + { + "epoch": 0.7002342614832607, + "grad_norm": 0.09991591423749924, + "learning_rate": 0.00020971609176208405, + "loss": 2.5958, + "step": 23614 + }, + { + "epoch": 0.7002639148355722, + "grad_norm": 0.10379701107740402, + "learning_rate": 0.00020967778359541627, + "loss": 2.6134, + "step": 23615 + }, + { + "epoch": 0.7002935681878837, + "grad_norm": 0.10745218396186829, + "learning_rate": 0.00020963947799960153, + "loss": 2.5948, + "step": 23616 + }, + { + "epoch": 0.7003232215401951, + "grad_norm": 0.09765829890966415, + "learning_rate": 0.0002096011749749791, + "loss": 2.5954, + "step": 23617 + }, + { + "epoch": 0.7003528748925066, + "grad_norm": 0.10451506078243256, + "learning_rate": 0.0002095628745218881, + "loss": 2.6223, + "step": 23618 + }, + { + "epoch": 0.7003825282448181, + "grad_norm": 0.10755879431962967, + "learning_rate": 0.00020952457664066766, + "loss": 2.644, + "step": 23619 + }, + { + "epoch": 0.7004121815971296, + "grad_norm": 0.10387039184570312, + "learning_rate": 0.0002094862813316568, + "loss": 2.6175, + "step": 23620 + }, + { + "epoch": 0.700441834949441, + "grad_norm": 0.1074817106127739, + "learning_rate": 0.00020944798859519494, + "loss": 2.6163, + "step": 23621 + }, + { + "epoch": 0.7004714883017525, + "grad_norm": 0.09176189452409744, + "learning_rate": 0.0002094096984316212, + "loss": 2.6232, + "step": 23622 + }, + { + "epoch": 0.700501141654064, + "grad_norm": 0.10313351452350616, + "learning_rate": 0.0002093714108412743, + "loss": 2.5981, + "step": 23623 + }, + { + "epoch": 0.7005307950063755, + "grad_norm": 0.088786281645298, + "learning_rate": 0.00020933312582449343, + "loss": 2.6027, + "step": 23624 + }, + { + "epoch": 0.7005604483586869, + "grad_norm": 0.10251012444496155, + "learning_rate": 0.00020929484338161764, + "loss": 2.6062, + "step": 23625 + }, + { + "epoch": 0.7005901017109984, + "grad_norm": 0.09772200882434845, + "learning_rate": 0.0002092565635129859, + "loss": 2.6201, + "step": 23626 + }, + { + "epoch": 0.7006197550633099, + "grad_norm": 0.11268720775842667, + "learning_rate": 0.00020921828621893717, + "loss": 2.6308, + "step": 23627 + }, + { + "epoch": 0.7006494084156214, + "grad_norm": 0.09384211152791977, + "learning_rate": 0.00020918001149981046, + "loss": 2.6165, + "step": 23628 + }, + { + "epoch": 0.7006790617679328, + "grad_norm": 0.115128293633461, + "learning_rate": 0.00020914173935594467, + "loss": 2.5989, + "step": 23629 + }, + { + "epoch": 0.7007087151202444, + "grad_norm": 0.10623413324356079, + "learning_rate": 0.00020910346978767868, + "loss": 2.6469, + "step": 23630 + }, + { + "epoch": 0.7007383684725558, + "grad_norm": 0.1024266853928566, + "learning_rate": 0.0002090652027953514, + "loss": 2.6282, + "step": 23631 + }, + { + "epoch": 0.7007680218248673, + "grad_norm": 0.0967777669429779, + "learning_rate": 0.0002090269383793017, + "loss": 2.6292, + "step": 23632 + }, + { + "epoch": 0.7007976751771787, + "grad_norm": 0.10830987989902496, + "learning_rate": 0.0002089886765398684, + "loss": 2.6078, + "step": 23633 + }, + { + "epoch": 0.7008273285294903, + "grad_norm": 0.10759046673774719, + "learning_rate": 0.00020895041727739033, + "loss": 2.622, + "step": 23634 + }, + { + "epoch": 0.7008569818818018, + "grad_norm": 0.10603111237287521, + "learning_rate": 0.0002089121605922063, + "loss": 2.6182, + "step": 23635 + }, + { + "epoch": 0.7008866352341132, + "grad_norm": 0.09936551749706268, + "learning_rate": 0.000208873906484655, + "loss": 2.6117, + "step": 23636 + }, + { + "epoch": 0.7009162885864247, + "grad_norm": 0.09973668307065964, + "learning_rate": 0.00020883565495507522, + "loss": 2.5982, + "step": 23637 + }, + { + "epoch": 0.7009459419387362, + "grad_norm": 0.09623448550701141, + "learning_rate": 0.00020879740600380576, + "loss": 2.611, + "step": 23638 + }, + { + "epoch": 0.7009755952910477, + "grad_norm": 0.10608984529972076, + "learning_rate": 0.00020875915963118518, + "loss": 2.6111, + "step": 23639 + }, + { + "epoch": 0.7010052486433591, + "grad_norm": 0.09735241532325745, + "learning_rate": 0.00020872091583755233, + "loss": 2.6416, + "step": 23640 + }, + { + "epoch": 0.7010349019956706, + "grad_norm": 0.09783729165792465, + "learning_rate": 0.00020868267462324569, + "loss": 2.6148, + "step": 23641 + }, + { + "epoch": 0.7010645553479821, + "grad_norm": 0.10526294261217117, + "learning_rate": 0.00020864443598860393, + "loss": 2.6002, + "step": 23642 + }, + { + "epoch": 0.7010942087002936, + "grad_norm": 0.09417963773012161, + "learning_rate": 0.0002086061999339658, + "loss": 2.6242, + "step": 23643 + }, + { + "epoch": 0.701123862052605, + "grad_norm": 0.10242436081171036, + "learning_rate": 0.00020856796645966975, + "loss": 2.5949, + "step": 23644 + }, + { + "epoch": 0.7011535154049165, + "grad_norm": 0.09457051753997803, + "learning_rate": 0.00020852973556605438, + "loss": 2.6213, + "step": 23645 + }, + { + "epoch": 0.701183168757228, + "grad_norm": 0.104641392827034, + "learning_rate": 0.00020849150725345817, + "loss": 2.6105, + "step": 23646 + }, + { + "epoch": 0.7012128221095395, + "grad_norm": 0.10727085173130035, + "learning_rate": 0.00020845328152221975, + "loss": 2.627, + "step": 23647 + }, + { + "epoch": 0.7012424754618509, + "grad_norm": 0.09932464361190796, + "learning_rate": 0.0002084150583726776, + "loss": 2.6059, + "step": 23648 + }, + { + "epoch": 0.7012721288141625, + "grad_norm": 0.10057927668094635, + "learning_rate": 0.00020837683780517013, + "loss": 2.6194, + "step": 23649 + }, + { + "epoch": 0.7013017821664739, + "grad_norm": 0.11076454073190689, + "learning_rate": 0.00020833861982003581, + "loss": 2.5925, + "step": 23650 + }, + { + "epoch": 0.7013314355187854, + "grad_norm": 0.10268506407737732, + "learning_rate": 0.00020830040441761305, + "loss": 2.6063, + "step": 23651 + }, + { + "epoch": 0.7013610888710968, + "grad_norm": 0.10720369219779968, + "learning_rate": 0.00020826219159824033, + "loss": 2.5943, + "step": 23652 + }, + { + "epoch": 0.7013907422234084, + "grad_norm": 0.10990142077207565, + "learning_rate": 0.00020822398136225595, + "loss": 2.6271, + "step": 23653 + }, + { + "epoch": 0.7014203955757199, + "grad_norm": 0.11567336320877075, + "learning_rate": 0.0002081857737099983, + "loss": 2.5904, + "step": 23654 + }, + { + "epoch": 0.7014500489280313, + "grad_norm": 0.09529582411050797, + "learning_rate": 0.00020814756864180573, + "loss": 2.6163, + "step": 23655 + }, + { + "epoch": 0.7014797022803428, + "grad_norm": 0.12829111516475677, + "learning_rate": 0.0002081093661580167, + "loss": 2.6043, + "step": 23656 + }, + { + "epoch": 0.7015093556326543, + "grad_norm": 0.10747010260820389, + "learning_rate": 0.00020807116625896915, + "loss": 2.5761, + "step": 23657 + }, + { + "epoch": 0.7015390089849658, + "grad_norm": 0.11104309558868408, + "learning_rate": 0.00020803296894500158, + "loss": 2.5931, + "step": 23658 + }, + { + "epoch": 0.7015686623372772, + "grad_norm": 0.1167357861995697, + "learning_rate": 0.00020799477421645196, + "loss": 2.6187, + "step": 23659 + }, + { + "epoch": 0.7015983156895887, + "grad_norm": 0.11400368809700012, + "learning_rate": 0.0002079565820736589, + "loss": 2.6151, + "step": 23660 + }, + { + "epoch": 0.7016279690419002, + "grad_norm": 0.10685040056705475, + "learning_rate": 0.00020791839251696042, + "loss": 2.6085, + "step": 23661 + }, + { + "epoch": 0.7016576223942117, + "grad_norm": 0.09525250643491745, + "learning_rate": 0.00020788020554669479, + "loss": 2.5785, + "step": 23662 + }, + { + "epoch": 0.7016872757465231, + "grad_norm": 0.11379586160182953, + "learning_rate": 0.0002078420211632, + "loss": 2.5992, + "step": 23663 + }, + { + "epoch": 0.7017169290988347, + "grad_norm": 0.08874356001615524, + "learning_rate": 0.00020780383936681436, + "loss": 2.6095, + "step": 23664 + }, + { + "epoch": 0.7017465824511461, + "grad_norm": 0.09850549697875977, + "learning_rate": 0.0002077656601578758, + "loss": 2.6156, + "step": 23665 + }, + { + "epoch": 0.7017762358034576, + "grad_norm": 0.10362184792757034, + "learning_rate": 0.0002077274835367225, + "loss": 2.6109, + "step": 23666 + }, + { + "epoch": 0.701805889155769, + "grad_norm": 0.10224074870347977, + "learning_rate": 0.00020768930950369264, + "loss": 2.6305, + "step": 23667 + }, + { + "epoch": 0.7018355425080806, + "grad_norm": 0.10315252840518951, + "learning_rate": 0.000207651138059124, + "loss": 2.6099, + "step": 23668 + }, + { + "epoch": 0.701865195860392, + "grad_norm": 0.10648492723703384, + "learning_rate": 0.0002076129692033547, + "loss": 2.5807, + "step": 23669 + }, + { + "epoch": 0.7018948492127035, + "grad_norm": 0.10859618335962296, + "learning_rate": 0.0002075748029367227, + "loss": 2.6227, + "step": 23670 + }, + { + "epoch": 0.7019245025650149, + "grad_norm": 0.10314083099365234, + "learning_rate": 0.00020753663925956607, + "loss": 2.6254, + "step": 23671 + }, + { + "epoch": 0.7019541559173265, + "grad_norm": 0.11673898249864578, + "learning_rate": 0.0002074984781722225, + "loss": 2.6109, + "step": 23672 + }, + { + "epoch": 0.7019838092696379, + "grad_norm": 0.11310994625091553, + "learning_rate": 0.00020746031967503026, + "loss": 2.6071, + "step": 23673 + }, + { + "epoch": 0.7020134626219494, + "grad_norm": 0.11331450194120407, + "learning_rate": 0.00020742216376832718, + "loss": 2.6255, + "step": 23674 + }, + { + "epoch": 0.702043115974261, + "grad_norm": 0.09402187913656235, + "learning_rate": 0.000207384010452451, + "loss": 2.6417, + "step": 23675 + }, + { + "epoch": 0.7020727693265724, + "grad_norm": 0.11273249238729477, + "learning_rate": 0.00020734585972773968, + "loss": 2.6212, + "step": 23676 + }, + { + "epoch": 0.7021024226788839, + "grad_norm": 0.10217174887657166, + "learning_rate": 0.0002073077115945311, + "loss": 2.5887, + "step": 23677 + }, + { + "epoch": 0.7021320760311953, + "grad_norm": 0.10094644874334335, + "learning_rate": 0.0002072695660531629, + "loss": 2.6358, + "step": 23678 + }, + { + "epoch": 0.7021617293835068, + "grad_norm": 0.09727008640766144, + "learning_rate": 0.00020723142310397286, + "loss": 2.603, + "step": 23679 + }, + { + "epoch": 0.7021913827358183, + "grad_norm": 0.10863389074802399, + "learning_rate": 0.00020719328274729887, + "loss": 2.6571, + "step": 23680 + }, + { + "epoch": 0.7022210360881298, + "grad_norm": 0.10480735450983047, + "learning_rate": 0.00020715514498347858, + "loss": 2.6202, + "step": 23681 + }, + { + "epoch": 0.7022506894404412, + "grad_norm": 0.10787687450647354, + "learning_rate": 0.00020711700981284977, + "loss": 2.5935, + "step": 23682 + }, + { + "epoch": 0.7022803427927528, + "grad_norm": 0.11352464556694031, + "learning_rate": 0.00020707887723575008, + "loss": 2.6312, + "step": 23683 + }, + { + "epoch": 0.7023099961450642, + "grad_norm": 0.11331488937139511, + "learning_rate": 0.0002070407472525171, + "loss": 2.6092, + "step": 23684 + }, + { + "epoch": 0.7023396494973757, + "grad_norm": 0.09709293395280838, + "learning_rate": 0.00020700261986348874, + "loss": 2.6011, + "step": 23685 + }, + { + "epoch": 0.7023693028496871, + "grad_norm": 0.10468171536922455, + "learning_rate": 0.00020696449506900244, + "loss": 2.6057, + "step": 23686 + }, + { + "epoch": 0.7023989562019987, + "grad_norm": 0.0974080041050911, + "learning_rate": 0.00020692637286939586, + "loss": 2.6058, + "step": 23687 + }, + { + "epoch": 0.7024286095543101, + "grad_norm": 0.10040763020515442, + "learning_rate": 0.00020688825326500672, + "loss": 2.6173, + "step": 23688 + }, + { + "epoch": 0.7024582629066216, + "grad_norm": 0.10753936320543289, + "learning_rate": 0.0002068501362561722, + "loss": 2.621, + "step": 23689 + }, + { + "epoch": 0.702487916258933, + "grad_norm": 0.11311905086040497, + "learning_rate": 0.00020681202184323012, + "loss": 2.6023, + "step": 23690 + }, + { + "epoch": 0.7025175696112446, + "grad_norm": 0.10587790608406067, + "learning_rate": 0.00020677391002651784, + "loss": 2.5996, + "step": 23691 + }, + { + "epoch": 0.702547222963556, + "grad_norm": 0.10934188961982727, + "learning_rate": 0.000206735800806373, + "loss": 2.6331, + "step": 23692 + }, + { + "epoch": 0.7025768763158675, + "grad_norm": 0.11487778276205063, + "learning_rate": 0.00020669769418313295, + "loss": 2.6458, + "step": 23693 + }, + { + "epoch": 0.7026065296681789, + "grad_norm": 0.1022520661354065, + "learning_rate": 0.00020665959015713514, + "loss": 2.6208, + "step": 23694 + }, + { + "epoch": 0.7026361830204905, + "grad_norm": 0.10617157071828842, + "learning_rate": 0.000206621488728717, + "loss": 2.6457, + "step": 23695 + }, + { + "epoch": 0.702665836372802, + "grad_norm": 0.10449802875518799, + "learning_rate": 0.00020658338989821596, + "loss": 2.5681, + "step": 23696 + }, + { + "epoch": 0.7026954897251134, + "grad_norm": 0.09019312262535095, + "learning_rate": 0.00020654529366596918, + "loss": 2.5941, + "step": 23697 + }, + { + "epoch": 0.702725143077425, + "grad_norm": 0.09797976911067963, + "learning_rate": 0.00020650720003231456, + "loss": 2.5982, + "step": 23698 + }, + { + "epoch": 0.7027547964297364, + "grad_norm": 0.10762985050678253, + "learning_rate": 0.00020646910899758887, + "loss": 2.597, + "step": 23699 + }, + { + "epoch": 0.7027844497820479, + "grad_norm": 0.08812930434942245, + "learning_rate": 0.00020643102056212958, + "loss": 2.5895, + "step": 23700 + }, + { + "epoch": 0.7028141031343593, + "grad_norm": 0.1147010549902916, + "learning_rate": 0.00020639293472627401, + "loss": 2.58, + "step": 23701 + }, + { + "epoch": 0.7028437564866709, + "grad_norm": 0.09787359088659286, + "learning_rate": 0.00020635485149035943, + "loss": 2.5979, + "step": 23702 + }, + { + "epoch": 0.7028734098389823, + "grad_norm": 0.10911699384450912, + "learning_rate": 0.00020631677085472305, + "loss": 2.6185, + "step": 23703 + }, + { + "epoch": 0.7029030631912938, + "grad_norm": 0.10297354310750961, + "learning_rate": 0.000206278692819702, + "loss": 2.5366, + "step": 23704 + }, + { + "epoch": 0.7029327165436052, + "grad_norm": 0.12155994772911072, + "learning_rate": 0.00020624061738563367, + "loss": 2.6065, + "step": 23705 + }, + { + "epoch": 0.7029623698959168, + "grad_norm": 0.10919728130102158, + "learning_rate": 0.00020620254455285499, + "loss": 2.5804, + "step": 23706 + }, + { + "epoch": 0.7029920232482282, + "grad_norm": 0.11447476595640182, + "learning_rate": 0.0002061644743217032, + "loss": 2.5912, + "step": 23707 + }, + { + "epoch": 0.7030216766005397, + "grad_norm": 0.10154441744089127, + "learning_rate": 0.0002061264066925155, + "loss": 2.6489, + "step": 23708 + }, + { + "epoch": 0.7030513299528511, + "grad_norm": 0.10109782963991165, + "learning_rate": 0.00020608834166562884, + "loss": 2.5881, + "step": 23709 + }, + { + "epoch": 0.7030809833051627, + "grad_norm": 0.10137774795293808, + "learning_rate": 0.00020605027924138042, + "loss": 2.6062, + "step": 23710 + }, + { + "epoch": 0.7031106366574741, + "grad_norm": 0.09665287286043167, + "learning_rate": 0.00020601221942010724, + "loss": 2.6436, + "step": 23711 + }, + { + "epoch": 0.7031402900097856, + "grad_norm": 0.10201513767242432, + "learning_rate": 0.00020597416220214626, + "loss": 2.5968, + "step": 23712 + }, + { + "epoch": 0.703169943362097, + "grad_norm": 0.09702041745185852, + "learning_rate": 0.0002059361075878346, + "loss": 2.5971, + "step": 23713 + }, + { + "epoch": 0.7031995967144086, + "grad_norm": 0.09296724200248718, + "learning_rate": 0.00020589805557750913, + "loss": 2.6586, + "step": 23714 + }, + { + "epoch": 0.70322925006672, + "grad_norm": 0.1005493700504303, + "learning_rate": 0.00020586000617150692, + "loss": 2.5599, + "step": 23715 + }, + { + "epoch": 0.7032589034190315, + "grad_norm": 0.10023710876703262, + "learning_rate": 0.0002058219593701648, + "loss": 2.5953, + "step": 23716 + }, + { + "epoch": 0.703288556771343, + "grad_norm": 0.09356401115655899, + "learning_rate": 0.00020578391517381972, + "loss": 2.6302, + "step": 23717 + }, + { + "epoch": 0.7033182101236545, + "grad_norm": 0.09649470448493958, + "learning_rate": 0.00020574587358280854, + "loss": 2.608, + "step": 23718 + }, + { + "epoch": 0.703347863475966, + "grad_norm": 0.10090164840221405, + "learning_rate": 0.00020570783459746822, + "loss": 2.618, + "step": 23719 + }, + { + "epoch": 0.7033775168282774, + "grad_norm": 0.10616949200630188, + "learning_rate": 0.00020566979821813554, + "loss": 2.5797, + "step": 23720 + }, + { + "epoch": 0.703407170180589, + "grad_norm": 0.09558966010808945, + "learning_rate": 0.0002056317644451473, + "loss": 2.6218, + "step": 23721 + }, + { + "epoch": 0.7034368235329004, + "grad_norm": 0.10111739486455917, + "learning_rate": 0.00020559373327884052, + "loss": 2.6128, + "step": 23722 + }, + { + "epoch": 0.7034664768852119, + "grad_norm": 0.10728054493665695, + "learning_rate": 0.00020555570471955138, + "loss": 2.5836, + "step": 23723 + }, + { + "epoch": 0.7034961302375233, + "grad_norm": 0.11485182493925095, + "learning_rate": 0.00020551767876761718, + "loss": 2.6091, + "step": 23724 + }, + { + "epoch": 0.7035257835898349, + "grad_norm": 0.09155827760696411, + "learning_rate": 0.00020547965542337448, + "loss": 2.6311, + "step": 23725 + }, + { + "epoch": 0.7035554369421463, + "grad_norm": 0.10379286110401154, + "learning_rate": 0.00020544163468716003, + "loss": 2.6405, + "step": 23726 + }, + { + "epoch": 0.7035850902944578, + "grad_norm": 0.11045540124177933, + "learning_rate": 0.00020540361655931044, + "loss": 2.594, + "step": 23727 + }, + { + "epoch": 0.7036147436467692, + "grad_norm": 0.09309455752372742, + "learning_rate": 0.00020536560104016238, + "loss": 2.6115, + "step": 23728 + }, + { + "epoch": 0.7036443969990808, + "grad_norm": 0.12541069090366364, + "learning_rate": 0.00020532758813005247, + "loss": 2.6, + "step": 23729 + }, + { + "epoch": 0.7036740503513922, + "grad_norm": 0.11000743508338928, + "learning_rate": 0.0002052895778293174, + "loss": 2.6352, + "step": 23730 + }, + { + "epoch": 0.7037037037037037, + "grad_norm": 0.09488928318023682, + "learning_rate": 0.00020525157013829372, + "loss": 2.5938, + "step": 23731 + }, + { + "epoch": 0.7037333570560151, + "grad_norm": 0.10846365243196487, + "learning_rate": 0.0002052135650573181, + "loss": 2.5773, + "step": 23732 + }, + { + "epoch": 0.7037630104083267, + "grad_norm": 0.10713914781808853, + "learning_rate": 0.0002051755625867268, + "loss": 2.5886, + "step": 23733 + }, + { + "epoch": 0.7037926637606381, + "grad_norm": 0.09565415233373642, + "learning_rate": 0.00020513756272685652, + "loss": 2.5799, + "step": 23734 + }, + { + "epoch": 0.7038223171129496, + "grad_norm": 0.12311644852161407, + "learning_rate": 0.0002050995654780437, + "loss": 2.5979, + "step": 23735 + }, + { + "epoch": 0.703851970465261, + "grad_norm": 0.11603893339633942, + "learning_rate": 0.00020506157084062472, + "loss": 2.5878, + "step": 23736 + }, + { + "epoch": 0.7038816238175726, + "grad_norm": 0.11204317212104797, + "learning_rate": 0.00020502357881493628, + "loss": 2.6275, + "step": 23737 + }, + { + "epoch": 0.7039112771698841, + "grad_norm": 0.10794688016176224, + "learning_rate": 0.00020498558940131473, + "loss": 2.6133, + "step": 23738 + }, + { + "epoch": 0.7039409305221955, + "grad_norm": 0.11935783922672272, + "learning_rate": 0.00020494760260009637, + "loss": 2.6454, + "step": 23739 + }, + { + "epoch": 0.7039705838745071, + "grad_norm": 0.09363449364900589, + "learning_rate": 0.0002049096184116177, + "loss": 2.5751, + "step": 23740 + }, + { + "epoch": 0.7040002372268185, + "grad_norm": 0.10867644846439362, + "learning_rate": 0.00020487163683621497, + "loss": 2.5827, + "step": 23741 + }, + { + "epoch": 0.70402989057913, + "grad_norm": 0.11779287457466125, + "learning_rate": 0.00020483365787422452, + "loss": 2.5893, + "step": 23742 + }, + { + "epoch": 0.7040595439314414, + "grad_norm": 0.09965632855892181, + "learning_rate": 0.00020479568152598294, + "loss": 2.5922, + "step": 23743 + }, + { + "epoch": 0.704089197283753, + "grad_norm": 0.10493193566799164, + "learning_rate": 0.00020475770779182606, + "loss": 2.6253, + "step": 23744 + }, + { + "epoch": 0.7041188506360644, + "grad_norm": 0.09893898665904999, + "learning_rate": 0.00020471973667209037, + "loss": 2.6444, + "step": 23745 + }, + { + "epoch": 0.7041485039883759, + "grad_norm": 0.10944090783596039, + "learning_rate": 0.0002046817681671121, + "loss": 2.5998, + "step": 23746 + }, + { + "epoch": 0.7041781573406873, + "grad_norm": 0.09781930595636368, + "learning_rate": 0.00020464380227722747, + "loss": 2.6071, + "step": 23747 + }, + { + "epoch": 0.7042078106929989, + "grad_norm": 0.10606762021780014, + "learning_rate": 0.00020460583900277262, + "loss": 2.609, + "step": 23748 + }, + { + "epoch": 0.7042374640453103, + "grad_norm": 0.10357122123241425, + "learning_rate": 0.0002045678783440836, + "loss": 2.6084, + "step": 23749 + }, + { + "epoch": 0.7042671173976218, + "grad_norm": 0.11109017580747604, + "learning_rate": 0.0002045299203014969, + "loss": 2.6187, + "step": 23750 + }, + { + "epoch": 0.7042967707499332, + "grad_norm": 0.11079373955726624, + "learning_rate": 0.00020449196487534854, + "loss": 2.593, + "step": 23751 + }, + { + "epoch": 0.7043264241022448, + "grad_norm": 0.11061792075634003, + "learning_rate": 0.00020445401206597446, + "loss": 2.5732, + "step": 23752 + }, + { + "epoch": 0.7043560774545562, + "grad_norm": 0.10749121755361557, + "learning_rate": 0.00020441606187371109, + "loss": 2.6191, + "step": 23753 + }, + { + "epoch": 0.7043857308068677, + "grad_norm": 0.14829692244529724, + "learning_rate": 0.000204378114298894, + "loss": 2.561, + "step": 23754 + }, + { + "epoch": 0.7044153841591791, + "grad_norm": 0.10618558526039124, + "learning_rate": 0.0002043401693418595, + "loss": 2.5905, + "step": 23755 + }, + { + "epoch": 0.7044450375114907, + "grad_norm": 0.35412511229515076, + "learning_rate": 0.00020430222700294355, + "loss": 2.584, + "step": 23756 + }, + { + "epoch": 0.7044746908638021, + "grad_norm": 0.11508891731500626, + "learning_rate": 0.0002042642872824821, + "loss": 2.6007, + "step": 23757 + }, + { + "epoch": 0.7045043442161136, + "grad_norm": 0.14267708361148834, + "learning_rate": 0.00020422635018081114, + "loss": 2.6417, + "step": 23758 + }, + { + "epoch": 0.7045339975684252, + "grad_norm": 0.12659788131713867, + "learning_rate": 0.00020418841569826664, + "loss": 2.6091, + "step": 23759 + }, + { + "epoch": 0.7045636509207366, + "grad_norm": 0.12892857193946838, + "learning_rate": 0.00020415048383518448, + "loss": 2.6112, + "step": 23760 + }, + { + "epoch": 0.7045933042730481, + "grad_norm": 0.11731239408254623, + "learning_rate": 0.0002041125545919006, + "loss": 2.6233, + "step": 23761 + }, + { + "epoch": 0.7046229576253595, + "grad_norm": 0.11131415516138077, + "learning_rate": 0.00020407462796875065, + "loss": 2.6366, + "step": 23762 + }, + { + "epoch": 0.7046526109776711, + "grad_norm": 0.13627885282039642, + "learning_rate": 0.00020403670396607082, + "loss": 2.6272, + "step": 23763 + }, + { + "epoch": 0.7046822643299825, + "grad_norm": 0.11070757359266281, + "learning_rate": 0.00020399878258419696, + "loss": 2.61, + "step": 23764 + }, + { + "epoch": 0.704711917682294, + "grad_norm": 0.11369968205690384, + "learning_rate": 0.00020396086382346452, + "loss": 2.6022, + "step": 23765 + }, + { + "epoch": 0.7047415710346054, + "grad_norm": 0.10667412728071213, + "learning_rate": 0.00020392294768420944, + "loss": 2.6224, + "step": 23766 + }, + { + "epoch": 0.704771224386917, + "grad_norm": 0.11236646771430969, + "learning_rate": 0.00020388503416676746, + "loss": 2.633, + "step": 23767 + }, + { + "epoch": 0.7048008777392284, + "grad_norm": 0.1032889261841774, + "learning_rate": 0.00020384712327147436, + "loss": 2.6209, + "step": 23768 + }, + { + "epoch": 0.7048305310915399, + "grad_norm": 0.11485651880502701, + "learning_rate": 0.0002038092149986658, + "loss": 2.6233, + "step": 23769 + }, + { + "epoch": 0.7048601844438513, + "grad_norm": 0.10049796849489212, + "learning_rate": 0.0002037713093486775, + "loss": 2.6386, + "step": 23770 + }, + { + "epoch": 0.7048898377961629, + "grad_norm": 0.11041729152202606, + "learning_rate": 0.00020373340632184506, + "loss": 2.6184, + "step": 23771 + }, + { + "epoch": 0.7049194911484743, + "grad_norm": 0.09885431081056595, + "learning_rate": 0.00020369550591850421, + "loss": 2.6225, + "step": 23772 + }, + { + "epoch": 0.7049491445007858, + "grad_norm": 0.10952533781528473, + "learning_rate": 0.0002036576081389905, + "loss": 2.621, + "step": 23773 + }, + { + "epoch": 0.7049787978530972, + "grad_norm": 0.1038118451833725, + "learning_rate": 0.0002036197129836395, + "loss": 2.6146, + "step": 23774 + }, + { + "epoch": 0.7050084512054088, + "grad_norm": 0.10552947968244553, + "learning_rate": 0.0002035818204527869, + "loss": 2.6192, + "step": 23775 + }, + { + "epoch": 0.7050381045577202, + "grad_norm": 0.09956086426973343, + "learning_rate": 0.00020354393054676807, + "loss": 2.626, + "step": 23776 + }, + { + "epoch": 0.7050677579100317, + "grad_norm": 0.1128799319267273, + "learning_rate": 0.00020350604326591865, + "loss": 2.6104, + "step": 23777 + }, + { + "epoch": 0.7050974112623432, + "grad_norm": 0.11046499013900757, + "learning_rate": 0.00020346815861057416, + "loss": 2.6069, + "step": 23778 + }, + { + "epoch": 0.7051270646146547, + "grad_norm": 0.11226849257946014, + "learning_rate": 0.00020343027658106995, + "loss": 2.5936, + "step": 23779 + }, + { + "epoch": 0.7051567179669662, + "grad_norm": 0.10505908727645874, + "learning_rate": 0.00020339239717774162, + "loss": 2.6093, + "step": 23780 + }, + { + "epoch": 0.7051863713192776, + "grad_norm": 0.1105046421289444, + "learning_rate": 0.00020335452040092444, + "loss": 2.5978, + "step": 23781 + }, + { + "epoch": 0.7052160246715892, + "grad_norm": 0.09469196945428848, + "learning_rate": 0.00020331664625095397, + "loss": 2.5914, + "step": 23782 + }, + { + "epoch": 0.7052456780239006, + "grad_norm": 0.10825742781162262, + "learning_rate": 0.00020327877472816548, + "loss": 2.612, + "step": 23783 + }, + { + "epoch": 0.7052753313762121, + "grad_norm": 0.1082645133137703, + "learning_rate": 0.00020324090583289438, + "loss": 2.597, + "step": 23784 + }, + { + "epoch": 0.7053049847285235, + "grad_norm": 0.10366763174533844, + "learning_rate": 0.00020320303956547603, + "loss": 2.5917, + "step": 23785 + }, + { + "epoch": 0.7053346380808351, + "grad_norm": 0.10481686145067215, + "learning_rate": 0.00020316517592624568, + "loss": 2.6041, + "step": 23786 + }, + { + "epoch": 0.7053642914331465, + "grad_norm": 0.08909960836172104, + "learning_rate": 0.00020312731491553866, + "loss": 2.5685, + "step": 23787 + }, + { + "epoch": 0.705393944785458, + "grad_norm": 0.10086388140916824, + "learning_rate": 0.0002030894565336902, + "loss": 2.5993, + "step": 23788 + }, + { + "epoch": 0.7054235981377694, + "grad_norm": 0.10177303105592728, + "learning_rate": 0.00020305160078103556, + "loss": 2.6109, + "step": 23789 + }, + { + "epoch": 0.705453251490081, + "grad_norm": 0.1010516807436943, + "learning_rate": 0.00020301374765790996, + "loss": 2.5776, + "step": 23790 + }, + { + "epoch": 0.7054829048423924, + "grad_norm": 0.09804730862379074, + "learning_rate": 0.00020297589716464866, + "loss": 2.5961, + "step": 23791 + }, + { + "epoch": 0.7055125581947039, + "grad_norm": 0.10206146538257599, + "learning_rate": 0.00020293804930158677, + "loss": 2.5685, + "step": 23792 + }, + { + "epoch": 0.7055422115470154, + "grad_norm": 0.10586890578269958, + "learning_rate": 0.0002029002040690594, + "loss": 2.5994, + "step": 23793 + }, + { + "epoch": 0.7055718648993269, + "grad_norm": 0.09994630515575409, + "learning_rate": 0.00020286236146740172, + "loss": 2.6062, + "step": 23794 + }, + { + "epoch": 0.7056015182516383, + "grad_norm": 0.0976002961397171, + "learning_rate": 0.00020282452149694886, + "loss": 2.5966, + "step": 23795 + }, + { + "epoch": 0.7056311716039498, + "grad_norm": 0.10524874180555344, + "learning_rate": 0.00020278668415803585, + "loss": 2.5749, + "step": 23796 + }, + { + "epoch": 0.7056608249562613, + "grad_norm": 0.09400252252817154, + "learning_rate": 0.00020274884945099781, + "loss": 2.5965, + "step": 23797 + }, + { + "epoch": 0.7056904783085728, + "grad_norm": 0.0978243425488472, + "learning_rate": 0.00020271101737616987, + "loss": 2.6128, + "step": 23798 + }, + { + "epoch": 0.7057201316608842, + "grad_norm": 0.08548180013895035, + "learning_rate": 0.00020267318793388672, + "loss": 2.5843, + "step": 23799 + }, + { + "epoch": 0.7057497850131957, + "grad_norm": 0.09489672631025314, + "learning_rate": 0.00020263536112448356, + "loss": 2.6135, + "step": 23800 + }, + { + "epoch": 0.7057794383655073, + "grad_norm": 0.094384104013443, + "learning_rate": 0.00020259753694829507, + "loss": 2.6574, + "step": 23801 + }, + { + "epoch": 0.7058090917178187, + "grad_norm": 0.09984230250120163, + "learning_rate": 0.0002025597154056567, + "loss": 2.592, + "step": 23802 + }, + { + "epoch": 0.7058387450701302, + "grad_norm": 0.101899154484272, + "learning_rate": 0.00020252189649690305, + "loss": 2.5714, + "step": 23803 + }, + { + "epoch": 0.7058683984224416, + "grad_norm": 0.08948197215795517, + "learning_rate": 0.0002024840802223691, + "loss": 2.6179, + "step": 23804 + }, + { + "epoch": 0.7058980517747532, + "grad_norm": 0.10108716040849686, + "learning_rate": 0.0002024462665823897, + "loss": 2.5902, + "step": 23805 + }, + { + "epoch": 0.7059277051270646, + "grad_norm": 0.08431930094957352, + "learning_rate": 0.00020240845557729963, + "loss": 2.5838, + "step": 23806 + }, + { + "epoch": 0.7059573584793761, + "grad_norm": 0.0988040640950203, + "learning_rate": 0.0002023706472074338, + "loss": 2.6292, + "step": 23807 + }, + { + "epoch": 0.7059870118316875, + "grad_norm": 0.0935870110988617, + "learning_rate": 0.00020233284147312713, + "loss": 2.5751, + "step": 23808 + }, + { + "epoch": 0.7060166651839991, + "grad_norm": 0.09880287945270538, + "learning_rate": 0.0002022950383747141, + "loss": 2.5853, + "step": 23809 + }, + { + "epoch": 0.7060463185363105, + "grad_norm": 0.09891272336244583, + "learning_rate": 0.0002022572379125296, + "loss": 2.5999, + "step": 23810 + }, + { + "epoch": 0.706075971888622, + "grad_norm": 0.10184157639741898, + "learning_rate": 0.00020221944008690836, + "loss": 2.6357, + "step": 23811 + }, + { + "epoch": 0.7061056252409335, + "grad_norm": 0.10139252245426178, + "learning_rate": 0.00020218164489818507, + "loss": 2.5837, + "step": 23812 + }, + { + "epoch": 0.706135278593245, + "grad_norm": 0.10911638289690018, + "learning_rate": 0.00020214385234669442, + "loss": 2.6009, + "step": 23813 + }, + { + "epoch": 0.7061649319455564, + "grad_norm": 0.1015973687171936, + "learning_rate": 0.00020210606243277097, + "loss": 2.5941, + "step": 23814 + }, + { + "epoch": 0.7061945852978679, + "grad_norm": 0.10106445848941803, + "learning_rate": 0.0002020682751567496, + "loss": 2.634, + "step": 23815 + }, + { + "epoch": 0.7062242386501794, + "grad_norm": 0.10094723850488663, + "learning_rate": 0.00020203049051896482, + "loss": 2.6253, + "step": 23816 + }, + { + "epoch": 0.7062538920024909, + "grad_norm": 0.10980435460805893, + "learning_rate": 0.00020199270851975116, + "loss": 2.5989, + "step": 23817 + }, + { + "epoch": 0.7062835453548023, + "grad_norm": 0.0984710231423378, + "learning_rate": 0.00020195492915944325, + "loss": 2.6101, + "step": 23818 + }, + { + "epoch": 0.7063131987071138, + "grad_norm": 0.1099301278591156, + "learning_rate": 0.00020191715243837578, + "loss": 2.6532, + "step": 23819 + }, + { + "epoch": 0.7063428520594253, + "grad_norm": 0.09683995693922043, + "learning_rate": 0.00020187937835688285, + "loss": 2.6399, + "step": 23820 + }, + { + "epoch": 0.7063725054117368, + "grad_norm": 0.10481009632349014, + "learning_rate": 0.0002018416069152993, + "loss": 2.5933, + "step": 23821 + }, + { + "epoch": 0.7064021587640483, + "grad_norm": 0.10062813013792038, + "learning_rate": 0.0002018038381139594, + "loss": 2.6164, + "step": 23822 + }, + { + "epoch": 0.7064318121163597, + "grad_norm": 0.09841147065162659, + "learning_rate": 0.00020176607195319775, + "loss": 2.6111, + "step": 23823 + }, + { + "epoch": 0.7064614654686713, + "grad_norm": 0.10458271205425262, + "learning_rate": 0.00020172830843334872, + "loss": 2.6034, + "step": 23824 + }, + { + "epoch": 0.7064911188209827, + "grad_norm": 0.10275686532258987, + "learning_rate": 0.0002016905475547467, + "loss": 2.609, + "step": 23825 + }, + { + "epoch": 0.7065207721732942, + "grad_norm": 0.10797888785600662, + "learning_rate": 0.00020165278931772612, + "loss": 2.6307, + "step": 23826 + }, + { + "epoch": 0.7065504255256057, + "grad_norm": 0.10645291209220886, + "learning_rate": 0.0002016150337226211, + "loss": 2.613, + "step": 23827 + }, + { + "epoch": 0.7065800788779172, + "grad_norm": 0.09606671333312988, + "learning_rate": 0.00020157728076976633, + "loss": 2.6088, + "step": 23828 + }, + { + "epoch": 0.7066097322302286, + "grad_norm": 0.10768425464630127, + "learning_rate": 0.0002015395304594962, + "loss": 2.6018, + "step": 23829 + }, + { + "epoch": 0.7066393855825401, + "grad_norm": 0.1015750914812088, + "learning_rate": 0.00020150178279214453, + "loss": 2.6239, + "step": 23830 + }, + { + "epoch": 0.7066690389348516, + "grad_norm": 0.09484751522541046, + "learning_rate": 0.00020146403776804585, + "loss": 2.6154, + "step": 23831 + }, + { + "epoch": 0.7066986922871631, + "grad_norm": 0.11119748651981354, + "learning_rate": 0.00020142629538753433, + "loss": 2.6294, + "step": 23832 + }, + { + "epoch": 0.7067283456394745, + "grad_norm": 0.10306733846664429, + "learning_rate": 0.00020138855565094416, + "loss": 2.5846, + "step": 23833 + }, + { + "epoch": 0.706757998991786, + "grad_norm": 0.10530838370323181, + "learning_rate": 0.00020135081855860966, + "loss": 2.6029, + "step": 23834 + }, + { + "epoch": 0.7067876523440975, + "grad_norm": 0.10457002371549606, + "learning_rate": 0.0002013130841108649, + "loss": 2.5941, + "step": 23835 + }, + { + "epoch": 0.706817305696409, + "grad_norm": 0.0987473726272583, + "learning_rate": 0.00020127535230804407, + "loss": 2.6272, + "step": 23836 + }, + { + "epoch": 0.7068469590487204, + "grad_norm": 0.10202687978744507, + "learning_rate": 0.00020123762315048122, + "loss": 2.6176, + "step": 23837 + }, + { + "epoch": 0.706876612401032, + "grad_norm": 0.10803552716970444, + "learning_rate": 0.00020119989663851056, + "loss": 2.581, + "step": 23838 + }, + { + "epoch": 0.7069062657533434, + "grad_norm": 0.0886782705783844, + "learning_rate": 0.00020116217277246608, + "loss": 2.5854, + "step": 23839 + }, + { + "epoch": 0.7069359191056549, + "grad_norm": 0.10011765360832214, + "learning_rate": 0.00020112445155268184, + "loss": 2.6106, + "step": 23840 + }, + { + "epoch": 0.7069655724579663, + "grad_norm": 0.09503251314163208, + "learning_rate": 0.0002010867329794919, + "loss": 2.5547, + "step": 23841 + }, + { + "epoch": 0.7069952258102778, + "grad_norm": 0.09979062527418137, + "learning_rate": 0.00020104901705323027, + "loss": 2.5928, + "step": 23842 + }, + { + "epoch": 0.7070248791625894, + "grad_norm": 0.08852806687355042, + "learning_rate": 0.00020101130377423088, + "loss": 2.6457, + "step": 23843 + }, + { + "epoch": 0.7070545325149008, + "grad_norm": 0.09865529835224152, + "learning_rate": 0.0002009735931428277, + "loss": 2.5951, + "step": 23844 + }, + { + "epoch": 0.7070841858672123, + "grad_norm": 0.09351318329572678, + "learning_rate": 0.00020093588515935468, + "loss": 2.6223, + "step": 23845 + }, + { + "epoch": 0.7071138392195238, + "grad_norm": 0.11605499684810638, + "learning_rate": 0.00020089817982414575, + "loss": 2.6062, + "step": 23846 + }, + { + "epoch": 0.7071434925718353, + "grad_norm": 0.11090023070573807, + "learning_rate": 0.0002008604771375348, + "loss": 2.6109, + "step": 23847 + }, + { + "epoch": 0.7071731459241467, + "grad_norm": 0.10428372770547867, + "learning_rate": 0.00020082277709985562, + "loss": 2.6311, + "step": 23848 + }, + { + "epoch": 0.7072027992764582, + "grad_norm": 0.12376919388771057, + "learning_rate": 0.00020078507971144215, + "loss": 2.6232, + "step": 23849 + }, + { + "epoch": 0.7072324526287697, + "grad_norm": 0.12276676297187805, + "learning_rate": 0.0002007473849726281, + "loss": 2.626, + "step": 23850 + }, + { + "epoch": 0.7072621059810812, + "grad_norm": 0.11155975610017776, + "learning_rate": 0.0002007096928837474, + "loss": 2.6182, + "step": 23851 + }, + { + "epoch": 0.7072917593333926, + "grad_norm": 0.13610519468784332, + "learning_rate": 0.0002006720034451337, + "loss": 2.5789, + "step": 23852 + }, + { + "epoch": 0.7073214126857041, + "grad_norm": 0.12280335277318954, + "learning_rate": 0.0002006343166571208, + "loss": 2.628, + "step": 23853 + }, + { + "epoch": 0.7073510660380156, + "grad_norm": 0.10461980849504471, + "learning_rate": 0.0002005966325200424, + "loss": 2.6428, + "step": 23854 + }, + { + "epoch": 0.7073807193903271, + "grad_norm": 0.1377621293067932, + "learning_rate": 0.00020055895103423223, + "loss": 2.6308, + "step": 23855 + }, + { + "epoch": 0.7074103727426385, + "grad_norm": 0.10670366883277893, + "learning_rate": 0.00020052127220002391, + "loss": 2.6231, + "step": 23856 + }, + { + "epoch": 0.70744002609495, + "grad_norm": 0.11352561414241791, + "learning_rate": 0.00020048359601775113, + "loss": 2.6287, + "step": 23857 + }, + { + "epoch": 0.7074696794472615, + "grad_norm": 0.14589226245880127, + "learning_rate": 0.00020044592248774756, + "loss": 2.6094, + "step": 23858 + }, + { + "epoch": 0.707499332799573, + "grad_norm": 0.12294133007526398, + "learning_rate": 0.00020040825161034672, + "loss": 2.5925, + "step": 23859 + }, + { + "epoch": 0.7075289861518844, + "grad_norm": 0.10659116506576538, + "learning_rate": 0.00020037058338588225, + "loss": 2.6329, + "step": 23860 + }, + { + "epoch": 0.707558639504196, + "grad_norm": 0.13148151338100433, + "learning_rate": 0.0002003329178146877, + "loss": 2.5726, + "step": 23861 + }, + { + "epoch": 0.7075882928565075, + "grad_norm": 0.0993756428360939, + "learning_rate": 0.0002002952548970966, + "loss": 2.6544, + "step": 23862 + }, + { + "epoch": 0.7076179462088189, + "grad_norm": 0.11984120309352875, + "learning_rate": 0.00020025759463344247, + "loss": 2.6084, + "step": 23863 + }, + { + "epoch": 0.7076475995611304, + "grad_norm": 0.11457205563783646, + "learning_rate": 0.000200219937024059, + "loss": 2.6111, + "step": 23864 + }, + { + "epoch": 0.7076772529134419, + "grad_norm": 0.1169142872095108, + "learning_rate": 0.00020018228206927901, + "loss": 2.6239, + "step": 23865 + }, + { + "epoch": 0.7077069062657534, + "grad_norm": 0.1244986355304718, + "learning_rate": 0.0002001446297694366, + "loss": 2.6309, + "step": 23866 + }, + { + "epoch": 0.7077365596180648, + "grad_norm": 0.10063406825065613, + "learning_rate": 0.00020010698012486495, + "loss": 2.6104, + "step": 23867 + }, + { + "epoch": 0.7077662129703763, + "grad_norm": 0.1180340051651001, + "learning_rate": 0.0002000693331358975, + "loss": 2.5898, + "step": 23868 + }, + { + "epoch": 0.7077958663226878, + "grad_norm": 0.10270615667104721, + "learning_rate": 0.00020003168880286754, + "loss": 2.5972, + "step": 23869 + }, + { + "epoch": 0.7078255196749993, + "grad_norm": 0.11238320916891098, + "learning_rate": 0.00019999404712610846, + "loss": 2.5499, + "step": 23870 + }, + { + "epoch": 0.7078551730273107, + "grad_norm": 0.09557770937681198, + "learning_rate": 0.00019995640810595367, + "loss": 2.5948, + "step": 23871 + }, + { + "epoch": 0.7078848263796222, + "grad_norm": 0.09810201823711395, + "learning_rate": 0.0001999187717427363, + "loss": 2.5885, + "step": 23872 + }, + { + "epoch": 0.7079144797319337, + "grad_norm": 0.09694914519786835, + "learning_rate": 0.00019988113803678977, + "loss": 2.5843, + "step": 23873 + }, + { + "epoch": 0.7079441330842452, + "grad_norm": 0.09773322194814682, + "learning_rate": 0.00019984350698844738, + "loss": 2.6018, + "step": 23874 + }, + { + "epoch": 0.7079737864365566, + "grad_norm": 0.10705442726612091, + "learning_rate": 0.00019980587859804217, + "loss": 2.6138, + "step": 23875 + }, + { + "epoch": 0.7080034397888681, + "grad_norm": 0.09626791626214981, + "learning_rate": 0.00019976825286590737, + "loss": 2.6115, + "step": 23876 + }, + { + "epoch": 0.7080330931411796, + "grad_norm": 0.10279368609189987, + "learning_rate": 0.00019973062979237628, + "loss": 2.6112, + "step": 23877 + }, + { + "epoch": 0.7080627464934911, + "grad_norm": 0.0968397706747055, + "learning_rate": 0.00019969300937778178, + "loss": 2.6229, + "step": 23878 + }, + { + "epoch": 0.7080923998458025, + "grad_norm": 0.11399859935045242, + "learning_rate": 0.00019965539162245744, + "loss": 2.6301, + "step": 23879 + }, + { + "epoch": 0.708122053198114, + "grad_norm": 0.1030309721827507, + "learning_rate": 0.00019961777652673614, + "loss": 2.6113, + "step": 23880 + }, + { + "epoch": 0.7081517065504255, + "grad_norm": 0.0984591543674469, + "learning_rate": 0.000199580164090951, + "loss": 2.6042, + "step": 23881 + }, + { + "epoch": 0.708181359902737, + "grad_norm": 0.10474595427513123, + "learning_rate": 0.00019954255431543506, + "loss": 2.6344, + "step": 23882 + }, + { + "epoch": 0.7082110132550485, + "grad_norm": 0.09003936499357224, + "learning_rate": 0.00019950494720052137, + "loss": 2.601, + "step": 23883 + }, + { + "epoch": 0.70824066660736, + "grad_norm": 0.09584818035364151, + "learning_rate": 0.0001994673427465431, + "loss": 2.5633, + "step": 23884 + }, + { + "epoch": 0.7082703199596715, + "grad_norm": 0.10773824155330658, + "learning_rate": 0.00019942974095383298, + "loss": 2.5968, + "step": 23885 + }, + { + "epoch": 0.7082999733119829, + "grad_norm": 0.1034260094165802, + "learning_rate": 0.0001993921418227241, + "loss": 2.5652, + "step": 23886 + }, + { + "epoch": 0.7083296266642944, + "grad_norm": 0.11307106912136078, + "learning_rate": 0.00019935454535354935, + "loss": 2.6106, + "step": 23887 + }, + { + "epoch": 0.7083592800166059, + "grad_norm": 0.10204768925905228, + "learning_rate": 0.0001993169515466417, + "loss": 2.632, + "step": 23888 + }, + { + "epoch": 0.7083889333689174, + "grad_norm": 0.112339548766613, + "learning_rate": 0.00019927936040233413, + "loss": 2.6174, + "step": 23889 + }, + { + "epoch": 0.7084185867212288, + "grad_norm": 0.09477241337299347, + "learning_rate": 0.00019924177192095938, + "loss": 2.5924, + "step": 23890 + }, + { + "epoch": 0.7084482400735403, + "grad_norm": 0.11031854897737503, + "learning_rate": 0.00019920418610285017, + "loss": 2.6277, + "step": 23891 + }, + { + "epoch": 0.7084778934258518, + "grad_norm": 0.10288196802139282, + "learning_rate": 0.00019916660294833977, + "loss": 2.6003, + "step": 23892 + }, + { + "epoch": 0.7085075467781633, + "grad_norm": 0.09734728932380676, + "learning_rate": 0.0001991290224577607, + "loss": 2.6291, + "step": 23893 + }, + { + "epoch": 0.7085372001304747, + "grad_norm": 0.10026400536298752, + "learning_rate": 0.0001990914446314458, + "loss": 2.6192, + "step": 23894 + }, + { + "epoch": 0.7085668534827863, + "grad_norm": 0.0952315405011177, + "learning_rate": 0.0001990538694697279, + "loss": 2.5943, + "step": 23895 + }, + { + "epoch": 0.7085965068350977, + "grad_norm": 0.10205842554569244, + "learning_rate": 0.00019901629697293961, + "loss": 2.6472, + "step": 23896 + }, + { + "epoch": 0.7086261601874092, + "grad_norm": 0.09741794317960739, + "learning_rate": 0.00019897872714141356, + "loss": 2.6102, + "step": 23897 + }, + { + "epoch": 0.7086558135397206, + "grad_norm": 0.09595456719398499, + "learning_rate": 0.00019894115997548257, + "loss": 2.632, + "step": 23898 + }, + { + "epoch": 0.7086854668920322, + "grad_norm": 0.08933040499687195, + "learning_rate": 0.0001989035954754793, + "loss": 2.6339, + "step": 23899 + }, + { + "epoch": 0.7087151202443436, + "grad_norm": 0.10147900134325027, + "learning_rate": 0.00019886603364173639, + "loss": 2.5899, + "step": 23900 + }, + { + "epoch": 0.7087447735966551, + "grad_norm": 0.09546275436878204, + "learning_rate": 0.00019882847447458645, + "loss": 2.6208, + "step": 23901 + }, + { + "epoch": 0.7087744269489665, + "grad_norm": 0.09291894733905792, + "learning_rate": 0.00019879091797436204, + "loss": 2.599, + "step": 23902 + }, + { + "epoch": 0.7088040803012781, + "grad_norm": 0.09295212477445602, + "learning_rate": 0.00019875336414139572, + "loss": 2.6123, + "step": 23903 + }, + { + "epoch": 0.7088337336535896, + "grad_norm": 0.094593346118927, + "learning_rate": 0.00019871581297601992, + "loss": 2.593, + "step": 23904 + }, + { + "epoch": 0.708863387005901, + "grad_norm": 0.0974205732345581, + "learning_rate": 0.0001986782644785677, + "loss": 2.6193, + "step": 23905 + }, + { + "epoch": 0.7088930403582125, + "grad_norm": 0.0987602099776268, + "learning_rate": 0.00019864071864937095, + "loss": 2.6153, + "step": 23906 + }, + { + "epoch": 0.708922693710524, + "grad_norm": 0.09609686583280563, + "learning_rate": 0.00019860317548876238, + "loss": 2.6492, + "step": 23907 + }, + { + "epoch": 0.7089523470628355, + "grad_norm": 0.10151571035385132, + "learning_rate": 0.00019856563499707442, + "loss": 2.6225, + "step": 23908 + }, + { + "epoch": 0.7089820004151469, + "grad_norm": 0.0944146141409874, + "learning_rate": 0.00019852809717463954, + "loss": 2.6123, + "step": 23909 + }, + { + "epoch": 0.7090116537674584, + "grad_norm": 0.10176824778318405, + "learning_rate": 0.00019849056202179006, + "loss": 2.6153, + "step": 23910 + }, + { + "epoch": 0.7090413071197699, + "grad_norm": 0.09399191290140152, + "learning_rate": 0.0001984530295388584, + "loss": 2.5928, + "step": 23911 + }, + { + "epoch": 0.7090709604720814, + "grad_norm": 0.10549923032522202, + "learning_rate": 0.00019841549972617696, + "loss": 2.6255, + "step": 23912 + }, + { + "epoch": 0.7091006138243928, + "grad_norm": 0.11025793850421906, + "learning_rate": 0.000198377972584078, + "loss": 2.6099, + "step": 23913 + }, + { + "epoch": 0.7091302671767044, + "grad_norm": 0.11274191737174988, + "learning_rate": 0.00019834044811289393, + "loss": 2.6328, + "step": 23914 + }, + { + "epoch": 0.7091599205290158, + "grad_norm": 0.10029935091733932, + "learning_rate": 0.00019830292631295687, + "loss": 2.5992, + "step": 23915 + }, + { + "epoch": 0.7091895738813273, + "grad_norm": 0.1058255285024643, + "learning_rate": 0.00019826540718459928, + "loss": 2.6384, + "step": 23916 + }, + { + "epoch": 0.7092192272336387, + "grad_norm": 0.1064145490527153, + "learning_rate": 0.00019822789072815322, + "loss": 2.594, + "step": 23917 + }, + { + "epoch": 0.7092488805859503, + "grad_norm": 0.09845870733261108, + "learning_rate": 0.00019819037694395104, + "loss": 2.6126, + "step": 23918 + }, + { + "epoch": 0.7092785339382617, + "grad_norm": 0.11811579763889313, + "learning_rate": 0.0001981528658323249, + "loss": 2.6314, + "step": 23919 + }, + { + "epoch": 0.7093081872905732, + "grad_norm": 0.11547791212797165, + "learning_rate": 0.00019811535739360685, + "loss": 2.5994, + "step": 23920 + }, + { + "epoch": 0.7093378406428846, + "grad_norm": 0.10502277314662933, + "learning_rate": 0.0001980778516281292, + "loss": 2.575, + "step": 23921 + }, + { + "epoch": 0.7093674939951962, + "grad_norm": 0.10246744006872177, + "learning_rate": 0.000198040348536224, + "loss": 2.6199, + "step": 23922 + }, + { + "epoch": 0.7093971473475076, + "grad_norm": 0.10542251169681549, + "learning_rate": 0.00019800284811822332, + "loss": 2.583, + "step": 23923 + }, + { + "epoch": 0.7094268006998191, + "grad_norm": 0.1032119020819664, + "learning_rate": 0.0001979653503744593, + "loss": 2.5802, + "step": 23924 + }, + { + "epoch": 0.7094564540521306, + "grad_norm": 0.1130688339471817, + "learning_rate": 0.00019792785530526385, + "loss": 2.5898, + "step": 23925 + }, + { + "epoch": 0.7094861074044421, + "grad_norm": 0.09669932723045349, + "learning_rate": 0.00019789036291096917, + "loss": 2.5702, + "step": 23926 + }, + { + "epoch": 0.7095157607567536, + "grad_norm": 0.10229357331991196, + "learning_rate": 0.0001978528731919072, + "loss": 2.6042, + "step": 23927 + }, + { + "epoch": 0.709545414109065, + "grad_norm": 0.11236602813005447, + "learning_rate": 0.00019781538614840983, + "loss": 2.646, + "step": 23928 + }, + { + "epoch": 0.7095750674613766, + "grad_norm": 0.10660547763109207, + "learning_rate": 0.00019777790178080934, + "loss": 2.6303, + "step": 23929 + }, + { + "epoch": 0.709604720813688, + "grad_norm": 0.09718943387269974, + "learning_rate": 0.000197740420089437, + "loss": 2.6185, + "step": 23930 + }, + { + "epoch": 0.7096343741659995, + "grad_norm": 0.10415059328079224, + "learning_rate": 0.00019770294107462528, + "loss": 2.606, + "step": 23931 + }, + { + "epoch": 0.7096640275183109, + "grad_norm": 0.09355952590703964, + "learning_rate": 0.00019766546473670593, + "loss": 2.6113, + "step": 23932 + }, + { + "epoch": 0.7096936808706225, + "grad_norm": 0.09696779400110245, + "learning_rate": 0.0001976279910760108, + "loss": 2.6371, + "step": 23933 + }, + { + "epoch": 0.7097233342229339, + "grad_norm": 0.0995769277215004, + "learning_rate": 0.0001975905200928717, + "loss": 2.6058, + "step": 23934 + }, + { + "epoch": 0.7097529875752454, + "grad_norm": 0.09873685240745544, + "learning_rate": 0.00019755305178762044, + "loss": 2.6165, + "step": 23935 + }, + { + "epoch": 0.7097826409275568, + "grad_norm": 0.09565957635641098, + "learning_rate": 0.00019751558616058884, + "loss": 2.5816, + "step": 23936 + }, + { + "epoch": 0.7098122942798684, + "grad_norm": 0.10311844944953918, + "learning_rate": 0.00019747812321210868, + "loss": 2.6245, + "step": 23937 + }, + { + "epoch": 0.7098419476321798, + "grad_norm": 0.09847098588943481, + "learning_rate": 0.00019744066294251163, + "loss": 2.579, + "step": 23938 + }, + { + "epoch": 0.7098716009844913, + "grad_norm": 0.09656032174825668, + "learning_rate": 0.00019740320535212942, + "loss": 2.6075, + "step": 23939 + }, + { + "epoch": 0.7099012543368027, + "grad_norm": 0.10064023733139038, + "learning_rate": 0.00019736575044129395, + "loss": 2.5825, + "step": 23940 + }, + { + "epoch": 0.7099309076891143, + "grad_norm": 0.09929665178060532, + "learning_rate": 0.00019732829821033653, + "loss": 2.6208, + "step": 23941 + }, + { + "epoch": 0.7099605610414257, + "grad_norm": 0.10991054773330688, + "learning_rate": 0.000197290848659589, + "loss": 2.642, + "step": 23942 + }, + { + "epoch": 0.7099902143937372, + "grad_norm": 0.10249267518520355, + "learning_rate": 0.0001972534017893828, + "loss": 2.59, + "step": 23943 + }, + { + "epoch": 0.7100198677460486, + "grad_norm": 0.10551050305366516, + "learning_rate": 0.00019721595760004984, + "loss": 2.5766, + "step": 23944 + }, + { + "epoch": 0.7100495210983602, + "grad_norm": 0.11646591871976852, + "learning_rate": 0.00019717851609192156, + "loss": 2.5867, + "step": 23945 + }, + { + "epoch": 0.7100791744506717, + "grad_norm": 0.10207418352365494, + "learning_rate": 0.00019714107726532949, + "loss": 2.6071, + "step": 23946 + }, + { + "epoch": 0.7101088278029831, + "grad_norm": 0.11692464351654053, + "learning_rate": 0.00019710364112060519, + "loss": 2.6407, + "step": 23947 + }, + { + "epoch": 0.7101384811552947, + "grad_norm": 0.09299471974372864, + "learning_rate": 0.00019706620765808008, + "loss": 2.5834, + "step": 23948 + }, + { + "epoch": 0.7101681345076061, + "grad_norm": 0.10709185153245926, + "learning_rate": 0.00019702877687808573, + "loss": 2.602, + "step": 23949 + }, + { + "epoch": 0.7101977878599176, + "grad_norm": 0.09888403862714767, + "learning_rate": 0.00019699134878095376, + "loss": 2.57, + "step": 23950 + }, + { + "epoch": 0.710227441212229, + "grad_norm": 0.11957370489835739, + "learning_rate": 0.00019695392336701524, + "loss": 2.5906, + "step": 23951 + }, + { + "epoch": 0.7102570945645406, + "grad_norm": 0.11490363627672195, + "learning_rate": 0.00019691650063660172, + "loss": 2.6261, + "step": 23952 + }, + { + "epoch": 0.710286747916852, + "grad_norm": 0.10345509648323059, + "learning_rate": 0.0001968790805900446, + "loss": 2.6112, + "step": 23953 + }, + { + "epoch": 0.7103164012691635, + "grad_norm": 0.11216368526220322, + "learning_rate": 0.00019684166322767532, + "loss": 2.5945, + "step": 23954 + }, + { + "epoch": 0.7103460546214749, + "grad_norm": 0.10093601047992706, + "learning_rate": 0.00019680424854982508, + "loss": 2.5922, + "step": 23955 + }, + { + "epoch": 0.7103757079737865, + "grad_norm": 0.11010784655809402, + "learning_rate": 0.00019676683655682508, + "loss": 2.604, + "step": 23956 + }, + { + "epoch": 0.7104053613260979, + "grad_norm": 0.09442032873630524, + "learning_rate": 0.00019672942724900699, + "loss": 2.6226, + "step": 23957 + }, + { + "epoch": 0.7104350146784094, + "grad_norm": 0.10535822063684464, + "learning_rate": 0.0001966920206267019, + "loss": 2.5941, + "step": 23958 + }, + { + "epoch": 0.7104646680307208, + "grad_norm": 0.0963204950094223, + "learning_rate": 0.00019665461669024105, + "loss": 2.6492, + "step": 23959 + }, + { + "epoch": 0.7104943213830324, + "grad_norm": 0.11656174063682556, + "learning_rate": 0.0001966172154399556, + "loss": 2.5962, + "step": 23960 + }, + { + "epoch": 0.7105239747353438, + "grad_norm": 0.12388242781162262, + "learning_rate": 0.00019657981687617692, + "loss": 2.6049, + "step": 23961 + }, + { + "epoch": 0.7105536280876553, + "grad_norm": 0.10254183411598206, + "learning_rate": 0.00019654242099923592, + "loss": 2.5871, + "step": 23962 + }, + { + "epoch": 0.7105832814399667, + "grad_norm": 0.13010744750499725, + "learning_rate": 0.00019650502780946383, + "loss": 2.6108, + "step": 23963 + }, + { + "epoch": 0.7106129347922783, + "grad_norm": 0.11475929617881775, + "learning_rate": 0.00019646763730719186, + "loss": 2.6159, + "step": 23964 + }, + { + "epoch": 0.7106425881445897, + "grad_norm": 0.11782380193471909, + "learning_rate": 0.00019643024949275102, + "loss": 2.5885, + "step": 23965 + }, + { + "epoch": 0.7106722414969012, + "grad_norm": 0.11206979304552078, + "learning_rate": 0.00019639286436647248, + "loss": 2.635, + "step": 23966 + }, + { + "epoch": 0.7107018948492128, + "grad_norm": 0.11675277352333069, + "learning_rate": 0.0001963554819286872, + "loss": 2.5946, + "step": 23967 + }, + { + "epoch": 0.7107315482015242, + "grad_norm": 0.09893065690994263, + "learning_rate": 0.00019631810217972623, + "loss": 2.6204, + "step": 23968 + }, + { + "epoch": 0.7107612015538357, + "grad_norm": 0.11942283064126968, + "learning_rate": 0.00019628072511992045, + "loss": 2.572, + "step": 23969 + }, + { + "epoch": 0.7107908549061471, + "grad_norm": 0.09866786748170853, + "learning_rate": 0.00019624335074960116, + "loss": 2.6258, + "step": 23970 + }, + { + "epoch": 0.7108205082584587, + "grad_norm": 0.12497151643037796, + "learning_rate": 0.00019620597906909925, + "loss": 2.6013, + "step": 23971 + }, + { + "epoch": 0.7108501616107701, + "grad_norm": 0.11592377722263336, + "learning_rate": 0.00019616861007874543, + "loss": 2.6174, + "step": 23972 + }, + { + "epoch": 0.7108798149630816, + "grad_norm": 0.1123804897069931, + "learning_rate": 0.0001961312437788707, + "loss": 2.5579, + "step": 23973 + }, + { + "epoch": 0.710909468315393, + "grad_norm": 0.131300151348114, + "learning_rate": 0.000196093880169806, + "loss": 2.6162, + "step": 23974 + }, + { + "epoch": 0.7109391216677046, + "grad_norm": 0.10061049461364746, + "learning_rate": 0.00019605651925188211, + "loss": 2.569, + "step": 23975 + }, + { + "epoch": 0.710968775020016, + "grad_norm": 0.10727290064096451, + "learning_rate": 0.00019601916102542994, + "loss": 2.594, + "step": 23976 + }, + { + "epoch": 0.7109984283723275, + "grad_norm": 0.10946100205183029, + "learning_rate": 0.00019598180549078027, + "loss": 2.6105, + "step": 23977 + }, + { + "epoch": 0.7110280817246389, + "grad_norm": 0.10466432571411133, + "learning_rate": 0.00019594445264826388, + "loss": 2.5903, + "step": 23978 + }, + { + "epoch": 0.7110577350769505, + "grad_norm": 0.11185599863529205, + "learning_rate": 0.0001959071024982116, + "loss": 2.6187, + "step": 23979 + }, + { + "epoch": 0.7110873884292619, + "grad_norm": 0.09565027058124542, + "learning_rate": 0.00019586975504095407, + "loss": 2.6085, + "step": 23980 + }, + { + "epoch": 0.7111170417815734, + "grad_norm": 0.0978788435459137, + "learning_rate": 0.00019583241027682207, + "loss": 2.571, + "step": 23981 + }, + { + "epoch": 0.7111466951338848, + "grad_norm": 0.10115645080804825, + "learning_rate": 0.0001957950682061463, + "loss": 2.6084, + "step": 23982 + }, + { + "epoch": 0.7111763484861964, + "grad_norm": 0.10452264547348022, + "learning_rate": 0.0001957577288292574, + "loss": 2.6227, + "step": 23983 + }, + { + "epoch": 0.7112060018385078, + "grad_norm": 0.10160302370786667, + "learning_rate": 0.00019572039214648608, + "loss": 2.6139, + "step": 23984 + }, + { + "epoch": 0.7112356551908193, + "grad_norm": 0.10026578605175018, + "learning_rate": 0.00019568305815816285, + "loss": 2.5919, + "step": 23985 + }, + { + "epoch": 0.7112653085431307, + "grad_norm": 0.09741617739200592, + "learning_rate": 0.0001956457268646184, + "loss": 2.5871, + "step": 23986 + }, + { + "epoch": 0.7112949618954423, + "grad_norm": 0.10538328438997269, + "learning_rate": 0.00019560839826618333, + "loss": 2.6315, + "step": 23987 + }, + { + "epoch": 0.7113246152477538, + "grad_norm": 0.11151296645402908, + "learning_rate": 0.00019557107236318805, + "loss": 2.6426, + "step": 23988 + }, + { + "epoch": 0.7113542686000652, + "grad_norm": 0.08850830048322678, + "learning_rate": 0.00019553374915596327, + "loss": 2.5977, + "step": 23989 + }, + { + "epoch": 0.7113839219523768, + "grad_norm": 0.11124924570322037, + "learning_rate": 0.00019549642864483935, + "loss": 2.6273, + "step": 23990 + }, + { + "epoch": 0.7114135753046882, + "grad_norm": 0.0904650166630745, + "learning_rate": 0.00019545911083014683, + "loss": 2.6292, + "step": 23991 + }, + { + "epoch": 0.7114432286569997, + "grad_norm": 0.1008504331111908, + "learning_rate": 0.00019542179571221618, + "loss": 2.6043, + "step": 23992 + }, + { + "epoch": 0.7114728820093111, + "grad_norm": 0.09846428036689758, + "learning_rate": 0.00019538448329137775, + "loss": 2.6267, + "step": 23993 + }, + { + "epoch": 0.7115025353616227, + "grad_norm": 0.09705939888954163, + "learning_rate": 0.00019534717356796204, + "loss": 2.6121, + "step": 23994 + }, + { + "epoch": 0.7115321887139341, + "grad_norm": 0.10606203973293304, + "learning_rate": 0.00019530986654229943, + "loss": 2.6125, + "step": 23995 + }, + { + "epoch": 0.7115618420662456, + "grad_norm": 0.10615924745798111, + "learning_rate": 0.00019527256221472022, + "loss": 2.6299, + "step": 23996 + }, + { + "epoch": 0.711591495418557, + "grad_norm": 0.0979161486029625, + "learning_rate": 0.0001952352605855548, + "loss": 2.6294, + "step": 23997 + }, + { + "epoch": 0.7116211487708686, + "grad_norm": 0.0983419343829155, + "learning_rate": 0.0001951979616551335, + "loss": 2.6053, + "step": 23998 + }, + { + "epoch": 0.71165080212318, + "grad_norm": 0.10993025451898575, + "learning_rate": 0.00019516066542378646, + "loss": 2.5958, + "step": 23999 + }, + { + "epoch": 0.7116804554754915, + "grad_norm": 0.09989224374294281, + "learning_rate": 0.00019512337189184415, + "loss": 2.6165, + "step": 24000 + }, + { + "epoch": 0.711710108827803, + "grad_norm": 0.09260914474725723, + "learning_rate": 0.00019508608105963665, + "loss": 2.6025, + "step": 24001 + }, + { + "epoch": 0.7117397621801145, + "grad_norm": 0.10186043381690979, + "learning_rate": 0.00019504879292749427, + "loss": 2.6339, + "step": 24002 + }, + { + "epoch": 0.7117694155324259, + "grad_norm": 0.10677161067724228, + "learning_rate": 0.00019501150749574715, + "loss": 2.625, + "step": 24003 + }, + { + "epoch": 0.7117990688847374, + "grad_norm": 0.10124924033880234, + "learning_rate": 0.0001949742247647255, + "loss": 2.6101, + "step": 24004 + }, + { + "epoch": 0.7118287222370488, + "grad_norm": 0.09793851524591446, + "learning_rate": 0.00019493694473475965, + "loss": 2.5795, + "step": 24005 + }, + { + "epoch": 0.7118583755893604, + "grad_norm": 0.10042530298233032, + "learning_rate": 0.00019489966740617926, + "loss": 2.6205, + "step": 24006 + }, + { + "epoch": 0.7118880289416718, + "grad_norm": 0.09140238165855408, + "learning_rate": 0.0001948623927793148, + "loss": 2.6092, + "step": 24007 + }, + { + "epoch": 0.7119176822939833, + "grad_norm": 0.10004600882530212, + "learning_rate": 0.00019482512085449593, + "loss": 2.6327, + "step": 24008 + }, + { + "epoch": 0.7119473356462949, + "grad_norm": 0.10180290043354034, + "learning_rate": 0.00019478785163205327, + "loss": 2.5948, + "step": 24009 + }, + { + "epoch": 0.7119769889986063, + "grad_norm": 0.09636468440294266, + "learning_rate": 0.00019475058511231653, + "loss": 2.5813, + "step": 24010 + }, + { + "epoch": 0.7120066423509178, + "grad_norm": 0.10768944025039673, + "learning_rate": 0.00019471332129561574, + "loss": 2.6273, + "step": 24011 + }, + { + "epoch": 0.7120362957032292, + "grad_norm": 0.11497580260038376, + "learning_rate": 0.0001946760601822809, + "loss": 2.6305, + "step": 24012 + }, + { + "epoch": 0.7120659490555408, + "grad_norm": 0.11124217510223389, + "learning_rate": 0.00019463880177264197, + "loss": 2.5929, + "step": 24013 + }, + { + "epoch": 0.7120956024078522, + "grad_norm": 0.11170704662799835, + "learning_rate": 0.00019460154606702884, + "loss": 2.6469, + "step": 24014 + }, + { + "epoch": 0.7121252557601637, + "grad_norm": 0.10893918573856354, + "learning_rate": 0.0001945642930657715, + "loss": 2.5898, + "step": 24015 + }, + { + "epoch": 0.7121549091124751, + "grad_norm": 0.10220137983560562, + "learning_rate": 0.00019452704276919991, + "loss": 2.6091, + "step": 24016 + }, + { + "epoch": 0.7121845624647867, + "grad_norm": 0.09681473672389984, + "learning_rate": 0.0001944897951776436, + "loss": 2.6443, + "step": 24017 + }, + { + "epoch": 0.7122142158170981, + "grad_norm": 0.11516941338777542, + "learning_rate": 0.0001944525502914326, + "loss": 2.6052, + "step": 24018 + }, + { + "epoch": 0.7122438691694096, + "grad_norm": 0.11009775102138519, + "learning_rate": 0.00019441530811089674, + "loss": 2.6271, + "step": 24019 + }, + { + "epoch": 0.712273522521721, + "grad_norm": 0.10470490902662277, + "learning_rate": 0.00019437806863636576, + "loss": 2.6482, + "step": 24020 + }, + { + "epoch": 0.7123031758740326, + "grad_norm": 0.10599108785390854, + "learning_rate": 0.00019434083186816925, + "loss": 2.6152, + "step": 24021 + }, + { + "epoch": 0.712332829226344, + "grad_norm": 0.09879089891910553, + "learning_rate": 0.0001943035978066373, + "loss": 2.6, + "step": 24022 + }, + { + "epoch": 0.7123624825786555, + "grad_norm": 0.09789168834686279, + "learning_rate": 0.00019426636645209955, + "loss": 2.6079, + "step": 24023 + }, + { + "epoch": 0.712392135930967, + "grad_norm": 0.09361442923545837, + "learning_rate": 0.00019422913780488556, + "loss": 2.6053, + "step": 24024 + }, + { + "epoch": 0.7124217892832785, + "grad_norm": 0.10053817927837372, + "learning_rate": 0.00019419191186532497, + "loss": 2.6049, + "step": 24025 + }, + { + "epoch": 0.7124514426355899, + "grad_norm": 0.10011691600084305, + "learning_rate": 0.00019415468863374775, + "loss": 2.5883, + "step": 24026 + }, + { + "epoch": 0.7124810959879014, + "grad_norm": 0.09733342379331589, + "learning_rate": 0.00019411746811048302, + "loss": 2.5811, + "step": 24027 + }, + { + "epoch": 0.7125107493402129, + "grad_norm": 0.10161416977643967, + "learning_rate": 0.00019408025029586068, + "loss": 2.6356, + "step": 24028 + }, + { + "epoch": 0.7125404026925244, + "grad_norm": 0.09260979294776917, + "learning_rate": 0.00019404303519021023, + "loss": 2.6239, + "step": 24029 + }, + { + "epoch": 0.7125700560448359, + "grad_norm": 0.09414573758840561, + "learning_rate": 0.0001940058227938612, + "loss": 2.622, + "step": 24030 + }, + { + "epoch": 0.7125997093971473, + "grad_norm": 0.09663739055395126, + "learning_rate": 0.00019396861310714308, + "loss": 2.6276, + "step": 24031 + }, + { + "epoch": 0.7126293627494589, + "grad_norm": 0.09620499610900879, + "learning_rate": 0.0001939314061303855, + "loss": 2.6239, + "step": 24032 + }, + { + "epoch": 0.7126590161017703, + "grad_norm": 0.10344146937131882, + "learning_rate": 0.00019389420186391782, + "loss": 2.6206, + "step": 24033 + }, + { + "epoch": 0.7126886694540818, + "grad_norm": 0.10024535655975342, + "learning_rate": 0.00019385700030806935, + "loss": 2.6277, + "step": 24034 + }, + { + "epoch": 0.7127183228063932, + "grad_norm": 0.08411496877670288, + "learning_rate": 0.00019381980146316991, + "loss": 2.5979, + "step": 24035 + }, + { + "epoch": 0.7127479761587048, + "grad_norm": 0.09536290168762207, + "learning_rate": 0.00019378260532954862, + "loss": 2.6226, + "step": 24036 + }, + { + "epoch": 0.7127776295110162, + "grad_norm": 0.09378065913915634, + "learning_rate": 0.00019374541190753515, + "loss": 2.6448, + "step": 24037 + }, + { + "epoch": 0.7128072828633277, + "grad_norm": 0.10360409319400787, + "learning_rate": 0.0001937082211974584, + "loss": 2.5867, + "step": 24038 + }, + { + "epoch": 0.7128369362156391, + "grad_norm": 0.09622154384851456, + "learning_rate": 0.00019367103319964797, + "loss": 2.6505, + "step": 24039 + }, + { + "epoch": 0.7128665895679507, + "grad_norm": 0.09822964668273926, + "learning_rate": 0.00019363384791443305, + "loss": 2.5976, + "step": 24040 + }, + { + "epoch": 0.7128962429202621, + "grad_norm": 0.10130506008863449, + "learning_rate": 0.00019359666534214304, + "loss": 2.6199, + "step": 24041 + }, + { + "epoch": 0.7129258962725736, + "grad_norm": 0.10078414529561996, + "learning_rate": 0.00019355948548310713, + "loss": 2.6109, + "step": 24042 + }, + { + "epoch": 0.712955549624885, + "grad_norm": 0.0929919183254242, + "learning_rate": 0.00019352230833765465, + "loss": 2.5693, + "step": 24043 + }, + { + "epoch": 0.7129852029771966, + "grad_norm": 0.10710839927196503, + "learning_rate": 0.00019348513390611465, + "loss": 2.6018, + "step": 24044 + }, + { + "epoch": 0.713014856329508, + "grad_norm": 0.09241028130054474, + "learning_rate": 0.00019344796218881644, + "loss": 2.6022, + "step": 24045 + }, + { + "epoch": 0.7130445096818195, + "grad_norm": 0.09773394465446472, + "learning_rate": 0.000193410793186089, + "loss": 2.6422, + "step": 24046 + }, + { + "epoch": 0.713074163034131, + "grad_norm": 0.1141607016324997, + "learning_rate": 0.00019337362689826195, + "loss": 2.6055, + "step": 24047 + }, + { + "epoch": 0.7131038163864425, + "grad_norm": 0.1054837629199028, + "learning_rate": 0.00019333646332566384, + "loss": 2.6634, + "step": 24048 + }, + { + "epoch": 0.7131334697387539, + "grad_norm": 0.09546545147895813, + "learning_rate": 0.00019329930246862403, + "loss": 2.5861, + "step": 24049 + }, + { + "epoch": 0.7131631230910654, + "grad_norm": 0.11088298261165619, + "learning_rate": 0.00019326214432747153, + "loss": 2.5968, + "step": 24050 + }, + { + "epoch": 0.713192776443377, + "grad_norm": 0.10780511051416397, + "learning_rate": 0.00019322498890253536, + "loss": 2.5971, + "step": 24051 + }, + { + "epoch": 0.7132224297956884, + "grad_norm": 0.1008680909872055, + "learning_rate": 0.0001931878361941446, + "loss": 2.6108, + "step": 24052 + }, + { + "epoch": 0.7132520831479999, + "grad_norm": 0.10343066602945328, + "learning_rate": 0.0001931506862026282, + "loss": 2.6303, + "step": 24053 + }, + { + "epoch": 0.7132817365003113, + "grad_norm": 0.10383515805006027, + "learning_rate": 0.0001931135389283152, + "loss": 2.5886, + "step": 24054 + }, + { + "epoch": 0.7133113898526229, + "grad_norm": 0.10408630222082138, + "learning_rate": 0.0001930763943715344, + "loss": 2.631, + "step": 24055 + }, + { + "epoch": 0.7133410432049343, + "grad_norm": 0.11218459904193878, + "learning_rate": 0.0001930392525326149, + "loss": 2.63, + "step": 24056 + }, + { + "epoch": 0.7133706965572458, + "grad_norm": 0.109665647149086, + "learning_rate": 0.00019300211341188544, + "loss": 2.6184, + "step": 24057 + }, + { + "epoch": 0.7134003499095573, + "grad_norm": 0.1034989282488823, + "learning_rate": 0.00019296497700967496, + "loss": 2.6445, + "step": 24058 + }, + { + "epoch": 0.7134300032618688, + "grad_norm": 0.10705827921628952, + "learning_rate": 0.00019292784332631237, + "loss": 2.6049, + "step": 24059 + }, + { + "epoch": 0.7134596566141802, + "grad_norm": 0.11455435305833817, + "learning_rate": 0.0001928907123621264, + "loss": 2.5999, + "step": 24060 + }, + { + "epoch": 0.7134893099664917, + "grad_norm": 0.10952328145503998, + "learning_rate": 0.00019285358411744586, + "loss": 2.5952, + "step": 24061 + }, + { + "epoch": 0.7135189633188032, + "grad_norm": 0.09854761511087418, + "learning_rate": 0.0001928164585925996, + "loss": 2.5595, + "step": 24062 + }, + { + "epoch": 0.7135486166711147, + "grad_norm": 0.11410447955131531, + "learning_rate": 0.0001927793357879163, + "loss": 2.6042, + "step": 24063 + }, + { + "epoch": 0.7135782700234261, + "grad_norm": 0.1144871786236763, + "learning_rate": 0.00019274221570372474, + "loss": 2.6335, + "step": 24064 + }, + { + "epoch": 0.7136079233757376, + "grad_norm": 0.11061011999845505, + "learning_rate": 0.00019270509834035354, + "loss": 2.6228, + "step": 24065 + }, + { + "epoch": 0.7136375767280491, + "grad_norm": 0.09469583630561829, + "learning_rate": 0.00019266798369813154, + "loss": 2.604, + "step": 24066 + }, + { + "epoch": 0.7136672300803606, + "grad_norm": 0.11712120473384857, + "learning_rate": 0.0001926308717773872, + "loss": 2.6143, + "step": 24067 + }, + { + "epoch": 0.713696883432672, + "grad_norm": 0.10026828199625015, + "learning_rate": 0.0001925937625784493, + "loss": 2.6226, + "step": 24068 + }, + { + "epoch": 0.7137265367849835, + "grad_norm": 0.1009451150894165, + "learning_rate": 0.0001925566561016464, + "loss": 2.6087, + "step": 24069 + }, + { + "epoch": 0.713756190137295, + "grad_norm": 0.10098090767860413, + "learning_rate": 0.00019251955234730707, + "loss": 2.6226, + "step": 24070 + }, + { + "epoch": 0.7137858434896065, + "grad_norm": 0.11723482608795166, + "learning_rate": 0.00019248245131576002, + "loss": 2.6185, + "step": 24071 + }, + { + "epoch": 0.713815496841918, + "grad_norm": 0.11215273290872574, + "learning_rate": 0.0001924453530073334, + "loss": 2.6126, + "step": 24072 + }, + { + "epoch": 0.7138451501942294, + "grad_norm": 0.1093781515955925, + "learning_rate": 0.00019240825742235606, + "loss": 2.6032, + "step": 24073 + }, + { + "epoch": 0.713874803546541, + "grad_norm": 0.12061361968517303, + "learning_rate": 0.00019237116456115644, + "loss": 2.5618, + "step": 24074 + }, + { + "epoch": 0.7139044568988524, + "grad_norm": 0.12398622184991837, + "learning_rate": 0.00019233407442406292, + "loss": 2.6209, + "step": 24075 + }, + { + "epoch": 0.7139341102511639, + "grad_norm": 0.10458192974328995, + "learning_rate": 0.00019229698701140403, + "loss": 2.6216, + "step": 24076 + }, + { + "epoch": 0.7139637636034754, + "grad_norm": 0.12077835202217102, + "learning_rate": 0.00019225990232350808, + "loss": 2.5906, + "step": 24077 + }, + { + "epoch": 0.7139934169557869, + "grad_norm": 0.13069920241832733, + "learning_rate": 0.00019222282036070355, + "loss": 2.5881, + "step": 24078 + }, + { + "epoch": 0.7140230703080983, + "grad_norm": 0.11322485655546188, + "learning_rate": 0.00019218574112331877, + "loss": 2.6254, + "step": 24079 + }, + { + "epoch": 0.7140527236604098, + "grad_norm": 0.11340884119272232, + "learning_rate": 0.00019214866461168207, + "loss": 2.6062, + "step": 24080 + }, + { + "epoch": 0.7140823770127213, + "grad_norm": 0.12679311633110046, + "learning_rate": 0.000192111590826122, + "loss": 2.6319, + "step": 24081 + }, + { + "epoch": 0.7141120303650328, + "grad_norm": 0.10635799169540405, + "learning_rate": 0.00019207451976696645, + "loss": 2.6069, + "step": 24082 + }, + { + "epoch": 0.7141416837173442, + "grad_norm": 0.11662624031305313, + "learning_rate": 0.00019203745143454387, + "loss": 2.6284, + "step": 24083 + }, + { + "epoch": 0.7141713370696557, + "grad_norm": 0.11448194831609726, + "learning_rate": 0.00019200038582918255, + "loss": 2.6036, + "step": 24084 + }, + { + "epoch": 0.7142009904219672, + "grad_norm": 0.09788866341114044, + "learning_rate": 0.0001919633229512105, + "loss": 2.5963, + "step": 24085 + }, + { + "epoch": 0.7142306437742787, + "grad_norm": 0.11481861025094986, + "learning_rate": 0.0001919262628009562, + "loss": 2.6014, + "step": 24086 + }, + { + "epoch": 0.7142602971265901, + "grad_norm": 0.09719870984554291, + "learning_rate": 0.00019188920537874783, + "loss": 2.6086, + "step": 24087 + }, + { + "epoch": 0.7142899504789016, + "grad_norm": 0.10467584431171417, + "learning_rate": 0.00019185215068491336, + "loss": 2.5969, + "step": 24088 + }, + { + "epoch": 0.7143196038312131, + "grad_norm": 0.1066136285662651, + "learning_rate": 0.00019181509871978104, + "loss": 2.6271, + "step": 24089 + }, + { + "epoch": 0.7143492571835246, + "grad_norm": 0.0930560901761055, + "learning_rate": 0.00019177804948367888, + "loss": 2.6275, + "step": 24090 + }, + { + "epoch": 0.7143789105358361, + "grad_norm": 0.1102137491106987, + "learning_rate": 0.00019174100297693504, + "loss": 2.6342, + "step": 24091 + }, + { + "epoch": 0.7144085638881476, + "grad_norm": 0.10612849146127701, + "learning_rate": 0.00019170395919987767, + "loss": 2.6151, + "step": 24092 + }, + { + "epoch": 0.7144382172404591, + "grad_norm": 0.10756665468215942, + "learning_rate": 0.0001916669181528345, + "loss": 2.6259, + "step": 24093 + }, + { + "epoch": 0.7144678705927705, + "grad_norm": 0.23656630516052246, + "learning_rate": 0.00019162987983613368, + "loss": 2.6052, + "step": 24094 + }, + { + "epoch": 0.714497523945082, + "grad_norm": 0.09382190555334091, + "learning_rate": 0.0001915928442501032, + "loss": 2.5969, + "step": 24095 + }, + { + "epoch": 0.7145271772973935, + "grad_norm": 0.10835102945566177, + "learning_rate": 0.000191555811395071, + "loss": 2.5801, + "step": 24096 + }, + { + "epoch": 0.714556830649705, + "grad_norm": 0.09692250192165375, + "learning_rate": 0.00019151878127136507, + "loss": 2.6281, + "step": 24097 + }, + { + "epoch": 0.7145864840020164, + "grad_norm": 0.10683529078960419, + "learning_rate": 0.0001914817538793131, + "loss": 2.6184, + "step": 24098 + }, + { + "epoch": 0.7146161373543279, + "grad_norm": 0.0923239141702652, + "learning_rate": 0.00019144472921924332, + "loss": 2.6144, + "step": 24099 + }, + { + "epoch": 0.7146457907066394, + "grad_norm": 0.10257289558649063, + "learning_rate": 0.00019140770729148348, + "loss": 2.5925, + "step": 24100 + }, + { + "epoch": 0.7146754440589509, + "grad_norm": 0.08790762722492218, + "learning_rate": 0.00019137068809636134, + "loss": 2.5868, + "step": 24101 + }, + { + "epoch": 0.7147050974112623, + "grad_norm": 0.10789214074611664, + "learning_rate": 0.00019133367163420484, + "loss": 2.612, + "step": 24102 + }, + { + "epoch": 0.7147347507635738, + "grad_norm": 0.08740068972110748, + "learning_rate": 0.00019129665790534157, + "loss": 2.61, + "step": 24103 + }, + { + "epoch": 0.7147644041158853, + "grad_norm": 0.09559950977563858, + "learning_rate": 0.00019125964691009935, + "loss": 2.5958, + "step": 24104 + }, + { + "epoch": 0.7147940574681968, + "grad_norm": 0.08399196714162827, + "learning_rate": 0.0001912226386488059, + "loss": 2.5806, + "step": 24105 + }, + { + "epoch": 0.7148237108205082, + "grad_norm": 0.10673648118972778, + "learning_rate": 0.00019118563312178904, + "loss": 2.6277, + "step": 24106 + }, + { + "epoch": 0.7148533641728197, + "grad_norm": 0.08640366047620773, + "learning_rate": 0.0001911486303293764, + "loss": 2.5819, + "step": 24107 + }, + { + "epoch": 0.7148830175251312, + "grad_norm": 0.09941668808460236, + "learning_rate": 0.00019111163027189565, + "loss": 2.6201, + "step": 24108 + }, + { + "epoch": 0.7149126708774427, + "grad_norm": 0.09584492444992065, + "learning_rate": 0.00019107463294967448, + "loss": 2.6076, + "step": 24109 + }, + { + "epoch": 0.7149423242297541, + "grad_norm": 0.09120247513055801, + "learning_rate": 0.0001910376383630404, + "loss": 2.5725, + "step": 24110 + }, + { + "epoch": 0.7149719775820657, + "grad_norm": 0.1008351743221283, + "learning_rate": 0.0001910006465123209, + "loss": 2.624, + "step": 24111 + }, + { + "epoch": 0.7150016309343772, + "grad_norm": 0.09317710995674133, + "learning_rate": 0.0001909636573978439, + "loss": 2.5819, + "step": 24112 + }, + { + "epoch": 0.7150312842866886, + "grad_norm": 0.10169358551502228, + "learning_rate": 0.00019092667101993694, + "loss": 2.6275, + "step": 24113 + }, + { + "epoch": 0.7150609376390001, + "grad_norm": 0.10663386434316635, + "learning_rate": 0.00019088968737892716, + "loss": 2.599, + "step": 24114 + }, + { + "epoch": 0.7150905909913116, + "grad_norm": 0.10115604847669601, + "learning_rate": 0.00019085270647514226, + "loss": 2.5654, + "step": 24115 + }, + { + "epoch": 0.7151202443436231, + "grad_norm": 0.10875831544399261, + "learning_rate": 0.0001908157283089097, + "loss": 2.5702, + "step": 24116 + }, + { + "epoch": 0.7151498976959345, + "grad_norm": 0.10778863728046417, + "learning_rate": 0.00019077875288055695, + "loss": 2.6328, + "step": 24117 + }, + { + "epoch": 0.715179551048246, + "grad_norm": 0.10852321982383728, + "learning_rate": 0.00019074178019041143, + "loss": 2.5866, + "step": 24118 + }, + { + "epoch": 0.7152092044005575, + "grad_norm": 0.11594785749912262, + "learning_rate": 0.00019070481023880054, + "loss": 2.6482, + "step": 24119 + }, + { + "epoch": 0.715238857752869, + "grad_norm": 0.11173985153436661, + "learning_rate": 0.00019066784302605166, + "loss": 2.5962, + "step": 24120 + }, + { + "epoch": 0.7152685111051804, + "grad_norm": 0.10193516314029694, + "learning_rate": 0.00019063087855249205, + "loss": 2.581, + "step": 24121 + }, + { + "epoch": 0.715298164457492, + "grad_norm": 0.11535318195819855, + "learning_rate": 0.00019059391681844917, + "loss": 2.6136, + "step": 24122 + }, + { + "epoch": 0.7153278178098034, + "grad_norm": 0.10565754026174545, + "learning_rate": 0.00019055695782425026, + "loss": 2.6253, + "step": 24123 + }, + { + "epoch": 0.7153574711621149, + "grad_norm": 0.10089834034442902, + "learning_rate": 0.0001905200015702226, + "loss": 2.604, + "step": 24124 + }, + { + "epoch": 0.7153871245144263, + "grad_norm": 0.10340286046266556, + "learning_rate": 0.00019048304805669347, + "loss": 2.6117, + "step": 24125 + }, + { + "epoch": 0.7154167778667379, + "grad_norm": 0.10209651291370392, + "learning_rate": 0.00019044609728399004, + "loss": 2.5742, + "step": 24126 + }, + { + "epoch": 0.7154464312190493, + "grad_norm": 0.09961683303117752, + "learning_rate": 0.00019040914925243956, + "loss": 2.6057, + "step": 24127 + }, + { + "epoch": 0.7154760845713608, + "grad_norm": 0.10967481881380081, + "learning_rate": 0.0001903722039623692, + "loss": 2.6195, + "step": 24128 + }, + { + "epoch": 0.7155057379236722, + "grad_norm": 0.10386344790458679, + "learning_rate": 0.00019033526141410618, + "loss": 2.5953, + "step": 24129 + }, + { + "epoch": 0.7155353912759838, + "grad_norm": 0.10792838037014008, + "learning_rate": 0.00019029832160797749, + "loss": 2.6062, + "step": 24130 + }, + { + "epoch": 0.7155650446282952, + "grad_norm": 0.10403437167406082, + "learning_rate": 0.00019026138454431035, + "loss": 2.6079, + "step": 24131 + }, + { + "epoch": 0.7155946979806067, + "grad_norm": 0.10553400218486786, + "learning_rate": 0.00019022445022343182, + "loss": 2.5906, + "step": 24132 + }, + { + "epoch": 0.7156243513329182, + "grad_norm": 0.10914649069309235, + "learning_rate": 0.00019018751864566897, + "loss": 2.5593, + "step": 24133 + }, + { + "epoch": 0.7156540046852297, + "grad_norm": 0.10209112614393234, + "learning_rate": 0.0001901505898113488, + "loss": 2.63, + "step": 24134 + }, + { + "epoch": 0.7156836580375412, + "grad_norm": 0.10996682941913605, + "learning_rate": 0.00019011366372079835, + "loss": 2.5946, + "step": 24135 + }, + { + "epoch": 0.7157133113898526, + "grad_norm": 0.0988161489367485, + "learning_rate": 0.0001900767403743448, + "loss": 2.5877, + "step": 24136 + }, + { + "epoch": 0.7157429647421641, + "grad_norm": 0.10425551980733871, + "learning_rate": 0.00019003981977231454, + "loss": 2.6146, + "step": 24137 + }, + { + "epoch": 0.7157726180944756, + "grad_norm": 0.09506679326295853, + "learning_rate": 0.00019000290191503505, + "loss": 2.5785, + "step": 24138 + }, + { + "epoch": 0.7158022714467871, + "grad_norm": 0.09452307969331741, + "learning_rate": 0.00018996598680283305, + "loss": 2.6107, + "step": 24139 + }, + { + "epoch": 0.7158319247990985, + "grad_norm": 0.09806884825229645, + "learning_rate": 0.00018992907443603552, + "loss": 2.5796, + "step": 24140 + }, + { + "epoch": 0.71586157815141, + "grad_norm": 0.09873591363430023, + "learning_rate": 0.00018989216481496917, + "loss": 2.6076, + "step": 24141 + }, + { + "epoch": 0.7158912315037215, + "grad_norm": 0.10518202185630798, + "learning_rate": 0.000189855257939961, + "loss": 2.6394, + "step": 24142 + }, + { + "epoch": 0.715920884856033, + "grad_norm": 0.10307559370994568, + "learning_rate": 0.00018981835381133778, + "loss": 2.6272, + "step": 24143 + }, + { + "epoch": 0.7159505382083444, + "grad_norm": 0.09650891274213791, + "learning_rate": 0.00018978145242942618, + "loss": 2.5927, + "step": 24144 + }, + { + "epoch": 0.715980191560656, + "grad_norm": 0.1028045192360878, + "learning_rate": 0.00018974455379455312, + "loss": 2.6341, + "step": 24145 + }, + { + "epoch": 0.7160098449129674, + "grad_norm": 0.09767447412014008, + "learning_rate": 0.0001897076579070453, + "loss": 2.6164, + "step": 24146 + }, + { + "epoch": 0.7160394982652789, + "grad_norm": 0.09610936790704727, + "learning_rate": 0.0001896707647672296, + "loss": 2.584, + "step": 24147 + }, + { + "epoch": 0.7160691516175903, + "grad_norm": 0.10426509380340576, + "learning_rate": 0.0001896338743754324, + "loss": 2.6208, + "step": 24148 + }, + { + "epoch": 0.7160988049699019, + "grad_norm": 0.09356793761253357, + "learning_rate": 0.00018959698673198046, + "loss": 2.616, + "step": 24149 + }, + { + "epoch": 0.7161284583222133, + "grad_norm": 0.09414391219615936, + "learning_rate": 0.00018956010183720034, + "loss": 2.6012, + "step": 24150 + }, + { + "epoch": 0.7161581116745248, + "grad_norm": 0.09389084577560425, + "learning_rate": 0.00018952321969141895, + "loss": 2.6024, + "step": 24151 + }, + { + "epoch": 0.7161877650268362, + "grad_norm": 0.09451289474964142, + "learning_rate": 0.00018948634029496275, + "loss": 2.6357, + "step": 24152 + }, + { + "epoch": 0.7162174183791478, + "grad_norm": 0.09878839552402496, + "learning_rate": 0.00018944946364815834, + "loss": 2.6055, + "step": 24153 + }, + { + "epoch": 0.7162470717314593, + "grad_norm": 0.09571079909801483, + "learning_rate": 0.00018941258975133224, + "loss": 2.5983, + "step": 24154 + }, + { + "epoch": 0.7162767250837707, + "grad_norm": 0.10093455016613007, + "learning_rate": 0.0001893757186048109, + "loss": 2.6121, + "step": 24155 + }, + { + "epoch": 0.7163063784360822, + "grad_norm": 0.09393086284399033, + "learning_rate": 0.00018933885020892095, + "loss": 2.5962, + "step": 24156 + }, + { + "epoch": 0.7163360317883937, + "grad_norm": 0.0995088592171669, + "learning_rate": 0.00018930198456398894, + "loss": 2.5846, + "step": 24157 + }, + { + "epoch": 0.7163656851407052, + "grad_norm": 0.09695488214492798, + "learning_rate": 0.00018926512167034105, + "loss": 2.6014, + "step": 24158 + }, + { + "epoch": 0.7163953384930166, + "grad_norm": 0.09814484417438507, + "learning_rate": 0.00018922826152830387, + "loss": 2.5722, + "step": 24159 + }, + { + "epoch": 0.7164249918453282, + "grad_norm": 0.09960421919822693, + "learning_rate": 0.00018919140413820368, + "loss": 2.6447, + "step": 24160 + }, + { + "epoch": 0.7164546451976396, + "grad_norm": 0.10448377579450607, + "learning_rate": 0.00018915454950036703, + "loss": 2.6295, + "step": 24161 + }, + { + "epoch": 0.7164842985499511, + "grad_norm": 0.09488289803266525, + "learning_rate": 0.0001891176976151202, + "loss": 2.59, + "step": 24162 + }, + { + "epoch": 0.7165139519022625, + "grad_norm": 0.0961090549826622, + "learning_rate": 0.00018908084848278934, + "loss": 2.5823, + "step": 24163 + }, + { + "epoch": 0.7165436052545741, + "grad_norm": 0.09961199015378952, + "learning_rate": 0.00018904400210370109, + "loss": 2.6168, + "step": 24164 + }, + { + "epoch": 0.7165732586068855, + "grad_norm": 0.09550692141056061, + "learning_rate": 0.00018900715847818157, + "loss": 2.6309, + "step": 24165 + }, + { + "epoch": 0.716602911959197, + "grad_norm": 0.10124808549880981, + "learning_rate": 0.00018897031760655708, + "loss": 2.5718, + "step": 24166 + }, + { + "epoch": 0.7166325653115084, + "grad_norm": 0.10505900532007217, + "learning_rate": 0.0001889334794891538, + "loss": 2.616, + "step": 24167 + }, + { + "epoch": 0.71666221866382, + "grad_norm": 0.10358767211437225, + "learning_rate": 0.0001888966441262981, + "loss": 2.6324, + "step": 24168 + }, + { + "epoch": 0.7166918720161314, + "grad_norm": 0.10112432390451431, + "learning_rate": 0.0001888598115183159, + "loss": 2.6362, + "step": 24169 + }, + { + "epoch": 0.7167215253684429, + "grad_norm": 0.09727810323238373, + "learning_rate": 0.00018882298166553342, + "loss": 2.5899, + "step": 24170 + }, + { + "epoch": 0.7167511787207543, + "grad_norm": 0.10141640156507492, + "learning_rate": 0.00018878615456827686, + "loss": 2.5969, + "step": 24171 + }, + { + "epoch": 0.7167808320730659, + "grad_norm": 0.10980687290430069, + "learning_rate": 0.0001887493302268723, + "loss": 2.599, + "step": 24172 + }, + { + "epoch": 0.7168104854253773, + "grad_norm": 0.08921501785516739, + "learning_rate": 0.0001887125086416459, + "loss": 2.6158, + "step": 24173 + }, + { + "epoch": 0.7168401387776888, + "grad_norm": 0.11168791353702545, + "learning_rate": 0.00018867568981292365, + "loss": 2.6239, + "step": 24174 + }, + { + "epoch": 0.7168697921300003, + "grad_norm": 0.09781600534915924, + "learning_rate": 0.00018863887374103154, + "loss": 2.6249, + "step": 24175 + }, + { + "epoch": 0.7168994454823118, + "grad_norm": 0.110487200319767, + "learning_rate": 0.00018860206042629557, + "loss": 2.5925, + "step": 24176 + }, + { + "epoch": 0.7169290988346233, + "grad_norm": 0.11303164809942245, + "learning_rate": 0.00018856524986904196, + "loss": 2.604, + "step": 24177 + }, + { + "epoch": 0.7169587521869347, + "grad_norm": 0.1098693385720253, + "learning_rate": 0.00018852844206959662, + "loss": 2.5895, + "step": 24178 + }, + { + "epoch": 0.7169884055392463, + "grad_norm": 0.10505890101194382, + "learning_rate": 0.00018849163702828531, + "loss": 2.6094, + "step": 24179 + }, + { + "epoch": 0.7170180588915577, + "grad_norm": 0.10651243478059769, + "learning_rate": 0.00018845483474543394, + "loss": 2.5979, + "step": 24180 + }, + { + "epoch": 0.7170477122438692, + "grad_norm": 0.10410348325967789, + "learning_rate": 0.00018841803522136858, + "loss": 2.6429, + "step": 24181 + }, + { + "epoch": 0.7170773655961806, + "grad_norm": 0.09726180881261826, + "learning_rate": 0.0001883812384564149, + "loss": 2.5882, + "step": 24182 + }, + { + "epoch": 0.7171070189484922, + "grad_norm": 0.1206011176109314, + "learning_rate": 0.00018834444445089892, + "loss": 2.607, + "step": 24183 + }, + { + "epoch": 0.7171366723008036, + "grad_norm": 0.11265875399112701, + "learning_rate": 0.00018830765320514636, + "loss": 2.5982, + "step": 24184 + }, + { + "epoch": 0.7171663256531151, + "grad_norm": 0.09747636318206787, + "learning_rate": 0.00018827086471948301, + "loss": 2.6189, + "step": 24185 + }, + { + "epoch": 0.7171959790054265, + "grad_norm": 0.10687199980020523, + "learning_rate": 0.00018823407899423467, + "loss": 2.6095, + "step": 24186 + }, + { + "epoch": 0.7172256323577381, + "grad_norm": 0.09802302718162537, + "learning_rate": 0.00018819729602972707, + "loss": 2.5691, + "step": 24187 + }, + { + "epoch": 0.7172552857100495, + "grad_norm": 0.11072739213705063, + "learning_rate": 0.00018816051582628597, + "loss": 2.5744, + "step": 24188 + }, + { + "epoch": 0.717284939062361, + "grad_norm": 0.09028130769729614, + "learning_rate": 0.00018812373838423697, + "loss": 2.6215, + "step": 24189 + }, + { + "epoch": 0.7173145924146724, + "grad_norm": 0.10581044852733612, + "learning_rate": 0.00018808696370390584, + "loss": 2.5951, + "step": 24190 + }, + { + "epoch": 0.717344245766984, + "grad_norm": 0.10013413429260254, + "learning_rate": 0.00018805019178561817, + "loss": 2.624, + "step": 24191 + }, + { + "epoch": 0.7173738991192954, + "grad_norm": 0.09923192858695984, + "learning_rate": 0.0001880134226296996, + "loss": 2.6196, + "step": 24192 + }, + { + "epoch": 0.7174035524716069, + "grad_norm": 0.10760512202978134, + "learning_rate": 0.00018797665623647574, + "loss": 2.5671, + "step": 24193 + }, + { + "epoch": 0.7174332058239183, + "grad_norm": 0.10293098539113998, + "learning_rate": 0.00018793989260627216, + "loss": 2.5787, + "step": 24194 + }, + { + "epoch": 0.7174628591762299, + "grad_norm": 0.1149538978934288, + "learning_rate": 0.0001879031317394143, + "loss": 2.6391, + "step": 24195 + }, + { + "epoch": 0.7174925125285414, + "grad_norm": 0.10713769495487213, + "learning_rate": 0.00018786637363622788, + "loss": 2.5965, + "step": 24196 + }, + { + "epoch": 0.7175221658808528, + "grad_norm": 0.11052680015563965, + "learning_rate": 0.00018782961829703825, + "loss": 2.5938, + "step": 24197 + }, + { + "epoch": 0.7175518192331644, + "grad_norm": 0.11063007265329361, + "learning_rate": 0.00018779286572217097, + "loss": 2.6225, + "step": 24198 + }, + { + "epoch": 0.7175814725854758, + "grad_norm": 0.11393418163061142, + "learning_rate": 0.00018775611591195142, + "loss": 2.6157, + "step": 24199 + }, + { + "epoch": 0.7176111259377873, + "grad_norm": 0.09690946340560913, + "learning_rate": 0.0001877193688667051, + "loss": 2.5897, + "step": 24200 + }, + { + "epoch": 0.7176407792900987, + "grad_norm": 0.10882935672998428, + "learning_rate": 0.00018768262458675734, + "loss": 2.6451, + "step": 24201 + }, + { + "epoch": 0.7176704326424103, + "grad_norm": 0.09503861516714096, + "learning_rate": 0.00018764588307243352, + "loss": 2.5919, + "step": 24202 + }, + { + "epoch": 0.7177000859947217, + "grad_norm": 0.10942460596561432, + "learning_rate": 0.00018760914432405906, + "loss": 2.618, + "step": 24203 + }, + { + "epoch": 0.7177297393470332, + "grad_norm": 0.10813729465007782, + "learning_rate": 0.0001875724083419592, + "loss": 2.6196, + "step": 24204 + }, + { + "epoch": 0.7177593926993446, + "grad_norm": 0.10373852401971817, + "learning_rate": 0.00018753567512645936, + "loss": 2.6074, + "step": 24205 + }, + { + "epoch": 0.7177890460516562, + "grad_norm": 0.11503837257623672, + "learning_rate": 0.00018749894467788475, + "loss": 2.591, + "step": 24206 + }, + { + "epoch": 0.7178186994039676, + "grad_norm": 0.1015392541885376, + "learning_rate": 0.0001874622169965606, + "loss": 2.5737, + "step": 24207 + }, + { + "epoch": 0.7178483527562791, + "grad_norm": 0.11103663593530655, + "learning_rate": 0.00018742549208281212, + "loss": 2.6072, + "step": 24208 + }, + { + "epoch": 0.7178780061085905, + "grad_norm": 0.10284143686294556, + "learning_rate": 0.00018738876993696464, + "loss": 2.6154, + "step": 24209 + }, + { + "epoch": 0.7179076594609021, + "grad_norm": 0.10375756025314331, + "learning_rate": 0.00018735205055934318, + "loss": 2.5742, + "step": 24210 + }, + { + "epoch": 0.7179373128132135, + "grad_norm": 0.09991052746772766, + "learning_rate": 0.00018731533395027305, + "loss": 2.5844, + "step": 24211 + }, + { + "epoch": 0.717966966165525, + "grad_norm": 0.0996970608830452, + "learning_rate": 0.00018727862011007945, + "loss": 2.5929, + "step": 24212 + }, + { + "epoch": 0.7179966195178364, + "grad_norm": 0.10423842817544937, + "learning_rate": 0.00018724190903908716, + "loss": 2.6125, + "step": 24213 + }, + { + "epoch": 0.718026272870148, + "grad_norm": 0.0960841104388237, + "learning_rate": 0.00018720520073762148, + "loss": 2.5957, + "step": 24214 + }, + { + "epoch": 0.7180559262224594, + "grad_norm": 0.10047737509012222, + "learning_rate": 0.00018716849520600726, + "loss": 2.6251, + "step": 24215 + }, + { + "epoch": 0.7180855795747709, + "grad_norm": 0.10085339099168777, + "learning_rate": 0.00018713179244456984, + "loss": 2.5702, + "step": 24216 + }, + { + "epoch": 0.7181152329270825, + "grad_norm": 0.10917253792285919, + "learning_rate": 0.00018709509245363415, + "loss": 2.6084, + "step": 24217 + }, + { + "epoch": 0.7181448862793939, + "grad_norm": 0.10638605058193207, + "learning_rate": 0.00018705839523352514, + "loss": 2.5777, + "step": 24218 + }, + { + "epoch": 0.7181745396317054, + "grad_norm": 0.1063966378569603, + "learning_rate": 0.00018702170078456772, + "loss": 2.607, + "step": 24219 + }, + { + "epoch": 0.7182041929840168, + "grad_norm": 0.10908452421426773, + "learning_rate": 0.00018698500910708688, + "loss": 2.6166, + "step": 24220 + }, + { + "epoch": 0.7182338463363284, + "grad_norm": 0.14133690297603607, + "learning_rate": 0.0001869483202014075, + "loss": 2.615, + "step": 24221 + }, + { + "epoch": 0.7182634996886398, + "grad_norm": 0.11469414085149765, + "learning_rate": 0.0001869116340678545, + "loss": 2.6053, + "step": 24222 + }, + { + "epoch": 0.7182931530409513, + "grad_norm": 0.11326249688863754, + "learning_rate": 0.00018687495070675287, + "loss": 2.5905, + "step": 24223 + }, + { + "epoch": 0.7183228063932627, + "grad_norm": 0.11411909013986588, + "learning_rate": 0.00018683827011842713, + "loss": 2.5545, + "step": 24224 + }, + { + "epoch": 0.7183524597455743, + "grad_norm": 0.1066087856888771, + "learning_rate": 0.00018680159230320226, + "loss": 2.5901, + "step": 24225 + }, + { + "epoch": 0.7183821130978857, + "grad_norm": 0.1073664054274559, + "learning_rate": 0.00018676491726140305, + "loss": 2.5857, + "step": 24226 + }, + { + "epoch": 0.7184117664501972, + "grad_norm": 0.10878997296094894, + "learning_rate": 0.0001867282449933541, + "loss": 2.6207, + "step": 24227 + }, + { + "epoch": 0.7184414198025086, + "grad_norm": 0.11108113080263138, + "learning_rate": 0.0001866915754993805, + "loss": 2.6344, + "step": 24228 + }, + { + "epoch": 0.7184710731548202, + "grad_norm": 0.09741702675819397, + "learning_rate": 0.00018665490877980674, + "loss": 2.6383, + "step": 24229 + }, + { + "epoch": 0.7185007265071316, + "grad_norm": 0.11368949711322784, + "learning_rate": 0.00018661824483495753, + "loss": 2.5883, + "step": 24230 + }, + { + "epoch": 0.7185303798594431, + "grad_norm": 0.11249292641878128, + "learning_rate": 0.00018658158366515766, + "loss": 2.6272, + "step": 24231 + }, + { + "epoch": 0.7185600332117545, + "grad_norm": 0.10013645887374878, + "learning_rate": 0.00018654492527073158, + "loss": 2.6112, + "step": 24232 + }, + { + "epoch": 0.7185896865640661, + "grad_norm": 0.09993311017751694, + "learning_rate": 0.00018650826965200417, + "loss": 2.5918, + "step": 24233 + }, + { + "epoch": 0.7186193399163775, + "grad_norm": 0.10224609822034836, + "learning_rate": 0.00018647161680929964, + "loss": 2.5892, + "step": 24234 + }, + { + "epoch": 0.718648993268689, + "grad_norm": 0.10163545608520508, + "learning_rate": 0.0001864349667429428, + "loss": 2.5728, + "step": 24235 + }, + { + "epoch": 0.7186786466210004, + "grad_norm": 0.09634559601545334, + "learning_rate": 0.0001863983194532582, + "loss": 2.5903, + "step": 24236 + }, + { + "epoch": 0.718708299973312, + "grad_norm": 0.10106318444013596, + "learning_rate": 0.00018636167494057022, + "loss": 2.5997, + "step": 24237 + }, + { + "epoch": 0.7187379533256235, + "grad_norm": 0.09514019638299942, + "learning_rate": 0.0001863250332052035, + "loss": 2.641, + "step": 24238 + }, + { + "epoch": 0.7187676066779349, + "grad_norm": 0.1056540235877037, + "learning_rate": 0.00018628839424748245, + "loss": 2.5908, + "step": 24239 + }, + { + "epoch": 0.7187972600302465, + "grad_norm": 0.10296082496643066, + "learning_rate": 0.00018625175806773133, + "loss": 2.6077, + "step": 24240 + }, + { + "epoch": 0.7188269133825579, + "grad_norm": 0.10588356852531433, + "learning_rate": 0.00018621512466627488, + "loss": 2.6076, + "step": 24241 + }, + { + "epoch": 0.7188565667348694, + "grad_norm": 0.11100923269987106, + "learning_rate": 0.0001861784940434374, + "loss": 2.5882, + "step": 24242 + }, + { + "epoch": 0.7188862200871808, + "grad_norm": 0.09477919340133667, + "learning_rate": 0.00018614186619954326, + "loss": 2.6081, + "step": 24243 + }, + { + "epoch": 0.7189158734394924, + "grad_norm": 0.11654689908027649, + "learning_rate": 0.00018610524113491688, + "loss": 2.5924, + "step": 24244 + }, + { + "epoch": 0.7189455267918038, + "grad_norm": 0.11512138694524765, + "learning_rate": 0.0001860686188498823, + "loss": 2.601, + "step": 24245 + }, + { + "epoch": 0.7189751801441153, + "grad_norm": 0.10675238817930222, + "learning_rate": 0.000186031999344764, + "loss": 2.599, + "step": 24246 + }, + { + "epoch": 0.7190048334964267, + "grad_norm": 0.11693493276834488, + "learning_rate": 0.00018599538261988624, + "loss": 2.6187, + "step": 24247 + }, + { + "epoch": 0.7190344868487383, + "grad_norm": 0.12369395047426224, + "learning_rate": 0.00018595876867557332, + "loss": 2.6052, + "step": 24248 + }, + { + "epoch": 0.7190641402010497, + "grad_norm": 0.1123914048075676, + "learning_rate": 0.0001859221575121493, + "loss": 2.5775, + "step": 24249 + }, + { + "epoch": 0.7190937935533612, + "grad_norm": 0.10490734130144119, + "learning_rate": 0.00018588554912993854, + "loss": 2.6184, + "step": 24250 + }, + { + "epoch": 0.7191234469056726, + "grad_norm": 0.11193299293518066, + "learning_rate": 0.00018584894352926518, + "loss": 2.6221, + "step": 24251 + }, + { + "epoch": 0.7191531002579842, + "grad_norm": 0.11172796785831451, + "learning_rate": 0.00018581234071045333, + "loss": 2.6026, + "step": 24252 + }, + { + "epoch": 0.7191827536102956, + "grad_norm": 0.09418341517448425, + "learning_rate": 0.00018577574067382696, + "loss": 2.5826, + "step": 24253 + }, + { + "epoch": 0.7192124069626071, + "grad_norm": 0.10183368623256683, + "learning_rate": 0.0001857391434197107, + "loss": 2.6028, + "step": 24254 + }, + { + "epoch": 0.7192420603149186, + "grad_norm": 0.09883134812116623, + "learning_rate": 0.00018570254894842807, + "loss": 2.5928, + "step": 24255 + }, + { + "epoch": 0.7192717136672301, + "grad_norm": 0.09293846040964127, + "learning_rate": 0.00018566595726030334, + "loss": 2.5807, + "step": 24256 + }, + { + "epoch": 0.7193013670195415, + "grad_norm": 0.10259266942739487, + "learning_rate": 0.00018562936835566052, + "loss": 2.6117, + "step": 24257 + }, + { + "epoch": 0.719331020371853, + "grad_norm": 0.09728538244962692, + "learning_rate": 0.00018559278223482357, + "loss": 2.5748, + "step": 24258 + }, + { + "epoch": 0.7193606737241646, + "grad_norm": 0.0979418084025383, + "learning_rate": 0.00018555619889811653, + "loss": 2.5876, + "step": 24259 + }, + { + "epoch": 0.719390327076476, + "grad_norm": 0.10240185260772705, + "learning_rate": 0.0001855196183458633, + "loss": 2.6212, + "step": 24260 + }, + { + "epoch": 0.7194199804287875, + "grad_norm": 0.1031024232506752, + "learning_rate": 0.00018548304057838783, + "loss": 2.5997, + "step": 24261 + }, + { + "epoch": 0.7194496337810989, + "grad_norm": 0.08862069994211197, + "learning_rate": 0.00018544646559601403, + "loss": 2.5886, + "step": 24262 + }, + { + "epoch": 0.7194792871334105, + "grad_norm": 0.10244658589363098, + "learning_rate": 0.00018540989339906579, + "loss": 2.5744, + "step": 24263 + }, + { + "epoch": 0.7195089404857219, + "grad_norm": 0.09800776094198227, + "learning_rate": 0.00018537332398786688, + "loss": 2.5815, + "step": 24264 + }, + { + "epoch": 0.7195385938380334, + "grad_norm": 0.10181844234466553, + "learning_rate": 0.00018533675736274124, + "loss": 2.6068, + "step": 24265 + }, + { + "epoch": 0.7195682471903448, + "grad_norm": 0.08649855107069016, + "learning_rate": 0.00018530019352401263, + "loss": 2.6146, + "step": 24266 + }, + { + "epoch": 0.7195979005426564, + "grad_norm": 0.10071659088134766, + "learning_rate": 0.00018526363247200483, + "loss": 2.5936, + "step": 24267 + }, + { + "epoch": 0.7196275538949678, + "grad_norm": 0.09127479046583176, + "learning_rate": 0.00018522707420704155, + "loss": 2.6151, + "step": 24268 + }, + { + "epoch": 0.7196572072472793, + "grad_norm": 0.10487598925828934, + "learning_rate": 0.00018519051872944658, + "loss": 2.5915, + "step": 24269 + }, + { + "epoch": 0.7196868605995907, + "grad_norm": 0.09022225439548492, + "learning_rate": 0.0001851539660395436, + "loss": 2.5428, + "step": 24270 + }, + { + "epoch": 0.7197165139519023, + "grad_norm": 0.10876400768756866, + "learning_rate": 0.0001851174161376563, + "loss": 2.6041, + "step": 24271 + }, + { + "epoch": 0.7197461673042137, + "grad_norm": 0.10306322574615479, + "learning_rate": 0.0001850808690241083, + "loss": 2.5935, + "step": 24272 + }, + { + "epoch": 0.7197758206565252, + "grad_norm": 0.09214282035827637, + "learning_rate": 0.00018504432469922333, + "loss": 2.5593, + "step": 24273 + }, + { + "epoch": 0.7198054740088367, + "grad_norm": 0.1118982657790184, + "learning_rate": 0.0001850077831633249, + "loss": 2.5881, + "step": 24274 + }, + { + "epoch": 0.7198351273611482, + "grad_norm": 0.10213936120271683, + "learning_rate": 0.00018497124441673658, + "loss": 2.5818, + "step": 24275 + }, + { + "epoch": 0.7198647807134596, + "grad_norm": 0.10058707743883133, + "learning_rate": 0.00018493470845978193, + "loss": 2.5453, + "step": 24276 + }, + { + "epoch": 0.7198944340657711, + "grad_norm": 0.10570785403251648, + "learning_rate": 0.00018489817529278462, + "loss": 2.616, + "step": 24277 + }, + { + "epoch": 0.7199240874180826, + "grad_norm": 0.10461015999317169, + "learning_rate": 0.00018486164491606817, + "loss": 2.5954, + "step": 24278 + }, + { + "epoch": 0.7199537407703941, + "grad_norm": 0.10083998739719391, + "learning_rate": 0.0001848251173299556, + "loss": 2.6168, + "step": 24279 + }, + { + "epoch": 0.7199833941227056, + "grad_norm": 0.10290465503931046, + "learning_rate": 0.00018478859253477092, + "loss": 2.5885, + "step": 24280 + }, + { + "epoch": 0.720013047475017, + "grad_norm": 0.10120737552642822, + "learning_rate": 0.00018475207053083732, + "loss": 2.5934, + "step": 24281 + }, + { + "epoch": 0.7200427008273286, + "grad_norm": 0.10444754362106323, + "learning_rate": 0.0001847155513184783, + "loss": 2.5899, + "step": 24282 + }, + { + "epoch": 0.72007235417964, + "grad_norm": 0.09332683682441711, + "learning_rate": 0.00018467903489801713, + "loss": 2.6294, + "step": 24283 + }, + { + "epoch": 0.7201020075319515, + "grad_norm": 0.10180441290140152, + "learning_rate": 0.0001846425212697772, + "loss": 2.6261, + "step": 24284 + }, + { + "epoch": 0.720131660884263, + "grad_norm": 0.09311255812644958, + "learning_rate": 0.00018460601043408198, + "loss": 2.5892, + "step": 24285 + }, + { + "epoch": 0.7201613142365745, + "grad_norm": 0.09402202814817429, + "learning_rate": 0.0001845695023912546, + "loss": 2.5763, + "step": 24286 + }, + { + "epoch": 0.7201909675888859, + "grad_norm": 0.09686242043972015, + "learning_rate": 0.0001845329971416184, + "loss": 2.6032, + "step": 24287 + }, + { + "epoch": 0.7202206209411974, + "grad_norm": 0.10747268795967102, + "learning_rate": 0.0001844964946854969, + "loss": 2.6339, + "step": 24288 + }, + { + "epoch": 0.7202502742935089, + "grad_norm": 0.10655428469181061, + "learning_rate": 0.00018445999502321293, + "loss": 2.6505, + "step": 24289 + }, + { + "epoch": 0.7202799276458204, + "grad_norm": 0.09336981922388077, + "learning_rate": 0.0001844234981550898, + "loss": 2.6126, + "step": 24290 + }, + { + "epoch": 0.7203095809981318, + "grad_norm": 0.10808728635311127, + "learning_rate": 0.0001843870040814508, + "loss": 2.6036, + "step": 24291 + }, + { + "epoch": 0.7203392343504433, + "grad_norm": 0.09078222513198853, + "learning_rate": 0.0001843505128026189, + "loss": 2.573, + "step": 24292 + }, + { + "epoch": 0.7203688877027548, + "grad_norm": 0.10548721998929977, + "learning_rate": 0.00018431402431891752, + "loss": 2.6064, + "step": 24293 + }, + { + "epoch": 0.7203985410550663, + "grad_norm": 0.09892299026250839, + "learning_rate": 0.00018427753863066966, + "loss": 2.6288, + "step": 24294 + }, + { + "epoch": 0.7204281944073777, + "grad_norm": 0.10089938342571259, + "learning_rate": 0.00018424105573819837, + "loss": 2.6291, + "step": 24295 + }, + { + "epoch": 0.7204578477596892, + "grad_norm": 0.10718698054552078, + "learning_rate": 0.00018420457564182675, + "loss": 2.6004, + "step": 24296 + }, + { + "epoch": 0.7204875011120007, + "grad_norm": 0.10982447117567062, + "learning_rate": 0.00018416809834187782, + "loss": 2.6086, + "step": 24297 + }, + { + "epoch": 0.7205171544643122, + "grad_norm": 0.09982333332300186, + "learning_rate": 0.0001841316238386746, + "loss": 2.6244, + "step": 24298 + }, + { + "epoch": 0.7205468078166237, + "grad_norm": 0.1030503511428833, + "learning_rate": 0.00018409515213254019, + "loss": 2.5975, + "step": 24299 + }, + { + "epoch": 0.7205764611689351, + "grad_norm": 0.10800360888242722, + "learning_rate": 0.00018405868322379733, + "loss": 2.6096, + "step": 24300 + }, + { + "epoch": 0.7206061145212467, + "grad_norm": 0.1060507595539093, + "learning_rate": 0.000184022217112769, + "loss": 2.6289, + "step": 24301 + }, + { + "epoch": 0.7206357678735581, + "grad_norm": 0.10400636494159698, + "learning_rate": 0.00018398575379977822, + "loss": 2.6397, + "step": 24302 + }, + { + "epoch": 0.7206654212258696, + "grad_norm": 0.1064026802778244, + "learning_rate": 0.0001839492932851478, + "loss": 2.6024, + "step": 24303 + }, + { + "epoch": 0.720695074578181, + "grad_norm": 0.10889691859483719, + "learning_rate": 0.0001839128355692007, + "loss": 2.5721, + "step": 24304 + }, + { + "epoch": 0.7207247279304926, + "grad_norm": 0.10603219270706177, + "learning_rate": 0.00018387638065225941, + "loss": 2.5861, + "step": 24305 + }, + { + "epoch": 0.720754381282804, + "grad_norm": 0.11139383912086487, + "learning_rate": 0.00018383992853464732, + "loss": 2.5946, + "step": 24306 + }, + { + "epoch": 0.7207840346351155, + "grad_norm": 0.10244669765233994, + "learning_rate": 0.00018380347921668688, + "loss": 2.6386, + "step": 24307 + }, + { + "epoch": 0.720813687987427, + "grad_norm": 0.10325241088867188, + "learning_rate": 0.0001837670326987009, + "loss": 2.6049, + "step": 24308 + }, + { + "epoch": 0.7208433413397385, + "grad_norm": 0.10554023087024689, + "learning_rate": 0.0001837305889810123, + "loss": 2.6097, + "step": 24309 + }, + { + "epoch": 0.7208729946920499, + "grad_norm": 0.10443215072154999, + "learning_rate": 0.00018369414806394346, + "loss": 2.5775, + "step": 24310 + }, + { + "epoch": 0.7209026480443614, + "grad_norm": 0.11281602084636688, + "learning_rate": 0.00018365770994781722, + "loss": 2.6181, + "step": 24311 + }, + { + "epoch": 0.7209323013966729, + "grad_norm": 0.10711098462343216, + "learning_rate": 0.00018362127463295624, + "loss": 2.5685, + "step": 24312 + }, + { + "epoch": 0.7209619547489844, + "grad_norm": 0.11138567328453064, + "learning_rate": 0.00018358484211968324, + "loss": 2.6332, + "step": 24313 + }, + { + "epoch": 0.7209916081012958, + "grad_norm": 0.10355249047279358, + "learning_rate": 0.00018354841240832072, + "loss": 2.5904, + "step": 24314 + }, + { + "epoch": 0.7210212614536073, + "grad_norm": 0.10943392664194107, + "learning_rate": 0.00018351198549919134, + "loss": 2.6055, + "step": 24315 + }, + { + "epoch": 0.7210509148059188, + "grad_norm": 0.10158883035182953, + "learning_rate": 0.00018347556139261767, + "loss": 2.6017, + "step": 24316 + }, + { + "epoch": 0.7210805681582303, + "grad_norm": 0.10946536809206009, + "learning_rate": 0.0001834391400889222, + "loss": 2.607, + "step": 24317 + }, + { + "epoch": 0.7211102215105417, + "grad_norm": 0.09268473088741302, + "learning_rate": 0.00018340272158842735, + "loss": 2.5826, + "step": 24318 + }, + { + "epoch": 0.7211398748628532, + "grad_norm": 0.10597027838230133, + "learning_rate": 0.00018336630589145593, + "loss": 2.5903, + "step": 24319 + }, + { + "epoch": 0.7211695282151648, + "grad_norm": 0.0949113667011261, + "learning_rate": 0.00018332989299833037, + "loss": 2.6241, + "step": 24320 + }, + { + "epoch": 0.7211991815674762, + "grad_norm": 0.12177615612745285, + "learning_rate": 0.00018329348290937276, + "loss": 2.5849, + "step": 24321 + }, + { + "epoch": 0.7212288349197877, + "grad_norm": 0.10370717197656631, + "learning_rate": 0.00018325707562490574, + "loss": 2.5975, + "step": 24322 + }, + { + "epoch": 0.7212584882720992, + "grad_norm": 0.09573841094970703, + "learning_rate": 0.00018322067114525165, + "loss": 2.5935, + "step": 24323 + }, + { + "epoch": 0.7212881416244107, + "grad_norm": 0.09374119341373444, + "learning_rate": 0.00018318426947073297, + "loss": 2.6033, + "step": 24324 + }, + { + "epoch": 0.7213177949767221, + "grad_norm": 0.10754755884408951, + "learning_rate": 0.00018314787060167193, + "loss": 2.6188, + "step": 24325 + }, + { + "epoch": 0.7213474483290336, + "grad_norm": 0.1005336120724678, + "learning_rate": 0.00018311147453839083, + "loss": 2.5851, + "step": 24326 + }, + { + "epoch": 0.7213771016813451, + "grad_norm": 0.11493554711341858, + "learning_rate": 0.00018307508128121209, + "loss": 2.6594, + "step": 24327 + }, + { + "epoch": 0.7214067550336566, + "grad_norm": 0.09376727044582367, + "learning_rate": 0.00018303869083045787, + "loss": 2.6188, + "step": 24328 + }, + { + "epoch": 0.721436408385968, + "grad_norm": 0.1085764616727829, + "learning_rate": 0.00018300230318645045, + "loss": 2.6001, + "step": 24329 + }, + { + "epoch": 0.7214660617382795, + "grad_norm": 0.09571901708841324, + "learning_rate": 0.00018296591834951204, + "loss": 2.6154, + "step": 24330 + }, + { + "epoch": 0.721495715090591, + "grad_norm": 0.10833659023046494, + "learning_rate": 0.00018292953631996484, + "loss": 2.595, + "step": 24331 + }, + { + "epoch": 0.7215253684429025, + "grad_norm": 0.09645266085863113, + "learning_rate": 0.00018289315709813103, + "loss": 2.6096, + "step": 24332 + }, + { + "epoch": 0.7215550217952139, + "grad_norm": 0.09641485661268234, + "learning_rate": 0.00018285678068433276, + "loss": 2.6149, + "step": 24333 + }, + { + "epoch": 0.7215846751475254, + "grad_norm": 0.10515221208333969, + "learning_rate": 0.00018282040707889215, + "loss": 2.6067, + "step": 24334 + }, + { + "epoch": 0.7216143284998369, + "grad_norm": 0.09607177972793579, + "learning_rate": 0.00018278403628213126, + "loss": 2.5905, + "step": 24335 + }, + { + "epoch": 0.7216439818521484, + "grad_norm": 0.11027088761329651, + "learning_rate": 0.00018274766829437218, + "loss": 2.5949, + "step": 24336 + }, + { + "epoch": 0.7216736352044598, + "grad_norm": 0.11439727991819382, + "learning_rate": 0.00018271130311593692, + "loss": 2.6118, + "step": 24337 + }, + { + "epoch": 0.7217032885567713, + "grad_norm": 0.09386283159255981, + "learning_rate": 0.00018267494074714752, + "loss": 2.6144, + "step": 24338 + }, + { + "epoch": 0.7217329419090828, + "grad_norm": 0.12860211730003357, + "learning_rate": 0.00018263858118832606, + "loss": 2.6016, + "step": 24339 + }, + { + "epoch": 0.7217625952613943, + "grad_norm": 0.12720592319965363, + "learning_rate": 0.0001826022244397944, + "loss": 2.5846, + "step": 24340 + }, + { + "epoch": 0.7217922486137058, + "grad_norm": 0.10128235816955566, + "learning_rate": 0.0001825658705018745, + "loss": 2.5848, + "step": 24341 + }, + { + "epoch": 0.7218219019660173, + "grad_norm": 0.12707306444644928, + "learning_rate": 0.00018252951937488833, + "loss": 2.6041, + "step": 24342 + }, + { + "epoch": 0.7218515553183288, + "grad_norm": 0.12430461496114731, + "learning_rate": 0.00018249317105915774, + "loss": 2.5941, + "step": 24343 + }, + { + "epoch": 0.7218812086706402, + "grad_norm": 0.09202053397893906, + "learning_rate": 0.00018245682555500465, + "loss": 2.5641, + "step": 24344 + }, + { + "epoch": 0.7219108620229517, + "grad_norm": 0.11595287173986435, + "learning_rate": 0.00018242048286275087, + "loss": 2.5968, + "step": 24345 + }, + { + "epoch": 0.7219405153752632, + "grad_norm": 0.08930493891239166, + "learning_rate": 0.0001823841429827182, + "loss": 2.5974, + "step": 24346 + }, + { + "epoch": 0.7219701687275747, + "grad_norm": 0.11413007229566574, + "learning_rate": 0.00018234780591522848, + "loss": 2.5973, + "step": 24347 + }, + { + "epoch": 0.7219998220798861, + "grad_norm": 0.10113123059272766, + "learning_rate": 0.00018231147166060347, + "loss": 2.6383, + "step": 24348 + }, + { + "epoch": 0.7220294754321976, + "grad_norm": 0.1146983951330185, + "learning_rate": 0.00018227514021916492, + "loss": 2.613, + "step": 24349 + }, + { + "epoch": 0.7220591287845091, + "grad_norm": 0.09813062846660614, + "learning_rate": 0.0001822388115912345, + "loss": 2.5981, + "step": 24350 + }, + { + "epoch": 0.7220887821368206, + "grad_norm": 0.1080547645688057, + "learning_rate": 0.00018220248577713395, + "loss": 2.601, + "step": 24351 + }, + { + "epoch": 0.722118435489132, + "grad_norm": 0.09644728899002075, + "learning_rate": 0.00018216616277718495, + "loss": 2.597, + "step": 24352 + }, + { + "epoch": 0.7221480888414435, + "grad_norm": 0.0912834033370018, + "learning_rate": 0.00018212984259170916, + "loss": 2.5852, + "step": 24353 + }, + { + "epoch": 0.722177742193755, + "grad_norm": 0.09343928098678589, + "learning_rate": 0.00018209352522102835, + "loss": 2.5816, + "step": 24354 + }, + { + "epoch": 0.7222073955460665, + "grad_norm": 0.09256822615861893, + "learning_rate": 0.0001820572106654637, + "loss": 2.6151, + "step": 24355 + }, + { + "epoch": 0.7222370488983779, + "grad_norm": 0.10337953269481659, + "learning_rate": 0.00018202089892533708, + "loss": 2.621, + "step": 24356 + }, + { + "epoch": 0.7222667022506895, + "grad_norm": 0.09051113575696945, + "learning_rate": 0.0001819845900009698, + "loss": 2.5967, + "step": 24357 + }, + { + "epoch": 0.7222963556030009, + "grad_norm": 0.09538029879331589, + "learning_rate": 0.00018194828389268375, + "loss": 2.6327, + "step": 24358 + }, + { + "epoch": 0.7223260089553124, + "grad_norm": 0.08885271102190018, + "learning_rate": 0.00018191198060080023, + "loss": 2.5605, + "step": 24359 + }, + { + "epoch": 0.7223556623076238, + "grad_norm": 0.0903245061635971, + "learning_rate": 0.00018187568012564072, + "loss": 2.5924, + "step": 24360 + }, + { + "epoch": 0.7223853156599354, + "grad_norm": 0.09377356618642807, + "learning_rate": 0.00018183938246752664, + "loss": 2.6185, + "step": 24361 + }, + { + "epoch": 0.7224149690122469, + "grad_norm": 0.08690743893384933, + "learning_rate": 0.00018180308762677944, + "loss": 2.6088, + "step": 24362 + }, + { + "epoch": 0.7224446223645583, + "grad_norm": 0.0909976065158844, + "learning_rate": 0.00018176679560372055, + "loss": 2.6001, + "step": 24363 + }, + { + "epoch": 0.7224742757168698, + "grad_norm": 0.08454578369855881, + "learning_rate": 0.00018173050639867146, + "loss": 2.5906, + "step": 24364 + }, + { + "epoch": 0.7225039290691813, + "grad_norm": 0.09609717130661011, + "learning_rate": 0.0001816942200119532, + "loss": 2.5602, + "step": 24365 + }, + { + "epoch": 0.7225335824214928, + "grad_norm": 0.09271574020385742, + "learning_rate": 0.00018165793644388728, + "loss": 2.5899, + "step": 24366 + }, + { + "epoch": 0.7225632357738042, + "grad_norm": 0.08529862761497498, + "learning_rate": 0.00018162165569479493, + "loss": 2.6197, + "step": 24367 + }, + { + "epoch": 0.7225928891261157, + "grad_norm": 0.09149829298257828, + "learning_rate": 0.0001815853777649975, + "loss": 2.5962, + "step": 24368 + }, + { + "epoch": 0.7226225424784272, + "grad_norm": 0.09297379851341248, + "learning_rate": 0.0001815491026548162, + "loss": 2.6074, + "step": 24369 + }, + { + "epoch": 0.7226521958307387, + "grad_norm": 0.0992286428809166, + "learning_rate": 0.00018151283036457213, + "loss": 2.6033, + "step": 24370 + }, + { + "epoch": 0.7226818491830501, + "grad_norm": 0.10055206716060638, + "learning_rate": 0.00018147656089458669, + "loss": 2.5902, + "step": 24371 + }, + { + "epoch": 0.7227115025353616, + "grad_norm": 0.10142985731363297, + "learning_rate": 0.00018144029424518106, + "loss": 2.6216, + "step": 24372 + }, + { + "epoch": 0.7227411558876731, + "grad_norm": 0.09617328643798828, + "learning_rate": 0.00018140403041667626, + "loss": 2.5708, + "step": 24373 + }, + { + "epoch": 0.7227708092399846, + "grad_norm": 0.10382114350795746, + "learning_rate": 0.00018136776940939347, + "loss": 2.5926, + "step": 24374 + }, + { + "epoch": 0.722800462592296, + "grad_norm": 0.09586970508098602, + "learning_rate": 0.00018133151122365392, + "loss": 2.5892, + "step": 24375 + }, + { + "epoch": 0.7228301159446076, + "grad_norm": 0.12863491475582123, + "learning_rate": 0.0001812952558597784, + "loss": 2.6272, + "step": 24376 + }, + { + "epoch": 0.722859769296919, + "grad_norm": 0.11012955754995346, + "learning_rate": 0.0001812590033180881, + "loss": 2.6367, + "step": 24377 + }, + { + "epoch": 0.7228894226492305, + "grad_norm": 0.11240918934345245, + "learning_rate": 0.000181222753598904, + "loss": 2.5528, + "step": 24378 + }, + { + "epoch": 0.7229190760015419, + "grad_norm": 0.10775395482778549, + "learning_rate": 0.00018118650670254717, + "loss": 2.599, + "step": 24379 + }, + { + "epoch": 0.7229487293538535, + "grad_norm": 0.10585971176624298, + "learning_rate": 0.00018115026262933854, + "loss": 2.5709, + "step": 24380 + }, + { + "epoch": 0.7229783827061649, + "grad_norm": 0.11254996806383133, + "learning_rate": 0.00018111402137959903, + "loss": 2.6263, + "step": 24381 + }, + { + "epoch": 0.7230080360584764, + "grad_norm": 0.1036405861377716, + "learning_rate": 0.00018107778295364961, + "loss": 2.6346, + "step": 24382 + }, + { + "epoch": 0.7230376894107879, + "grad_norm": 0.10788025707006454, + "learning_rate": 0.00018104154735181104, + "loss": 2.595, + "step": 24383 + }, + { + "epoch": 0.7230673427630994, + "grad_norm": 0.09004289656877518, + "learning_rate": 0.00018100531457440445, + "loss": 2.5664, + "step": 24384 + }, + { + "epoch": 0.7230969961154109, + "grad_norm": 0.11845693737268448, + "learning_rate": 0.00018096908462175072, + "loss": 2.621, + "step": 24385 + }, + { + "epoch": 0.7231266494677223, + "grad_norm": 0.09548462182283401, + "learning_rate": 0.00018093285749417036, + "loss": 2.6331, + "step": 24386 + }, + { + "epoch": 0.7231563028200338, + "grad_norm": 0.10939323157072067, + "learning_rate": 0.0001808966331919843, + "loss": 2.6099, + "step": 24387 + }, + { + "epoch": 0.7231859561723453, + "grad_norm": 0.10785092413425446, + "learning_rate": 0.00018086041171551333, + "loss": 2.5782, + "step": 24388 + }, + { + "epoch": 0.7232156095246568, + "grad_norm": 0.09794100373983383, + "learning_rate": 0.0001808241930650782, + "loss": 2.5674, + "step": 24389 + }, + { + "epoch": 0.7232452628769682, + "grad_norm": 0.09821109473705292, + "learning_rate": 0.0001807879772409996, + "loss": 2.6082, + "step": 24390 + }, + { + "epoch": 0.7232749162292798, + "grad_norm": 0.09286647289991379, + "learning_rate": 0.00018075176424359825, + "loss": 2.6051, + "step": 24391 + }, + { + "epoch": 0.7233045695815912, + "grad_norm": 0.1113882064819336, + "learning_rate": 0.00018071555407319484, + "loss": 2.6141, + "step": 24392 + }, + { + "epoch": 0.7233342229339027, + "grad_norm": 0.0927039384841919, + "learning_rate": 0.00018067934673011006, + "loss": 2.6181, + "step": 24393 + }, + { + "epoch": 0.7233638762862141, + "grad_norm": 0.10014495253562927, + "learning_rate": 0.0001806431422146644, + "loss": 2.5753, + "step": 24394 + }, + { + "epoch": 0.7233935296385257, + "grad_norm": 0.09569203853607178, + "learning_rate": 0.00018060694052717858, + "loss": 2.5555, + "step": 24395 + }, + { + "epoch": 0.7234231829908371, + "grad_norm": 0.11245740205049515, + "learning_rate": 0.00018057074166797304, + "loss": 2.5953, + "step": 24396 + }, + { + "epoch": 0.7234528363431486, + "grad_norm": 0.10407228767871857, + "learning_rate": 0.00018053454563736847, + "loss": 2.5738, + "step": 24397 + }, + { + "epoch": 0.72348248969546, + "grad_norm": 0.11538471281528473, + "learning_rate": 0.00018049835243568536, + "loss": 2.6249, + "step": 24398 + }, + { + "epoch": 0.7235121430477716, + "grad_norm": 0.10610302537679672, + "learning_rate": 0.00018046216206324417, + "loss": 2.5664, + "step": 24399 + }, + { + "epoch": 0.723541796400083, + "grad_norm": 0.10745036602020264, + "learning_rate": 0.00018042597452036535, + "loss": 2.6219, + "step": 24400 + }, + { + "epoch": 0.7235714497523945, + "grad_norm": 0.10368639975786209, + "learning_rate": 0.00018038978980736942, + "loss": 2.5604, + "step": 24401 + }, + { + "epoch": 0.7236011031047059, + "grad_norm": 0.09218431264162064, + "learning_rate": 0.0001803536079245767, + "loss": 2.5847, + "step": 24402 + }, + { + "epoch": 0.7236307564570175, + "grad_norm": 0.10199326276779175, + "learning_rate": 0.00018031742887230772, + "loss": 2.5964, + "step": 24403 + }, + { + "epoch": 0.723660409809329, + "grad_norm": 0.09869545698165894, + "learning_rate": 0.00018028125265088274, + "loss": 2.5873, + "step": 24404 + }, + { + "epoch": 0.7236900631616404, + "grad_norm": 0.09086482226848602, + "learning_rate": 0.00018024507926062217, + "loss": 2.5576, + "step": 24405 + }, + { + "epoch": 0.723719716513952, + "grad_norm": 0.09697434306144714, + "learning_rate": 0.0001802089087018463, + "loss": 2.609, + "step": 24406 + }, + { + "epoch": 0.7237493698662634, + "grad_norm": 0.1004500612616539, + "learning_rate": 0.00018017274097487546, + "loss": 2.5884, + "step": 24407 + }, + { + "epoch": 0.7237790232185749, + "grad_norm": 0.09437950700521469, + "learning_rate": 0.00018013657608002985, + "loss": 2.6229, + "step": 24408 + }, + { + "epoch": 0.7238086765708863, + "grad_norm": 0.09573261439800262, + "learning_rate": 0.00018010041401762976, + "loss": 2.5892, + "step": 24409 + }, + { + "epoch": 0.7238383299231979, + "grad_norm": 0.09927540272474289, + "learning_rate": 0.00018006425478799543, + "loss": 2.579, + "step": 24410 + }, + { + "epoch": 0.7238679832755093, + "grad_norm": 0.10136948525905609, + "learning_rate": 0.00018002809839144708, + "loss": 2.6364, + "step": 24411 + }, + { + "epoch": 0.7238976366278208, + "grad_norm": 0.09633610397577286, + "learning_rate": 0.00017999194482830476, + "loss": 2.6294, + "step": 24412 + }, + { + "epoch": 0.7239272899801322, + "grad_norm": 0.10153902322053909, + "learning_rate": 0.00017995579409888879, + "loss": 2.6231, + "step": 24413 + }, + { + "epoch": 0.7239569433324438, + "grad_norm": 0.10077261179685593, + "learning_rate": 0.00017991964620351914, + "loss": 2.6005, + "step": 24414 + }, + { + "epoch": 0.7239865966847552, + "grad_norm": 0.10524572432041168, + "learning_rate": 0.00017988350114251595, + "loss": 2.5857, + "step": 24415 + }, + { + "epoch": 0.7240162500370667, + "grad_norm": 0.10107962042093277, + "learning_rate": 0.00017984735891619935, + "loss": 2.6364, + "step": 24416 + }, + { + "epoch": 0.7240459033893781, + "grad_norm": 0.0975659117102623, + "learning_rate": 0.00017981121952488933, + "loss": 2.6006, + "step": 24417 + }, + { + "epoch": 0.7240755567416897, + "grad_norm": 0.1012427881360054, + "learning_rate": 0.00017977508296890588, + "loss": 2.5842, + "step": 24418 + }, + { + "epoch": 0.7241052100940011, + "grad_norm": 0.10597214102745056, + "learning_rate": 0.0001797389492485691, + "loss": 2.5769, + "step": 24419 + }, + { + "epoch": 0.7241348634463126, + "grad_norm": 0.10058950632810593, + "learning_rate": 0.000179702818364199, + "loss": 2.6185, + "step": 24420 + }, + { + "epoch": 0.724164516798624, + "grad_norm": 0.10573665052652359, + "learning_rate": 0.0001796666903161151, + "loss": 2.5759, + "step": 24421 + }, + { + "epoch": 0.7241941701509356, + "grad_norm": 0.09635855257511139, + "learning_rate": 0.00017963056510463782, + "loss": 2.5872, + "step": 24422 + }, + { + "epoch": 0.724223823503247, + "grad_norm": 0.09728952497243881, + "learning_rate": 0.0001795944427300869, + "loss": 2.6206, + "step": 24423 + }, + { + "epoch": 0.7242534768555585, + "grad_norm": 0.11705698072910309, + "learning_rate": 0.0001795583231927822, + "loss": 2.5797, + "step": 24424 + }, + { + "epoch": 0.72428313020787, + "grad_norm": 0.09962265193462372, + "learning_rate": 0.00017952220649304352, + "loss": 2.5851, + "step": 24425 + }, + { + "epoch": 0.7243127835601815, + "grad_norm": 0.1064363420009613, + "learning_rate": 0.0001794860926311907, + "loss": 2.624, + "step": 24426 + }, + { + "epoch": 0.724342436912493, + "grad_norm": 0.10613780468702316, + "learning_rate": 0.00017944998160754355, + "loss": 2.5998, + "step": 24427 + }, + { + "epoch": 0.7243720902648044, + "grad_norm": 0.11206523329019547, + "learning_rate": 0.00017941387342242183, + "loss": 2.6354, + "step": 24428 + }, + { + "epoch": 0.724401743617116, + "grad_norm": 0.10211730003356934, + "learning_rate": 0.00017937776807614532, + "loss": 2.6225, + "step": 24429 + }, + { + "epoch": 0.7244313969694274, + "grad_norm": 0.10712762922048569, + "learning_rate": 0.00017934166556903385, + "loss": 2.5872, + "step": 24430 + }, + { + "epoch": 0.7244610503217389, + "grad_norm": 0.10661005973815918, + "learning_rate": 0.00017930556590140678, + "loss": 2.6019, + "step": 24431 + }, + { + "epoch": 0.7244907036740503, + "grad_norm": 0.10475461184978485, + "learning_rate": 0.00017926946907358403, + "loss": 2.6649, + "step": 24432 + }, + { + "epoch": 0.7245203570263619, + "grad_norm": 0.11608521640300751, + "learning_rate": 0.00017923337508588517, + "loss": 2.6299, + "step": 24433 + }, + { + "epoch": 0.7245500103786733, + "grad_norm": 0.10361535847187042, + "learning_rate": 0.0001791972839386296, + "loss": 2.5734, + "step": 24434 + }, + { + "epoch": 0.7245796637309848, + "grad_norm": 0.11483120918273926, + "learning_rate": 0.0001791611956321374, + "loss": 2.6059, + "step": 24435 + }, + { + "epoch": 0.7246093170832962, + "grad_norm": 0.10774000734090805, + "learning_rate": 0.00017912511016672782, + "loss": 2.5942, + "step": 24436 + }, + { + "epoch": 0.7246389704356078, + "grad_norm": 0.10845398157835007, + "learning_rate": 0.0001790890275427205, + "loss": 2.593, + "step": 24437 + }, + { + "epoch": 0.7246686237879192, + "grad_norm": 0.11223174631595612, + "learning_rate": 0.0001790529477604349, + "loss": 2.5779, + "step": 24438 + }, + { + "epoch": 0.7246982771402307, + "grad_norm": 0.12369462847709656, + "learning_rate": 0.00017901687082019058, + "loss": 2.6177, + "step": 24439 + }, + { + "epoch": 0.7247279304925421, + "grad_norm": 0.0998692587018013, + "learning_rate": 0.00017898079672230694, + "loss": 2.5997, + "step": 24440 + }, + { + "epoch": 0.7247575838448537, + "grad_norm": 0.11107496917247772, + "learning_rate": 0.0001789447254671036, + "loss": 2.6135, + "step": 24441 + }, + { + "epoch": 0.7247872371971651, + "grad_norm": 0.10266181081533432, + "learning_rate": 0.00017890865705489967, + "loss": 2.6094, + "step": 24442 + }, + { + "epoch": 0.7248168905494766, + "grad_norm": 0.09877905249595642, + "learning_rate": 0.0001788725914860147, + "loss": 2.6029, + "step": 24443 + }, + { + "epoch": 0.724846543901788, + "grad_norm": 0.09623556584119797, + "learning_rate": 0.00017883652876076806, + "loss": 2.5851, + "step": 24444 + }, + { + "epoch": 0.7248761972540996, + "grad_norm": 0.0991639569401741, + "learning_rate": 0.0001788004688794791, + "loss": 2.6182, + "step": 24445 + }, + { + "epoch": 0.7249058506064111, + "grad_norm": 0.09899156540632248, + "learning_rate": 0.00017876441184246707, + "loss": 2.6281, + "step": 24446 + }, + { + "epoch": 0.7249355039587225, + "grad_norm": 0.10490317642688751, + "learning_rate": 0.0001787283576500512, + "loss": 2.614, + "step": 24447 + }, + { + "epoch": 0.7249651573110341, + "grad_norm": 0.10408739000558853, + "learning_rate": 0.00017869230630255102, + "loss": 2.6087, + "step": 24448 + }, + { + "epoch": 0.7249948106633455, + "grad_norm": 0.09750587493181229, + "learning_rate": 0.00017865625780028561, + "loss": 2.6005, + "step": 24449 + }, + { + "epoch": 0.725024464015657, + "grad_norm": 0.10223488509654999, + "learning_rate": 0.00017862021214357416, + "loss": 2.5783, + "step": 24450 + }, + { + "epoch": 0.7250541173679684, + "grad_norm": 0.10106124728918076, + "learning_rate": 0.0001785841693327361, + "loss": 2.6131, + "step": 24451 + }, + { + "epoch": 0.72508377072028, + "grad_norm": 0.09670937806367874, + "learning_rate": 0.00017854812936809024, + "loss": 2.5994, + "step": 24452 + }, + { + "epoch": 0.7251134240725914, + "grad_norm": 0.10531069338321686, + "learning_rate": 0.00017851209224995586, + "loss": 2.636, + "step": 24453 + }, + { + "epoch": 0.7251430774249029, + "grad_norm": 0.09516966342926025, + "learning_rate": 0.00017847605797865207, + "loss": 2.5733, + "step": 24454 + }, + { + "epoch": 0.7251727307772143, + "grad_norm": 0.09532535821199417, + "learning_rate": 0.00017844002655449797, + "loss": 2.5826, + "step": 24455 + }, + { + "epoch": 0.7252023841295259, + "grad_norm": 0.10571535676717758, + "learning_rate": 0.00017840399797781266, + "loss": 2.6138, + "step": 24456 + }, + { + "epoch": 0.7252320374818373, + "grad_norm": 0.10220025479793549, + "learning_rate": 0.0001783679722489151, + "loss": 2.5973, + "step": 24457 + }, + { + "epoch": 0.7252616908341488, + "grad_norm": 0.09231115132570267, + "learning_rate": 0.00017833194936812437, + "loss": 2.6181, + "step": 24458 + }, + { + "epoch": 0.7252913441864602, + "grad_norm": 0.0976901650428772, + "learning_rate": 0.00017829592933575944, + "loss": 2.5985, + "step": 24459 + }, + { + "epoch": 0.7253209975387718, + "grad_norm": 0.09346932172775269, + "learning_rate": 0.00017825991215213917, + "loss": 2.5489, + "step": 24460 + }, + { + "epoch": 0.7253506508910832, + "grad_norm": 0.09434553980827332, + "learning_rate": 0.00017822389781758287, + "loss": 2.6025, + "step": 24461 + }, + { + "epoch": 0.7253803042433947, + "grad_norm": 0.09551830589771271, + "learning_rate": 0.00017818788633240906, + "loss": 2.5878, + "step": 24462 + }, + { + "epoch": 0.7254099575957061, + "grad_norm": 0.09684531390666962, + "learning_rate": 0.00017815187769693676, + "loss": 2.6275, + "step": 24463 + }, + { + "epoch": 0.7254396109480177, + "grad_norm": 0.09559644013643265, + "learning_rate": 0.00017811587191148476, + "loss": 2.5834, + "step": 24464 + }, + { + "epoch": 0.7254692643003291, + "grad_norm": 0.10175430029630661, + "learning_rate": 0.00017807986897637202, + "loss": 2.6298, + "step": 24465 + }, + { + "epoch": 0.7254989176526406, + "grad_norm": 0.09553055465221405, + "learning_rate": 0.00017804386889191725, + "loss": 2.5827, + "step": 24466 + }, + { + "epoch": 0.7255285710049522, + "grad_norm": 0.10178521275520325, + "learning_rate": 0.00017800787165843935, + "loss": 2.6193, + "step": 24467 + }, + { + "epoch": 0.7255582243572636, + "grad_norm": 0.09371235966682434, + "learning_rate": 0.00017797187727625698, + "loss": 2.6198, + "step": 24468 + }, + { + "epoch": 0.7255878777095751, + "grad_norm": 0.10473645478487015, + "learning_rate": 0.00017793588574568892, + "loss": 2.6138, + "step": 24469 + }, + { + "epoch": 0.7256175310618865, + "grad_norm": 0.10014871507883072, + "learning_rate": 0.00017789989706705389, + "loss": 2.6088, + "step": 24470 + }, + { + "epoch": 0.7256471844141981, + "grad_norm": 0.09701766073703766, + "learning_rate": 0.00017786391124067054, + "loss": 2.5873, + "step": 24471 + }, + { + "epoch": 0.7256768377665095, + "grad_norm": 0.09869259595870972, + "learning_rate": 0.00017782792826685756, + "loss": 2.5597, + "step": 24472 + }, + { + "epoch": 0.725706491118821, + "grad_norm": 0.09040941298007965, + "learning_rate": 0.00017779194814593352, + "loss": 2.5969, + "step": 24473 + }, + { + "epoch": 0.7257361444711324, + "grad_norm": 0.10061939805746078, + "learning_rate": 0.00017775597087821716, + "loss": 2.6251, + "step": 24474 + }, + { + "epoch": 0.725765797823444, + "grad_norm": 0.0932934358716011, + "learning_rate": 0.00017771999646402697, + "loss": 2.6351, + "step": 24475 + }, + { + "epoch": 0.7257954511757554, + "grad_norm": 0.09226945787668228, + "learning_rate": 0.00017768402490368152, + "loss": 2.6282, + "step": 24476 + }, + { + "epoch": 0.7258251045280669, + "grad_norm": 0.1064029112458229, + "learning_rate": 0.00017764805619749935, + "loss": 2.5764, + "step": 24477 + }, + { + "epoch": 0.7258547578803783, + "grad_norm": 0.09588227421045303, + "learning_rate": 0.00017761209034579902, + "loss": 2.6008, + "step": 24478 + }, + { + "epoch": 0.7258844112326899, + "grad_norm": 0.10232865065336227, + "learning_rate": 0.00017757612734889888, + "loss": 2.596, + "step": 24479 + }, + { + "epoch": 0.7259140645850013, + "grad_norm": 0.10761300474405289, + "learning_rate": 0.00017754016720711752, + "loss": 2.6201, + "step": 24480 + }, + { + "epoch": 0.7259437179373128, + "grad_norm": 0.10034393519163132, + "learning_rate": 0.00017750420992077332, + "loss": 2.6069, + "step": 24481 + }, + { + "epoch": 0.7259733712896242, + "grad_norm": 0.1078014224767685, + "learning_rate": 0.00017746825549018474, + "loss": 2.6466, + "step": 24482 + }, + { + "epoch": 0.7260030246419358, + "grad_norm": 0.10128441452980042, + "learning_rate": 0.00017743230391567005, + "loss": 2.5591, + "step": 24483 + }, + { + "epoch": 0.7260326779942472, + "grad_norm": 0.09787493944168091, + "learning_rate": 0.00017739635519754777, + "loss": 2.6055, + "step": 24484 + }, + { + "epoch": 0.7260623313465587, + "grad_norm": 0.09805212914943695, + "learning_rate": 0.00017736040933613622, + "loss": 2.5872, + "step": 24485 + }, + { + "epoch": 0.7260919846988702, + "grad_norm": 0.09040717035531998, + "learning_rate": 0.00017732446633175332, + "loss": 2.6158, + "step": 24486 + }, + { + "epoch": 0.7261216380511817, + "grad_norm": 0.11607994139194489, + "learning_rate": 0.00017728852618471787, + "loss": 2.6217, + "step": 24487 + }, + { + "epoch": 0.7261512914034932, + "grad_norm": 0.09422361850738525, + "learning_rate": 0.00017725258889534785, + "loss": 2.6218, + "step": 24488 + }, + { + "epoch": 0.7261809447558046, + "grad_norm": 0.10354015976190567, + "learning_rate": 0.00017721665446396157, + "loss": 2.5941, + "step": 24489 + }, + { + "epoch": 0.7262105981081162, + "grad_norm": 0.10336793959140778, + "learning_rate": 0.0001771807228908772, + "loss": 2.6075, + "step": 24490 + }, + { + "epoch": 0.7262402514604276, + "grad_norm": 0.10213246196508408, + "learning_rate": 0.00017714479417641298, + "loss": 2.6007, + "step": 24491 + }, + { + "epoch": 0.7262699048127391, + "grad_norm": 0.10558052361011505, + "learning_rate": 0.00017710886832088702, + "loss": 2.5849, + "step": 24492 + }, + { + "epoch": 0.7262995581650505, + "grad_norm": 0.09852142632007599, + "learning_rate": 0.00017707294532461743, + "loss": 2.6154, + "step": 24493 + }, + { + "epoch": 0.7263292115173621, + "grad_norm": 0.11259579658508301, + "learning_rate": 0.00017703702518792236, + "loss": 2.5866, + "step": 24494 + }, + { + "epoch": 0.7263588648696735, + "grad_norm": 0.0925372838973999, + "learning_rate": 0.00017700110791111985, + "loss": 2.6305, + "step": 24495 + }, + { + "epoch": 0.726388518221985, + "grad_norm": 0.11235357075929642, + "learning_rate": 0.00017696519349452816, + "loss": 2.564, + "step": 24496 + }, + { + "epoch": 0.7264181715742964, + "grad_norm": 0.10136300325393677, + "learning_rate": 0.0001769292819384649, + "loss": 2.5792, + "step": 24497 + }, + { + "epoch": 0.726447824926608, + "grad_norm": 0.11274371296167374, + "learning_rate": 0.0001768933732432484, + "loss": 2.5986, + "step": 24498 + }, + { + "epoch": 0.7264774782789194, + "grad_norm": 0.10179143399000168, + "learning_rate": 0.00017685746740919633, + "loss": 2.5994, + "step": 24499 + }, + { + "epoch": 0.7265071316312309, + "grad_norm": 0.10702623426914215, + "learning_rate": 0.00017682156443662702, + "loss": 2.5607, + "step": 24500 + }, + { + "epoch": 0.7265367849835423, + "grad_norm": 0.11238163709640503, + "learning_rate": 0.00017678566432585818, + "loss": 2.6207, + "step": 24501 + }, + { + "epoch": 0.7265664383358539, + "grad_norm": 0.11508238315582275, + "learning_rate": 0.00017674976707720785, + "loss": 2.5867, + "step": 24502 + }, + { + "epoch": 0.7265960916881653, + "grad_norm": 0.1197316125035286, + "learning_rate": 0.00017671387269099377, + "loss": 2.6444, + "step": 24503 + }, + { + "epoch": 0.7266257450404768, + "grad_norm": 0.1061507984995842, + "learning_rate": 0.00017667798116753386, + "loss": 2.5485, + "step": 24504 + }, + { + "epoch": 0.7266553983927883, + "grad_norm": 0.10883306711912155, + "learning_rate": 0.00017664209250714593, + "loss": 2.6221, + "step": 24505 + }, + { + "epoch": 0.7266850517450998, + "grad_norm": 0.09799879789352417, + "learning_rate": 0.00017660620671014788, + "loss": 2.6052, + "step": 24506 + }, + { + "epoch": 0.7267147050974113, + "grad_norm": 0.10888003557920456, + "learning_rate": 0.00017657032377685727, + "loss": 2.6019, + "step": 24507 + }, + { + "epoch": 0.7267443584497227, + "grad_norm": 0.0952850803732872, + "learning_rate": 0.0001765344437075919, + "loss": 2.5855, + "step": 24508 + }, + { + "epoch": 0.7267740118020343, + "grad_norm": 0.10483866184949875, + "learning_rate": 0.0001764985665026696, + "loss": 2.59, + "step": 24509 + }, + { + "epoch": 0.7268036651543457, + "grad_norm": 0.08871877938508987, + "learning_rate": 0.00017646269216240802, + "loss": 2.616, + "step": 24510 + }, + { + "epoch": 0.7268333185066572, + "grad_norm": 0.10903408378362656, + "learning_rate": 0.00017642682068712485, + "loss": 2.5705, + "step": 24511 + }, + { + "epoch": 0.7268629718589686, + "grad_norm": 0.10015712678432465, + "learning_rate": 0.00017639095207713752, + "loss": 2.5859, + "step": 24512 + }, + { + "epoch": 0.7268926252112802, + "grad_norm": 0.10633029043674469, + "learning_rate": 0.00017635508633276405, + "loss": 2.6291, + "step": 24513 + }, + { + "epoch": 0.7269222785635916, + "grad_norm": 0.09483388066291809, + "learning_rate": 0.00017631922345432184, + "loss": 2.5701, + "step": 24514 + }, + { + "epoch": 0.7269519319159031, + "grad_norm": 0.10830976068973541, + "learning_rate": 0.00017628336344212848, + "loss": 2.6074, + "step": 24515 + }, + { + "epoch": 0.7269815852682145, + "grad_norm": 0.08734194934368134, + "learning_rate": 0.0001762475062965015, + "loss": 2.5946, + "step": 24516 + }, + { + "epoch": 0.7270112386205261, + "grad_norm": 0.09490444511175156, + "learning_rate": 0.0001762116520177585, + "loss": 2.5913, + "step": 24517 + }, + { + "epoch": 0.7270408919728375, + "grad_norm": 0.0951165035367012, + "learning_rate": 0.00017617580060621686, + "loss": 2.6254, + "step": 24518 + }, + { + "epoch": 0.727070545325149, + "grad_norm": 0.1018420085310936, + "learning_rate": 0.00017613995206219402, + "loss": 2.5819, + "step": 24519 + }, + { + "epoch": 0.7271001986774605, + "grad_norm": 0.09682879596948624, + "learning_rate": 0.00017610410638600749, + "loss": 2.6381, + "step": 24520 + }, + { + "epoch": 0.727129852029772, + "grad_norm": 0.08984175324440002, + "learning_rate": 0.00017606826357797472, + "loss": 2.5681, + "step": 24521 + }, + { + "epoch": 0.7271595053820834, + "grad_norm": 0.09693745523691177, + "learning_rate": 0.0001760324236384131, + "loss": 2.5982, + "step": 24522 + }, + { + "epoch": 0.7271891587343949, + "grad_norm": 0.10601179301738739, + "learning_rate": 0.00017599658656763996, + "loss": 2.5664, + "step": 24523 + }, + { + "epoch": 0.7272188120867064, + "grad_norm": 0.09416723251342773, + "learning_rate": 0.0001759607523659726, + "loss": 2.5989, + "step": 24524 + }, + { + "epoch": 0.7272484654390179, + "grad_norm": 0.11040829122066498, + "learning_rate": 0.0001759249210337283, + "loss": 2.5749, + "step": 24525 + }, + { + "epoch": 0.7272781187913293, + "grad_norm": 0.0955849140882492, + "learning_rate": 0.0001758890925712246, + "loss": 2.6364, + "step": 24526 + }, + { + "epoch": 0.7273077721436408, + "grad_norm": 0.10252886265516281, + "learning_rate": 0.0001758532669787788, + "loss": 2.6044, + "step": 24527 + }, + { + "epoch": 0.7273374254959524, + "grad_norm": 0.09030380845069885, + "learning_rate": 0.00017581744425670777, + "loss": 2.5856, + "step": 24528 + }, + { + "epoch": 0.7273670788482638, + "grad_norm": 0.10177980363368988, + "learning_rate": 0.00017578162440532892, + "loss": 2.6159, + "step": 24529 + }, + { + "epoch": 0.7273967322005753, + "grad_norm": 0.10405462235212326, + "learning_rate": 0.00017574580742495943, + "loss": 2.6071, + "step": 24530 + }, + { + "epoch": 0.7274263855528867, + "grad_norm": 0.09324218332767487, + "learning_rate": 0.00017570999331591647, + "loss": 2.6165, + "step": 24531 + }, + { + "epoch": 0.7274560389051983, + "grad_norm": 0.10230676084756851, + "learning_rate": 0.0001756741820785172, + "loss": 2.6334, + "step": 24532 + }, + { + "epoch": 0.7274856922575097, + "grad_norm": 0.11565793305635452, + "learning_rate": 0.00017563837371307873, + "loss": 2.5925, + "step": 24533 + }, + { + "epoch": 0.7275153456098212, + "grad_norm": 0.10255371034145355, + "learning_rate": 0.0001756025682199181, + "loss": 2.5956, + "step": 24534 + }, + { + "epoch": 0.7275449989621326, + "grad_norm": 0.09257709980010986, + "learning_rate": 0.0001755667655993524, + "loss": 2.6382, + "step": 24535 + }, + { + "epoch": 0.7275746523144442, + "grad_norm": 0.09913036972284317, + "learning_rate": 0.00017553096585169874, + "loss": 2.6042, + "step": 24536 + }, + { + "epoch": 0.7276043056667556, + "grad_norm": 0.10174436867237091, + "learning_rate": 0.00017549516897727403, + "loss": 2.6238, + "step": 24537 + }, + { + "epoch": 0.7276339590190671, + "grad_norm": 0.09481301158666611, + "learning_rate": 0.0001754593749763953, + "loss": 2.61, + "step": 24538 + }, + { + "epoch": 0.7276636123713786, + "grad_norm": 0.10382696241140366, + "learning_rate": 0.0001754235838493795, + "loss": 2.5801, + "step": 24539 + }, + { + "epoch": 0.7276932657236901, + "grad_norm": 0.10092637687921524, + "learning_rate": 0.0001753877955965436, + "loss": 2.5887, + "step": 24540 + }, + { + "epoch": 0.7277229190760015, + "grad_norm": 0.11630378663539886, + "learning_rate": 0.00017535201021820452, + "loss": 2.6033, + "step": 24541 + }, + { + "epoch": 0.727752572428313, + "grad_norm": 0.09940234571695328, + "learning_rate": 0.00017531622771467908, + "loss": 2.633, + "step": 24542 + }, + { + "epoch": 0.7277822257806245, + "grad_norm": 0.10136493295431137, + "learning_rate": 0.00017528044808628418, + "loss": 2.574, + "step": 24543 + }, + { + "epoch": 0.727811879132936, + "grad_norm": 0.10558163374662399, + "learning_rate": 0.00017524467133333665, + "loss": 2.6093, + "step": 24544 + }, + { + "epoch": 0.7278415324852474, + "grad_norm": 0.09689989686012268, + "learning_rate": 0.0001752088974561533, + "loss": 2.5943, + "step": 24545 + }, + { + "epoch": 0.7278711858375589, + "grad_norm": 0.1096758246421814, + "learning_rate": 0.0001751731264550509, + "loss": 2.6053, + "step": 24546 + }, + { + "epoch": 0.7279008391898704, + "grad_norm": 0.10962241142988205, + "learning_rate": 0.00017513735833034623, + "loss": 2.5675, + "step": 24547 + }, + { + "epoch": 0.7279304925421819, + "grad_norm": 0.09366852790117264, + "learning_rate": 0.000175101593082356, + "loss": 2.5898, + "step": 24548 + }, + { + "epoch": 0.7279601458944934, + "grad_norm": 0.11069923639297485, + "learning_rate": 0.000175065830711397, + "loss": 2.5916, + "step": 24549 + }, + { + "epoch": 0.7279897992468048, + "grad_norm": 0.10285104066133499, + "learning_rate": 0.00017503007121778575, + "loss": 2.6186, + "step": 24550 + }, + { + "epoch": 0.7280194525991164, + "grad_norm": 0.09697301685810089, + "learning_rate": 0.000174994314601839, + "loss": 2.5845, + "step": 24551 + }, + { + "epoch": 0.7280491059514278, + "grad_norm": 0.11260323971509933, + "learning_rate": 0.00017495856086387345, + "loss": 2.5749, + "step": 24552 + }, + { + "epoch": 0.7280787593037393, + "grad_norm": 0.10643663257360458, + "learning_rate": 0.00017492281000420563, + "loss": 2.6012, + "step": 24553 + }, + { + "epoch": 0.7281084126560508, + "grad_norm": 0.09568719565868378, + "learning_rate": 0.00017488706202315208, + "loss": 2.6097, + "step": 24554 + }, + { + "epoch": 0.7281380660083623, + "grad_norm": 0.1020304262638092, + "learning_rate": 0.00017485131692102941, + "loss": 2.6038, + "step": 24555 + }, + { + "epoch": 0.7281677193606737, + "grad_norm": 0.10206884890794754, + "learning_rate": 0.00017481557469815412, + "loss": 2.5735, + "step": 24556 + }, + { + "epoch": 0.7281973727129852, + "grad_norm": 0.10998865216970444, + "learning_rate": 0.00017477983535484282, + "loss": 2.6261, + "step": 24557 + }, + { + "epoch": 0.7282270260652967, + "grad_norm": 0.09704963117837906, + "learning_rate": 0.0001747440988914118, + "loss": 2.5696, + "step": 24558 + }, + { + "epoch": 0.7282566794176082, + "grad_norm": 0.10009491443634033, + "learning_rate": 0.00017470836530817768, + "loss": 2.6173, + "step": 24559 + }, + { + "epoch": 0.7282863327699196, + "grad_norm": 0.09731942415237427, + "learning_rate": 0.0001746726346054568, + "loss": 2.6217, + "step": 24560 + }, + { + "epoch": 0.7283159861222311, + "grad_norm": 0.09098923951387405, + "learning_rate": 0.00017463690678356576, + "loss": 2.6274, + "step": 24561 + }, + { + "epoch": 0.7283456394745426, + "grad_norm": 0.1082134023308754, + "learning_rate": 0.0001746011818428206, + "loss": 2.6225, + "step": 24562 + }, + { + "epoch": 0.7283752928268541, + "grad_norm": 0.0929543673992157, + "learning_rate": 0.00017456545978353777, + "loss": 2.6176, + "step": 24563 + }, + { + "epoch": 0.7284049461791655, + "grad_norm": 0.10634011775255203, + "learning_rate": 0.00017452974060603354, + "loss": 2.5848, + "step": 24564 + }, + { + "epoch": 0.728434599531477, + "grad_norm": 0.09115312993526459, + "learning_rate": 0.00017449402431062445, + "loss": 2.6107, + "step": 24565 + }, + { + "epoch": 0.7284642528837885, + "grad_norm": 0.11188440769910812, + "learning_rate": 0.0001744583108976267, + "loss": 2.5839, + "step": 24566 + }, + { + "epoch": 0.7284939062361, + "grad_norm": 0.0929386168718338, + "learning_rate": 0.00017442260036735647, + "loss": 2.6179, + "step": 24567 + }, + { + "epoch": 0.7285235595884114, + "grad_norm": 0.11322837322950363, + "learning_rate": 0.00017438689272012998, + "loss": 2.5915, + "step": 24568 + }, + { + "epoch": 0.728553212940723, + "grad_norm": 0.09641791135072708, + "learning_rate": 0.00017435118795626343, + "loss": 2.6077, + "step": 24569 + }, + { + "epoch": 0.7285828662930345, + "grad_norm": 0.10108121484518051, + "learning_rate": 0.00017431548607607306, + "loss": 2.6361, + "step": 24570 + }, + { + "epoch": 0.7286125196453459, + "grad_norm": 0.09631387144327164, + "learning_rate": 0.0001742797870798749, + "loss": 2.6255, + "step": 24571 + }, + { + "epoch": 0.7286421729976574, + "grad_norm": 0.10583219677209854, + "learning_rate": 0.00017424409096798534, + "loss": 2.6125, + "step": 24572 + }, + { + "epoch": 0.7286718263499689, + "grad_norm": 0.09106221795082092, + "learning_rate": 0.0001742083977407201, + "loss": 2.6065, + "step": 24573 + }, + { + "epoch": 0.7287014797022804, + "grad_norm": 0.10923467576503754, + "learning_rate": 0.00017417270739839542, + "loss": 2.562, + "step": 24574 + }, + { + "epoch": 0.7287311330545918, + "grad_norm": 0.09705956280231476, + "learning_rate": 0.00017413701994132736, + "loss": 2.6043, + "step": 24575 + }, + { + "epoch": 0.7287607864069033, + "grad_norm": 0.10548383742570877, + "learning_rate": 0.00017410133536983191, + "loss": 2.6053, + "step": 24576 + }, + { + "epoch": 0.7287904397592148, + "grad_norm": 0.10080111026763916, + "learning_rate": 0.00017406565368422488, + "loss": 2.5926, + "step": 24577 + }, + { + "epoch": 0.7288200931115263, + "grad_norm": 0.10146187990903854, + "learning_rate": 0.00017402997488482263, + "loss": 2.6037, + "step": 24578 + }, + { + "epoch": 0.7288497464638377, + "grad_norm": 0.10059348493814468, + "learning_rate": 0.00017399429897194091, + "loss": 2.5894, + "step": 24579 + }, + { + "epoch": 0.7288793998161492, + "grad_norm": 0.11009608954191208, + "learning_rate": 0.00017395862594589556, + "loss": 2.5703, + "step": 24580 + }, + { + "epoch": 0.7289090531684607, + "grad_norm": 0.09545352309942245, + "learning_rate": 0.00017392295580700263, + "loss": 2.6017, + "step": 24581 + }, + { + "epoch": 0.7289387065207722, + "grad_norm": 0.10590122640132904, + "learning_rate": 0.00017388728855557802, + "loss": 2.6041, + "step": 24582 + }, + { + "epoch": 0.7289683598730836, + "grad_norm": 0.09052985161542892, + "learning_rate": 0.00017385162419193727, + "loss": 2.5912, + "step": 24583 + }, + { + "epoch": 0.7289980132253951, + "grad_norm": 0.09866136312484741, + "learning_rate": 0.00017381596271639645, + "loss": 2.6002, + "step": 24584 + }, + { + "epoch": 0.7290276665777066, + "grad_norm": 0.10258805751800537, + "learning_rate": 0.0001737803041292712, + "loss": 2.5768, + "step": 24585 + }, + { + "epoch": 0.7290573199300181, + "grad_norm": 0.09131210297346115, + "learning_rate": 0.00017374464843087733, + "loss": 2.6086, + "step": 24586 + }, + { + "epoch": 0.7290869732823295, + "grad_norm": 0.1058654636144638, + "learning_rate": 0.00017370899562153065, + "loss": 2.6178, + "step": 24587 + }, + { + "epoch": 0.729116626634641, + "grad_norm": 0.09637648612260818, + "learning_rate": 0.0001736733457015468, + "loss": 2.6091, + "step": 24588 + }, + { + "epoch": 0.7291462799869525, + "grad_norm": 0.1013491302728653, + "learning_rate": 0.00017363769867124147, + "loss": 2.6259, + "step": 24589 + }, + { + "epoch": 0.729175933339264, + "grad_norm": 0.0972745418548584, + "learning_rate": 0.0001736020545309302, + "loss": 2.5789, + "step": 24590 + }, + { + "epoch": 0.7292055866915755, + "grad_norm": 0.09639737010002136, + "learning_rate": 0.00017356641328092893, + "loss": 2.5722, + "step": 24591 + }, + { + "epoch": 0.729235240043887, + "grad_norm": 0.0997677892446518, + "learning_rate": 0.00017353077492155306, + "loss": 2.5798, + "step": 24592 + }, + { + "epoch": 0.7292648933961985, + "grad_norm": 0.10842575877904892, + "learning_rate": 0.00017349513945311845, + "loss": 2.615, + "step": 24593 + }, + { + "epoch": 0.7292945467485099, + "grad_norm": 0.09906283766031265, + "learning_rate": 0.0001734595068759402, + "loss": 2.6081, + "step": 24594 + }, + { + "epoch": 0.7293242001008214, + "grad_norm": 0.10536528378725052, + "learning_rate": 0.0001734238771903341, + "loss": 2.6463, + "step": 24595 + }, + { + "epoch": 0.7293538534531329, + "grad_norm": 0.10704638063907623, + "learning_rate": 0.0001733882503966156, + "loss": 2.5745, + "step": 24596 + }, + { + "epoch": 0.7293835068054444, + "grad_norm": 0.10481065511703491, + "learning_rate": 0.0001733526264951002, + "loss": 2.6104, + "step": 24597 + }, + { + "epoch": 0.7294131601577558, + "grad_norm": 0.09470026195049286, + "learning_rate": 0.00017331700548610341, + "loss": 2.5947, + "step": 24598 + }, + { + "epoch": 0.7294428135100673, + "grad_norm": 0.10215716809034348, + "learning_rate": 0.00017328138736994058, + "loss": 2.6015, + "step": 24599 + }, + { + "epoch": 0.7294724668623788, + "grad_norm": 0.10086297243833542, + "learning_rate": 0.0001732457721469271, + "loss": 2.5986, + "step": 24600 + }, + { + "epoch": 0.7295021202146903, + "grad_norm": 0.11742596328258514, + "learning_rate": 0.00017321015981737846, + "loss": 2.5735, + "step": 24601 + }, + { + "epoch": 0.7295317735670017, + "grad_norm": 0.10213736444711685, + "learning_rate": 0.0001731745503816098, + "loss": 2.6655, + "step": 24602 + }, + { + "epoch": 0.7295614269193132, + "grad_norm": 0.12244513630867004, + "learning_rate": 0.00017313894383993682, + "loss": 2.5491, + "step": 24603 + }, + { + "epoch": 0.7295910802716247, + "grad_norm": 0.09305867552757263, + "learning_rate": 0.00017310334019267454, + "loss": 2.5844, + "step": 24604 + }, + { + "epoch": 0.7296207336239362, + "grad_norm": 0.11346817761659622, + "learning_rate": 0.00017306773944013827, + "loss": 2.6352, + "step": 24605 + }, + { + "epoch": 0.7296503869762476, + "grad_norm": 0.09551909565925598, + "learning_rate": 0.00017303214158264325, + "loss": 2.6188, + "step": 24606 + }, + { + "epoch": 0.7296800403285592, + "grad_norm": 0.09582938253879547, + "learning_rate": 0.00017299654662050474, + "loss": 2.5771, + "step": 24607 + }, + { + "epoch": 0.7297096936808706, + "grad_norm": 0.0952054113149643, + "learning_rate": 0.00017296095455403794, + "loss": 2.6127, + "step": 24608 + }, + { + "epoch": 0.7297393470331821, + "grad_norm": 0.09061739593744278, + "learning_rate": 0.0001729253653835581, + "loss": 2.6174, + "step": 24609 + }, + { + "epoch": 0.7297690003854935, + "grad_norm": 0.10121764987707138, + "learning_rate": 0.0001728897791093802, + "loss": 2.604, + "step": 24610 + }, + { + "epoch": 0.7297986537378051, + "grad_norm": 0.09251508116722107, + "learning_rate": 0.0001728541957318195, + "loss": 2.6403, + "step": 24611 + }, + { + "epoch": 0.7298283070901166, + "grad_norm": 0.09849358350038528, + "learning_rate": 0.00017281861525119096, + "loss": 2.5879, + "step": 24612 + }, + { + "epoch": 0.729857960442428, + "grad_norm": 0.10734626650810242, + "learning_rate": 0.00017278303766780985, + "loss": 2.603, + "step": 24613 + }, + { + "epoch": 0.7298876137947395, + "grad_norm": 0.0882638618350029, + "learning_rate": 0.00017274746298199107, + "loss": 2.5912, + "step": 24614 + }, + { + "epoch": 0.729917267147051, + "grad_norm": 0.11085180193185806, + "learning_rate": 0.00017271189119404966, + "loss": 2.5987, + "step": 24615 + }, + { + "epoch": 0.7299469204993625, + "grad_norm": 0.10123831778764725, + "learning_rate": 0.0001726763223043007, + "loss": 2.5988, + "step": 24616 + }, + { + "epoch": 0.7299765738516739, + "grad_norm": 0.10979872941970825, + "learning_rate": 0.00017264075631305903, + "loss": 2.5999, + "step": 24617 + }, + { + "epoch": 0.7300062272039854, + "grad_norm": 0.10251936316490173, + "learning_rate": 0.00017260519322063968, + "loss": 2.6192, + "step": 24618 + }, + { + "epoch": 0.7300358805562969, + "grad_norm": 0.10111060738563538, + "learning_rate": 0.0001725696330273575, + "loss": 2.5804, + "step": 24619 + }, + { + "epoch": 0.7300655339086084, + "grad_norm": 0.09383048862218857, + "learning_rate": 0.00017253407573352743, + "loss": 2.5922, + "step": 24620 + }, + { + "epoch": 0.7300951872609198, + "grad_norm": 0.09504973143339157, + "learning_rate": 0.00017249852133946437, + "loss": 2.6182, + "step": 24621 + }, + { + "epoch": 0.7301248406132314, + "grad_norm": 0.11048698425292969, + "learning_rate": 0.0001724629698454831, + "loss": 2.5974, + "step": 24622 + }, + { + "epoch": 0.7301544939655428, + "grad_norm": 0.0973716527223587, + "learning_rate": 0.00017242742125189842, + "loss": 2.5698, + "step": 24623 + }, + { + "epoch": 0.7301841473178543, + "grad_norm": 0.09651952236890793, + "learning_rate": 0.0001723918755590252, + "loss": 2.5927, + "step": 24624 + }, + { + "epoch": 0.7302138006701657, + "grad_norm": 0.09795106947422028, + "learning_rate": 0.0001723563327671781, + "loss": 2.5818, + "step": 24625 + }, + { + "epoch": 0.7302434540224773, + "grad_norm": 0.1035551130771637, + "learning_rate": 0.00017232079287667195, + "loss": 2.6157, + "step": 24626 + }, + { + "epoch": 0.7302731073747887, + "grad_norm": 0.09932549297809601, + "learning_rate": 0.00017228525588782161, + "loss": 2.5983, + "step": 24627 + }, + { + "epoch": 0.7303027607271002, + "grad_norm": 0.10111091285943985, + "learning_rate": 0.00017224972180094123, + "loss": 2.6141, + "step": 24628 + }, + { + "epoch": 0.7303324140794116, + "grad_norm": 0.10909845679998398, + "learning_rate": 0.00017221419061634596, + "loss": 2.6213, + "step": 24629 + }, + { + "epoch": 0.7303620674317232, + "grad_norm": 0.10010547190904617, + "learning_rate": 0.0001721786623343503, + "loss": 2.5894, + "step": 24630 + }, + { + "epoch": 0.7303917207840346, + "grad_norm": 0.10365443676710129, + "learning_rate": 0.00017214313695526888, + "loss": 2.6, + "step": 24631 + }, + { + "epoch": 0.7304213741363461, + "grad_norm": 0.09940272569656372, + "learning_rate": 0.00017210761447941625, + "loss": 2.6176, + "step": 24632 + }, + { + "epoch": 0.7304510274886576, + "grad_norm": 0.10022563487291336, + "learning_rate": 0.00017207209490710702, + "loss": 2.5764, + "step": 24633 + }, + { + "epoch": 0.7304806808409691, + "grad_norm": 0.10205462574958801, + "learning_rate": 0.00017203657823865558, + "loss": 2.5929, + "step": 24634 + }, + { + "epoch": 0.7305103341932806, + "grad_norm": 0.09827837347984314, + "learning_rate": 0.00017200106447437662, + "loss": 2.5833, + "step": 24635 + }, + { + "epoch": 0.730539987545592, + "grad_norm": 0.10414908826351166, + "learning_rate": 0.00017196555361458448, + "loss": 2.614, + "step": 24636 + }, + { + "epoch": 0.7305696408979035, + "grad_norm": 0.10397481918334961, + "learning_rate": 0.0001719300456595939, + "loss": 2.6241, + "step": 24637 + }, + { + "epoch": 0.730599294250215, + "grad_norm": 0.10390476882457733, + "learning_rate": 0.00017189454060971887, + "loss": 2.5792, + "step": 24638 + }, + { + "epoch": 0.7306289476025265, + "grad_norm": 0.11468515545129776, + "learning_rate": 0.000171859038465274, + "loss": 2.6005, + "step": 24639 + }, + { + "epoch": 0.7306586009548379, + "grad_norm": 0.10258589684963226, + "learning_rate": 0.00017182353922657367, + "loss": 2.589, + "step": 24640 + }, + { + "epoch": 0.7306882543071495, + "grad_norm": 0.09771901369094849, + "learning_rate": 0.00017178804289393206, + "loss": 2.6285, + "step": 24641 + }, + { + "epoch": 0.7307179076594609, + "grad_norm": 0.09800925105810165, + "learning_rate": 0.00017175254946766384, + "loss": 2.5978, + "step": 24642 + }, + { + "epoch": 0.7307475610117724, + "grad_norm": 0.10053852200508118, + "learning_rate": 0.00017171705894808315, + "loss": 2.6045, + "step": 24643 + }, + { + "epoch": 0.7307772143640838, + "grad_norm": 0.09610328078269958, + "learning_rate": 0.00017168157133550421, + "loss": 2.6283, + "step": 24644 + }, + { + "epoch": 0.7308068677163954, + "grad_norm": 0.10589111596345901, + "learning_rate": 0.00017164608663024133, + "loss": 2.5992, + "step": 24645 + }, + { + "epoch": 0.7308365210687068, + "grad_norm": 0.09478311240673065, + "learning_rate": 0.00017161060483260877, + "loss": 2.6007, + "step": 24646 + }, + { + "epoch": 0.7308661744210183, + "grad_norm": 0.10060635954141617, + "learning_rate": 0.00017157512594292063, + "loss": 2.6009, + "step": 24647 + }, + { + "epoch": 0.7308958277733297, + "grad_norm": 0.09909297525882721, + "learning_rate": 0.00017153964996149124, + "loss": 2.5856, + "step": 24648 + }, + { + "epoch": 0.7309254811256413, + "grad_norm": 0.09803591668605804, + "learning_rate": 0.00017150417688863452, + "loss": 2.6351, + "step": 24649 + }, + { + "epoch": 0.7309551344779527, + "grad_norm": 0.09919236600399017, + "learning_rate": 0.0001714687067246647, + "loss": 2.6094, + "step": 24650 + }, + { + "epoch": 0.7309847878302642, + "grad_norm": 0.09584126621484756, + "learning_rate": 0.00017143323946989587, + "loss": 2.62, + "step": 24651 + }, + { + "epoch": 0.7310144411825756, + "grad_norm": 0.09899647533893585, + "learning_rate": 0.00017139777512464204, + "loss": 2.5874, + "step": 24652 + }, + { + "epoch": 0.7310440945348872, + "grad_norm": 0.11014842987060547, + "learning_rate": 0.0001713623136892174, + "loss": 2.6065, + "step": 24653 + }, + { + "epoch": 0.7310737478871987, + "grad_norm": 0.10682693123817444, + "learning_rate": 0.0001713268551639357, + "loss": 2.6038, + "step": 24654 + }, + { + "epoch": 0.7311034012395101, + "grad_norm": 0.094106525182724, + "learning_rate": 0.00017129139954911128, + "loss": 2.6107, + "step": 24655 + }, + { + "epoch": 0.7311330545918217, + "grad_norm": 0.09939561784267426, + "learning_rate": 0.0001712559468450579, + "loss": 2.6354, + "step": 24656 + }, + { + "epoch": 0.7311627079441331, + "grad_norm": 0.09982295334339142, + "learning_rate": 0.00017122049705208954, + "loss": 2.619, + "step": 24657 + }, + { + "epoch": 0.7311923612964446, + "grad_norm": 0.10217691957950592, + "learning_rate": 0.00017118505017052032, + "loss": 2.6059, + "step": 24658 + }, + { + "epoch": 0.731222014648756, + "grad_norm": 0.09023305773735046, + "learning_rate": 0.00017114960620066372, + "loss": 2.5975, + "step": 24659 + }, + { + "epoch": 0.7312516680010676, + "grad_norm": 0.10037004947662354, + "learning_rate": 0.00017111416514283383, + "loss": 2.6147, + "step": 24660 + }, + { + "epoch": 0.731281321353379, + "grad_norm": 0.08873192220926285, + "learning_rate": 0.00017107872699734445, + "loss": 2.6044, + "step": 24661 + }, + { + "epoch": 0.7313109747056905, + "grad_norm": 0.10397592931985855, + "learning_rate": 0.0001710432917645094, + "loss": 2.6427, + "step": 24662 + }, + { + "epoch": 0.7313406280580019, + "grad_norm": 0.09903231263160706, + "learning_rate": 0.00017100785944464247, + "loss": 2.6074, + "step": 24663 + }, + { + "epoch": 0.7313702814103135, + "grad_norm": 0.10204093903303146, + "learning_rate": 0.0001709724300380574, + "loss": 2.6315, + "step": 24664 + }, + { + "epoch": 0.7313999347626249, + "grad_norm": 0.09746991097927094, + "learning_rate": 0.00017093700354506796, + "loss": 2.5997, + "step": 24665 + }, + { + "epoch": 0.7314295881149364, + "grad_norm": 0.09939491003751755, + "learning_rate": 0.00017090157996598783, + "loss": 2.6083, + "step": 24666 + }, + { + "epoch": 0.7314592414672478, + "grad_norm": 0.09280597418546677, + "learning_rate": 0.00017086615930113054, + "loss": 2.5972, + "step": 24667 + }, + { + "epoch": 0.7314888948195594, + "grad_norm": 0.08970250189304352, + "learning_rate": 0.00017083074155081003, + "loss": 2.5978, + "step": 24668 + }, + { + "epoch": 0.7315185481718708, + "grad_norm": 0.09381972253322601, + "learning_rate": 0.00017079532671534004, + "loss": 2.5875, + "step": 24669 + }, + { + "epoch": 0.7315482015241823, + "grad_norm": 0.09552506357431412, + "learning_rate": 0.00017075991479503373, + "loss": 2.5984, + "step": 24670 + }, + { + "epoch": 0.7315778548764937, + "grad_norm": 0.10198148339986801, + "learning_rate": 0.00017072450579020489, + "loss": 2.6463, + "step": 24671 + }, + { + "epoch": 0.7316075082288053, + "grad_norm": 0.09932068735361099, + "learning_rate": 0.00017068909970116702, + "loss": 2.6037, + "step": 24672 + }, + { + "epoch": 0.7316371615811167, + "grad_norm": 0.10180310159921646, + "learning_rate": 0.00017065369652823377, + "loss": 2.5893, + "step": 24673 + }, + { + "epoch": 0.7316668149334282, + "grad_norm": 0.09320838004350662, + "learning_rate": 0.00017061829627171854, + "loss": 2.5803, + "step": 24674 + }, + { + "epoch": 0.7316964682857398, + "grad_norm": 0.10421103239059448, + "learning_rate": 0.00017058289893193484, + "loss": 2.5959, + "step": 24675 + }, + { + "epoch": 0.7317261216380512, + "grad_norm": 0.10002512484788895, + "learning_rate": 0.00017054750450919614, + "loss": 2.6448, + "step": 24676 + }, + { + "epoch": 0.7317557749903627, + "grad_norm": 0.10480018705129623, + "learning_rate": 0.0001705121130038158, + "loss": 2.6276, + "step": 24677 + }, + { + "epoch": 0.7317854283426741, + "grad_norm": 0.10687306523323059, + "learning_rate": 0.00017047672441610729, + "loss": 2.6573, + "step": 24678 + }, + { + "epoch": 0.7318150816949857, + "grad_norm": 0.11033441871404648, + "learning_rate": 0.0001704413387463839, + "loss": 2.5985, + "step": 24679 + }, + { + "epoch": 0.7318447350472971, + "grad_norm": 0.11117173731327057, + "learning_rate": 0.00017040595599495905, + "loss": 2.6082, + "step": 24680 + }, + { + "epoch": 0.7318743883996086, + "grad_norm": 0.10509227216243744, + "learning_rate": 0.00017037057616214608, + "loss": 2.6266, + "step": 24681 + }, + { + "epoch": 0.73190404175192, + "grad_norm": 0.11112568527460098, + "learning_rate": 0.0001703351992482582, + "loss": 2.5844, + "step": 24682 + }, + { + "epoch": 0.7319336951042316, + "grad_norm": 0.10183538496494293, + "learning_rate": 0.00017029982525360864, + "loss": 2.5676, + "step": 24683 + }, + { + "epoch": 0.731963348456543, + "grad_norm": 0.10591519623994827, + "learning_rate": 0.00017026445417851082, + "loss": 2.629, + "step": 24684 + }, + { + "epoch": 0.7319930018088545, + "grad_norm": 0.10007002204656601, + "learning_rate": 0.0001702290860232778, + "loss": 2.6231, + "step": 24685 + }, + { + "epoch": 0.7320226551611659, + "grad_norm": 0.10550642758607864, + "learning_rate": 0.00017019372078822288, + "loss": 2.6207, + "step": 24686 + }, + { + "epoch": 0.7320523085134775, + "grad_norm": 0.10774758458137512, + "learning_rate": 0.00017015835847365913, + "loss": 2.6174, + "step": 24687 + }, + { + "epoch": 0.7320819618657889, + "grad_norm": 0.10629530996084213, + "learning_rate": 0.00017012299907989977, + "loss": 2.6203, + "step": 24688 + }, + { + "epoch": 0.7321116152181004, + "grad_norm": 0.11900345981121063, + "learning_rate": 0.00017008764260725785, + "loss": 2.5867, + "step": 24689 + }, + { + "epoch": 0.7321412685704118, + "grad_norm": 0.10235253721475601, + "learning_rate": 0.00017005228905604648, + "loss": 2.6199, + "step": 24690 + }, + { + "epoch": 0.7321709219227234, + "grad_norm": 0.10212965309619904, + "learning_rate": 0.00017001693842657873, + "loss": 2.5953, + "step": 24691 + }, + { + "epoch": 0.7322005752750348, + "grad_norm": 0.10361117124557495, + "learning_rate": 0.0001699815907191678, + "loss": 2.592, + "step": 24692 + }, + { + "epoch": 0.7322302286273463, + "grad_norm": 0.09494023770093918, + "learning_rate": 0.00016994624593412623, + "loss": 2.6035, + "step": 24693 + }, + { + "epoch": 0.7322598819796577, + "grad_norm": 0.10634443908929825, + "learning_rate": 0.0001699109040717674, + "loss": 2.6056, + "step": 24694 + }, + { + "epoch": 0.7322895353319693, + "grad_norm": 0.10114531964063644, + "learning_rate": 0.00016987556513240422, + "loss": 2.6281, + "step": 24695 + }, + { + "epoch": 0.7323191886842808, + "grad_norm": 0.09281200170516968, + "learning_rate": 0.00016984022911634955, + "loss": 2.5919, + "step": 24696 + }, + { + "epoch": 0.7323488420365922, + "grad_norm": 0.09448248893022537, + "learning_rate": 0.00016980489602391635, + "loss": 2.6193, + "step": 24697 + }, + { + "epoch": 0.7323784953889038, + "grad_norm": 0.09155023843050003, + "learning_rate": 0.00016976956585541746, + "loss": 2.6204, + "step": 24698 + }, + { + "epoch": 0.7324081487412152, + "grad_norm": 0.10303740203380585, + "learning_rate": 0.00016973423861116576, + "loss": 2.5856, + "step": 24699 + }, + { + "epoch": 0.7324378020935267, + "grad_norm": 0.09205310046672821, + "learning_rate": 0.00016969891429147405, + "loss": 2.5781, + "step": 24700 + }, + { + "epoch": 0.7324674554458381, + "grad_norm": 0.10177739709615707, + "learning_rate": 0.00016966359289665518, + "loss": 2.5836, + "step": 24701 + }, + { + "epoch": 0.7324971087981497, + "grad_norm": 0.09165225178003311, + "learning_rate": 0.0001696282744270219, + "loss": 2.5758, + "step": 24702 + }, + { + "epoch": 0.7325267621504611, + "grad_norm": 0.1144130527973175, + "learning_rate": 0.00016959295888288706, + "loss": 2.6138, + "step": 24703 + }, + { + "epoch": 0.7325564155027726, + "grad_norm": 0.0965549498796463, + "learning_rate": 0.00016955764626456316, + "loss": 2.6358, + "step": 24704 + }, + { + "epoch": 0.732586068855084, + "grad_norm": 0.10255277901887894, + "learning_rate": 0.00016952233657236305, + "loss": 2.6137, + "step": 24705 + }, + { + "epoch": 0.7326157222073956, + "grad_norm": 0.10075245797634125, + "learning_rate": 0.00016948702980659912, + "loss": 2.6164, + "step": 24706 + }, + { + "epoch": 0.732645375559707, + "grad_norm": 0.09958535432815552, + "learning_rate": 0.0001694517259675845, + "loss": 2.6066, + "step": 24707 + }, + { + "epoch": 0.7326750289120185, + "grad_norm": 0.09880216419696808, + "learning_rate": 0.0001694164250556316, + "loss": 2.5998, + "step": 24708 + }, + { + "epoch": 0.7327046822643299, + "grad_norm": 0.09234286844730377, + "learning_rate": 0.00016938112707105298, + "loss": 2.6041, + "step": 24709 + }, + { + "epoch": 0.7327343356166415, + "grad_norm": 0.09327922016382217, + "learning_rate": 0.00016934583201416126, + "loss": 2.5883, + "step": 24710 + }, + { + "epoch": 0.7327639889689529, + "grad_norm": 0.09557045996189117, + "learning_rate": 0.0001693105398852689, + "loss": 2.6208, + "step": 24711 + }, + { + "epoch": 0.7327936423212644, + "grad_norm": 0.09106545150279999, + "learning_rate": 0.00016927525068468846, + "loss": 2.5903, + "step": 24712 + }, + { + "epoch": 0.7328232956735758, + "grad_norm": 0.08989416807889938, + "learning_rate": 0.00016923996441273265, + "loss": 2.634, + "step": 24713 + }, + { + "epoch": 0.7328529490258874, + "grad_norm": 0.09647230058908463, + "learning_rate": 0.0001692046810697136, + "loss": 2.601, + "step": 24714 + }, + { + "epoch": 0.7328826023781989, + "grad_norm": 0.09167872369289398, + "learning_rate": 0.00016916940065594383, + "loss": 2.5884, + "step": 24715 + }, + { + "epoch": 0.7329122557305103, + "grad_norm": 0.09715325385332108, + "learning_rate": 0.00016913412317173582, + "loss": 2.5947, + "step": 24716 + }, + { + "epoch": 0.7329419090828219, + "grad_norm": 0.10214217752218246, + "learning_rate": 0.00016909884861740192, + "loss": 2.5496, + "step": 24717 + }, + { + "epoch": 0.7329715624351333, + "grad_norm": 0.09367150068283081, + "learning_rate": 0.00016906357699325453, + "loss": 2.6372, + "step": 24718 + }, + { + "epoch": 0.7330012157874448, + "grad_norm": 0.10186415165662766, + "learning_rate": 0.0001690283082996058, + "loss": 2.5969, + "step": 24719 + }, + { + "epoch": 0.7330308691397562, + "grad_norm": 0.10058518499135971, + "learning_rate": 0.00016899304253676834, + "loss": 2.5948, + "step": 24720 + }, + { + "epoch": 0.7330605224920678, + "grad_norm": 0.09781071543693542, + "learning_rate": 0.0001689577797050544, + "loss": 2.5651, + "step": 24721 + }, + { + "epoch": 0.7330901758443792, + "grad_norm": 0.08780821412801743, + "learning_rate": 0.00016892251980477602, + "loss": 2.5843, + "step": 24722 + }, + { + "epoch": 0.7331198291966907, + "grad_norm": 0.1055026650428772, + "learning_rate": 0.00016888726283624566, + "loss": 2.6072, + "step": 24723 + }, + { + "epoch": 0.7331494825490021, + "grad_norm": 0.10692623257637024, + "learning_rate": 0.0001688520087997755, + "loss": 2.5885, + "step": 24724 + }, + { + "epoch": 0.7331791359013137, + "grad_norm": 0.09111069142818451, + "learning_rate": 0.00016881675769567757, + "loss": 2.5602, + "step": 24725 + }, + { + "epoch": 0.7332087892536251, + "grad_norm": 0.10435773432254791, + "learning_rate": 0.00016878150952426402, + "loss": 2.6286, + "step": 24726 + }, + { + "epoch": 0.7332384426059366, + "grad_norm": 0.10059804469347, + "learning_rate": 0.0001687462642858471, + "loss": 2.6172, + "step": 24727 + }, + { + "epoch": 0.733268095958248, + "grad_norm": 0.10640887171030045, + "learning_rate": 0.00016871102198073883, + "loss": 2.6093, + "step": 24728 + }, + { + "epoch": 0.7332977493105596, + "grad_norm": 0.0993768721818924, + "learning_rate": 0.00016867578260925138, + "loss": 2.5815, + "step": 24729 + }, + { + "epoch": 0.733327402662871, + "grad_norm": 0.10070333629846573, + "learning_rate": 0.00016864054617169672, + "loss": 2.6095, + "step": 24730 + }, + { + "epoch": 0.7333570560151825, + "grad_norm": 0.09565501660108566, + "learning_rate": 0.00016860531266838693, + "loss": 2.6273, + "step": 24731 + }, + { + "epoch": 0.733386709367494, + "grad_norm": 0.1052032932639122, + "learning_rate": 0.00016857008209963375, + "loss": 2.5899, + "step": 24732 + }, + { + "epoch": 0.7334163627198055, + "grad_norm": 0.1079738661646843, + "learning_rate": 0.00016853485446574962, + "loss": 2.5731, + "step": 24733 + }, + { + "epoch": 0.7334460160721169, + "grad_norm": 0.09601528197526932, + "learning_rate": 0.00016849962976704636, + "loss": 2.6247, + "step": 24734 + }, + { + "epoch": 0.7334756694244284, + "grad_norm": 0.10248351842164993, + "learning_rate": 0.0001684644080038356, + "loss": 2.5661, + "step": 24735 + }, + { + "epoch": 0.73350532277674, + "grad_norm": 0.09921514987945557, + "learning_rate": 0.00016842918917642945, + "loss": 2.5949, + "step": 24736 + }, + { + "epoch": 0.7335349761290514, + "grad_norm": 0.09562189877033234, + "learning_rate": 0.0001683939732851398, + "loss": 2.6054, + "step": 24737 + }, + { + "epoch": 0.7335646294813629, + "grad_norm": 0.09471621364355087, + "learning_rate": 0.00016835876033027836, + "loss": 2.5925, + "step": 24738 + }, + { + "epoch": 0.7335942828336743, + "grad_norm": 0.10074007511138916, + "learning_rate": 0.00016832355031215702, + "loss": 2.5908, + "step": 24739 + }, + { + "epoch": 0.7336239361859859, + "grad_norm": 0.0925135463476181, + "learning_rate": 0.0001682883432310876, + "loss": 2.5868, + "step": 24740 + }, + { + "epoch": 0.7336535895382973, + "grad_norm": 0.10763352364301682, + "learning_rate": 0.00016825313908738182, + "loss": 2.6344, + "step": 24741 + }, + { + "epoch": 0.7336832428906088, + "grad_norm": 0.10415516793727875, + "learning_rate": 0.00016821793788135143, + "loss": 2.606, + "step": 24742 + }, + { + "epoch": 0.7337128962429202, + "grad_norm": 0.10885931551456451, + "learning_rate": 0.00016818273961330816, + "loss": 2.5741, + "step": 24743 + }, + { + "epoch": 0.7337425495952318, + "grad_norm": 0.09198271483182907, + "learning_rate": 0.00016814754428356372, + "loss": 2.5729, + "step": 24744 + }, + { + "epoch": 0.7337722029475432, + "grad_norm": 0.09813029319047928, + "learning_rate": 0.00016811235189242968, + "loss": 2.5818, + "step": 24745 + }, + { + "epoch": 0.7338018562998547, + "grad_norm": 0.1026005893945694, + "learning_rate": 0.00016807716244021775, + "loss": 2.586, + "step": 24746 + }, + { + "epoch": 0.7338315096521661, + "grad_norm": 0.09524068236351013, + "learning_rate": 0.0001680419759272396, + "loss": 2.591, + "step": 24747 + }, + { + "epoch": 0.7338611630044777, + "grad_norm": 0.10565607994794846, + "learning_rate": 0.00016800679235380662, + "loss": 2.5915, + "step": 24748 + }, + { + "epoch": 0.7338908163567891, + "grad_norm": 0.10494440793991089, + "learning_rate": 0.0001679716117202305, + "loss": 2.5871, + "step": 24749 + }, + { + "epoch": 0.7339204697091006, + "grad_norm": 0.09679187834262848, + "learning_rate": 0.0001679364340268228, + "loss": 2.5895, + "step": 24750 + }, + { + "epoch": 0.733950123061412, + "grad_norm": 0.10924082249403, + "learning_rate": 0.00016790125927389495, + "loss": 2.5986, + "step": 24751 + }, + { + "epoch": 0.7339797764137236, + "grad_norm": 0.10252982378005981, + "learning_rate": 0.00016786608746175845, + "loss": 2.5685, + "step": 24752 + }, + { + "epoch": 0.734009429766035, + "grad_norm": 0.10530427098274231, + "learning_rate": 0.00016783091859072475, + "loss": 2.6044, + "step": 24753 + }, + { + "epoch": 0.7340390831183465, + "grad_norm": 0.10619303584098816, + "learning_rate": 0.0001677957526611053, + "loss": 2.5809, + "step": 24754 + }, + { + "epoch": 0.734068736470658, + "grad_norm": 0.09467735886573792, + "learning_rate": 0.00016776058967321144, + "loss": 2.5961, + "step": 24755 + }, + { + "epoch": 0.7340983898229695, + "grad_norm": 0.10455280542373657, + "learning_rate": 0.00016772542962735466, + "loss": 2.559, + "step": 24756 + }, + { + "epoch": 0.734128043175281, + "grad_norm": 0.10461879521608353, + "learning_rate": 0.00016769027252384617, + "loss": 2.6319, + "step": 24757 + }, + { + "epoch": 0.7341576965275924, + "grad_norm": 0.10719498991966248, + "learning_rate": 0.00016765511836299742, + "loss": 2.6003, + "step": 24758 + }, + { + "epoch": 0.734187349879904, + "grad_norm": 0.09168305993080139, + "learning_rate": 0.00016761996714511956, + "loss": 2.5852, + "step": 24759 + }, + { + "epoch": 0.7342170032322154, + "grad_norm": 0.10207495838403702, + "learning_rate": 0.00016758481887052401, + "loss": 2.6305, + "step": 24760 + }, + { + "epoch": 0.7342466565845269, + "grad_norm": 0.09106580168008804, + "learning_rate": 0.0001675496735395219, + "loss": 2.5915, + "step": 24761 + }, + { + "epoch": 0.7342763099368383, + "grad_norm": 0.10199364274740219, + "learning_rate": 0.00016751453115242454, + "loss": 2.642, + "step": 24762 + }, + { + "epoch": 0.7343059632891499, + "grad_norm": 0.09241905063390732, + "learning_rate": 0.000167479391709543, + "loss": 2.6269, + "step": 24763 + }, + { + "epoch": 0.7343356166414613, + "grad_norm": 0.09624113142490387, + "learning_rate": 0.0001674442552111886, + "loss": 2.6084, + "step": 24764 + }, + { + "epoch": 0.7343652699937728, + "grad_norm": 0.10596083104610443, + "learning_rate": 0.00016740912165767242, + "loss": 2.623, + "step": 24765 + }, + { + "epoch": 0.7343949233460842, + "grad_norm": 0.09755022823810577, + "learning_rate": 0.0001673739910493055, + "loss": 2.6195, + "step": 24766 + }, + { + "epoch": 0.7344245766983958, + "grad_norm": 0.1000400260090828, + "learning_rate": 0.00016733886338639897, + "loss": 2.6287, + "step": 24767 + }, + { + "epoch": 0.7344542300507072, + "grad_norm": 0.09785234183073044, + "learning_rate": 0.00016730373866926412, + "loss": 2.5755, + "step": 24768 + }, + { + "epoch": 0.7344838834030187, + "grad_norm": 0.10688633471727371, + "learning_rate": 0.0001672686168982116, + "loss": 2.6141, + "step": 24769 + }, + { + "epoch": 0.7345135367553302, + "grad_norm": 0.10821395367383957, + "learning_rate": 0.0001672334980735526, + "loss": 2.5865, + "step": 24770 + }, + { + "epoch": 0.7345431901076417, + "grad_norm": 0.09429295361042023, + "learning_rate": 0.00016719838219559786, + "loss": 2.5707, + "step": 24771 + }, + { + "epoch": 0.7345728434599531, + "grad_norm": 0.09654537588357925, + "learning_rate": 0.0001671632692646588, + "loss": 2.6177, + "step": 24772 + }, + { + "epoch": 0.7346024968122646, + "grad_norm": 0.10263872146606445, + "learning_rate": 0.00016712815928104607, + "loss": 2.6487, + "step": 24773 + }, + { + "epoch": 0.7346321501645761, + "grad_norm": 0.09636469930410385, + "learning_rate": 0.0001670930522450707, + "loss": 2.5812, + "step": 24774 + }, + { + "epoch": 0.7346618035168876, + "grad_norm": 0.10761509835720062, + "learning_rate": 0.00016705794815704346, + "loss": 2.611, + "step": 24775 + }, + { + "epoch": 0.734691456869199, + "grad_norm": 0.09913304448127747, + "learning_rate": 0.0001670228470172752, + "loss": 2.5979, + "step": 24776 + }, + { + "epoch": 0.7347211102215105, + "grad_norm": 0.1030144989490509, + "learning_rate": 0.0001669877488260768, + "loss": 2.6145, + "step": 24777 + }, + { + "epoch": 0.7347507635738221, + "grad_norm": 0.10220405459403992, + "learning_rate": 0.0001669526535837591, + "loss": 2.6163, + "step": 24778 + }, + { + "epoch": 0.7347804169261335, + "grad_norm": 0.09760262817144394, + "learning_rate": 0.00016691756129063296, + "loss": 2.622, + "step": 24779 + }, + { + "epoch": 0.734810070278445, + "grad_norm": 0.09526531398296356, + "learning_rate": 0.0001668824719470088, + "loss": 2.6192, + "step": 24780 + }, + { + "epoch": 0.7348397236307564, + "grad_norm": 0.10456806421279907, + "learning_rate": 0.00016684738555319752, + "loss": 2.5847, + "step": 24781 + }, + { + "epoch": 0.734869376983068, + "grad_norm": 0.09949792921543121, + "learning_rate": 0.00016681230210950981, + "loss": 2.5996, + "step": 24782 + }, + { + "epoch": 0.7348990303353794, + "grad_norm": 0.11852752417325974, + "learning_rate": 0.00016677722161625625, + "loss": 2.604, + "step": 24783 + }, + { + "epoch": 0.7349286836876909, + "grad_norm": 0.09430484473705292, + "learning_rate": 0.00016674214407374772, + "loss": 2.6105, + "step": 24784 + }, + { + "epoch": 0.7349583370400024, + "grad_norm": 0.1083538830280304, + "learning_rate": 0.00016670706948229474, + "loss": 2.6028, + "step": 24785 + }, + { + "epoch": 0.7349879903923139, + "grad_norm": 0.09467224776744843, + "learning_rate": 0.00016667199784220783, + "loss": 2.6231, + "step": 24786 + }, + { + "epoch": 0.7350176437446253, + "grad_norm": 0.10065668821334839, + "learning_rate": 0.00016663692915379764, + "loss": 2.6002, + "step": 24787 + }, + { + "epoch": 0.7350472970969368, + "grad_norm": 0.10300663113594055, + "learning_rate": 0.0001666018634173746, + "loss": 2.6104, + "step": 24788 + }, + { + "epoch": 0.7350769504492483, + "grad_norm": 0.10633477568626404, + "learning_rate": 0.00016656680063324947, + "loss": 2.5959, + "step": 24789 + }, + { + "epoch": 0.7351066038015598, + "grad_norm": 0.09631706774234772, + "learning_rate": 0.00016653174080173238, + "loss": 2.6124, + "step": 24790 + }, + { + "epoch": 0.7351362571538712, + "grad_norm": 0.09905405342578888, + "learning_rate": 0.000166496683923134, + "loss": 2.5881, + "step": 24791 + }, + { + "epoch": 0.7351659105061827, + "grad_norm": 0.0981544777750969, + "learning_rate": 0.00016646162999776478, + "loss": 2.609, + "step": 24792 + }, + { + "epoch": 0.7351955638584942, + "grad_norm": 0.09637168049812317, + "learning_rate": 0.00016642657902593496, + "loss": 2.5858, + "step": 24793 + }, + { + "epoch": 0.7352252172108057, + "grad_norm": 0.09922053664922714, + "learning_rate": 0.0001663915310079551, + "loss": 2.6109, + "step": 24794 + }, + { + "epoch": 0.7352548705631171, + "grad_norm": 0.10880504548549652, + "learning_rate": 0.00016635648594413548, + "loss": 2.5949, + "step": 24795 + }, + { + "epoch": 0.7352845239154286, + "grad_norm": 0.10100492089986801, + "learning_rate": 0.00016632144383478632, + "loss": 2.6138, + "step": 24796 + }, + { + "epoch": 0.7353141772677401, + "grad_norm": 0.09516335278749466, + "learning_rate": 0.00016628640468021816, + "loss": 2.5973, + "step": 24797 + }, + { + "epoch": 0.7353438306200516, + "grad_norm": 0.102542944252491, + "learning_rate": 0.0001662513684807412, + "loss": 2.612, + "step": 24798 + }, + { + "epoch": 0.7353734839723631, + "grad_norm": 0.09751221537590027, + "learning_rate": 0.00016621633523666564, + "loss": 2.6043, + "step": 24799 + }, + { + "epoch": 0.7354031373246745, + "grad_norm": 0.0981590747833252, + "learning_rate": 0.0001661813049483019, + "loss": 2.6156, + "step": 24800 + }, + { + "epoch": 0.7354327906769861, + "grad_norm": 0.09841442853212357, + "learning_rate": 0.00016614627761595986, + "loss": 2.6221, + "step": 24801 + }, + { + "epoch": 0.7354624440292975, + "grad_norm": 0.10948622971773148, + "learning_rate": 0.0001661112532399498, + "loss": 2.6196, + "step": 24802 + }, + { + "epoch": 0.735492097381609, + "grad_norm": 0.09477681666612625, + "learning_rate": 0.0001660762318205819, + "loss": 2.6019, + "step": 24803 + }, + { + "epoch": 0.7355217507339205, + "grad_norm": 0.10131384432315826, + "learning_rate": 0.00016604121335816636, + "loss": 2.5783, + "step": 24804 + }, + { + "epoch": 0.735551404086232, + "grad_norm": 0.09997236728668213, + "learning_rate": 0.00016600619785301312, + "loss": 2.5871, + "step": 24805 + }, + { + "epoch": 0.7355810574385434, + "grad_norm": 0.1031009703874588, + "learning_rate": 0.00016597118530543238, + "loss": 2.6292, + "step": 24806 + }, + { + "epoch": 0.7356107107908549, + "grad_norm": 0.103024423122406, + "learning_rate": 0.00016593617571573406, + "loss": 2.5833, + "step": 24807 + }, + { + "epoch": 0.7356403641431664, + "grad_norm": 0.10346704721450806, + "learning_rate": 0.00016590116908422832, + "loss": 2.6168, + "step": 24808 + }, + { + "epoch": 0.7356700174954779, + "grad_norm": 0.09549148380756378, + "learning_rate": 0.00016586616541122484, + "loss": 2.5895, + "step": 24809 + }, + { + "epoch": 0.7356996708477893, + "grad_norm": 0.09620842337608337, + "learning_rate": 0.0001658311646970342, + "loss": 2.5895, + "step": 24810 + }, + { + "epoch": 0.7357293242001008, + "grad_norm": 0.10332469642162323, + "learning_rate": 0.00016579616694196574, + "loss": 2.5893, + "step": 24811 + }, + { + "epoch": 0.7357589775524123, + "grad_norm": 0.09646700322628021, + "learning_rate": 0.00016576117214632964, + "loss": 2.6395, + "step": 24812 + }, + { + "epoch": 0.7357886309047238, + "grad_norm": 0.09979858994483948, + "learning_rate": 0.00016572618031043574, + "loss": 2.5955, + "step": 24813 + }, + { + "epoch": 0.7358182842570352, + "grad_norm": 0.111024409532547, + "learning_rate": 0.00016569119143459388, + "loss": 2.5811, + "step": 24814 + }, + { + "epoch": 0.7358479376093467, + "grad_norm": 0.10485979914665222, + "learning_rate": 0.00016565620551911385, + "loss": 2.6092, + "step": 24815 + }, + { + "epoch": 0.7358775909616582, + "grad_norm": 0.09368210285902023, + "learning_rate": 0.00016562122256430557, + "loss": 2.602, + "step": 24816 + }, + { + "epoch": 0.7359072443139697, + "grad_norm": 0.10237415134906769, + "learning_rate": 0.00016558624257047871, + "loss": 2.5501, + "step": 24817 + }, + { + "epoch": 0.7359368976662811, + "grad_norm": 0.10358203947544098, + "learning_rate": 0.00016555126553794314, + "loss": 2.5808, + "step": 24818 + }, + { + "epoch": 0.7359665510185927, + "grad_norm": 0.10253152996301651, + "learning_rate": 0.00016551629146700848, + "loss": 2.6178, + "step": 24819 + }, + { + "epoch": 0.7359962043709042, + "grad_norm": 0.09673295170068741, + "learning_rate": 0.00016548132035798445, + "loss": 2.5861, + "step": 24820 + }, + { + "epoch": 0.7360258577232156, + "grad_norm": 0.1100500151515007, + "learning_rate": 0.00016544635221118077, + "loss": 2.6051, + "step": 24821 + }, + { + "epoch": 0.7360555110755271, + "grad_norm": 0.09236160665750504, + "learning_rate": 0.00016541138702690707, + "loss": 2.591, + "step": 24822 + }, + { + "epoch": 0.7360851644278386, + "grad_norm": 0.09254305064678192, + "learning_rate": 0.00016537642480547298, + "loss": 2.6061, + "step": 24823 + }, + { + "epoch": 0.7361148177801501, + "grad_norm": 0.09528575837612152, + "learning_rate": 0.0001653414655471881, + "loss": 2.5999, + "step": 24824 + }, + { + "epoch": 0.7361444711324615, + "grad_norm": 0.09563209116458893, + "learning_rate": 0.00016530650925236195, + "loss": 2.5971, + "step": 24825 + }, + { + "epoch": 0.736174124484773, + "grad_norm": 0.09340905398130417, + "learning_rate": 0.00016527155592130412, + "loss": 2.5828, + "step": 24826 + }, + { + "epoch": 0.7362037778370845, + "grad_norm": 0.10902687907218933, + "learning_rate": 0.00016523660555432413, + "loss": 2.5896, + "step": 24827 + }, + { + "epoch": 0.736233431189396, + "grad_norm": 0.08743040263652802, + "learning_rate": 0.00016520165815173143, + "loss": 2.6024, + "step": 24828 + }, + { + "epoch": 0.7362630845417074, + "grad_norm": 0.11000880599021912, + "learning_rate": 0.00016516671371383552, + "loss": 2.5971, + "step": 24829 + }, + { + "epoch": 0.736292737894019, + "grad_norm": 0.09104909002780914, + "learning_rate": 0.00016513177224094583, + "loss": 2.6095, + "step": 24830 + }, + { + "epoch": 0.7363223912463304, + "grad_norm": 0.09643898159265518, + "learning_rate": 0.0001650968337333718, + "loss": 2.5819, + "step": 24831 + }, + { + "epoch": 0.7363520445986419, + "grad_norm": 0.09472738206386566, + "learning_rate": 0.0001650618981914228, + "loss": 2.5965, + "step": 24832 + }, + { + "epoch": 0.7363816979509533, + "grad_norm": 0.10554249584674835, + "learning_rate": 0.00016502696561540814, + "loss": 2.6081, + "step": 24833 + }, + { + "epoch": 0.7364113513032648, + "grad_norm": 0.1065455749630928, + "learning_rate": 0.0001649920360056374, + "loss": 2.5877, + "step": 24834 + }, + { + "epoch": 0.7364410046555763, + "grad_norm": 0.11094958335161209, + "learning_rate": 0.00016495710936241938, + "loss": 2.5947, + "step": 24835 + }, + { + "epoch": 0.7364706580078878, + "grad_norm": 0.09642555564641953, + "learning_rate": 0.00016492218568606377, + "loss": 2.6186, + "step": 24836 + }, + { + "epoch": 0.7365003113601992, + "grad_norm": 0.09935589879751205, + "learning_rate": 0.00016488726497687972, + "loss": 2.6109, + "step": 24837 + }, + { + "epoch": 0.7365299647125108, + "grad_norm": 0.10592536628246307, + "learning_rate": 0.0001648523472351765, + "loss": 2.6085, + "step": 24838 + }, + { + "epoch": 0.7365596180648222, + "grad_norm": 0.09177607297897339, + "learning_rate": 0.00016481743246126324, + "loss": 2.6166, + "step": 24839 + }, + { + "epoch": 0.7365892714171337, + "grad_norm": 0.11548636853694916, + "learning_rate": 0.00016478252065544918, + "loss": 2.577, + "step": 24840 + }, + { + "epoch": 0.7366189247694452, + "grad_norm": 0.09180721640586853, + "learning_rate": 0.00016474761181804344, + "loss": 2.6113, + "step": 24841 + }, + { + "epoch": 0.7366485781217567, + "grad_norm": 0.10472463071346283, + "learning_rate": 0.00016471270594935512, + "loss": 2.6021, + "step": 24842 + }, + { + "epoch": 0.7366782314740682, + "grad_norm": 0.09897184371948242, + "learning_rate": 0.00016467780304969338, + "loss": 2.6158, + "step": 24843 + }, + { + "epoch": 0.7367078848263796, + "grad_norm": 0.08534671366214752, + "learning_rate": 0.0001646429031193672, + "loss": 2.6125, + "step": 24844 + }, + { + "epoch": 0.7367375381786911, + "grad_norm": 0.10484156757593155, + "learning_rate": 0.00016460800615868587, + "loss": 2.6188, + "step": 24845 + }, + { + "epoch": 0.7367671915310026, + "grad_norm": 0.10565579682588577, + "learning_rate": 0.000164573112167958, + "loss": 2.586, + "step": 24846 + }, + { + "epoch": 0.7367968448833141, + "grad_norm": 0.09215737134218216, + "learning_rate": 0.00016453822114749283, + "loss": 2.5791, + "step": 24847 + }, + { + "epoch": 0.7368264982356255, + "grad_norm": 0.09588003158569336, + "learning_rate": 0.00016450333309759918, + "loss": 2.6101, + "step": 24848 + }, + { + "epoch": 0.736856151587937, + "grad_norm": 0.09758903086185455, + "learning_rate": 0.00016446844801858623, + "loss": 2.6422, + "step": 24849 + }, + { + "epoch": 0.7368858049402485, + "grad_norm": 0.09076331555843353, + "learning_rate": 0.00016443356591076274, + "loss": 2.6102, + "step": 24850 + }, + { + "epoch": 0.73691545829256, + "grad_norm": 0.09291369467973709, + "learning_rate": 0.00016439868677443765, + "loss": 2.5772, + "step": 24851 + }, + { + "epoch": 0.7369451116448714, + "grad_norm": 0.09134644269943237, + "learning_rate": 0.0001643638106099198, + "loss": 2.5842, + "step": 24852 + }, + { + "epoch": 0.736974764997183, + "grad_norm": 0.09463109076023102, + "learning_rate": 0.00016432893741751798, + "loss": 2.5758, + "step": 24853 + }, + { + "epoch": 0.7370044183494944, + "grad_norm": 0.09245338290929794, + "learning_rate": 0.0001642940671975411, + "loss": 2.5972, + "step": 24854 + }, + { + "epoch": 0.7370340717018059, + "grad_norm": 0.09459618479013443, + "learning_rate": 0.00016425919995029798, + "loss": 2.5755, + "step": 24855 + }, + { + "epoch": 0.7370637250541173, + "grad_norm": 0.1046128198504448, + "learning_rate": 0.0001642243356760971, + "loss": 2.6009, + "step": 24856 + }, + { + "epoch": 0.7370933784064289, + "grad_norm": 0.09456895291805267, + "learning_rate": 0.00016418947437524739, + "loss": 2.6268, + "step": 24857 + }, + { + "epoch": 0.7371230317587403, + "grad_norm": 0.09073346853256226, + "learning_rate": 0.00016415461604805752, + "loss": 2.6078, + "step": 24858 + }, + { + "epoch": 0.7371526851110518, + "grad_norm": 0.10172818601131439, + "learning_rate": 0.0001641197606948362, + "loss": 2.6503, + "step": 24859 + }, + { + "epoch": 0.7371823384633632, + "grad_norm": 0.09565118700265884, + "learning_rate": 0.00016408490831589206, + "loss": 2.6427, + "step": 24860 + }, + { + "epoch": 0.7372119918156748, + "grad_norm": 0.09969563037157059, + "learning_rate": 0.0001640500589115335, + "loss": 2.6085, + "step": 24861 + }, + { + "epoch": 0.7372416451679863, + "grad_norm": 0.1064130961894989, + "learning_rate": 0.00016401521248206953, + "loss": 2.6015, + "step": 24862 + }, + { + "epoch": 0.7372712985202977, + "grad_norm": 0.09590321779251099, + "learning_rate": 0.00016398036902780854, + "loss": 2.6093, + "step": 24863 + }, + { + "epoch": 0.7373009518726092, + "grad_norm": 0.10242735594511032, + "learning_rate": 0.00016394552854905904, + "loss": 2.5917, + "step": 24864 + }, + { + "epoch": 0.7373306052249207, + "grad_norm": 0.09256383776664734, + "learning_rate": 0.0001639106910461297, + "loss": 2.5523, + "step": 24865 + }, + { + "epoch": 0.7373602585772322, + "grad_norm": 0.0938350185751915, + "learning_rate": 0.00016387585651932878, + "loss": 2.6296, + "step": 24866 + }, + { + "epoch": 0.7373899119295436, + "grad_norm": 0.09652668982744217, + "learning_rate": 0.0001638410249689648, + "loss": 2.5758, + "step": 24867 + }, + { + "epoch": 0.7374195652818551, + "grad_norm": 0.09652245789766312, + "learning_rate": 0.00016380619639534629, + "loss": 2.6215, + "step": 24868 + }, + { + "epoch": 0.7374492186341666, + "grad_norm": 0.10639980435371399, + "learning_rate": 0.00016377137079878157, + "loss": 2.5936, + "step": 24869 + }, + { + "epoch": 0.7374788719864781, + "grad_norm": 0.09086759388446808, + "learning_rate": 0.00016373654817957906, + "loss": 2.6056, + "step": 24870 + }, + { + "epoch": 0.7375085253387895, + "grad_norm": 0.10019226372241974, + "learning_rate": 0.00016370172853804715, + "loss": 2.6147, + "step": 24871 + }, + { + "epoch": 0.737538178691101, + "grad_norm": 0.08937818557024002, + "learning_rate": 0.00016366691187449418, + "loss": 2.6079, + "step": 24872 + }, + { + "epoch": 0.7375678320434125, + "grad_norm": 0.10455070436000824, + "learning_rate": 0.00016363209818922843, + "loss": 2.5917, + "step": 24873 + }, + { + "epoch": 0.737597485395724, + "grad_norm": 0.09758283942937851, + "learning_rate": 0.00016359728748255802, + "loss": 2.6195, + "step": 24874 + }, + { + "epoch": 0.7376271387480354, + "grad_norm": 0.09090423583984375, + "learning_rate": 0.00016356247975479155, + "loss": 2.6175, + "step": 24875 + }, + { + "epoch": 0.737656792100347, + "grad_norm": 0.10138329118490219, + "learning_rate": 0.00016352767500623722, + "loss": 2.5818, + "step": 24876 + }, + { + "epoch": 0.7376864454526584, + "grad_norm": 0.09576119482517242, + "learning_rate": 0.0001634928732372029, + "loss": 2.597, + "step": 24877 + }, + { + "epoch": 0.7377160988049699, + "grad_norm": 0.09057163447141647, + "learning_rate": 0.00016345807444799698, + "loss": 2.623, + "step": 24878 + }, + { + "epoch": 0.7377457521572813, + "grad_norm": 0.09403111785650253, + "learning_rate": 0.00016342327863892757, + "loss": 2.6038, + "step": 24879 + }, + { + "epoch": 0.7377754055095929, + "grad_norm": 0.09540514647960663, + "learning_rate": 0.0001633884858103028, + "loss": 2.6154, + "step": 24880 + }, + { + "epoch": 0.7378050588619043, + "grad_norm": 0.09362024068832397, + "learning_rate": 0.00016335369596243076, + "loss": 2.5754, + "step": 24881 + }, + { + "epoch": 0.7378347122142158, + "grad_norm": 0.11314421147108078, + "learning_rate": 0.00016331890909561953, + "loss": 2.6377, + "step": 24882 + }, + { + "epoch": 0.7378643655665273, + "grad_norm": 0.10136084258556366, + "learning_rate": 0.00016328412521017716, + "loss": 2.5939, + "step": 24883 + }, + { + "epoch": 0.7378940189188388, + "grad_norm": 0.09668823331594467, + "learning_rate": 0.00016324934430641164, + "loss": 2.5775, + "step": 24884 + }, + { + "epoch": 0.7379236722711503, + "grad_norm": 0.1017700806260109, + "learning_rate": 0.000163214566384631, + "loss": 2.567, + "step": 24885 + }, + { + "epoch": 0.7379533256234617, + "grad_norm": 0.09135463088750839, + "learning_rate": 0.00016317979144514318, + "loss": 2.5929, + "step": 24886 + }, + { + "epoch": 0.7379829789757733, + "grad_norm": 0.0977071151137352, + "learning_rate": 0.0001631450194882561, + "loss": 2.5855, + "step": 24887 + }, + { + "epoch": 0.7380126323280847, + "grad_norm": 0.10564279556274414, + "learning_rate": 0.0001631102505142777, + "loss": 2.6063, + "step": 24888 + }, + { + "epoch": 0.7380422856803962, + "grad_norm": 0.0948372632265091, + "learning_rate": 0.00016307548452351584, + "loss": 2.6233, + "step": 24889 + }, + { + "epoch": 0.7380719390327076, + "grad_norm": 0.11470253765583038, + "learning_rate": 0.0001630407215162784, + "loss": 2.6349, + "step": 24890 + }, + { + "epoch": 0.7381015923850192, + "grad_norm": 0.11176978051662445, + "learning_rate": 0.00016300596149287328, + "loss": 2.5971, + "step": 24891 + }, + { + "epoch": 0.7381312457373306, + "grad_norm": 0.10065045207738876, + "learning_rate": 0.00016297120445360813, + "loss": 2.5779, + "step": 24892 + }, + { + "epoch": 0.7381608990896421, + "grad_norm": 0.1029774621129036, + "learning_rate": 0.00016293645039879085, + "loss": 2.5763, + "step": 24893 + }, + { + "epoch": 0.7381905524419535, + "grad_norm": 0.10415317118167877, + "learning_rate": 0.00016290169932872917, + "loss": 2.602, + "step": 24894 + }, + { + "epoch": 0.7382202057942651, + "grad_norm": 0.09087124466896057, + "learning_rate": 0.0001628669512437308, + "loss": 2.6308, + "step": 24895 + }, + { + "epoch": 0.7382498591465765, + "grad_norm": 0.09897398948669434, + "learning_rate": 0.00016283220614410343, + "loss": 2.6186, + "step": 24896 + }, + { + "epoch": 0.738279512498888, + "grad_norm": 0.09248702228069305, + "learning_rate": 0.00016279746403015478, + "loss": 2.6348, + "step": 24897 + }, + { + "epoch": 0.7383091658511994, + "grad_norm": 0.09394366294145584, + "learning_rate": 0.00016276272490219242, + "loss": 2.6009, + "step": 24898 + }, + { + "epoch": 0.738338819203511, + "grad_norm": 0.10166043788194656, + "learning_rate": 0.00016272798876052402, + "loss": 2.6192, + "step": 24899 + }, + { + "epoch": 0.7383684725558224, + "grad_norm": 0.09412944316864014, + "learning_rate": 0.00016269325560545718, + "loss": 2.5879, + "step": 24900 + }, + { + "epoch": 0.7383981259081339, + "grad_norm": 0.09948890656232834, + "learning_rate": 0.00016265852543729948, + "loss": 2.5798, + "step": 24901 + }, + { + "epoch": 0.7384277792604453, + "grad_norm": 0.10506144165992737, + "learning_rate": 0.00016262379825635843, + "loss": 2.6192, + "step": 24902 + }, + { + "epoch": 0.7384574326127569, + "grad_norm": 0.0968441516160965, + "learning_rate": 0.00016258907406294156, + "loss": 2.6208, + "step": 24903 + }, + { + "epoch": 0.7384870859650684, + "grad_norm": 0.09564191102981567, + "learning_rate": 0.00016255435285735637, + "loss": 2.5733, + "step": 24904 + }, + { + "epoch": 0.7385167393173798, + "grad_norm": 0.09238765388727188, + "learning_rate": 0.00016251963463991025, + "loss": 2.5742, + "step": 24905 + }, + { + "epoch": 0.7385463926696914, + "grad_norm": 0.10709523409605026, + "learning_rate": 0.00016248491941091075, + "loss": 2.5786, + "step": 24906 + }, + { + "epoch": 0.7385760460220028, + "grad_norm": 0.10816512256860733, + "learning_rate": 0.00016245020717066522, + "loss": 2.5928, + "step": 24907 + }, + { + "epoch": 0.7386056993743143, + "grad_norm": 0.11137592792510986, + "learning_rate": 0.00016241549791948102, + "loss": 2.6279, + "step": 24908 + }, + { + "epoch": 0.7386353527266257, + "grad_norm": 0.11103065311908722, + "learning_rate": 0.0001623807916576655, + "loss": 2.6088, + "step": 24909 + }, + { + "epoch": 0.7386650060789373, + "grad_norm": 0.10850365459918976, + "learning_rate": 0.00016234608838552628, + "loss": 2.6226, + "step": 24910 + }, + { + "epoch": 0.7386946594312487, + "grad_norm": 0.11834505945444107, + "learning_rate": 0.00016231138810337016, + "loss": 2.6138, + "step": 24911 + }, + { + "epoch": 0.7387243127835602, + "grad_norm": 0.10671160370111465, + "learning_rate": 0.0001622766908115047, + "loss": 2.5893, + "step": 24912 + }, + { + "epoch": 0.7387539661358716, + "grad_norm": 0.10350020974874496, + "learning_rate": 0.0001622419965102369, + "loss": 2.5913, + "step": 24913 + }, + { + "epoch": 0.7387836194881832, + "grad_norm": 0.0955464318394661, + "learning_rate": 0.00016220730519987442, + "loss": 2.638, + "step": 24914 + }, + { + "epoch": 0.7388132728404946, + "grad_norm": 0.11527597159147263, + "learning_rate": 0.00016217261688072421, + "loss": 2.6229, + "step": 24915 + }, + { + "epoch": 0.7388429261928061, + "grad_norm": 0.10238654166460037, + "learning_rate": 0.00016213793155309348, + "loss": 2.5882, + "step": 24916 + }, + { + "epoch": 0.7388725795451175, + "grad_norm": 0.1107466071844101, + "learning_rate": 0.00016210324921728936, + "loss": 2.5839, + "step": 24917 + }, + { + "epoch": 0.7389022328974291, + "grad_norm": 0.099739670753479, + "learning_rate": 0.00016206856987361896, + "loss": 2.6216, + "step": 24918 + }, + { + "epoch": 0.7389318862497405, + "grad_norm": 0.10971595346927643, + "learning_rate": 0.00016203389352238934, + "loss": 2.583, + "step": 24919 + }, + { + "epoch": 0.738961539602052, + "grad_norm": 0.11959565430879593, + "learning_rate": 0.0001619992201639076, + "loss": 2.6402, + "step": 24920 + }, + { + "epoch": 0.7389911929543634, + "grad_norm": 0.10890167206525803, + "learning_rate": 0.00016196454979848103, + "loss": 2.5823, + "step": 24921 + }, + { + "epoch": 0.739020846306675, + "grad_norm": 0.10097701102495193, + "learning_rate": 0.00016192988242641615, + "loss": 2.6092, + "step": 24922 + }, + { + "epoch": 0.7390504996589865, + "grad_norm": 0.1010248214006424, + "learning_rate": 0.00016189521804802027, + "loss": 2.5898, + "step": 24923 + }, + { + "epoch": 0.7390801530112979, + "grad_norm": 0.10313834995031357, + "learning_rate": 0.00016186055666360022, + "loss": 2.5979, + "step": 24924 + }, + { + "epoch": 0.7391098063636095, + "grad_norm": 0.09257183969020844, + "learning_rate": 0.00016182589827346295, + "loss": 2.6006, + "step": 24925 + }, + { + "epoch": 0.7391394597159209, + "grad_norm": 0.10083898901939392, + "learning_rate": 0.0001617912428779153, + "loss": 2.5967, + "step": 24926 + }, + { + "epoch": 0.7391691130682324, + "grad_norm": 0.10157926380634308, + "learning_rate": 0.00016175659047726437, + "loss": 2.6133, + "step": 24927 + }, + { + "epoch": 0.7391987664205438, + "grad_norm": 0.09434132277965546, + "learning_rate": 0.00016172194107181686, + "loss": 2.6002, + "step": 24928 + }, + { + "epoch": 0.7392284197728554, + "grad_norm": 0.1130734533071518, + "learning_rate": 0.0001616872946618797, + "loss": 2.5806, + "step": 24929 + }, + { + "epoch": 0.7392580731251668, + "grad_norm": 0.09287697821855545, + "learning_rate": 0.00016165265124775958, + "loss": 2.5871, + "step": 24930 + }, + { + "epoch": 0.7392877264774783, + "grad_norm": 0.09991087764501572, + "learning_rate": 0.00016161801082976347, + "loss": 2.5791, + "step": 24931 + }, + { + "epoch": 0.7393173798297897, + "grad_norm": 0.09525524824857712, + "learning_rate": 0.00016158337340819778, + "loss": 2.6055, + "step": 24932 + }, + { + "epoch": 0.7393470331821013, + "grad_norm": 0.09777656197547913, + "learning_rate": 0.00016154873898336942, + "loss": 2.5948, + "step": 24933 + }, + { + "epoch": 0.7393766865344127, + "grad_norm": 0.10145357251167297, + "learning_rate": 0.0001615141075555851, + "loss": 2.5869, + "step": 24934 + }, + { + "epoch": 0.7394063398867242, + "grad_norm": 0.09817913919687271, + "learning_rate": 0.00016147947912515144, + "loss": 2.6002, + "step": 24935 + }, + { + "epoch": 0.7394359932390356, + "grad_norm": 0.10286382585763931, + "learning_rate": 0.00016144485369237505, + "loss": 2.571, + "step": 24936 + }, + { + "epoch": 0.7394656465913472, + "grad_norm": 0.09583757817745209, + "learning_rate": 0.00016141023125756265, + "loss": 2.5927, + "step": 24937 + }, + { + "epoch": 0.7394952999436586, + "grad_norm": 0.10113821178674698, + "learning_rate": 0.00016137561182102077, + "loss": 2.6196, + "step": 24938 + }, + { + "epoch": 0.7395249532959701, + "grad_norm": 0.1045682430267334, + "learning_rate": 0.0001613409953830558, + "loss": 2.6241, + "step": 24939 + }, + { + "epoch": 0.7395546066482815, + "grad_norm": 0.09030642360448837, + "learning_rate": 0.00016130638194397458, + "loss": 2.5723, + "step": 24940 + }, + { + "epoch": 0.7395842600005931, + "grad_norm": 0.10110843926668167, + "learning_rate": 0.00016127177150408368, + "loss": 2.599, + "step": 24941 + }, + { + "epoch": 0.7396139133529045, + "grad_norm": 0.09782278537750244, + "learning_rate": 0.0001612371640636892, + "loss": 2.5801, + "step": 24942 + }, + { + "epoch": 0.739643566705216, + "grad_norm": 0.09548010677099228, + "learning_rate": 0.00016120255962309783, + "loss": 2.6076, + "step": 24943 + }, + { + "epoch": 0.7396732200575276, + "grad_norm": 0.10368813574314117, + "learning_rate": 0.00016116795818261586, + "loss": 2.5883, + "step": 24944 + }, + { + "epoch": 0.739702873409839, + "grad_norm": 0.09256669878959656, + "learning_rate": 0.00016113335974254984, + "loss": 2.5879, + "step": 24945 + }, + { + "epoch": 0.7397325267621505, + "grad_norm": 0.10244810581207275, + "learning_rate": 0.00016109876430320607, + "loss": 2.5948, + "step": 24946 + }, + { + "epoch": 0.7397621801144619, + "grad_norm": 0.09406080842018127, + "learning_rate": 0.0001610641718648909, + "loss": 2.6315, + "step": 24947 + }, + { + "epoch": 0.7397918334667735, + "grad_norm": 0.09812169522047043, + "learning_rate": 0.0001610295824279107, + "loss": 2.6137, + "step": 24948 + }, + { + "epoch": 0.7398214868190849, + "grad_norm": 0.09151647984981537, + "learning_rate": 0.00016099499599257173, + "loss": 2.5817, + "step": 24949 + }, + { + "epoch": 0.7398511401713964, + "grad_norm": 0.10005592554807663, + "learning_rate": 0.00016096041255918026, + "loss": 2.6068, + "step": 24950 + }, + { + "epoch": 0.7398807935237078, + "grad_norm": 0.09470351785421371, + "learning_rate": 0.00016092583212804252, + "loss": 2.5864, + "step": 24951 + }, + { + "epoch": 0.7399104468760194, + "grad_norm": 0.09081991016864777, + "learning_rate": 0.00016089125469946475, + "loss": 2.5517, + "step": 24952 + }, + { + "epoch": 0.7399401002283308, + "grad_norm": 0.09982243180274963, + "learning_rate": 0.00016085668027375312, + "loss": 2.5852, + "step": 24953 + }, + { + "epoch": 0.7399697535806423, + "grad_norm": 0.0944405347108841, + "learning_rate": 0.00016082210885121374, + "loss": 2.609, + "step": 24954 + }, + { + "epoch": 0.7399994069329537, + "grad_norm": 0.11214911192655563, + "learning_rate": 0.00016078754043215288, + "loss": 2.6026, + "step": 24955 + }, + { + "epoch": 0.7400290602852653, + "grad_norm": 0.09839203953742981, + "learning_rate": 0.00016075297501687652, + "loss": 2.5714, + "step": 24956 + }, + { + "epoch": 0.7400587136375767, + "grad_norm": 0.1038087010383606, + "learning_rate": 0.00016071841260569087, + "loss": 2.6482, + "step": 24957 + }, + { + "epoch": 0.7400883669898882, + "grad_norm": 0.1136384829878807, + "learning_rate": 0.00016068385319890183, + "loss": 2.5731, + "step": 24958 + }, + { + "epoch": 0.7401180203421996, + "grad_norm": 0.09555795043706894, + "learning_rate": 0.00016064929679681557, + "loss": 2.5842, + "step": 24959 + }, + { + "epoch": 0.7401476736945112, + "grad_norm": 0.09415651112794876, + "learning_rate": 0.00016061474339973804, + "loss": 2.6074, + "step": 24960 + }, + { + "epoch": 0.7401773270468226, + "grad_norm": 0.09755009412765503, + "learning_rate": 0.00016058019300797517, + "loss": 2.5836, + "step": 24961 + }, + { + "epoch": 0.7402069803991341, + "grad_norm": 0.10936586558818817, + "learning_rate": 0.000160545645621833, + "loss": 2.5799, + "step": 24962 + }, + { + "epoch": 0.7402366337514455, + "grad_norm": 0.09594611078500748, + "learning_rate": 0.0001605111012416174, + "loss": 2.5922, + "step": 24963 + }, + { + "epoch": 0.7402662871037571, + "grad_norm": 0.11814404278993607, + "learning_rate": 0.00016047655986763421, + "loss": 2.5782, + "step": 24964 + }, + { + "epoch": 0.7402959404560686, + "grad_norm": 0.09823847562074661, + "learning_rate": 0.00016044202150018938, + "loss": 2.5952, + "step": 24965 + }, + { + "epoch": 0.74032559380838, + "grad_norm": 0.11377815157175064, + "learning_rate": 0.0001604074861395888, + "loss": 2.6161, + "step": 24966 + }, + { + "epoch": 0.7403552471606916, + "grad_norm": 0.09116487950086594, + "learning_rate": 0.00016037295378613814, + "loss": 2.6097, + "step": 24967 + }, + { + "epoch": 0.740384900513003, + "grad_norm": 0.1057807058095932, + "learning_rate": 0.00016033842444014335, + "loss": 2.6203, + "step": 24968 + }, + { + "epoch": 0.7404145538653145, + "grad_norm": 0.09776125848293304, + "learning_rate": 0.00016030389810191005, + "loss": 2.6042, + "step": 24969 + }, + { + "epoch": 0.7404442072176259, + "grad_norm": 0.10445229709148407, + "learning_rate": 0.000160269374771744, + "loss": 2.6086, + "step": 24970 + }, + { + "epoch": 0.7404738605699375, + "grad_norm": 0.09843797981739044, + "learning_rate": 0.00016023485444995107, + "loss": 2.6098, + "step": 24971 + }, + { + "epoch": 0.7405035139222489, + "grad_norm": 0.09846772253513336, + "learning_rate": 0.00016020033713683675, + "loss": 2.585, + "step": 24972 + }, + { + "epoch": 0.7405331672745604, + "grad_norm": 0.09492038935422897, + "learning_rate": 0.00016016582283270682, + "loss": 2.5639, + "step": 24973 + }, + { + "epoch": 0.7405628206268718, + "grad_norm": 0.0951746255159378, + "learning_rate": 0.0001601313115378668, + "loss": 2.6076, + "step": 24974 + }, + { + "epoch": 0.7405924739791834, + "grad_norm": 0.10396720468997955, + "learning_rate": 0.00016009680325262238, + "loss": 2.6189, + "step": 24975 + }, + { + "epoch": 0.7406221273314948, + "grad_norm": 0.0879400297999382, + "learning_rate": 0.00016006229797727928, + "loss": 2.5664, + "step": 24976 + }, + { + "epoch": 0.7406517806838063, + "grad_norm": 0.10808905959129333, + "learning_rate": 0.00016002779571214256, + "loss": 2.6121, + "step": 24977 + }, + { + "epoch": 0.7406814340361177, + "grad_norm": 0.0911564826965332, + "learning_rate": 0.0001599932964575182, + "loss": 2.6082, + "step": 24978 + }, + { + "epoch": 0.7407110873884293, + "grad_norm": 0.11528617888689041, + "learning_rate": 0.00015995880021371157, + "loss": 2.6089, + "step": 24979 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.10186813771724701, + "learning_rate": 0.00015992430698102812, + "loss": 2.5831, + "step": 24980 + }, + { + "epoch": 0.7407703940930522, + "grad_norm": 0.09795864671468735, + "learning_rate": 0.0001598898167597733, + "loss": 2.6241, + "step": 24981 + }, + { + "epoch": 0.7408000474453637, + "grad_norm": 0.11661425232887268, + "learning_rate": 0.00015985532955025263, + "loss": 2.6004, + "step": 24982 + }, + { + "epoch": 0.7408297007976752, + "grad_norm": 0.1126611977815628, + "learning_rate": 0.00015982084535277132, + "loss": 2.5584, + "step": 24983 + }, + { + "epoch": 0.7408593541499866, + "grad_norm": 0.10556669533252716, + "learning_rate": 0.0001597863641676348, + "loss": 2.6382, + "step": 24984 + }, + { + "epoch": 0.7408890075022981, + "grad_norm": 0.0996391773223877, + "learning_rate": 0.00015975188599514845, + "loss": 2.6298, + "step": 24985 + }, + { + "epoch": 0.7409186608546097, + "grad_norm": 0.10906536877155304, + "learning_rate": 0.0001597174108356177, + "loss": 2.5943, + "step": 24986 + }, + { + "epoch": 0.7409483142069211, + "grad_norm": 0.1022399365901947, + "learning_rate": 0.00015968293868934752, + "loss": 2.6279, + "step": 24987 + }, + { + "epoch": 0.7409779675592326, + "grad_norm": 0.09967714548110962, + "learning_rate": 0.00015964846955664335, + "loss": 2.5751, + "step": 24988 + }, + { + "epoch": 0.741007620911544, + "grad_norm": 0.10370303690433502, + "learning_rate": 0.0001596140034378104, + "loss": 2.5983, + "step": 24989 + }, + { + "epoch": 0.7410372742638556, + "grad_norm": 0.10043773055076599, + "learning_rate": 0.0001595795403331537, + "loss": 2.5969, + "step": 24990 + }, + { + "epoch": 0.741066927616167, + "grad_norm": 0.0960029736161232, + "learning_rate": 0.00015954508024297877, + "loss": 2.6253, + "step": 24991 + }, + { + "epoch": 0.7410965809684785, + "grad_norm": 0.09951420873403549, + "learning_rate": 0.00015951062316759063, + "loss": 2.6022, + "step": 24992 + }, + { + "epoch": 0.74112623432079, + "grad_norm": 0.09618350118398666, + "learning_rate": 0.00015947616910729433, + "loss": 2.6166, + "step": 24993 + }, + { + "epoch": 0.7411558876731015, + "grad_norm": 0.0968291163444519, + "learning_rate": 0.000159441718062395, + "loss": 2.5953, + "step": 24994 + }, + { + "epoch": 0.7411855410254129, + "grad_norm": 0.10651401430368423, + "learning_rate": 0.0001594072700331977, + "loss": 2.5951, + "step": 24995 + }, + { + "epoch": 0.7412151943777244, + "grad_norm": 0.09613034874200821, + "learning_rate": 0.0001593728250200075, + "loss": 2.5834, + "step": 24996 + }, + { + "epoch": 0.7412448477300358, + "grad_norm": 0.10962014645338058, + "learning_rate": 0.0001593383830231296, + "loss": 2.6153, + "step": 24997 + }, + { + "epoch": 0.7412745010823474, + "grad_norm": 0.10065247118473053, + "learning_rate": 0.00015930394404286858, + "loss": 2.6027, + "step": 24998 + }, + { + "epoch": 0.7413041544346588, + "grad_norm": 0.10872069001197815, + "learning_rate": 0.00015926950807952967, + "loss": 2.5791, + "step": 24999 + }, + { + "epoch": 0.7413338077869703, + "grad_norm": 0.10363122075796127, + "learning_rate": 0.00015923507513341766, + "loss": 2.588, + "step": 25000 + }, + { + "epoch": 0.7413634611392818, + "grad_norm": 0.10608656704425812, + "learning_rate": 0.00015920064520483763, + "loss": 2.5961, + "step": 25001 + }, + { + "epoch": 0.7413931144915933, + "grad_norm": 0.10106157511472702, + "learning_rate": 0.0001591662182940944, + "loss": 2.6109, + "step": 25002 + }, + { + "epoch": 0.7414227678439047, + "grad_norm": 0.10698466747999191, + "learning_rate": 0.0001591317944014926, + "loss": 2.6341, + "step": 25003 + }, + { + "epoch": 0.7414524211962162, + "grad_norm": 0.10432606935501099, + "learning_rate": 0.00015909737352733745, + "loss": 2.5924, + "step": 25004 + }, + { + "epoch": 0.7414820745485277, + "grad_norm": 0.10520718991756439, + "learning_rate": 0.0001590629556719336, + "loss": 2.575, + "step": 25005 + }, + { + "epoch": 0.7415117279008392, + "grad_norm": 0.11072839051485062, + "learning_rate": 0.0001590285408355857, + "loss": 2.5905, + "step": 25006 + }, + { + "epoch": 0.7415413812531507, + "grad_norm": 0.11004508286714554, + "learning_rate": 0.00015899412901859884, + "loss": 2.5773, + "step": 25007 + }, + { + "epoch": 0.7415710346054621, + "grad_norm": 0.11234258860349655, + "learning_rate": 0.0001589597202212773, + "loss": 2.5789, + "step": 25008 + }, + { + "epoch": 0.7416006879577737, + "grad_norm": 0.11923807859420776, + "learning_rate": 0.000158925314443926, + "loss": 2.6174, + "step": 25009 + }, + { + "epoch": 0.7416303413100851, + "grad_norm": 0.10102763772010803, + "learning_rate": 0.0001588909116868496, + "loss": 2.6121, + "step": 25010 + }, + { + "epoch": 0.7416599946623966, + "grad_norm": 0.1002821996808052, + "learning_rate": 0.00015885651195035271, + "loss": 2.6182, + "step": 25011 + }, + { + "epoch": 0.741689648014708, + "grad_norm": 0.0949004516005516, + "learning_rate": 0.00015882211523474, + "loss": 2.6027, + "step": 25012 + }, + { + "epoch": 0.7417193013670196, + "grad_norm": 0.09937185049057007, + "learning_rate": 0.00015878772154031596, + "loss": 2.6183, + "step": 25013 + }, + { + "epoch": 0.741748954719331, + "grad_norm": 0.08900534361600876, + "learning_rate": 0.00015875333086738525, + "loss": 2.5868, + "step": 25014 + }, + { + "epoch": 0.7417786080716425, + "grad_norm": 0.09759057313203812, + "learning_rate": 0.00015871894321625236, + "loss": 2.6072, + "step": 25015 + }, + { + "epoch": 0.741808261423954, + "grad_norm": 0.10122871398925781, + "learning_rate": 0.00015868455858722164, + "loss": 2.6226, + "step": 25016 + }, + { + "epoch": 0.7418379147762655, + "grad_norm": 0.09562693536281586, + "learning_rate": 0.0001586501769805979, + "loss": 2.5807, + "step": 25017 + }, + { + "epoch": 0.7418675681285769, + "grad_norm": 0.0968097373843193, + "learning_rate": 0.0001586157983966856, + "loss": 2.5786, + "step": 25018 + }, + { + "epoch": 0.7418972214808884, + "grad_norm": 0.10669219493865967, + "learning_rate": 0.00015858142283578885, + "loss": 2.586, + "step": 25019 + }, + { + "epoch": 0.7419268748331999, + "grad_norm": 0.08813948929309845, + "learning_rate": 0.0001585470502982122, + "loss": 2.6383, + "step": 25020 + }, + { + "epoch": 0.7419565281855114, + "grad_norm": 0.10952139645814896, + "learning_rate": 0.00015851268078426002, + "loss": 2.6101, + "step": 25021 + }, + { + "epoch": 0.7419861815378228, + "grad_norm": 0.09450080990791321, + "learning_rate": 0.0001584783142942367, + "loss": 2.6339, + "step": 25022 + }, + { + "epoch": 0.7420158348901343, + "grad_norm": 0.09850428253412247, + "learning_rate": 0.00015844395082844647, + "loss": 2.63, + "step": 25023 + }, + { + "epoch": 0.7420454882424458, + "grad_norm": 0.10071700066328049, + "learning_rate": 0.00015840959038719376, + "loss": 2.6071, + "step": 25024 + }, + { + "epoch": 0.7420751415947573, + "grad_norm": 0.09285008907318115, + "learning_rate": 0.0001583752329707827, + "loss": 2.5946, + "step": 25025 + }, + { + "epoch": 0.7421047949470687, + "grad_norm": 0.09642775356769562, + "learning_rate": 0.00015834087857951763, + "loss": 2.5967, + "step": 25026 + }, + { + "epoch": 0.7421344482993802, + "grad_norm": 0.0968208909034729, + "learning_rate": 0.0001583065272137027, + "loss": 2.6102, + "step": 25027 + }, + { + "epoch": 0.7421641016516918, + "grad_norm": 0.0949186235666275, + "learning_rate": 0.0001582721788736421, + "loss": 2.6063, + "step": 25028 + }, + { + "epoch": 0.7421937550040032, + "grad_norm": 0.09208818525075912, + "learning_rate": 0.00015823783355964005, + "loss": 2.5647, + "step": 25029 + }, + { + "epoch": 0.7422234083563147, + "grad_norm": 0.10245377570390701, + "learning_rate": 0.0001582034912720006, + "loss": 2.6164, + "step": 25030 + }, + { + "epoch": 0.7422530617086261, + "grad_norm": 0.09826844185590744, + "learning_rate": 0.000158169152011028, + "loss": 2.617, + "step": 25031 + }, + { + "epoch": 0.7422827150609377, + "grad_norm": 0.09032483398914337, + "learning_rate": 0.00015813481577702616, + "loss": 2.5778, + "step": 25032 + }, + { + "epoch": 0.7423123684132491, + "grad_norm": 0.09974588453769684, + "learning_rate": 0.0001581004825702992, + "loss": 2.6131, + "step": 25033 + }, + { + "epoch": 0.7423420217655606, + "grad_norm": 0.0954645499587059, + "learning_rate": 0.00015806615239115119, + "loss": 2.5789, + "step": 25034 + }, + { + "epoch": 0.742371675117872, + "grad_norm": 0.10882586240768433, + "learning_rate": 0.00015803182523988608, + "loss": 2.5933, + "step": 25035 + }, + { + "epoch": 0.7424013284701836, + "grad_norm": 0.09295953065156937, + "learning_rate": 0.00015799750111680782, + "loss": 2.6118, + "step": 25036 + }, + { + "epoch": 0.742430981822495, + "grad_norm": 0.10139995813369751, + "learning_rate": 0.00015796318002222044, + "loss": 2.6119, + "step": 25037 + }, + { + "epoch": 0.7424606351748065, + "grad_norm": 0.09676681458950043, + "learning_rate": 0.0001579288619564278, + "loss": 2.5824, + "step": 25038 + }, + { + "epoch": 0.742490288527118, + "grad_norm": 0.09844029694795609, + "learning_rate": 0.00015789454691973382, + "loss": 2.5999, + "step": 25039 + }, + { + "epoch": 0.7425199418794295, + "grad_norm": 0.08990093320608139, + "learning_rate": 0.0001578602349124423, + "loss": 2.6073, + "step": 25040 + }, + { + "epoch": 0.7425495952317409, + "grad_norm": 0.10132165998220444, + "learning_rate": 0.00015782592593485735, + "loss": 2.6024, + "step": 25041 + }, + { + "epoch": 0.7425792485840524, + "grad_norm": 0.09619655460119247, + "learning_rate": 0.0001577916199872822, + "loss": 2.6006, + "step": 25042 + }, + { + "epoch": 0.7426089019363639, + "grad_norm": 0.09849156439304352, + "learning_rate": 0.0001577573170700211, + "loss": 2.5973, + "step": 25043 + }, + { + "epoch": 0.7426385552886754, + "grad_norm": 0.10696852952241898, + "learning_rate": 0.0001577230171833778, + "loss": 2.5866, + "step": 25044 + }, + { + "epoch": 0.7426682086409868, + "grad_norm": 0.08918100595474243, + "learning_rate": 0.00015768872032765585, + "loss": 2.6023, + "step": 25045 + }, + { + "epoch": 0.7426978619932983, + "grad_norm": 0.1038745567202568, + "learning_rate": 0.0001576544265031591, + "loss": 2.6236, + "step": 25046 + }, + { + "epoch": 0.7427275153456098, + "grad_norm": 0.09470730274915695, + "learning_rate": 0.00015762013571019118, + "loss": 2.6212, + "step": 25047 + }, + { + "epoch": 0.7427571686979213, + "grad_norm": 0.09922925382852554, + "learning_rate": 0.00015758584794905566, + "loss": 2.607, + "step": 25048 + }, + { + "epoch": 0.7427868220502328, + "grad_norm": 0.09961622208356857, + "learning_rate": 0.00015755156322005626, + "loss": 2.5914, + "step": 25049 + }, + { + "epoch": 0.7428164754025443, + "grad_norm": 0.09512800723314285, + "learning_rate": 0.0001575172815234966, + "loss": 2.6372, + "step": 25050 + }, + { + "epoch": 0.7428461287548558, + "grad_norm": 0.11563431471586227, + "learning_rate": 0.00015748300285968014, + "loss": 2.6366, + "step": 25051 + }, + { + "epoch": 0.7428757821071672, + "grad_norm": 0.10222694277763367, + "learning_rate": 0.00015744872722891064, + "loss": 2.5987, + "step": 25052 + }, + { + "epoch": 0.7429054354594787, + "grad_norm": 0.10047438740730286, + "learning_rate": 0.0001574144546314913, + "loss": 2.5454, + "step": 25053 + }, + { + "epoch": 0.7429350888117902, + "grad_norm": 0.10448676347732544, + "learning_rate": 0.00015738018506772578, + "loss": 2.6463, + "step": 25054 + }, + { + "epoch": 0.7429647421641017, + "grad_norm": 0.12176439166069031, + "learning_rate": 0.00015734591853791742, + "loss": 2.6284, + "step": 25055 + }, + { + "epoch": 0.7429943955164131, + "grad_norm": 0.10050773620605469, + "learning_rate": 0.00015731165504236983, + "loss": 2.5868, + "step": 25056 + }, + { + "epoch": 0.7430240488687246, + "grad_norm": 0.10078781843185425, + "learning_rate": 0.0001572773945813864, + "loss": 2.5999, + "step": 25057 + }, + { + "epoch": 0.7430537022210361, + "grad_norm": 0.09673506766557693, + "learning_rate": 0.0001572431371552705, + "loss": 2.5892, + "step": 25058 + }, + { + "epoch": 0.7430833555733476, + "grad_norm": 0.1160486489534378, + "learning_rate": 0.00015720888276432544, + "loss": 2.5804, + "step": 25059 + }, + { + "epoch": 0.743113008925659, + "grad_norm": 0.10966081917285919, + "learning_rate": 0.0001571746314088545, + "loss": 2.5933, + "step": 25060 + }, + { + "epoch": 0.7431426622779705, + "grad_norm": 0.09337366372346878, + "learning_rate": 0.00015714038308916112, + "loss": 2.6064, + "step": 25061 + }, + { + "epoch": 0.743172315630282, + "grad_norm": 0.10372699797153473, + "learning_rate": 0.00015710613780554867, + "loss": 2.591, + "step": 25062 + }, + { + "epoch": 0.7432019689825935, + "grad_norm": 0.10004125535488129, + "learning_rate": 0.00015707189555832002, + "loss": 2.5884, + "step": 25063 + }, + { + "epoch": 0.7432316223349049, + "grad_norm": 0.09598252177238464, + "learning_rate": 0.00015703765634777862, + "loss": 2.5923, + "step": 25064 + }, + { + "epoch": 0.7432612756872164, + "grad_norm": 0.09858720749616623, + "learning_rate": 0.0001570034201742276, + "loss": 2.5914, + "step": 25065 + }, + { + "epoch": 0.7432909290395279, + "grad_norm": 0.09813113510608673, + "learning_rate": 0.0001569691870379702, + "loss": 2.589, + "step": 25066 + }, + { + "epoch": 0.7433205823918394, + "grad_norm": 0.09943898767232895, + "learning_rate": 0.00015693495693930954, + "loss": 2.6318, + "step": 25067 + }, + { + "epoch": 0.7433502357441508, + "grad_norm": 0.09555903822183609, + "learning_rate": 0.00015690072987854854, + "loss": 2.6053, + "step": 25068 + }, + { + "epoch": 0.7433798890964624, + "grad_norm": 0.09095390141010284, + "learning_rate": 0.00015686650585599067, + "loss": 2.5749, + "step": 25069 + }, + { + "epoch": 0.7434095424487739, + "grad_norm": 0.09602586179971695, + "learning_rate": 0.00015683228487193873, + "loss": 2.6028, + "step": 25070 + }, + { + "epoch": 0.7434391958010853, + "grad_norm": 0.08889502286911011, + "learning_rate": 0.00015679806692669586, + "loss": 2.5902, + "step": 25071 + }, + { + "epoch": 0.7434688491533968, + "grad_norm": 0.11076441407203674, + "learning_rate": 0.00015676385202056497, + "loss": 2.6268, + "step": 25072 + }, + { + "epoch": 0.7434985025057083, + "grad_norm": 0.0897030308842659, + "learning_rate": 0.00015672964015384926, + "loss": 2.5909, + "step": 25073 + }, + { + "epoch": 0.7435281558580198, + "grad_norm": 0.10543588548898697, + "learning_rate": 0.00015669543132685137, + "loss": 2.5786, + "step": 25074 + }, + { + "epoch": 0.7435578092103312, + "grad_norm": 0.0924147367477417, + "learning_rate": 0.0001566612255398744, + "loss": 2.6255, + "step": 25075 + }, + { + "epoch": 0.7435874625626427, + "grad_norm": 0.09834664314985275, + "learning_rate": 0.00015662702279322112, + "loss": 2.5831, + "step": 25076 + }, + { + "epoch": 0.7436171159149542, + "grad_norm": 0.09122440963983536, + "learning_rate": 0.0001565928230871946, + "loss": 2.5891, + "step": 25077 + }, + { + "epoch": 0.7436467692672657, + "grad_norm": 0.09886529296636581, + "learning_rate": 0.00015655862642209746, + "loss": 2.5877, + "step": 25078 + }, + { + "epoch": 0.7436764226195771, + "grad_norm": 0.09296098351478577, + "learning_rate": 0.0001565244327982327, + "loss": 2.597, + "step": 25079 + }, + { + "epoch": 0.7437060759718886, + "grad_norm": 0.10273288190364838, + "learning_rate": 0.00015649024221590303, + "loss": 2.6013, + "step": 25080 + }, + { + "epoch": 0.7437357293242001, + "grad_norm": 0.10251971334218979, + "learning_rate": 0.00015645605467541108, + "loss": 2.5727, + "step": 25081 + }, + { + "epoch": 0.7437653826765116, + "grad_norm": 0.09264436364173889, + "learning_rate": 0.0001564218701770599, + "loss": 2.6022, + "step": 25082 + }, + { + "epoch": 0.743795036028823, + "grad_norm": 0.10319145023822784, + "learning_rate": 0.00015638768872115218, + "loss": 2.6154, + "step": 25083 + }, + { + "epoch": 0.7438246893811346, + "grad_norm": 0.10341480374336243, + "learning_rate": 0.00015635351030799028, + "loss": 2.6146, + "step": 25084 + }, + { + "epoch": 0.743854342733446, + "grad_norm": 0.09476401656866074, + "learning_rate": 0.00015631933493787703, + "loss": 2.5756, + "step": 25085 + }, + { + "epoch": 0.7438839960857575, + "grad_norm": 0.10010645538568497, + "learning_rate": 0.0001562851626111151, + "loss": 2.6146, + "step": 25086 + }, + { + "epoch": 0.7439136494380689, + "grad_norm": 0.09131492674350739, + "learning_rate": 0.000156250993328007, + "loss": 2.5891, + "step": 25087 + }, + { + "epoch": 0.7439433027903805, + "grad_norm": 0.09956564754247665, + "learning_rate": 0.00015621682708885542, + "loss": 2.6023, + "step": 25088 + }, + { + "epoch": 0.7439729561426919, + "grad_norm": 0.09223173558712006, + "learning_rate": 0.0001561826638939628, + "loss": 2.5787, + "step": 25089 + }, + { + "epoch": 0.7440026094950034, + "grad_norm": 0.09996706992387772, + "learning_rate": 0.0001561485037436317, + "loss": 2.6171, + "step": 25090 + }, + { + "epoch": 0.7440322628473149, + "grad_norm": 0.1043437123298645, + "learning_rate": 0.00015611434663816465, + "loss": 2.6078, + "step": 25091 + }, + { + "epoch": 0.7440619161996264, + "grad_norm": 0.09297288954257965, + "learning_rate": 0.0001560801925778641, + "loss": 2.605, + "step": 25092 + }, + { + "epoch": 0.7440915695519379, + "grad_norm": 0.09675529599189758, + "learning_rate": 0.0001560460415630325, + "loss": 2.5989, + "step": 25093 + }, + { + "epoch": 0.7441212229042493, + "grad_norm": 0.09033486992120743, + "learning_rate": 0.00015601189359397215, + "loss": 2.5791, + "step": 25094 + }, + { + "epoch": 0.7441508762565608, + "grad_norm": 0.10228560119867325, + "learning_rate": 0.00015597774867098557, + "loss": 2.6299, + "step": 25095 + }, + { + "epoch": 0.7441805296088723, + "grad_norm": 0.09958535432815552, + "learning_rate": 0.00015594360679437508, + "loss": 2.6075, + "step": 25096 + }, + { + "epoch": 0.7442101829611838, + "grad_norm": 0.11673784255981445, + "learning_rate": 0.00015590946796444305, + "loss": 2.599, + "step": 25097 + }, + { + "epoch": 0.7442398363134952, + "grad_norm": 0.09674523025751114, + "learning_rate": 0.00015587533218149169, + "loss": 2.5839, + "step": 25098 + }, + { + "epoch": 0.7442694896658067, + "grad_norm": 0.1164899468421936, + "learning_rate": 0.00015584119944582337, + "loss": 2.6014, + "step": 25099 + }, + { + "epoch": 0.7442991430181182, + "grad_norm": 0.10478085279464722, + "learning_rate": 0.0001558070697577403, + "loss": 2.6134, + "step": 25100 + }, + { + "epoch": 0.7443287963704297, + "grad_norm": 0.11717286705970764, + "learning_rate": 0.00015577294311754463, + "loss": 2.62, + "step": 25101 + }, + { + "epoch": 0.7443584497227411, + "grad_norm": 0.10514816641807556, + "learning_rate": 0.0001557388195255387, + "loss": 2.5889, + "step": 25102 + }, + { + "epoch": 0.7443881030750527, + "grad_norm": 0.1127871498465538, + "learning_rate": 0.0001557046989820246, + "loss": 2.6213, + "step": 25103 + }, + { + "epoch": 0.7444177564273641, + "grad_norm": 0.11578749865293503, + "learning_rate": 0.00015567058148730452, + "loss": 2.5756, + "step": 25104 + }, + { + "epoch": 0.7444474097796756, + "grad_norm": 0.10190816223621368, + "learning_rate": 0.0001556364670416805, + "loss": 2.624, + "step": 25105 + }, + { + "epoch": 0.744477063131987, + "grad_norm": 0.11024721711874008, + "learning_rate": 0.00015560235564545473, + "loss": 2.6007, + "step": 25106 + }, + { + "epoch": 0.7445067164842986, + "grad_norm": 0.11294061690568924, + "learning_rate": 0.00015556824729892916, + "loss": 2.5938, + "step": 25107 + }, + { + "epoch": 0.74453636983661, + "grad_norm": 0.1005033552646637, + "learning_rate": 0.0001555341420024059, + "loss": 2.6321, + "step": 25108 + }, + { + "epoch": 0.7445660231889215, + "grad_norm": 0.10678461939096451, + "learning_rate": 0.00015550003975618693, + "loss": 2.593, + "step": 25109 + }, + { + "epoch": 0.7445956765412329, + "grad_norm": 0.09998514503240585, + "learning_rate": 0.00015546594056057423, + "loss": 2.6035, + "step": 25110 + }, + { + "epoch": 0.7446253298935445, + "grad_norm": 0.10429299622774124, + "learning_rate": 0.00015543184441586978, + "loss": 2.6192, + "step": 25111 + }, + { + "epoch": 0.744654983245856, + "grad_norm": 0.1068437471985817, + "learning_rate": 0.00015539775132237548, + "loss": 2.5895, + "step": 25112 + }, + { + "epoch": 0.7446846365981674, + "grad_norm": 0.09235512465238571, + "learning_rate": 0.00015536366128039331, + "loss": 2.6009, + "step": 25113 + }, + { + "epoch": 0.744714289950479, + "grad_norm": 0.11212705075740814, + "learning_rate": 0.000155329574290225, + "loss": 2.5921, + "step": 25114 + }, + { + "epoch": 0.7447439433027904, + "grad_norm": 0.09748509526252747, + "learning_rate": 0.0001552954903521725, + "loss": 2.6106, + "step": 25115 + }, + { + "epoch": 0.7447735966551019, + "grad_norm": 0.10292953252792358, + "learning_rate": 0.0001552614094665376, + "loss": 2.5817, + "step": 25116 + }, + { + "epoch": 0.7448032500074133, + "grad_norm": 0.09430093318223953, + "learning_rate": 0.00015522733163362223, + "loss": 2.6159, + "step": 25117 + }, + { + "epoch": 0.7448329033597249, + "grad_norm": 0.10859105736017227, + "learning_rate": 0.00015519325685372788, + "loss": 2.5567, + "step": 25118 + }, + { + "epoch": 0.7448625567120363, + "grad_norm": 0.10024742782115936, + "learning_rate": 0.00015515918512715643, + "loss": 2.5795, + "step": 25119 + }, + { + "epoch": 0.7448922100643478, + "grad_norm": 0.11168175935745239, + "learning_rate": 0.0001551251164542094, + "loss": 2.6241, + "step": 25120 + }, + { + "epoch": 0.7449218634166592, + "grad_norm": 0.10182105004787445, + "learning_rate": 0.00015509105083518886, + "loss": 2.5813, + "step": 25121 + }, + { + "epoch": 0.7449515167689708, + "grad_norm": 0.10312584042549133, + "learning_rate": 0.00015505698827039628, + "loss": 2.606, + "step": 25122 + }, + { + "epoch": 0.7449811701212822, + "grad_norm": 0.10080575942993164, + "learning_rate": 0.0001550229287601332, + "loss": 2.5477, + "step": 25123 + }, + { + "epoch": 0.7450108234735937, + "grad_norm": 0.09826524555683136, + "learning_rate": 0.00015498887230470137, + "loss": 2.5867, + "step": 25124 + }, + { + "epoch": 0.7450404768259051, + "grad_norm": 0.09651283919811249, + "learning_rate": 0.00015495481890440228, + "loss": 2.5948, + "step": 25125 + }, + { + "epoch": 0.7450701301782167, + "grad_norm": 0.09906784445047379, + "learning_rate": 0.00015492076855953745, + "loss": 2.6175, + "step": 25126 + }, + { + "epoch": 0.7450997835305281, + "grad_norm": 0.0930991992354393, + "learning_rate": 0.0001548867212704085, + "loss": 2.6175, + "step": 25127 + }, + { + "epoch": 0.7451294368828396, + "grad_norm": 0.10632404685020447, + "learning_rate": 0.00015485267703731703, + "loss": 2.5698, + "step": 25128 + }, + { + "epoch": 0.745159090235151, + "grad_norm": 0.08892618119716644, + "learning_rate": 0.00015481863586056416, + "loss": 2.5678, + "step": 25129 + }, + { + "epoch": 0.7451887435874626, + "grad_norm": 0.12124145030975342, + "learning_rate": 0.0001547845977404515, + "loss": 2.6117, + "step": 25130 + }, + { + "epoch": 0.745218396939774, + "grad_norm": 0.12230043113231659, + "learning_rate": 0.00015475056267728055, + "loss": 2.622, + "step": 25131 + }, + { + "epoch": 0.7452480502920855, + "grad_norm": 0.0951424390077591, + "learning_rate": 0.00015471653067135262, + "loss": 2.5819, + "step": 25132 + }, + { + "epoch": 0.745277703644397, + "grad_norm": 0.10995042324066162, + "learning_rate": 0.00015468250172296887, + "loss": 2.6242, + "step": 25133 + }, + { + "epoch": 0.7453073569967085, + "grad_norm": 0.09199021011590958, + "learning_rate": 0.00015464847583243103, + "loss": 2.6044, + "step": 25134 + }, + { + "epoch": 0.74533701034902, + "grad_norm": 0.098944291472435, + "learning_rate": 0.00015461445300004017, + "loss": 2.6042, + "step": 25135 + }, + { + "epoch": 0.7453666637013314, + "grad_norm": 0.10122720897197723, + "learning_rate": 0.00015458043322609766, + "loss": 2.6277, + "step": 25136 + }, + { + "epoch": 0.745396317053643, + "grad_norm": 0.10065612196922302, + "learning_rate": 0.00015454641651090469, + "loss": 2.6163, + "step": 25137 + }, + { + "epoch": 0.7454259704059544, + "grad_norm": 0.09864596277475357, + "learning_rate": 0.00015451240285476264, + "loss": 2.6163, + "step": 25138 + }, + { + "epoch": 0.7454556237582659, + "grad_norm": 0.09106110781431198, + "learning_rate": 0.00015447839225797244, + "loss": 2.5589, + "step": 25139 + }, + { + "epoch": 0.7454852771105773, + "grad_norm": 0.09585372358560562, + "learning_rate": 0.00015444438472083538, + "loss": 2.599, + "step": 25140 + }, + { + "epoch": 0.7455149304628889, + "grad_norm": 0.10154429823160172, + "learning_rate": 0.00015441038024365263, + "loss": 2.6057, + "step": 25141 + }, + { + "epoch": 0.7455445838152003, + "grad_norm": 0.09695930778980255, + "learning_rate": 0.00015437637882672524, + "loss": 2.6076, + "step": 25142 + }, + { + "epoch": 0.7455742371675118, + "grad_norm": 0.10411178320646286, + "learning_rate": 0.00015434238047035438, + "loss": 2.6443, + "step": 25143 + }, + { + "epoch": 0.7456038905198232, + "grad_norm": 0.10771634429693222, + "learning_rate": 0.00015430838517484102, + "loss": 2.6071, + "step": 25144 + }, + { + "epoch": 0.7456335438721348, + "grad_norm": 0.1006590947508812, + "learning_rate": 0.00015427439294048628, + "loss": 2.5663, + "step": 25145 + }, + { + "epoch": 0.7456631972244462, + "grad_norm": 0.10312867909669876, + "learning_rate": 0.00015424040376759096, + "loss": 2.5934, + "step": 25146 + }, + { + "epoch": 0.7456928505767577, + "grad_norm": 0.09868995100259781, + "learning_rate": 0.00015420641765645637, + "loss": 2.5784, + "step": 25147 + }, + { + "epoch": 0.7457225039290691, + "grad_norm": 0.10360296815633774, + "learning_rate": 0.00015417243460738334, + "loss": 2.6067, + "step": 25148 + }, + { + "epoch": 0.7457521572813807, + "grad_norm": 0.10261813551187515, + "learning_rate": 0.00015413845462067287, + "loss": 2.5905, + "step": 25149 + }, + { + "epoch": 0.7457818106336921, + "grad_norm": 0.09322627633810043, + "learning_rate": 0.00015410447769662562, + "loss": 2.591, + "step": 25150 + }, + { + "epoch": 0.7458114639860036, + "grad_norm": 0.10197743028402328, + "learning_rate": 0.00015407050383554262, + "loss": 2.5798, + "step": 25151 + }, + { + "epoch": 0.7458411173383152, + "grad_norm": 0.09424705058336258, + "learning_rate": 0.00015403653303772464, + "loss": 2.6035, + "step": 25152 + }, + { + "epoch": 0.7458707706906266, + "grad_norm": 0.1091613695025444, + "learning_rate": 0.00015400256530347255, + "loss": 2.6028, + "step": 25153 + }, + { + "epoch": 0.7459004240429381, + "grad_norm": 0.08713605999946594, + "learning_rate": 0.00015396860063308714, + "loss": 2.5722, + "step": 25154 + }, + { + "epoch": 0.7459300773952495, + "grad_norm": 0.10457886755466461, + "learning_rate": 0.00015393463902686917, + "loss": 2.5714, + "step": 25155 + }, + { + "epoch": 0.7459597307475611, + "grad_norm": 0.09538010507822037, + "learning_rate": 0.00015390068048511934, + "loss": 2.5964, + "step": 25156 + }, + { + "epoch": 0.7459893840998725, + "grad_norm": 0.10560247302055359, + "learning_rate": 0.00015386672500813847, + "loss": 2.5655, + "step": 25157 + }, + { + "epoch": 0.746019037452184, + "grad_norm": 0.10667448490858078, + "learning_rate": 0.00015383277259622697, + "loss": 2.5893, + "step": 25158 + }, + { + "epoch": 0.7460486908044954, + "grad_norm": 0.1025291383266449, + "learning_rate": 0.00015379882324968597, + "loss": 2.6011, + "step": 25159 + }, + { + "epoch": 0.746078344156807, + "grad_norm": 0.10560543090105057, + "learning_rate": 0.00015376487696881568, + "loss": 2.6159, + "step": 25160 + }, + { + "epoch": 0.7461079975091184, + "grad_norm": 0.101901575922966, + "learning_rate": 0.00015373093375391683, + "loss": 2.597, + "step": 25161 + }, + { + "epoch": 0.7461376508614299, + "grad_norm": 0.09687581658363342, + "learning_rate": 0.00015369699360529, + "loss": 2.6253, + "step": 25162 + }, + { + "epoch": 0.7461673042137413, + "grad_norm": 0.11684955656528473, + "learning_rate": 0.00015366305652323575, + "loss": 2.5799, + "step": 25163 + }, + { + "epoch": 0.7461969575660529, + "grad_norm": 0.09384405612945557, + "learning_rate": 0.00015362912250805455, + "loss": 2.6133, + "step": 25164 + }, + { + "epoch": 0.7462266109183643, + "grad_norm": 0.10645195096731186, + "learning_rate": 0.00015359519156004698, + "loss": 2.5962, + "step": 25165 + }, + { + "epoch": 0.7462562642706758, + "grad_norm": 0.09625536948442459, + "learning_rate": 0.0001535612636795134, + "loss": 2.5653, + "step": 25166 + }, + { + "epoch": 0.7462859176229872, + "grad_norm": 0.11438170075416565, + "learning_rate": 0.0001535273388667543, + "loss": 2.6078, + "step": 25167 + }, + { + "epoch": 0.7463155709752988, + "grad_norm": 0.09317310154438019, + "learning_rate": 0.0001534934171220701, + "loss": 2.5556, + "step": 25168 + }, + { + "epoch": 0.7463452243276102, + "grad_norm": 0.10426334291696548, + "learning_rate": 0.00015345949844576117, + "loss": 2.5985, + "step": 25169 + }, + { + "epoch": 0.7463748776799217, + "grad_norm": 0.106275275349617, + "learning_rate": 0.00015342558283812786, + "loss": 2.5773, + "step": 25170 + }, + { + "epoch": 0.7464045310322331, + "grad_norm": 0.10107424855232239, + "learning_rate": 0.00015339167029947048, + "loss": 2.552, + "step": 25171 + }, + { + "epoch": 0.7464341843845447, + "grad_norm": 0.09809713810682297, + "learning_rate": 0.0001533577608300894, + "loss": 2.5841, + "step": 25172 + }, + { + "epoch": 0.7464638377368562, + "grad_norm": 0.08876460045576096, + "learning_rate": 0.0001533238544302848, + "loss": 2.568, + "step": 25173 + }, + { + "epoch": 0.7464934910891676, + "grad_norm": 0.09149426221847534, + "learning_rate": 0.00015328995110035698, + "loss": 2.5623, + "step": 25174 + }, + { + "epoch": 0.7465231444414792, + "grad_norm": 0.09558196365833282, + "learning_rate": 0.00015325605084060616, + "loss": 2.6073, + "step": 25175 + }, + { + "epoch": 0.7465527977937906, + "grad_norm": 0.09536617994308472, + "learning_rate": 0.00015322215365133257, + "loss": 2.5998, + "step": 25176 + }, + { + "epoch": 0.7465824511461021, + "grad_norm": 0.11001887917518616, + "learning_rate": 0.00015318825953283626, + "loss": 2.5846, + "step": 25177 + }, + { + "epoch": 0.7466121044984135, + "grad_norm": 0.09599456936120987, + "learning_rate": 0.00015315436848541752, + "loss": 2.5933, + "step": 25178 + }, + { + "epoch": 0.7466417578507251, + "grad_norm": 0.09718845784664154, + "learning_rate": 0.00015312048050937632, + "loss": 2.5763, + "step": 25179 + }, + { + "epoch": 0.7466714112030365, + "grad_norm": 0.0940728560090065, + "learning_rate": 0.00015308659560501288, + "loss": 2.6048, + "step": 25180 + }, + { + "epoch": 0.746701064555348, + "grad_norm": 0.099184051156044, + "learning_rate": 0.0001530527137726271, + "loss": 2.6101, + "step": 25181 + }, + { + "epoch": 0.7467307179076594, + "grad_norm": 0.09869246184825897, + "learning_rate": 0.0001530188350125191, + "loss": 2.597, + "step": 25182 + }, + { + "epoch": 0.746760371259971, + "grad_norm": 0.10434078425168991, + "learning_rate": 0.00015298495932498907, + "loss": 2.5938, + "step": 25183 + }, + { + "epoch": 0.7467900246122824, + "grad_norm": 0.09734732657670975, + "learning_rate": 0.00015295108671033647, + "loss": 2.6029, + "step": 25184 + }, + { + "epoch": 0.7468196779645939, + "grad_norm": 0.09732318669557571, + "learning_rate": 0.00015291721716886175, + "loss": 2.5769, + "step": 25185 + }, + { + "epoch": 0.7468493313169053, + "grad_norm": 0.09576363116502762, + "learning_rate": 0.00015288335070086462, + "loss": 2.5817, + "step": 25186 + }, + { + "epoch": 0.7468789846692169, + "grad_norm": 0.09935975819826126, + "learning_rate": 0.000152849487306645, + "loss": 2.5336, + "step": 25187 + }, + { + "epoch": 0.7469086380215283, + "grad_norm": 0.09461464732885361, + "learning_rate": 0.00015281562698650275, + "loss": 2.6143, + "step": 25188 + }, + { + "epoch": 0.7469382913738398, + "grad_norm": 0.1058618575334549, + "learning_rate": 0.0001527817697407377, + "loss": 2.6039, + "step": 25189 + }, + { + "epoch": 0.7469679447261512, + "grad_norm": 0.10946246981620789, + "learning_rate": 0.00015274791556964967, + "loss": 2.5968, + "step": 25190 + }, + { + "epoch": 0.7469975980784628, + "grad_norm": 0.08627047389745712, + "learning_rate": 0.00015271406447353852, + "loss": 2.5928, + "step": 25191 + }, + { + "epoch": 0.7470272514307742, + "grad_norm": 0.10039889067411423, + "learning_rate": 0.00015268021645270391, + "loss": 2.5828, + "step": 25192 + }, + { + "epoch": 0.7470569047830857, + "grad_norm": 0.09735259413719177, + "learning_rate": 0.00015264637150744575, + "loss": 2.6217, + "step": 25193 + }, + { + "epoch": 0.7470865581353973, + "grad_norm": 0.09824465960264206, + "learning_rate": 0.00015261252963806344, + "loss": 2.5727, + "step": 25194 + }, + { + "epoch": 0.7471162114877087, + "grad_norm": 0.10069890320301056, + "learning_rate": 0.00015257869084485677, + "loss": 2.5951, + "step": 25195 + }, + { + "epoch": 0.7471458648400202, + "grad_norm": 0.09285929799079895, + "learning_rate": 0.0001525448551281255, + "loss": 2.5933, + "step": 25196 + }, + { + "epoch": 0.7471755181923316, + "grad_norm": 0.09909925609827042, + "learning_rate": 0.00015251102248816894, + "loss": 2.5569, + "step": 25197 + }, + { + "epoch": 0.7472051715446432, + "grad_norm": 0.09785602241754532, + "learning_rate": 0.0001524771929252871, + "loss": 2.5976, + "step": 25198 + }, + { + "epoch": 0.7472348248969546, + "grad_norm": 0.10559670627117157, + "learning_rate": 0.00015244336643977942, + "loss": 2.5787, + "step": 25199 + }, + { + "epoch": 0.7472644782492661, + "grad_norm": 0.10409879684448242, + "learning_rate": 0.00015240954303194536, + "loss": 2.6015, + "step": 25200 + }, + { + "epoch": 0.7472941316015775, + "grad_norm": 0.0984412431716919, + "learning_rate": 0.00015237572270208443, + "loss": 2.572, + "step": 25201 + }, + { + "epoch": 0.7473237849538891, + "grad_norm": 0.10254097729921341, + "learning_rate": 0.0001523419054504962, + "loss": 2.594, + "step": 25202 + }, + { + "epoch": 0.7473534383062005, + "grad_norm": 0.10824020206928253, + "learning_rate": 0.00015230809127748002, + "loss": 2.5952, + "step": 25203 + }, + { + "epoch": 0.747383091658512, + "grad_norm": 0.09314490854740143, + "learning_rate": 0.00015227428018333556, + "loss": 2.588, + "step": 25204 + }, + { + "epoch": 0.7474127450108234, + "grad_norm": 0.11265583336353302, + "learning_rate": 0.00015224047216836186, + "loss": 2.5933, + "step": 25205 + }, + { + "epoch": 0.747442398363135, + "grad_norm": 0.09652938693761826, + "learning_rate": 0.00015220666723285848, + "loss": 2.592, + "step": 25206 + }, + { + "epoch": 0.7474720517154464, + "grad_norm": 0.11020023375749588, + "learning_rate": 0.00015217286537712478, + "loss": 2.59, + "step": 25207 + }, + { + "epoch": 0.7475017050677579, + "grad_norm": 0.09509775787591934, + "learning_rate": 0.00015213906660146005, + "loss": 2.5961, + "step": 25208 + }, + { + "epoch": 0.7475313584200693, + "grad_norm": 0.09814666956663132, + "learning_rate": 0.00015210527090616356, + "loss": 2.5919, + "step": 25209 + }, + { + "epoch": 0.7475610117723809, + "grad_norm": 0.09894295036792755, + "learning_rate": 0.00015207147829153446, + "loss": 2.6125, + "step": 25210 + }, + { + "epoch": 0.7475906651246923, + "grad_norm": 0.10131292790174484, + "learning_rate": 0.00015203768875787232, + "loss": 2.5859, + "step": 25211 + }, + { + "epoch": 0.7476203184770038, + "grad_norm": 0.10486181080341339, + "learning_rate": 0.00015200390230547616, + "loss": 2.5673, + "step": 25212 + }, + { + "epoch": 0.7476499718293153, + "grad_norm": 0.10062523931264877, + "learning_rate": 0.0001519701189346452, + "loss": 2.5775, + "step": 25213 + }, + { + "epoch": 0.7476796251816268, + "grad_norm": 0.11127379536628723, + "learning_rate": 0.00015193633864567875, + "loss": 2.5688, + "step": 25214 + }, + { + "epoch": 0.7477092785339383, + "grad_norm": 0.108993761241436, + "learning_rate": 0.00015190256143887554, + "loss": 2.5705, + "step": 25215 + }, + { + "epoch": 0.7477389318862497, + "grad_norm": 0.10554473847150803, + "learning_rate": 0.00015186878731453495, + "loss": 2.6078, + "step": 25216 + }, + { + "epoch": 0.7477685852385613, + "grad_norm": 0.09767917543649673, + "learning_rate": 0.00015183501627295598, + "loss": 2.5993, + "step": 25217 + }, + { + "epoch": 0.7477982385908727, + "grad_norm": 0.09018096327781677, + "learning_rate": 0.00015180124831443774, + "loss": 2.6115, + "step": 25218 + }, + { + "epoch": 0.7478278919431842, + "grad_norm": 0.09415055066347122, + "learning_rate": 0.00015176748343927922, + "loss": 2.5936, + "step": 25219 + }, + { + "epoch": 0.7478575452954956, + "grad_norm": 0.09854955971240997, + "learning_rate": 0.00015173372164777933, + "loss": 2.5614, + "step": 25220 + }, + { + "epoch": 0.7478871986478072, + "grad_norm": 0.09454606473445892, + "learning_rate": 0.0001516999629402372, + "loss": 2.5777, + "step": 25221 + }, + { + "epoch": 0.7479168520001186, + "grad_norm": 0.11242947727441788, + "learning_rate": 0.00015166620731695165, + "loss": 2.6106, + "step": 25222 + }, + { + "epoch": 0.7479465053524301, + "grad_norm": 0.11031479388475418, + "learning_rate": 0.00015163245477822142, + "loss": 2.5852, + "step": 25223 + }, + { + "epoch": 0.7479761587047415, + "grad_norm": 0.09089823067188263, + "learning_rate": 0.00015159870532434577, + "loss": 2.5736, + "step": 25224 + }, + { + "epoch": 0.7480058120570531, + "grad_norm": 0.11143859475851059, + "learning_rate": 0.00015156495895562362, + "loss": 2.5841, + "step": 25225 + }, + { + "epoch": 0.7480354654093645, + "grad_norm": 0.0958549827337265, + "learning_rate": 0.00015153121567235335, + "loss": 2.5775, + "step": 25226 + }, + { + "epoch": 0.748065118761676, + "grad_norm": 0.09886838495731354, + "learning_rate": 0.00015149747547483401, + "loss": 2.6064, + "step": 25227 + }, + { + "epoch": 0.7480947721139874, + "grad_norm": 0.10633521527051926, + "learning_rate": 0.0001514637383633643, + "loss": 2.603, + "step": 25228 + }, + { + "epoch": 0.748124425466299, + "grad_norm": 0.08998887985944748, + "learning_rate": 0.00015143000433824307, + "loss": 2.6038, + "step": 25229 + }, + { + "epoch": 0.7481540788186104, + "grad_norm": 0.11078353971242905, + "learning_rate": 0.0001513962733997689, + "loss": 2.5987, + "step": 25230 + }, + { + "epoch": 0.7481837321709219, + "grad_norm": 0.10073788464069366, + "learning_rate": 0.00015136254554824063, + "loss": 2.6067, + "step": 25231 + }, + { + "epoch": 0.7482133855232334, + "grad_norm": 0.10685540735721588, + "learning_rate": 0.00015132882078395683, + "loss": 2.6127, + "step": 25232 + }, + { + "epoch": 0.7482430388755449, + "grad_norm": 0.09841912984848022, + "learning_rate": 0.00015129509910721616, + "loss": 2.6032, + "step": 25233 + }, + { + "epoch": 0.7482726922278563, + "grad_norm": 0.10905061662197113, + "learning_rate": 0.00015126138051831727, + "loss": 2.6124, + "step": 25234 + }, + { + "epoch": 0.7483023455801678, + "grad_norm": 0.10160988569259644, + "learning_rate": 0.00015122766501755874, + "loss": 2.6279, + "step": 25235 + }, + { + "epoch": 0.7483319989324794, + "grad_norm": 0.10932467132806778, + "learning_rate": 0.00015119395260523912, + "loss": 2.6003, + "step": 25236 + }, + { + "epoch": 0.7483616522847908, + "grad_norm": 0.09650439769029617, + "learning_rate": 0.00015116024328165685, + "loss": 2.5607, + "step": 25237 + }, + { + "epoch": 0.7483913056371023, + "grad_norm": 0.11091045290231705, + "learning_rate": 0.00015112653704711055, + "loss": 2.6085, + "step": 25238 + }, + { + "epoch": 0.7484209589894137, + "grad_norm": 0.09199647605419159, + "learning_rate": 0.0001510928339018986, + "loss": 2.5966, + "step": 25239 + }, + { + "epoch": 0.7484506123417253, + "grad_norm": 0.09659085422754288, + "learning_rate": 0.00015105913384631952, + "loss": 2.6041, + "step": 25240 + }, + { + "epoch": 0.7484802656940367, + "grad_norm": 0.09045685827732086, + "learning_rate": 0.00015102543688067172, + "loss": 2.5931, + "step": 25241 + }, + { + "epoch": 0.7485099190463482, + "grad_norm": 0.08982520550489426, + "learning_rate": 0.0001509917430052536, + "loss": 2.5882, + "step": 25242 + }, + { + "epoch": 0.7485395723986596, + "grad_norm": 0.09291709214448929, + "learning_rate": 0.00015095805222036345, + "loss": 2.6289, + "step": 25243 + }, + { + "epoch": 0.7485692257509712, + "grad_norm": 0.09339882433414459, + "learning_rate": 0.00015092436452629964, + "loss": 2.6037, + "step": 25244 + }, + { + "epoch": 0.7485988791032826, + "grad_norm": 0.08829180151224136, + "learning_rate": 0.00015089067992336057, + "loss": 2.5946, + "step": 25245 + }, + { + "epoch": 0.7486285324555941, + "grad_norm": 0.08729123324155807, + "learning_rate": 0.0001508569984118444, + "loss": 2.5892, + "step": 25246 + }, + { + "epoch": 0.7486581858079056, + "grad_norm": 0.08960961550474167, + "learning_rate": 0.00015082331999204945, + "loss": 2.6139, + "step": 25247 + }, + { + "epoch": 0.7486878391602171, + "grad_norm": 0.08586475998163223, + "learning_rate": 0.00015078964466427397, + "loss": 2.5836, + "step": 25248 + }, + { + "epoch": 0.7487174925125285, + "grad_norm": 0.08980953693389893, + "learning_rate": 0.0001507559724288161, + "loss": 2.5913, + "step": 25249 + }, + { + "epoch": 0.74874714586484, + "grad_norm": 0.08874659985303879, + "learning_rate": 0.00015072230328597408, + "loss": 2.6069, + "step": 25250 + }, + { + "epoch": 0.7487767992171515, + "grad_norm": 0.08993866294622421, + "learning_rate": 0.00015068863723604597, + "loss": 2.5879, + "step": 25251 + }, + { + "epoch": 0.748806452569463, + "grad_norm": 0.09454996138811111, + "learning_rate": 0.00015065497427932994, + "loss": 2.6005, + "step": 25252 + }, + { + "epoch": 0.7488361059217744, + "grad_norm": 0.09504088014364243, + "learning_rate": 0.00015062131441612408, + "loss": 2.6173, + "step": 25253 + }, + { + "epoch": 0.7488657592740859, + "grad_norm": 0.11031309515237808, + "learning_rate": 0.00015058765764672645, + "loss": 2.5894, + "step": 25254 + }, + { + "epoch": 0.7488954126263974, + "grad_norm": 0.08932528644800186, + "learning_rate": 0.0001505540039714351, + "loss": 2.5619, + "step": 25255 + }, + { + "epoch": 0.7489250659787089, + "grad_norm": 0.1250959187746048, + "learning_rate": 0.00015052035339054804, + "loss": 2.6163, + "step": 25256 + }, + { + "epoch": 0.7489547193310204, + "grad_norm": 0.09494350105524063, + "learning_rate": 0.0001504867059043632, + "loss": 2.6047, + "step": 25257 + }, + { + "epoch": 0.7489843726833318, + "grad_norm": 0.09896485507488251, + "learning_rate": 0.0001504530615131786, + "loss": 2.5889, + "step": 25258 + }, + { + "epoch": 0.7490140260356434, + "grad_norm": 0.10454487055540085, + "learning_rate": 0.00015041942021729228, + "loss": 2.5907, + "step": 25259 + }, + { + "epoch": 0.7490436793879548, + "grad_norm": 0.10890236496925354, + "learning_rate": 0.00015038578201700186, + "loss": 2.6093, + "step": 25260 + }, + { + "epoch": 0.7490733327402663, + "grad_norm": 0.10448035597801208, + "learning_rate": 0.00015035214691260534, + "loss": 2.5558, + "step": 25261 + }, + { + "epoch": 0.7491029860925777, + "grad_norm": 0.09429508447647095, + "learning_rate": 0.0001503185149044004, + "loss": 2.5975, + "step": 25262 + }, + { + "epoch": 0.7491326394448893, + "grad_norm": 0.10659915953874588, + "learning_rate": 0.00015028488599268524, + "loss": 2.6136, + "step": 25263 + }, + { + "epoch": 0.7491622927972007, + "grad_norm": 0.09841465950012207, + "learning_rate": 0.0001502512601777574, + "loss": 2.5635, + "step": 25264 + }, + { + "epoch": 0.7491919461495122, + "grad_norm": 0.10253030061721802, + "learning_rate": 0.00015021763745991468, + "loss": 2.6165, + "step": 25265 + }, + { + "epoch": 0.7492215995018237, + "grad_norm": 0.09655728191137314, + "learning_rate": 0.00015018401783945483, + "loss": 2.5618, + "step": 25266 + }, + { + "epoch": 0.7492512528541352, + "grad_norm": 0.0901445671916008, + "learning_rate": 0.00015015040131667557, + "loss": 2.5696, + "step": 25267 + }, + { + "epoch": 0.7492809062064466, + "grad_norm": 0.09059661626815796, + "learning_rate": 0.00015011678789187454, + "loss": 2.5728, + "step": 25268 + }, + { + "epoch": 0.7493105595587581, + "grad_norm": 0.09685096889734268, + "learning_rate": 0.00015008317756534957, + "loss": 2.6144, + "step": 25269 + }, + { + "epoch": 0.7493402129110696, + "grad_norm": 0.1005183681845665, + "learning_rate": 0.00015004957033739797, + "loss": 2.5903, + "step": 25270 + }, + { + "epoch": 0.7493698662633811, + "grad_norm": 0.09100538492202759, + "learning_rate": 0.0001500159662083175, + "loss": 2.6315, + "step": 25271 + }, + { + "epoch": 0.7493995196156925, + "grad_norm": 0.10426381230354309, + "learning_rate": 0.0001499823651784057, + "loss": 2.585, + "step": 25272 + }, + { + "epoch": 0.749429172968004, + "grad_norm": 0.08882053941488266, + "learning_rate": 0.00014994876724796013, + "loss": 2.5506, + "step": 25273 + }, + { + "epoch": 0.7494588263203155, + "grad_norm": 0.10002505034208298, + "learning_rate": 0.00014991517241727832, + "loss": 2.579, + "step": 25274 + }, + { + "epoch": 0.749488479672627, + "grad_norm": 0.09095529466867447, + "learning_rate": 0.00014988158068665757, + "loss": 2.5972, + "step": 25275 + }, + { + "epoch": 0.7495181330249384, + "grad_norm": 0.10128232836723328, + "learning_rate": 0.00014984799205639576, + "loss": 2.596, + "step": 25276 + }, + { + "epoch": 0.74954778637725, + "grad_norm": 0.10102375596761703, + "learning_rate": 0.00014981440652679003, + "loss": 2.6519, + "step": 25277 + }, + { + "epoch": 0.7495774397295615, + "grad_norm": 0.09550876170396805, + "learning_rate": 0.0001497808240981378, + "loss": 2.5922, + "step": 25278 + }, + { + "epoch": 0.7496070930818729, + "grad_norm": 0.0992896631360054, + "learning_rate": 0.00014974724477073654, + "loss": 2.6188, + "step": 25279 + }, + { + "epoch": 0.7496367464341844, + "grad_norm": 0.09548045694828033, + "learning_rate": 0.00014971366854488373, + "loss": 2.5872, + "step": 25280 + }, + { + "epoch": 0.7496663997864959, + "grad_norm": 0.08626090735197067, + "learning_rate": 0.0001496800954208763, + "loss": 2.6, + "step": 25281 + }, + { + "epoch": 0.7496960531388074, + "grad_norm": 0.09498404711484909, + "learning_rate": 0.00014964652539901176, + "loss": 2.6129, + "step": 25282 + }, + { + "epoch": 0.7497257064911188, + "grad_norm": 0.09850598871707916, + "learning_rate": 0.0001496129584795874, + "loss": 2.6041, + "step": 25283 + }, + { + "epoch": 0.7497553598434303, + "grad_norm": 0.09003061056137085, + "learning_rate": 0.00014957939466290045, + "loss": 2.574, + "step": 25284 + }, + { + "epoch": 0.7497850131957418, + "grad_norm": 0.09238868206739426, + "learning_rate": 0.00014954583394924804, + "loss": 2.6076, + "step": 25285 + }, + { + "epoch": 0.7498146665480533, + "grad_norm": 0.09857959300279617, + "learning_rate": 0.00014951227633892744, + "loss": 2.6026, + "step": 25286 + }, + { + "epoch": 0.7498443199003647, + "grad_norm": 0.09837985038757324, + "learning_rate": 0.00014947872183223586, + "loss": 2.5862, + "step": 25287 + }, + { + "epoch": 0.7498739732526762, + "grad_norm": 0.10236508399248123, + "learning_rate": 0.00014944517042947015, + "loss": 2.5931, + "step": 25288 + }, + { + "epoch": 0.7499036266049877, + "grad_norm": 0.0923645868897438, + "learning_rate": 0.00014941162213092778, + "loss": 2.6135, + "step": 25289 + }, + { + "epoch": 0.7499332799572992, + "grad_norm": 0.10383764654397964, + "learning_rate": 0.00014937807693690585, + "loss": 2.5717, + "step": 25290 + }, + { + "epoch": 0.7499629333096106, + "grad_norm": 0.10922138392925262, + "learning_rate": 0.0001493445348477011, + "loss": 2.5923, + "step": 25291 + }, + { + "epoch": 0.7499925866619221, + "grad_norm": 0.09115950763225555, + "learning_rate": 0.0001493109958636107, + "loss": 2.5896, + "step": 25292 + }, + { + "epoch": 0.7500222400142336, + "grad_norm": 0.09990448504686356, + "learning_rate": 0.0001492774599849316, + "loss": 2.5931, + "step": 25293 + }, + { + "epoch": 0.7500518933665451, + "grad_norm": 0.09706759452819824, + "learning_rate": 0.00014924392721196071, + "loss": 2.5663, + "step": 25294 + }, + { + "epoch": 0.7500815467188565, + "grad_norm": 0.1092100441455841, + "learning_rate": 0.00014921039754499515, + "loss": 2.6683, + "step": 25295 + }, + { + "epoch": 0.750111200071168, + "grad_norm": 0.09423243254423141, + "learning_rate": 0.0001491768709843317, + "loss": 2.5844, + "step": 25296 + }, + { + "epoch": 0.7501408534234795, + "grad_norm": 0.09965541958808899, + "learning_rate": 0.00014914334753026725, + "loss": 2.6244, + "step": 25297 + }, + { + "epoch": 0.750170506775791, + "grad_norm": 0.09903246909379959, + "learning_rate": 0.00014910982718309874, + "loss": 2.5846, + "step": 25298 + }, + { + "epoch": 0.7502001601281025, + "grad_norm": 0.0958503931760788, + "learning_rate": 0.00014907630994312295, + "loss": 2.5563, + "step": 25299 + }, + { + "epoch": 0.750229813480414, + "grad_norm": 0.10493969917297363, + "learning_rate": 0.00014904279581063663, + "loss": 2.5743, + "step": 25300 + }, + { + "epoch": 0.7502594668327255, + "grad_norm": 0.09831276535987854, + "learning_rate": 0.00014900928478593656, + "loss": 2.6169, + "step": 25301 + }, + { + "epoch": 0.7502891201850369, + "grad_norm": 0.09679894149303436, + "learning_rate": 0.00014897577686931956, + "loss": 2.5958, + "step": 25302 + }, + { + "epoch": 0.7503187735373484, + "grad_norm": 0.1058657094836235, + "learning_rate": 0.0001489422720610823, + "loss": 2.5903, + "step": 25303 + }, + { + "epoch": 0.7503484268896599, + "grad_norm": 0.09189621359109879, + "learning_rate": 0.0001489087703615215, + "loss": 2.5646, + "step": 25304 + }, + { + "epoch": 0.7503780802419714, + "grad_norm": 0.10495732724666595, + "learning_rate": 0.0001488752717709338, + "loss": 2.5736, + "step": 25305 + }, + { + "epoch": 0.7504077335942828, + "grad_norm": 0.09666085988283157, + "learning_rate": 0.00014884177628961582, + "loss": 2.5703, + "step": 25306 + }, + { + "epoch": 0.7504373869465943, + "grad_norm": 0.10181022435426712, + "learning_rate": 0.00014880828391786416, + "loss": 2.5981, + "step": 25307 + }, + { + "epoch": 0.7504670402989058, + "grad_norm": 0.10143633931875229, + "learning_rate": 0.00014877479465597544, + "loss": 2.5825, + "step": 25308 + }, + { + "epoch": 0.7504966936512173, + "grad_norm": 0.09674756228923798, + "learning_rate": 0.00014874130850424622, + "loss": 2.5548, + "step": 25309 + }, + { + "epoch": 0.7505263470035287, + "grad_norm": 0.09306923300027847, + "learning_rate": 0.00014870782546297302, + "loss": 2.5883, + "step": 25310 + }, + { + "epoch": 0.7505560003558402, + "grad_norm": 0.08407130092382431, + "learning_rate": 0.0001486743455324523, + "loss": 2.6536, + "step": 25311 + }, + { + "epoch": 0.7505856537081517, + "grad_norm": 0.1112465038895607, + "learning_rate": 0.00014864086871298055, + "loss": 2.6282, + "step": 25312 + }, + { + "epoch": 0.7506153070604632, + "grad_norm": 0.0879666656255722, + "learning_rate": 0.00014860739500485427, + "loss": 2.6162, + "step": 25313 + }, + { + "epoch": 0.7506449604127746, + "grad_norm": 0.10245667397975922, + "learning_rate": 0.00014857392440836975, + "loss": 2.6295, + "step": 25314 + }, + { + "epoch": 0.7506746137650862, + "grad_norm": 0.09233152866363525, + "learning_rate": 0.0001485404569238235, + "loss": 2.5922, + "step": 25315 + }, + { + "epoch": 0.7507042671173976, + "grad_norm": 0.09575916826725006, + "learning_rate": 0.00014850699255151178, + "loss": 2.6, + "step": 25316 + }, + { + "epoch": 0.7507339204697091, + "grad_norm": 0.09649801254272461, + "learning_rate": 0.00014847353129173103, + "loss": 2.6034, + "step": 25317 + }, + { + "epoch": 0.7507635738220205, + "grad_norm": 0.09706231951713562, + "learning_rate": 0.0001484400731447775, + "loss": 2.5906, + "step": 25318 + }, + { + "epoch": 0.7507932271743321, + "grad_norm": 0.10340717434883118, + "learning_rate": 0.00014840661811094742, + "loss": 2.5501, + "step": 25319 + }, + { + "epoch": 0.7508228805266436, + "grad_norm": 0.09800375998020172, + "learning_rate": 0.00014837316619053715, + "loss": 2.5859, + "step": 25320 + }, + { + "epoch": 0.750852533878955, + "grad_norm": 0.10102839767932892, + "learning_rate": 0.00014833971738384278, + "loss": 2.6173, + "step": 25321 + }, + { + "epoch": 0.7508821872312665, + "grad_norm": 0.10597419738769531, + "learning_rate": 0.00014830627169116063, + "loss": 2.6005, + "step": 25322 + }, + { + "epoch": 0.750911840583578, + "grad_norm": 0.09036269038915634, + "learning_rate": 0.00014827282911278677, + "loss": 2.5832, + "step": 25323 + }, + { + "epoch": 0.7509414939358895, + "grad_norm": 0.09388062357902527, + "learning_rate": 0.00014823938964901739, + "loss": 2.594, + "step": 25324 + }, + { + "epoch": 0.7509711472882009, + "grad_norm": 0.10043171048164368, + "learning_rate": 0.00014820595330014875, + "loss": 2.6077, + "step": 25325 + }, + { + "epoch": 0.7510008006405124, + "grad_norm": 0.0941598191857338, + "learning_rate": 0.00014817252006647664, + "loss": 2.579, + "step": 25326 + }, + { + "epoch": 0.7510304539928239, + "grad_norm": 0.10322199016809464, + "learning_rate": 0.00014813908994829711, + "loss": 2.5848, + "step": 25327 + }, + { + "epoch": 0.7510601073451354, + "grad_norm": 0.0961594507098198, + "learning_rate": 0.00014810566294590644, + "loss": 2.613, + "step": 25328 + }, + { + "epoch": 0.7510897606974468, + "grad_norm": 0.1048402339220047, + "learning_rate": 0.00014807223905960055, + "loss": 2.6153, + "step": 25329 + }, + { + "epoch": 0.7511194140497583, + "grad_norm": 0.09073807299137115, + "learning_rate": 0.0001480388182896754, + "loss": 2.6207, + "step": 25330 + }, + { + "epoch": 0.7511490674020698, + "grad_norm": 0.10718527436256409, + "learning_rate": 0.0001480054006364269, + "loss": 2.5675, + "step": 25331 + }, + { + "epoch": 0.7511787207543813, + "grad_norm": 0.09952108561992645, + "learning_rate": 0.00014797198610015105, + "loss": 2.5783, + "step": 25332 + }, + { + "epoch": 0.7512083741066927, + "grad_norm": 0.10658607631921768, + "learning_rate": 0.00014793857468114362, + "loss": 2.573, + "step": 25333 + }, + { + "epoch": 0.7512380274590043, + "grad_norm": 0.09943132847547531, + "learning_rate": 0.00014790516637970064, + "loss": 2.6015, + "step": 25334 + }, + { + "epoch": 0.7512676808113157, + "grad_norm": 0.10034585744142532, + "learning_rate": 0.00014787176119611796, + "loss": 2.5767, + "step": 25335 + }, + { + "epoch": 0.7512973341636272, + "grad_norm": 0.10557665675878525, + "learning_rate": 0.00014783835913069105, + "loss": 2.6261, + "step": 25336 + }, + { + "epoch": 0.7513269875159386, + "grad_norm": 0.10193585604429245, + "learning_rate": 0.00014780496018371598, + "loss": 2.5988, + "step": 25337 + }, + { + "epoch": 0.7513566408682502, + "grad_norm": 0.09877356886863708, + "learning_rate": 0.00014777156435548844, + "loss": 2.614, + "step": 25338 + }, + { + "epoch": 0.7513862942205616, + "grad_norm": 0.09828561544418335, + "learning_rate": 0.00014773817164630398, + "loss": 2.5992, + "step": 25339 + }, + { + "epoch": 0.7514159475728731, + "grad_norm": 0.1055857390165329, + "learning_rate": 0.00014770478205645865, + "loss": 2.5869, + "step": 25340 + }, + { + "epoch": 0.7514456009251846, + "grad_norm": 0.09208188951015472, + "learning_rate": 0.000147671395586248, + "loss": 2.6066, + "step": 25341 + }, + { + "epoch": 0.7514752542774961, + "grad_norm": 0.10406952351331711, + "learning_rate": 0.00014763801223596751, + "loss": 2.6035, + "step": 25342 + }, + { + "epoch": 0.7515049076298076, + "grad_norm": 0.09317358583211899, + "learning_rate": 0.00014760463200591295, + "loss": 2.6415, + "step": 25343 + }, + { + "epoch": 0.751534560982119, + "grad_norm": 0.10219195485115051, + "learning_rate": 0.0001475712548963799, + "loss": 2.6287, + "step": 25344 + }, + { + "epoch": 0.7515642143344305, + "grad_norm": 0.09759571403265, + "learning_rate": 0.00014753788090766395, + "loss": 2.6013, + "step": 25345 + }, + { + "epoch": 0.751593867686742, + "grad_norm": 0.09914153069257736, + "learning_rate": 0.0001475045100400605, + "loss": 2.5635, + "step": 25346 + }, + { + "epoch": 0.7516235210390535, + "grad_norm": 0.09709671139717102, + "learning_rate": 0.00014747114229386504, + "loss": 2.5955, + "step": 25347 + }, + { + "epoch": 0.7516531743913649, + "grad_norm": 0.10430341958999634, + "learning_rate": 0.00014743777766937322, + "loss": 2.6089, + "step": 25348 + }, + { + "epoch": 0.7516828277436765, + "grad_norm": 0.10674595087766647, + "learning_rate": 0.00014740441616688032, + "loss": 2.601, + "step": 25349 + }, + { + "epoch": 0.7517124810959879, + "grad_norm": 0.1007767841219902, + "learning_rate": 0.00014737105778668186, + "loss": 2.5964, + "step": 25350 + }, + { + "epoch": 0.7517421344482994, + "grad_norm": 0.09966287761926651, + "learning_rate": 0.00014733770252907318, + "loss": 2.6203, + "step": 25351 + }, + { + "epoch": 0.7517717878006108, + "grad_norm": 0.11048810184001923, + "learning_rate": 0.0001473043503943496, + "loss": 2.6352, + "step": 25352 + }, + { + "epoch": 0.7518014411529224, + "grad_norm": 0.09771115332841873, + "learning_rate": 0.00014727100138280664, + "loss": 2.5856, + "step": 25353 + }, + { + "epoch": 0.7518310945052338, + "grad_norm": 0.09692700207233429, + "learning_rate": 0.0001472376554947395, + "loss": 2.5669, + "step": 25354 + }, + { + "epoch": 0.7518607478575453, + "grad_norm": 0.10196175426244736, + "learning_rate": 0.00014720431273044344, + "loss": 2.6224, + "step": 25355 + }, + { + "epoch": 0.7518904012098567, + "grad_norm": 0.09876891225576401, + "learning_rate": 0.000147170973090214, + "loss": 2.5812, + "step": 25356 + }, + { + "epoch": 0.7519200545621683, + "grad_norm": 0.10189881175756454, + "learning_rate": 0.00014713763657434593, + "loss": 2.5949, + "step": 25357 + }, + { + "epoch": 0.7519497079144797, + "grad_norm": 0.09797447919845581, + "learning_rate": 0.00014710430318313466, + "loss": 2.6115, + "step": 25358 + }, + { + "epoch": 0.7519793612667912, + "grad_norm": 0.10803961008787155, + "learning_rate": 0.00014707097291687539, + "loss": 2.6079, + "step": 25359 + }, + { + "epoch": 0.7520090146191027, + "grad_norm": 0.09650211781263351, + "learning_rate": 0.0001470376457758632, + "loss": 2.6033, + "step": 25360 + }, + { + "epoch": 0.7520386679714142, + "grad_norm": 0.0989980548620224, + "learning_rate": 0.00014700432176039324, + "loss": 2.6265, + "step": 25361 + }, + { + "epoch": 0.7520683213237257, + "grad_norm": 0.10129048675298691, + "learning_rate": 0.00014697100087076066, + "loss": 2.5974, + "step": 25362 + }, + { + "epoch": 0.7520979746760371, + "grad_norm": 0.09291054308414459, + "learning_rate": 0.0001469376831072604, + "loss": 2.6044, + "step": 25363 + }, + { + "epoch": 0.7521276280283486, + "grad_norm": 0.11047995835542679, + "learning_rate": 0.00014690436847018758, + "loss": 2.6244, + "step": 25364 + }, + { + "epoch": 0.7521572813806601, + "grad_norm": 0.10315318405628204, + "learning_rate": 0.000146871056959837, + "loss": 2.6333, + "step": 25365 + }, + { + "epoch": 0.7521869347329716, + "grad_norm": 0.09509551525115967, + "learning_rate": 0.00014683774857650417, + "loss": 2.6252, + "step": 25366 + }, + { + "epoch": 0.752216588085283, + "grad_norm": 0.10003877431154251, + "learning_rate": 0.00014680444332048354, + "loss": 2.6205, + "step": 25367 + }, + { + "epoch": 0.7522462414375946, + "grad_norm": 0.09639193117618561, + "learning_rate": 0.00014677114119207018, + "loss": 2.5782, + "step": 25368 + }, + { + "epoch": 0.752275894789906, + "grad_norm": 0.10376134514808655, + "learning_rate": 0.000146737842191559, + "loss": 2.6099, + "step": 25369 + }, + { + "epoch": 0.7523055481422175, + "grad_norm": 0.10196021944284439, + "learning_rate": 0.0001467045463192449, + "loss": 2.6087, + "step": 25370 + }, + { + "epoch": 0.7523352014945289, + "grad_norm": 0.09449424594640732, + "learning_rate": 0.00014667125357542267, + "loss": 2.5928, + "step": 25371 + }, + { + "epoch": 0.7523648548468405, + "grad_norm": 0.09778359532356262, + "learning_rate": 0.0001466379639603871, + "loss": 2.6062, + "step": 25372 + }, + { + "epoch": 0.7523945081991519, + "grad_norm": 0.09860401600599289, + "learning_rate": 0.00014660467747443302, + "loss": 2.6256, + "step": 25373 + }, + { + "epoch": 0.7524241615514634, + "grad_norm": 0.09302442520856857, + "learning_rate": 0.0001465713941178552, + "loss": 2.5851, + "step": 25374 + }, + { + "epoch": 0.7524538149037748, + "grad_norm": 0.09783956408500671, + "learning_rate": 0.00014653811389094833, + "loss": 2.5858, + "step": 25375 + }, + { + "epoch": 0.7524834682560864, + "grad_norm": 0.09248930960893631, + "learning_rate": 0.00014650483679400713, + "loss": 2.6231, + "step": 25376 + }, + { + "epoch": 0.7525131216083978, + "grad_norm": 0.0913204774260521, + "learning_rate": 0.00014647156282732632, + "loss": 2.6073, + "step": 25377 + }, + { + "epoch": 0.7525427749607093, + "grad_norm": 0.08942323178052902, + "learning_rate": 0.00014643829199120046, + "loss": 2.6081, + "step": 25378 + }, + { + "epoch": 0.7525724283130207, + "grad_norm": 0.09844674915075302, + "learning_rate": 0.00014640502428592422, + "loss": 2.5877, + "step": 25379 + }, + { + "epoch": 0.7526020816653323, + "grad_norm": 0.0912671610713005, + "learning_rate": 0.00014637175971179219, + "loss": 2.5984, + "step": 25380 + }, + { + "epoch": 0.7526317350176438, + "grad_norm": 0.10085266828536987, + "learning_rate": 0.00014633849826909895, + "loss": 2.6374, + "step": 25381 + }, + { + "epoch": 0.7526613883699552, + "grad_norm": 0.10822383314371109, + "learning_rate": 0.000146305239958139, + "loss": 2.5774, + "step": 25382 + }, + { + "epoch": 0.7526910417222668, + "grad_norm": 0.0956464633345604, + "learning_rate": 0.00014627198477920684, + "loss": 2.5937, + "step": 25383 + }, + { + "epoch": 0.7527206950745782, + "grad_norm": 0.09981915354728699, + "learning_rate": 0.00014623873273259696, + "loss": 2.5939, + "step": 25384 + }, + { + "epoch": 0.7527503484268897, + "grad_norm": 0.09528873860836029, + "learning_rate": 0.0001462054838186039, + "loss": 2.6069, + "step": 25385 + }, + { + "epoch": 0.7527800017792011, + "grad_norm": 0.1037270724773407, + "learning_rate": 0.0001461722380375219, + "loss": 2.6106, + "step": 25386 + }, + { + "epoch": 0.7528096551315127, + "grad_norm": 0.09001407027244568, + "learning_rate": 0.00014613899538964555, + "loss": 2.6196, + "step": 25387 + }, + { + "epoch": 0.7528393084838241, + "grad_norm": 0.09836653620004654, + "learning_rate": 0.00014610575587526909, + "loss": 2.5739, + "step": 25388 + }, + { + "epoch": 0.7528689618361356, + "grad_norm": 0.0934203490614891, + "learning_rate": 0.00014607251949468693, + "loss": 2.6006, + "step": 25389 + }, + { + "epoch": 0.752898615188447, + "grad_norm": 0.0928676500916481, + "learning_rate": 0.0001460392862481935, + "loss": 2.5901, + "step": 25390 + }, + { + "epoch": 0.7529282685407586, + "grad_norm": 0.10310368984937668, + "learning_rate": 0.00014600605613608264, + "loss": 2.5971, + "step": 25391 + }, + { + "epoch": 0.75295792189307, + "grad_norm": 0.09029872715473175, + "learning_rate": 0.00014597282915864907, + "loss": 2.5924, + "step": 25392 + }, + { + "epoch": 0.7529875752453815, + "grad_norm": 0.09861495345830917, + "learning_rate": 0.0001459396053161869, + "loss": 2.5739, + "step": 25393 + }, + { + "epoch": 0.7530172285976929, + "grad_norm": 0.09742332994937897, + "learning_rate": 0.00014590638460899025, + "loss": 2.6013, + "step": 25394 + }, + { + "epoch": 0.7530468819500045, + "grad_norm": 0.1037479117512703, + "learning_rate": 0.00014587316703735338, + "loss": 2.5903, + "step": 25395 + }, + { + "epoch": 0.7530765353023159, + "grad_norm": 0.10354439169168472, + "learning_rate": 0.0001458399526015704, + "loss": 2.6169, + "step": 25396 + }, + { + "epoch": 0.7531061886546274, + "grad_norm": 0.09981260448694229, + "learning_rate": 0.0001458067413019354, + "loss": 2.5975, + "step": 25397 + }, + { + "epoch": 0.7531358420069388, + "grad_norm": 0.09132877737283707, + "learning_rate": 0.00014577353313874252, + "loss": 2.6387, + "step": 25398 + }, + { + "epoch": 0.7531654953592504, + "grad_norm": 0.09473533183336258, + "learning_rate": 0.00014574032811228582, + "loss": 2.5823, + "step": 25399 + }, + { + "epoch": 0.7531951487115618, + "grad_norm": 0.09204939007759094, + "learning_rate": 0.0001457071262228593, + "loss": 2.5794, + "step": 25400 + }, + { + "epoch": 0.7532248020638733, + "grad_norm": 0.09564459323883057, + "learning_rate": 0.00014567392747075715, + "loss": 2.6275, + "step": 25401 + }, + { + "epoch": 0.7532544554161849, + "grad_norm": 0.09378065913915634, + "learning_rate": 0.00014564073185627303, + "loss": 2.5766, + "step": 25402 + }, + { + "epoch": 0.7532841087684963, + "grad_norm": 0.09355396777391434, + "learning_rate": 0.00014560753937970105, + "loss": 2.6332, + "step": 25403 + }, + { + "epoch": 0.7533137621208078, + "grad_norm": 0.09800974279642105, + "learning_rate": 0.00014557435004133501, + "loss": 2.603, + "step": 25404 + }, + { + "epoch": 0.7533434154731192, + "grad_norm": 0.08621308952569962, + "learning_rate": 0.0001455411638414691, + "loss": 2.6388, + "step": 25405 + }, + { + "epoch": 0.7533730688254308, + "grad_norm": 0.10632119327783585, + "learning_rate": 0.00014550798078039702, + "loss": 2.5841, + "step": 25406 + }, + { + "epoch": 0.7534027221777422, + "grad_norm": 0.08485155552625656, + "learning_rate": 0.00014547480085841258, + "loss": 2.566, + "step": 25407 + }, + { + "epoch": 0.7534323755300537, + "grad_norm": 0.08702410757541656, + "learning_rate": 0.0001454416240758097, + "loss": 2.6167, + "step": 25408 + }, + { + "epoch": 0.7534620288823651, + "grad_norm": 0.09972666203975677, + "learning_rate": 0.00014540845043288202, + "loss": 2.5817, + "step": 25409 + }, + { + "epoch": 0.7534916822346767, + "grad_norm": 0.10236255079507828, + "learning_rate": 0.0001453752799299234, + "loss": 2.5874, + "step": 25410 + }, + { + "epoch": 0.7535213355869881, + "grad_norm": 0.09227844327688217, + "learning_rate": 0.00014534211256722775, + "loss": 2.6078, + "step": 25411 + }, + { + "epoch": 0.7535509889392996, + "grad_norm": 0.09645093232393265, + "learning_rate": 0.00014530894834508836, + "loss": 2.5845, + "step": 25412 + }, + { + "epoch": 0.753580642291611, + "grad_norm": 0.11275888234376907, + "learning_rate": 0.00014527578726379915, + "loss": 2.6379, + "step": 25413 + }, + { + "epoch": 0.7536102956439226, + "grad_norm": 0.10157691687345505, + "learning_rate": 0.00014524262932365367, + "loss": 2.5846, + "step": 25414 + }, + { + "epoch": 0.753639948996234, + "grad_norm": 0.09956391900777817, + "learning_rate": 0.00014520947452494566, + "loss": 2.6045, + "step": 25415 + }, + { + "epoch": 0.7536696023485455, + "grad_norm": 0.0997825562953949, + "learning_rate": 0.00014517632286796861, + "loss": 2.5995, + "step": 25416 + }, + { + "epoch": 0.7536992557008569, + "grad_norm": 0.08274253457784653, + "learning_rate": 0.000145143174353016, + "loss": 2.5905, + "step": 25417 + }, + { + "epoch": 0.7537289090531685, + "grad_norm": 0.1047942042350769, + "learning_rate": 0.00014511002898038168, + "loss": 2.6098, + "step": 25418 + }, + { + "epoch": 0.7537585624054799, + "grad_norm": 0.09688066691160202, + "learning_rate": 0.0001450768867503589, + "loss": 2.5993, + "step": 25419 + }, + { + "epoch": 0.7537882157577914, + "grad_norm": 0.09964017570018768, + "learning_rate": 0.00014504374766324124, + "loss": 2.5989, + "step": 25420 + }, + { + "epoch": 0.7538178691101028, + "grad_norm": 0.09116896241903305, + "learning_rate": 0.00014501061171932227, + "loss": 2.602, + "step": 25421 + }, + { + "epoch": 0.7538475224624144, + "grad_norm": 0.09200751781463623, + "learning_rate": 0.00014497747891889512, + "loss": 2.597, + "step": 25422 + }, + { + "epoch": 0.7538771758147259, + "grad_norm": 0.09803009033203125, + "learning_rate": 0.00014494434926225336, + "loss": 2.6061, + "step": 25423 + }, + { + "epoch": 0.7539068291670373, + "grad_norm": 0.09645649790763855, + "learning_rate": 0.0001449112227496903, + "loss": 2.6058, + "step": 25424 + }, + { + "epoch": 0.7539364825193489, + "grad_norm": 0.09255602210760117, + "learning_rate": 0.00014487809938149932, + "loss": 2.5817, + "step": 25425 + }, + { + "epoch": 0.7539661358716603, + "grad_norm": 0.09243062883615494, + "learning_rate": 0.00014484497915797373, + "loss": 2.5837, + "step": 25426 + }, + { + "epoch": 0.7539957892239718, + "grad_norm": 0.0881362184882164, + "learning_rate": 0.00014481186207940678, + "loss": 2.5878, + "step": 25427 + }, + { + "epoch": 0.7540254425762832, + "grad_norm": 0.0909448191523552, + "learning_rate": 0.00014477874814609182, + "loss": 2.5848, + "step": 25428 + }, + { + "epoch": 0.7540550959285948, + "grad_norm": 0.0903245359659195, + "learning_rate": 0.00014474563735832196, + "loss": 2.5904, + "step": 25429 + }, + { + "epoch": 0.7540847492809062, + "grad_norm": 0.10402342677116394, + "learning_rate": 0.00014471252971639032, + "loss": 2.6093, + "step": 25430 + }, + { + "epoch": 0.7541144026332177, + "grad_norm": 0.08912607282400131, + "learning_rate": 0.00014467942522059036, + "loss": 2.5943, + "step": 25431 + }, + { + "epoch": 0.7541440559855291, + "grad_norm": 0.10272686183452606, + "learning_rate": 0.00014464632387121528, + "loss": 2.5776, + "step": 25432 + }, + { + "epoch": 0.7541737093378407, + "grad_norm": 0.09843132644891739, + "learning_rate": 0.00014461322566855777, + "loss": 2.6194, + "step": 25433 + }, + { + "epoch": 0.7542033626901521, + "grad_norm": 0.10612457245588303, + "learning_rate": 0.0001445801306129112, + "loss": 2.6242, + "step": 25434 + }, + { + "epoch": 0.7542330160424636, + "grad_norm": 0.0931023508310318, + "learning_rate": 0.0001445470387045686, + "loss": 2.5762, + "step": 25435 + }, + { + "epoch": 0.754262669394775, + "grad_norm": 0.09823086857795715, + "learning_rate": 0.00014451394994382293, + "loss": 2.6087, + "step": 25436 + }, + { + "epoch": 0.7542923227470866, + "grad_norm": 0.10172372311353683, + "learning_rate": 0.0001444808643309673, + "loss": 2.5901, + "step": 25437 + }, + { + "epoch": 0.754321976099398, + "grad_norm": 0.10376991331577301, + "learning_rate": 0.0001444477818662946, + "loss": 2.5816, + "step": 25438 + }, + { + "epoch": 0.7543516294517095, + "grad_norm": 0.09956216812133789, + "learning_rate": 0.00014441470255009787, + "loss": 2.584, + "step": 25439 + }, + { + "epoch": 0.754381282804021, + "grad_norm": 0.09669575095176697, + "learning_rate": 0.00014438162638266995, + "loss": 2.5688, + "step": 25440 + }, + { + "epoch": 0.7544109361563325, + "grad_norm": 0.08970178663730621, + "learning_rate": 0.00014434855336430374, + "loss": 2.5801, + "step": 25441 + }, + { + "epoch": 0.7544405895086439, + "grad_norm": 0.10051030665636063, + "learning_rate": 0.00014431548349529217, + "loss": 2.5841, + "step": 25442 + }, + { + "epoch": 0.7544702428609554, + "grad_norm": 0.09701458364725113, + "learning_rate": 0.00014428241677592806, + "loss": 2.5893, + "step": 25443 + }, + { + "epoch": 0.754499896213267, + "grad_norm": 0.09763050824403763, + "learning_rate": 0.00014424935320650419, + "loss": 2.5912, + "step": 25444 + }, + { + "epoch": 0.7545295495655784, + "grad_norm": 0.10301917046308517, + "learning_rate": 0.00014421629278731334, + "loss": 2.609, + "step": 25445 + }, + { + "epoch": 0.7545592029178899, + "grad_norm": 0.09145627915859222, + "learning_rate": 0.00014418323551864832, + "loss": 2.5927, + "step": 25446 + }, + { + "epoch": 0.7545888562702013, + "grad_norm": 0.10518616437911987, + "learning_rate": 0.00014415018140080183, + "loss": 2.5647, + "step": 25447 + }, + { + "epoch": 0.7546185096225129, + "grad_norm": 0.10218816995620728, + "learning_rate": 0.00014411713043406655, + "loss": 2.5559, + "step": 25448 + }, + { + "epoch": 0.7546481629748243, + "grad_norm": 0.10216739773750305, + "learning_rate": 0.00014408408261873517, + "loss": 2.6002, + "step": 25449 + }, + { + "epoch": 0.7546778163271358, + "grad_norm": 0.10834003984928131, + "learning_rate": 0.00014405103795510032, + "loss": 2.6051, + "step": 25450 + }, + { + "epoch": 0.7547074696794472, + "grad_norm": 0.10240782797336578, + "learning_rate": 0.00014401799644345466, + "loss": 2.5847, + "step": 25451 + }, + { + "epoch": 0.7547371230317588, + "grad_norm": 0.10261264443397522, + "learning_rate": 0.00014398495808409068, + "loss": 2.601, + "step": 25452 + }, + { + "epoch": 0.7547667763840702, + "grad_norm": 0.09436756372451782, + "learning_rate": 0.00014395192287730107, + "loss": 2.6004, + "step": 25453 + }, + { + "epoch": 0.7547964297363817, + "grad_norm": 0.10220323503017426, + "learning_rate": 0.00014391889082337827, + "loss": 2.6119, + "step": 25454 + }, + { + "epoch": 0.7548260830886931, + "grad_norm": 0.09609510004520416, + "learning_rate": 0.00014388586192261483, + "loss": 2.6009, + "step": 25455 + }, + { + "epoch": 0.7548557364410047, + "grad_norm": 0.09593429416418076, + "learning_rate": 0.0001438528361753032, + "loss": 2.5778, + "step": 25456 + }, + { + "epoch": 0.7548853897933161, + "grad_norm": 0.09540308266878128, + "learning_rate": 0.00014381981358173578, + "loss": 2.595, + "step": 25457 + }, + { + "epoch": 0.7549150431456276, + "grad_norm": 0.09244538843631744, + "learning_rate": 0.00014378679414220514, + "loss": 2.5887, + "step": 25458 + }, + { + "epoch": 0.754944696497939, + "grad_norm": 0.09353740513324738, + "learning_rate": 0.00014375377785700354, + "loss": 2.6038, + "step": 25459 + }, + { + "epoch": 0.7549743498502506, + "grad_norm": 0.1135692149400711, + "learning_rate": 0.00014372076472642337, + "loss": 2.6192, + "step": 25460 + }, + { + "epoch": 0.755004003202562, + "grad_norm": 0.0936872735619545, + "learning_rate": 0.00014368775475075702, + "loss": 2.5793, + "step": 25461 + }, + { + "epoch": 0.7550336565548735, + "grad_norm": 0.10823957622051239, + "learning_rate": 0.0001436547479302967, + "loss": 2.5822, + "step": 25462 + }, + { + "epoch": 0.755063309907185, + "grad_norm": 0.10034497082233429, + "learning_rate": 0.0001436217442653348, + "loss": 2.5864, + "step": 25463 + }, + { + "epoch": 0.7550929632594965, + "grad_norm": 0.09749431163072586, + "learning_rate": 0.00014358874375616353, + "loss": 2.5775, + "step": 25464 + }, + { + "epoch": 0.755122616611808, + "grad_norm": 0.09499523788690567, + "learning_rate": 0.0001435557464030751, + "loss": 2.5842, + "step": 25465 + }, + { + "epoch": 0.7551522699641194, + "grad_norm": 0.09195561707019806, + "learning_rate": 0.0001435227522063619, + "loss": 2.5708, + "step": 25466 + }, + { + "epoch": 0.755181923316431, + "grad_norm": 0.10705116391181946, + "learning_rate": 0.00014348976116631575, + "loss": 2.5818, + "step": 25467 + }, + { + "epoch": 0.7552115766687424, + "grad_norm": 0.09779825806617737, + "learning_rate": 0.00014345677328322893, + "loss": 2.6126, + "step": 25468 + }, + { + "epoch": 0.7552412300210539, + "grad_norm": 0.11060589551925659, + "learning_rate": 0.0001434237885573934, + "loss": 2.5945, + "step": 25469 + }, + { + "epoch": 0.7552708833733653, + "grad_norm": 0.1055692732334137, + "learning_rate": 0.00014339080698910168, + "loss": 2.5796, + "step": 25470 + }, + { + "epoch": 0.7553005367256769, + "grad_norm": 0.10620418190956116, + "learning_rate": 0.0001433578285786455, + "loss": 2.6092, + "step": 25471 + }, + { + "epoch": 0.7553301900779883, + "grad_norm": 0.09817415475845337, + "learning_rate": 0.000143324853326317, + "loss": 2.5871, + "step": 25472 + }, + { + "epoch": 0.7553598434302998, + "grad_norm": 0.11355914175510406, + "learning_rate": 0.0001432918812324081, + "loss": 2.5861, + "step": 25473 + }, + { + "epoch": 0.7553894967826112, + "grad_norm": 0.10284591466188431, + "learning_rate": 0.0001432589122972109, + "loss": 2.5956, + "step": 25474 + }, + { + "epoch": 0.7554191501349228, + "grad_norm": 0.09892759472131729, + "learning_rate": 0.00014322594652101716, + "loss": 2.6032, + "step": 25475 + }, + { + "epoch": 0.7554488034872342, + "grad_norm": 0.09563393890857697, + "learning_rate": 0.000143192983904119, + "loss": 2.5992, + "step": 25476 + }, + { + "epoch": 0.7554784568395457, + "grad_norm": 0.10312259942293167, + "learning_rate": 0.00014316002444680833, + "loss": 2.6155, + "step": 25477 + }, + { + "epoch": 0.7555081101918572, + "grad_norm": 0.10883844643831253, + "learning_rate": 0.00014312706814937677, + "loss": 2.5849, + "step": 25478 + }, + { + "epoch": 0.7555377635441687, + "grad_norm": 0.08863400667905807, + "learning_rate": 0.00014309411501211623, + "loss": 2.5523, + "step": 25479 + }, + { + "epoch": 0.7555674168964801, + "grad_norm": 0.10076094418764114, + "learning_rate": 0.00014306116503531857, + "loss": 2.6388, + "step": 25480 + }, + { + "epoch": 0.7555970702487916, + "grad_norm": 0.10231608152389526, + "learning_rate": 0.0001430282182192756, + "loss": 2.6092, + "step": 25481 + }, + { + "epoch": 0.7556267236011031, + "grad_norm": 0.08792220801115036, + "learning_rate": 0.0001429952745642788, + "loss": 2.5916, + "step": 25482 + }, + { + "epoch": 0.7556563769534146, + "grad_norm": 0.10082919150590897, + "learning_rate": 0.00014296233407062032, + "loss": 2.6139, + "step": 25483 + }, + { + "epoch": 0.755686030305726, + "grad_norm": 0.08999331295490265, + "learning_rate": 0.00014292939673859169, + "loss": 2.59, + "step": 25484 + }, + { + "epoch": 0.7557156836580375, + "grad_norm": 0.09146998077630997, + "learning_rate": 0.0001428964625684845, + "loss": 2.6033, + "step": 25485 + }, + { + "epoch": 0.7557453370103491, + "grad_norm": 0.09547338634729385, + "learning_rate": 0.00014286353156059046, + "loss": 2.5695, + "step": 25486 + }, + { + "epoch": 0.7557749903626605, + "grad_norm": 0.09379488974809647, + "learning_rate": 0.0001428306037152013, + "loss": 2.5924, + "step": 25487 + }, + { + "epoch": 0.755804643714972, + "grad_norm": 0.09854873269796371, + "learning_rate": 0.00014279767903260825, + "loss": 2.5941, + "step": 25488 + }, + { + "epoch": 0.7558342970672834, + "grad_norm": 0.08737541735172272, + "learning_rate": 0.00014276475751310314, + "loss": 2.5792, + "step": 25489 + }, + { + "epoch": 0.755863950419595, + "grad_norm": 0.09497088193893433, + "learning_rate": 0.00014273183915697736, + "loss": 2.6026, + "step": 25490 + }, + { + "epoch": 0.7558936037719064, + "grad_norm": 0.08868733048439026, + "learning_rate": 0.0001426989239645225, + "loss": 2.5807, + "step": 25491 + }, + { + "epoch": 0.7559232571242179, + "grad_norm": 0.10321952402591705, + "learning_rate": 0.00014266601193602997, + "loss": 2.5377, + "step": 25492 + }, + { + "epoch": 0.7559529104765293, + "grad_norm": 0.09149210900068283, + "learning_rate": 0.00014263310307179128, + "loss": 2.5918, + "step": 25493 + }, + { + "epoch": 0.7559825638288409, + "grad_norm": 0.10247785598039627, + "learning_rate": 0.00014260019737209777, + "loss": 2.617, + "step": 25494 + }, + { + "epoch": 0.7560122171811523, + "grad_norm": 0.09809235483407974, + "learning_rate": 0.0001425672948372407, + "loss": 2.6214, + "step": 25495 + }, + { + "epoch": 0.7560418705334638, + "grad_norm": 0.0890641063451767, + "learning_rate": 0.00014253439546751178, + "loss": 2.5955, + "step": 25496 + }, + { + "epoch": 0.7560715238857753, + "grad_norm": 0.09623704105615616, + "learning_rate": 0.0001425014992632021, + "loss": 2.588, + "step": 25497 + }, + { + "epoch": 0.7561011772380868, + "grad_norm": 0.09735686331987381, + "learning_rate": 0.00014246860622460318, + "loss": 2.6033, + "step": 25498 + }, + { + "epoch": 0.7561308305903982, + "grad_norm": 0.09877163171768188, + "learning_rate": 0.00014243571635200598, + "loss": 2.587, + "step": 25499 + }, + { + "epoch": 0.7561604839427097, + "grad_norm": 0.09828633069992065, + "learning_rate": 0.00014240282964570188, + "loss": 2.6033, + "step": 25500 + }, + { + "epoch": 0.7561901372950212, + "grad_norm": 0.10862350463867188, + "learning_rate": 0.0001423699461059821, + "loss": 2.6082, + "step": 25501 + }, + { + "epoch": 0.7562197906473327, + "grad_norm": 0.094425730407238, + "learning_rate": 0.00014233706573313788, + "loss": 2.616, + "step": 25502 + }, + { + "epoch": 0.7562494439996441, + "grad_norm": 0.11202002316713333, + "learning_rate": 0.00014230418852746024, + "loss": 2.5647, + "step": 25503 + }, + { + "epoch": 0.7562790973519556, + "grad_norm": 0.1010655015707016, + "learning_rate": 0.00014227131448924047, + "loss": 2.6341, + "step": 25504 + }, + { + "epoch": 0.7563087507042671, + "grad_norm": 0.10757365077733994, + "learning_rate": 0.00014223844361876964, + "loss": 2.6015, + "step": 25505 + }, + { + "epoch": 0.7563384040565786, + "grad_norm": 0.11441381275653839, + "learning_rate": 0.00014220557591633875, + "loss": 2.599, + "step": 25506 + }, + { + "epoch": 0.7563680574088901, + "grad_norm": 0.09270355850458145, + "learning_rate": 0.00014217271138223893, + "loss": 2.6062, + "step": 25507 + }, + { + "epoch": 0.7563977107612015, + "grad_norm": 0.1150112971663475, + "learning_rate": 0.0001421398500167611, + "loss": 2.5923, + "step": 25508 + }, + { + "epoch": 0.7564273641135131, + "grad_norm": 0.0877443179488182, + "learning_rate": 0.00014210699182019642, + "loss": 2.58, + "step": 25509 + }, + { + "epoch": 0.7564570174658245, + "grad_norm": 0.1115962490439415, + "learning_rate": 0.0001420741367928357, + "loss": 2.5997, + "step": 25510 + }, + { + "epoch": 0.756486670818136, + "grad_norm": 0.09755799919366837, + "learning_rate": 0.00014204128493497, + "loss": 2.5614, + "step": 25511 + }, + { + "epoch": 0.7565163241704475, + "grad_norm": 0.09746671468019485, + "learning_rate": 0.0001420084362468901, + "loss": 2.584, + "step": 25512 + }, + { + "epoch": 0.756545977522759, + "grad_norm": 0.10930673778057098, + "learning_rate": 0.00014197559072888694, + "loss": 2.5782, + "step": 25513 + }, + { + "epoch": 0.7565756308750704, + "grad_norm": 0.09081760793924332, + "learning_rate": 0.00014194274838125144, + "loss": 2.5811, + "step": 25514 + }, + { + "epoch": 0.7566052842273819, + "grad_norm": 0.10056430846452713, + "learning_rate": 0.00014190990920427433, + "loss": 2.5902, + "step": 25515 + }, + { + "epoch": 0.7566349375796934, + "grad_norm": 0.09496521949768066, + "learning_rate": 0.0001418770731982464, + "loss": 2.5898, + "step": 25516 + }, + { + "epoch": 0.7566645909320049, + "grad_norm": 0.10831715166568756, + "learning_rate": 0.00014184424036345849, + "loss": 2.6442, + "step": 25517 + }, + { + "epoch": 0.7566942442843163, + "grad_norm": 0.08732583373785019, + "learning_rate": 0.00014181141070020132, + "loss": 2.5973, + "step": 25518 + }, + { + "epoch": 0.7567238976366278, + "grad_norm": 0.10772816836833954, + "learning_rate": 0.0001417785842087656, + "loss": 2.5864, + "step": 25519 + }, + { + "epoch": 0.7567535509889393, + "grad_norm": 0.08758878707885742, + "learning_rate": 0.00014174576088944195, + "loss": 2.5881, + "step": 25520 + }, + { + "epoch": 0.7567832043412508, + "grad_norm": 0.11126261949539185, + "learning_rate": 0.0001417129407425211, + "loss": 2.5818, + "step": 25521 + }, + { + "epoch": 0.7568128576935622, + "grad_norm": 0.10024050623178482, + "learning_rate": 0.0001416801237682937, + "loss": 2.6301, + "step": 25522 + }, + { + "epoch": 0.7568425110458737, + "grad_norm": 0.10604259371757507, + "learning_rate": 0.00014164730996705027, + "loss": 2.5563, + "step": 25523 + }, + { + "epoch": 0.7568721643981852, + "grad_norm": 0.1026151031255722, + "learning_rate": 0.00014161449933908143, + "loss": 2.5887, + "step": 25524 + }, + { + "epoch": 0.7569018177504967, + "grad_norm": 0.09181836247444153, + "learning_rate": 0.00014158169188467772, + "loss": 2.6136, + "step": 25525 + }, + { + "epoch": 0.7569314711028081, + "grad_norm": 0.11098400503396988, + "learning_rate": 0.0001415488876041297, + "loss": 2.6053, + "step": 25526 + }, + { + "epoch": 0.7569611244551196, + "grad_norm": 0.09208580106496811, + "learning_rate": 0.00014151608649772774, + "loss": 2.6126, + "step": 25527 + }, + { + "epoch": 0.7569907778074312, + "grad_norm": 0.10074347257614136, + "learning_rate": 0.00014148328856576238, + "loss": 2.6042, + "step": 25528 + }, + { + "epoch": 0.7570204311597426, + "grad_norm": 0.0895785540342331, + "learning_rate": 0.00014145049380852404, + "loss": 2.5919, + "step": 25529 + }, + { + "epoch": 0.7570500845120541, + "grad_norm": 0.10304614156484604, + "learning_rate": 0.0001414177022263031, + "loss": 2.5861, + "step": 25530 + }, + { + "epoch": 0.7570797378643656, + "grad_norm": 0.0956297218799591, + "learning_rate": 0.00014138491381939, + "loss": 2.5536, + "step": 25531 + }, + { + "epoch": 0.7571093912166771, + "grad_norm": 0.09532621502876282, + "learning_rate": 0.00014135212858807516, + "loss": 2.6128, + "step": 25532 + }, + { + "epoch": 0.7571390445689885, + "grad_norm": 0.10941287875175476, + "learning_rate": 0.00014131934653264854, + "loss": 2.5947, + "step": 25533 + }, + { + "epoch": 0.7571686979213, + "grad_norm": 0.0911460816860199, + "learning_rate": 0.00014128656765340075, + "loss": 2.5971, + "step": 25534 + }, + { + "epoch": 0.7571983512736115, + "grad_norm": 0.10814712941646576, + "learning_rate": 0.00014125379195062204, + "loss": 2.5891, + "step": 25535 + }, + { + "epoch": 0.757228004625923, + "grad_norm": 0.09765780717134476, + "learning_rate": 0.00014122101942460252, + "loss": 2.5816, + "step": 25536 + }, + { + "epoch": 0.7572576579782344, + "grad_norm": 0.09504131972789764, + "learning_rate": 0.00014118825007563253, + "loss": 2.6034, + "step": 25537 + }, + { + "epoch": 0.7572873113305459, + "grad_norm": 0.09375189989805222, + "learning_rate": 0.00014115548390400206, + "loss": 2.5635, + "step": 25538 + }, + { + "epoch": 0.7573169646828574, + "grad_norm": 0.09587301313877106, + "learning_rate": 0.0001411227209100015, + "loss": 2.5932, + "step": 25539 + }, + { + "epoch": 0.7573466180351689, + "grad_norm": 0.09834787994623184, + "learning_rate": 0.00014108996109392076, + "loss": 2.6012, + "step": 25540 + }, + { + "epoch": 0.7573762713874803, + "grad_norm": 0.09798354655504227, + "learning_rate": 0.0001410572044560501, + "loss": 2.5856, + "step": 25541 + }, + { + "epoch": 0.7574059247397918, + "grad_norm": 0.09446779638528824, + "learning_rate": 0.00014102445099667955, + "loss": 2.5792, + "step": 25542 + }, + { + "epoch": 0.7574355780921033, + "grad_norm": 0.08467139303684235, + "learning_rate": 0.000140991700716099, + "loss": 2.5785, + "step": 25543 + }, + { + "epoch": 0.7574652314444148, + "grad_norm": 0.09796177595853806, + "learning_rate": 0.00014095895361459858, + "loss": 2.61, + "step": 25544 + }, + { + "epoch": 0.7574948847967262, + "grad_norm": 0.08807095885276794, + "learning_rate": 0.0001409262096924683, + "loss": 2.5883, + "step": 25545 + }, + { + "epoch": 0.7575245381490378, + "grad_norm": 0.10199938714504242, + "learning_rate": 0.00014089346894999782, + "loss": 2.6223, + "step": 25546 + }, + { + "epoch": 0.7575541915013492, + "grad_norm": 0.0867963656783104, + "learning_rate": 0.00014086073138747752, + "loss": 2.5604, + "step": 25547 + }, + { + "epoch": 0.7575838448536607, + "grad_norm": 0.10262267291545868, + "learning_rate": 0.00014082799700519704, + "loss": 2.5994, + "step": 25548 + }, + { + "epoch": 0.7576134982059722, + "grad_norm": 0.0977199375629425, + "learning_rate": 0.00014079526580344637, + "loss": 2.607, + "step": 25549 + }, + { + "epoch": 0.7576431515582837, + "grad_norm": 0.11252973973751068, + "learning_rate": 0.00014076253778251525, + "loss": 2.5589, + "step": 25550 + }, + { + "epoch": 0.7576728049105952, + "grad_norm": 0.09800590574741364, + "learning_rate": 0.0001407298129426935, + "loss": 2.5952, + "step": 25551 + }, + { + "epoch": 0.7577024582629066, + "grad_norm": 0.1009431779384613, + "learning_rate": 0.00014069709128427095, + "loss": 2.5812, + "step": 25552 + }, + { + "epoch": 0.7577321116152181, + "grad_norm": 0.09800563007593155, + "learning_rate": 0.00014066437280753748, + "loss": 2.6173, + "step": 25553 + }, + { + "epoch": 0.7577617649675296, + "grad_norm": 0.1120246946811676, + "learning_rate": 0.0001406316575127825, + "loss": 2.5969, + "step": 25554 + }, + { + "epoch": 0.7577914183198411, + "grad_norm": 0.09867819398641586, + "learning_rate": 0.0001405989454002959, + "loss": 2.5588, + "step": 25555 + }, + { + "epoch": 0.7578210716721525, + "grad_norm": 0.09504152089357376, + "learning_rate": 0.00014056623647036725, + "loss": 2.5836, + "step": 25556 + }, + { + "epoch": 0.757850725024464, + "grad_norm": 0.09931877255439758, + "learning_rate": 0.00014053353072328634, + "loss": 2.5437, + "step": 25557 + }, + { + "epoch": 0.7578803783767755, + "grad_norm": 0.08632998913526535, + "learning_rate": 0.00014050082815934272, + "loss": 2.5975, + "step": 25558 + }, + { + "epoch": 0.757910031729087, + "grad_norm": 0.10257825255393982, + "learning_rate": 0.0001404681287788258, + "loss": 2.627, + "step": 25559 + }, + { + "epoch": 0.7579396850813984, + "grad_norm": 0.0913541242480278, + "learning_rate": 0.00014043543258202552, + "loss": 2.5942, + "step": 25560 + }, + { + "epoch": 0.75796933843371, + "grad_norm": 0.09444966912269592, + "learning_rate": 0.00014040273956923116, + "loss": 2.5995, + "step": 25561 + }, + { + "epoch": 0.7579989917860214, + "grad_norm": 0.08711542934179306, + "learning_rate": 0.00014037004974073224, + "loss": 2.5873, + "step": 25562 + }, + { + "epoch": 0.7580286451383329, + "grad_norm": 0.10047847032546997, + "learning_rate": 0.00014033736309681844, + "loss": 2.5717, + "step": 25563 + }, + { + "epoch": 0.7580582984906443, + "grad_norm": 0.10097483545541763, + "learning_rate": 0.00014030467963777887, + "loss": 2.6097, + "step": 25564 + }, + { + "epoch": 0.7580879518429559, + "grad_norm": 0.09416288882493973, + "learning_rate": 0.0001402719993639031, + "loss": 2.6412, + "step": 25565 + }, + { + "epoch": 0.7581176051952673, + "grad_norm": 0.09719210118055344, + "learning_rate": 0.00014023932227548054, + "loss": 2.5973, + "step": 25566 + }, + { + "epoch": 0.7581472585475788, + "grad_norm": 0.09301465004682541, + "learning_rate": 0.0001402066483728005, + "loss": 2.5966, + "step": 25567 + }, + { + "epoch": 0.7581769118998903, + "grad_norm": 0.09078968316316605, + "learning_rate": 0.00014017397765615235, + "loss": 2.6168, + "step": 25568 + }, + { + "epoch": 0.7582065652522018, + "grad_norm": 0.09697729349136353, + "learning_rate": 0.00014014131012582542, + "loss": 2.6127, + "step": 25569 + }, + { + "epoch": 0.7582362186045133, + "grad_norm": 0.09559577703475952, + "learning_rate": 0.00014010864578210897, + "loss": 2.6516, + "step": 25570 + }, + { + "epoch": 0.7582658719568247, + "grad_norm": 0.09280065447092056, + "learning_rate": 0.0001400759846252922, + "loss": 2.564, + "step": 25571 + }, + { + "epoch": 0.7582955253091362, + "grad_norm": 0.10457008332014084, + "learning_rate": 0.00014004332665566423, + "loss": 2.6229, + "step": 25572 + }, + { + "epoch": 0.7583251786614477, + "grad_norm": 0.10557955503463745, + "learning_rate": 0.00014001067187351452, + "loss": 2.5808, + "step": 25573 + }, + { + "epoch": 0.7583548320137592, + "grad_norm": 0.10477275401353836, + "learning_rate": 0.00013997802027913226, + "loss": 2.5962, + "step": 25574 + }, + { + "epoch": 0.7583844853660706, + "grad_norm": 0.10541300475597382, + "learning_rate": 0.00013994537187280633, + "loss": 2.6162, + "step": 25575 + }, + { + "epoch": 0.7584141387183821, + "grad_norm": 0.10354090481996536, + "learning_rate": 0.00013991272665482584, + "loss": 2.584, + "step": 25576 + }, + { + "epoch": 0.7584437920706936, + "grad_norm": 0.10607561469078064, + "learning_rate": 0.00013988008462548, + "loss": 2.6111, + "step": 25577 + }, + { + "epoch": 0.7584734454230051, + "grad_norm": 0.10034139454364777, + "learning_rate": 0.00013984744578505787, + "loss": 2.5928, + "step": 25578 + }, + { + "epoch": 0.7585030987753165, + "grad_norm": 0.09965527057647705, + "learning_rate": 0.0001398148101338484, + "loss": 2.549, + "step": 25579 + }, + { + "epoch": 0.758532752127628, + "grad_norm": 0.09232592582702637, + "learning_rate": 0.0001397821776721406, + "loss": 2.5718, + "step": 25580 + }, + { + "epoch": 0.7585624054799395, + "grad_norm": 0.0987962931394577, + "learning_rate": 0.00013974954840022342, + "loss": 2.5865, + "step": 25581 + }, + { + "epoch": 0.758592058832251, + "grad_norm": 0.09142671525478363, + "learning_rate": 0.00013971692231838585, + "loss": 2.6094, + "step": 25582 + }, + { + "epoch": 0.7586217121845624, + "grad_norm": 0.09733670204877853, + "learning_rate": 0.0001396842994269168, + "loss": 2.635, + "step": 25583 + }, + { + "epoch": 0.758651365536874, + "grad_norm": 0.09455058723688126, + "learning_rate": 0.0001396516797261051, + "loss": 2.5855, + "step": 25584 + }, + { + "epoch": 0.7586810188891854, + "grad_norm": 0.10013869404792786, + "learning_rate": 0.00013961906321623962, + "loss": 2.5938, + "step": 25585 + }, + { + "epoch": 0.7587106722414969, + "grad_norm": 0.11875324696302414, + "learning_rate": 0.0001395864498976092, + "loss": 2.6106, + "step": 25586 + }, + { + "epoch": 0.7587403255938083, + "grad_norm": 0.1004515066742897, + "learning_rate": 0.00013955383977050267, + "loss": 2.5941, + "step": 25587 + }, + { + "epoch": 0.7587699789461199, + "grad_norm": 0.10707533359527588, + "learning_rate": 0.00013952123283520872, + "loss": 2.5628, + "step": 25588 + }, + { + "epoch": 0.7587996322984314, + "grad_norm": 0.10143743455410004, + "learning_rate": 0.00013948862909201614, + "loss": 2.6007, + "step": 25589 + }, + { + "epoch": 0.7588292856507428, + "grad_norm": 0.10244982689619064, + "learning_rate": 0.00013945602854121365, + "loss": 2.5809, + "step": 25590 + }, + { + "epoch": 0.7588589390030543, + "grad_norm": 0.09717128425836563, + "learning_rate": 0.00013942343118308987, + "loss": 2.5723, + "step": 25591 + }, + { + "epoch": 0.7588885923553658, + "grad_norm": 0.1037726029753685, + "learning_rate": 0.00013939083701793354, + "loss": 2.6032, + "step": 25592 + }, + { + "epoch": 0.7589182457076773, + "grad_norm": 0.09451860189437866, + "learning_rate": 0.0001393582460460332, + "loss": 2.5987, + "step": 25593 + }, + { + "epoch": 0.7589478990599887, + "grad_norm": 0.10498914122581482, + "learning_rate": 0.00013932565826767752, + "loss": 2.5837, + "step": 25594 + }, + { + "epoch": 0.7589775524123002, + "grad_norm": 0.10908568650484085, + "learning_rate": 0.0001392930736831551, + "loss": 2.6011, + "step": 25595 + }, + { + "epoch": 0.7590072057646117, + "grad_norm": 0.0965314581990242, + "learning_rate": 0.00013926049229275435, + "loss": 2.604, + "step": 25596 + }, + { + "epoch": 0.7590368591169232, + "grad_norm": 0.10283756256103516, + "learning_rate": 0.000139227914096764, + "loss": 2.6095, + "step": 25597 + }, + { + "epoch": 0.7590665124692346, + "grad_norm": 0.09712749719619751, + "learning_rate": 0.00013919533909547216, + "loss": 2.5706, + "step": 25598 + }, + { + "epoch": 0.7590961658215462, + "grad_norm": 0.10354380309581757, + "learning_rate": 0.00013916276728916766, + "loss": 2.5464, + "step": 25599 + }, + { + "epoch": 0.7591258191738576, + "grad_norm": 0.09913424402475357, + "learning_rate": 0.00013913019867813875, + "loss": 2.5927, + "step": 25600 + }, + { + "epoch": 0.7591554725261691, + "grad_norm": 0.09828124195337296, + "learning_rate": 0.00013909763326267388, + "loss": 2.6273, + "step": 25601 + }, + { + "epoch": 0.7591851258784805, + "grad_norm": 0.10199066996574402, + "learning_rate": 0.00013906507104306142, + "loss": 2.5707, + "step": 25602 + }, + { + "epoch": 0.7592147792307921, + "grad_norm": 0.102933369576931, + "learning_rate": 0.00013903251201958976, + "loss": 2.5584, + "step": 25603 + }, + { + "epoch": 0.7592444325831035, + "grad_norm": 0.09971506148576736, + "learning_rate": 0.00013899995619254713, + "loss": 2.5764, + "step": 25604 + }, + { + "epoch": 0.759274085935415, + "grad_norm": 0.09966486692428589, + "learning_rate": 0.00013896740356222187, + "loss": 2.5904, + "step": 25605 + }, + { + "epoch": 0.7593037392877264, + "grad_norm": 0.10039729624986649, + "learning_rate": 0.00013893485412890216, + "loss": 2.5396, + "step": 25606 + }, + { + "epoch": 0.759333392640038, + "grad_norm": 0.104727603495121, + "learning_rate": 0.0001389023078928764, + "loss": 2.6156, + "step": 25607 + }, + { + "epoch": 0.7593630459923494, + "grad_norm": 0.0925939679145813, + "learning_rate": 0.00013886976485443276, + "loss": 2.6325, + "step": 25608 + }, + { + "epoch": 0.7593926993446609, + "grad_norm": 0.10830862820148468, + "learning_rate": 0.00013883722501385922, + "loss": 2.6352, + "step": 25609 + }, + { + "epoch": 0.7594223526969724, + "grad_norm": 0.09003773331642151, + "learning_rate": 0.000138804688371444, + "loss": 2.6034, + "step": 25610 + }, + { + "epoch": 0.7594520060492839, + "grad_norm": 0.10719913989305496, + "learning_rate": 0.00013877215492747512, + "loss": 2.606, + "step": 25611 + }, + { + "epoch": 0.7594816594015954, + "grad_norm": 0.09279369562864304, + "learning_rate": 0.000138739624682241, + "loss": 2.6058, + "step": 25612 + }, + { + "epoch": 0.7595113127539068, + "grad_norm": 0.09870711714029312, + "learning_rate": 0.0001387070976360295, + "loss": 2.5523, + "step": 25613 + }, + { + "epoch": 0.7595409661062184, + "grad_norm": 0.09281452000141144, + "learning_rate": 0.00013867457378912863, + "loss": 2.5886, + "step": 25614 + }, + { + "epoch": 0.7595706194585298, + "grad_norm": 0.10532062500715256, + "learning_rate": 0.0001386420531418265, + "loss": 2.6376, + "step": 25615 + }, + { + "epoch": 0.7596002728108413, + "grad_norm": 0.10131282359361649, + "learning_rate": 0.00013860953569441094, + "loss": 2.5825, + "step": 25616 + }, + { + "epoch": 0.7596299261631527, + "grad_norm": 0.09708256274461746, + "learning_rate": 0.00013857702144717005, + "loss": 2.5931, + "step": 25617 + }, + { + "epoch": 0.7596595795154643, + "grad_norm": 0.10155980288982391, + "learning_rate": 0.00013854451040039173, + "loss": 2.6339, + "step": 25618 + }, + { + "epoch": 0.7596892328677757, + "grad_norm": 0.09041018038988113, + "learning_rate": 0.00013851200255436373, + "loss": 2.6054, + "step": 25619 + }, + { + "epoch": 0.7597188862200872, + "grad_norm": 0.09811408072710037, + "learning_rate": 0.00013847949790937397, + "loss": 2.5895, + "step": 25620 + }, + { + "epoch": 0.7597485395723986, + "grad_norm": 0.09715403616428375, + "learning_rate": 0.00013844699646571034, + "loss": 2.6039, + "step": 25621 + }, + { + "epoch": 0.7597781929247102, + "grad_norm": 0.09966786950826645, + "learning_rate": 0.00013841449822366058, + "loss": 2.6279, + "step": 25622 + }, + { + "epoch": 0.7598078462770216, + "grad_norm": 0.0895020067691803, + "learning_rate": 0.00013838200318351258, + "loss": 2.5861, + "step": 25623 + }, + { + "epoch": 0.7598374996293331, + "grad_norm": 0.09969017654657364, + "learning_rate": 0.0001383495113455538, + "loss": 2.5813, + "step": 25624 + }, + { + "epoch": 0.7598671529816445, + "grad_norm": 0.09064140170812607, + "learning_rate": 0.0001383170227100723, + "loss": 2.5973, + "step": 25625 + }, + { + "epoch": 0.7598968063339561, + "grad_norm": 0.09501317888498306, + "learning_rate": 0.00013828453727735568, + "loss": 2.6024, + "step": 25626 + }, + { + "epoch": 0.7599264596862675, + "grad_norm": 0.08742289990186691, + "learning_rate": 0.00013825205504769156, + "loss": 2.5963, + "step": 25627 + }, + { + "epoch": 0.759956113038579, + "grad_norm": 0.09772026538848877, + "learning_rate": 0.0001382195760213676, + "loss": 2.5811, + "step": 25628 + }, + { + "epoch": 0.7599857663908904, + "grad_norm": 0.09080366045236588, + "learning_rate": 0.00013818710019867153, + "loss": 2.5933, + "step": 25629 + }, + { + "epoch": 0.760015419743202, + "grad_norm": 0.08647750318050385, + "learning_rate": 0.00013815462757989062, + "loss": 2.5898, + "step": 25630 + }, + { + "epoch": 0.7600450730955135, + "grad_norm": 0.09061233699321747, + "learning_rate": 0.00013812215816531265, + "loss": 2.5738, + "step": 25631 + }, + { + "epoch": 0.7600747264478249, + "grad_norm": 0.08582844585180283, + "learning_rate": 0.00013808969195522504, + "loss": 2.571, + "step": 25632 + }, + { + "epoch": 0.7601043798001365, + "grad_norm": 0.09252694249153137, + "learning_rate": 0.00013805722894991534, + "loss": 2.5721, + "step": 25633 + }, + { + "epoch": 0.7601340331524479, + "grad_norm": 0.09273646026849747, + "learning_rate": 0.000138024769149671, + "loss": 2.6027, + "step": 25634 + }, + { + "epoch": 0.7601636865047594, + "grad_norm": 0.0863938182592392, + "learning_rate": 0.00013799231255477945, + "loss": 2.5966, + "step": 25635 + }, + { + "epoch": 0.7601933398570708, + "grad_norm": 0.09444107860326767, + "learning_rate": 0.00013795985916552816, + "loss": 2.6177, + "step": 25636 + }, + { + "epoch": 0.7602229932093824, + "grad_norm": 0.0959455594420433, + "learning_rate": 0.00013792740898220423, + "loss": 2.6145, + "step": 25637 + }, + { + "epoch": 0.7602526465616938, + "grad_norm": 0.09667948633432388, + "learning_rate": 0.0001378949620050955, + "loss": 2.6145, + "step": 25638 + }, + { + "epoch": 0.7602822999140053, + "grad_norm": 0.09552402049303055, + "learning_rate": 0.00013786251823448908, + "loss": 2.5706, + "step": 25639 + }, + { + "epoch": 0.7603119532663167, + "grad_norm": 0.0856458991765976, + "learning_rate": 0.00013783007767067214, + "loss": 2.5878, + "step": 25640 + }, + { + "epoch": 0.7603416066186283, + "grad_norm": 0.10697807371616364, + "learning_rate": 0.000137797640313932, + "loss": 2.6217, + "step": 25641 + }, + { + "epoch": 0.7603712599709397, + "grad_norm": 0.09534647315740585, + "learning_rate": 0.00013776520616455595, + "loss": 2.5993, + "step": 25642 + }, + { + "epoch": 0.7604009133232512, + "grad_norm": 0.09411637485027313, + "learning_rate": 0.0001377327752228311, + "loss": 2.6023, + "step": 25643 + }, + { + "epoch": 0.7604305666755626, + "grad_norm": 0.0898883268237114, + "learning_rate": 0.00013770034748904482, + "loss": 2.6098, + "step": 25644 + }, + { + "epoch": 0.7604602200278742, + "grad_norm": 0.10788065195083618, + "learning_rate": 0.00013766792296348408, + "loss": 2.6343, + "step": 25645 + }, + { + "epoch": 0.7604898733801856, + "grad_norm": 0.09972704946994781, + "learning_rate": 0.00013763550164643613, + "loss": 2.5895, + "step": 25646 + }, + { + "epoch": 0.7605195267324971, + "grad_norm": 0.10547450929880142, + "learning_rate": 0.00013760308353818795, + "loss": 2.5878, + "step": 25647 + }, + { + "epoch": 0.7605491800848085, + "grad_norm": 0.09971068054437637, + "learning_rate": 0.0001375706686390267, + "loss": 2.6159, + "step": 25648 + }, + { + "epoch": 0.7605788334371201, + "grad_norm": 0.09650196135044098, + "learning_rate": 0.00013753825694923938, + "loss": 2.6258, + "step": 25649 + }, + { + "epoch": 0.7606084867894315, + "grad_norm": 0.10548727214336395, + "learning_rate": 0.000137505848469113, + "loss": 2.5682, + "step": 25650 + }, + { + "epoch": 0.760638140141743, + "grad_norm": 0.0950339064002037, + "learning_rate": 0.00013747344319893457, + "loss": 2.598, + "step": 25651 + }, + { + "epoch": 0.7606677934940546, + "grad_norm": 0.09630871564149857, + "learning_rate": 0.00013744104113899103, + "loss": 2.5551, + "step": 25652 + }, + { + "epoch": 0.760697446846366, + "grad_norm": 0.10518965125083923, + "learning_rate": 0.0001374086422895693, + "loss": 2.6127, + "step": 25653 + }, + { + "epoch": 0.7607271001986775, + "grad_norm": 0.09692855179309845, + "learning_rate": 0.00013737624665095626, + "loss": 2.5832, + "step": 25654 + }, + { + "epoch": 0.7607567535509889, + "grad_norm": 0.09030207991600037, + "learning_rate": 0.0001373438542234388, + "loss": 2.6006, + "step": 25655 + }, + { + "epoch": 0.7607864069033005, + "grad_norm": 0.10366376489400864, + "learning_rate": 0.00013731146500730378, + "loss": 2.5896, + "step": 25656 + }, + { + "epoch": 0.7608160602556119, + "grad_norm": 0.08664492517709732, + "learning_rate": 0.00013727907900283804, + "loss": 2.5743, + "step": 25657 + }, + { + "epoch": 0.7608457136079234, + "grad_norm": 0.10412893444299698, + "learning_rate": 0.00013724669621032826, + "loss": 2.5917, + "step": 25658 + }, + { + "epoch": 0.7608753669602348, + "grad_norm": 0.09939440339803696, + "learning_rate": 0.00013721431663006123, + "loss": 2.6167, + "step": 25659 + }, + { + "epoch": 0.7609050203125464, + "grad_norm": 0.09402456134557724, + "learning_rate": 0.00013718194026232373, + "loss": 2.5832, + "step": 25660 + }, + { + "epoch": 0.7609346736648578, + "grad_norm": 0.10782720148563385, + "learning_rate": 0.0001371495671074024, + "loss": 2.5966, + "step": 25661 + }, + { + "epoch": 0.7609643270171693, + "grad_norm": 0.09647002816200256, + "learning_rate": 0.00013711719716558396, + "loss": 2.5855, + "step": 25662 + }, + { + "epoch": 0.7609939803694807, + "grad_norm": 0.09916424751281738, + "learning_rate": 0.00013708483043715504, + "loss": 2.6144, + "step": 25663 + }, + { + "epoch": 0.7610236337217923, + "grad_norm": 0.09855108708143234, + "learning_rate": 0.0001370524669224022, + "loss": 2.597, + "step": 25664 + }, + { + "epoch": 0.7610532870741037, + "grad_norm": 0.10349318385124207, + "learning_rate": 0.00013702010662161213, + "loss": 2.5869, + "step": 25665 + }, + { + "epoch": 0.7610829404264152, + "grad_norm": 0.0989750474691391, + "learning_rate": 0.00013698774953507125, + "loss": 2.5598, + "step": 25666 + }, + { + "epoch": 0.7611125937787266, + "grad_norm": 0.10800624638795853, + "learning_rate": 0.00013695539566306619, + "loss": 2.5742, + "step": 25667 + }, + { + "epoch": 0.7611422471310382, + "grad_norm": 0.10067728161811829, + "learning_rate": 0.00013692304500588344, + "loss": 2.6099, + "step": 25668 + }, + { + "epoch": 0.7611719004833496, + "grad_norm": 0.09567779302597046, + "learning_rate": 0.0001368906975638094, + "loss": 2.6054, + "step": 25669 + }, + { + "epoch": 0.7612015538356611, + "grad_norm": 0.09683188050985336, + "learning_rate": 0.0001368583533371306, + "loss": 2.5874, + "step": 25670 + }, + { + "epoch": 0.7612312071879725, + "grad_norm": 0.0943194255232811, + "learning_rate": 0.00013682601232613335, + "loss": 2.587, + "step": 25671 + }, + { + "epoch": 0.7612608605402841, + "grad_norm": 0.09974221885204315, + "learning_rate": 0.00013679367453110414, + "loss": 2.5922, + "step": 25672 + }, + { + "epoch": 0.7612905138925956, + "grad_norm": 0.09388276934623718, + "learning_rate": 0.00013676133995232947, + "loss": 2.5965, + "step": 25673 + }, + { + "epoch": 0.761320167244907, + "grad_norm": 0.09790647029876709, + "learning_rate": 0.00013672900859009528, + "loss": 2.6259, + "step": 25674 + }, + { + "epoch": 0.7613498205972186, + "grad_norm": 0.1003052294254303, + "learning_rate": 0.00013669668044468807, + "loss": 2.6045, + "step": 25675 + }, + { + "epoch": 0.76137947394953, + "grad_norm": 0.09390489012002945, + "learning_rate": 0.000136664355516394, + "loss": 2.6023, + "step": 25676 + }, + { + "epoch": 0.7614091273018415, + "grad_norm": 0.09495073556900024, + "learning_rate": 0.0001366320338054996, + "loss": 2.5737, + "step": 25677 + }, + { + "epoch": 0.7614387806541529, + "grad_norm": 0.0904708281159401, + "learning_rate": 0.00013659971531229087, + "loss": 2.6136, + "step": 25678 + }, + { + "epoch": 0.7614684340064645, + "grad_norm": 0.09828969091176987, + "learning_rate": 0.00013656740003705403, + "loss": 2.5618, + "step": 25679 + }, + { + "epoch": 0.7614980873587759, + "grad_norm": 0.09718242287635803, + "learning_rate": 0.0001365350879800753, + "loss": 2.5847, + "step": 25680 + }, + { + "epoch": 0.7615277407110874, + "grad_norm": 0.09546143561601639, + "learning_rate": 0.00013650277914164073, + "loss": 2.5965, + "step": 25681 + }, + { + "epoch": 0.7615573940633988, + "grad_norm": 0.0935322493314743, + "learning_rate": 0.0001364704735220364, + "loss": 2.6013, + "step": 25682 + }, + { + "epoch": 0.7615870474157104, + "grad_norm": 0.09568389505147934, + "learning_rate": 0.00013643817112154845, + "loss": 2.5654, + "step": 25683 + }, + { + "epoch": 0.7616167007680218, + "grad_norm": 0.10426458716392517, + "learning_rate": 0.00013640587194046306, + "loss": 2.6288, + "step": 25684 + }, + { + "epoch": 0.7616463541203333, + "grad_norm": 0.09148314595222473, + "learning_rate": 0.00013637357597906592, + "loss": 2.59, + "step": 25685 + }, + { + "epoch": 0.7616760074726447, + "grad_norm": 0.09325046092271805, + "learning_rate": 0.00013634128323764322, + "loss": 2.5834, + "step": 25686 + }, + { + "epoch": 0.7617056608249563, + "grad_norm": 0.10045385360717773, + "learning_rate": 0.00013630899371648087, + "loss": 2.6015, + "step": 25687 + }, + { + "epoch": 0.7617353141772677, + "grad_norm": 0.09918421506881714, + "learning_rate": 0.00013627670741586472, + "loss": 2.6187, + "step": 25688 + }, + { + "epoch": 0.7617649675295792, + "grad_norm": 0.11103449016809464, + "learning_rate": 0.0001362444243360807, + "loss": 2.6125, + "step": 25689 + }, + { + "epoch": 0.7617946208818906, + "grad_norm": 0.09812137484550476, + "learning_rate": 0.00013621214447741487, + "loss": 2.5763, + "step": 25690 + }, + { + "epoch": 0.7618242742342022, + "grad_norm": 0.10485585033893585, + "learning_rate": 0.00013617986784015296, + "loss": 2.5575, + "step": 25691 + }, + { + "epoch": 0.7618539275865136, + "grad_norm": 0.09329402446746826, + "learning_rate": 0.00013614759442458075, + "loss": 2.5386, + "step": 25692 + }, + { + "epoch": 0.7618835809388251, + "grad_norm": 0.10692956298589706, + "learning_rate": 0.00013611532423098404, + "loss": 2.6077, + "step": 25693 + }, + { + "epoch": 0.7619132342911367, + "grad_norm": 0.09242938458919525, + "learning_rate": 0.00013608305725964877, + "loss": 2.6122, + "step": 25694 + }, + { + "epoch": 0.7619428876434481, + "grad_norm": 0.09443622827529907, + "learning_rate": 0.0001360507935108603, + "loss": 2.5936, + "step": 25695 + }, + { + "epoch": 0.7619725409957596, + "grad_norm": 0.10541632771492004, + "learning_rate": 0.0001360185329849045, + "loss": 2.5837, + "step": 25696 + }, + { + "epoch": 0.762002194348071, + "grad_norm": 0.0958947092294693, + "learning_rate": 0.00013598627568206718, + "loss": 2.5728, + "step": 25697 + }, + { + "epoch": 0.7620318477003826, + "grad_norm": 0.10148649662733078, + "learning_rate": 0.00013595402160263375, + "loss": 2.5803, + "step": 25698 + }, + { + "epoch": 0.762061501052694, + "grad_norm": 0.09724048525094986, + "learning_rate": 0.00013592177074689, + "loss": 2.607, + "step": 25699 + }, + { + "epoch": 0.7620911544050055, + "grad_norm": 0.09193932265043259, + "learning_rate": 0.0001358895231151215, + "loss": 2.6085, + "step": 25700 + }, + { + "epoch": 0.7621208077573169, + "grad_norm": 0.10577544569969177, + "learning_rate": 0.00013585727870761354, + "loss": 2.6058, + "step": 25701 + }, + { + "epoch": 0.7621504611096285, + "grad_norm": 0.09919694811105728, + "learning_rate": 0.0001358250375246521, + "loss": 2.613, + "step": 25702 + }, + { + "epoch": 0.7621801144619399, + "grad_norm": 0.10184714943170547, + "learning_rate": 0.00013579279956652245, + "loss": 2.5909, + "step": 25703 + }, + { + "epoch": 0.7622097678142514, + "grad_norm": 0.10070673376321793, + "learning_rate": 0.00013576056483351006, + "loss": 2.6006, + "step": 25704 + }, + { + "epoch": 0.7622394211665628, + "grad_norm": 0.10803896933794022, + "learning_rate": 0.00013572833332590057, + "loss": 2.5994, + "step": 25705 + }, + { + "epoch": 0.7622690745188744, + "grad_norm": 0.1114998608827591, + "learning_rate": 0.000135696105043979, + "loss": 2.5983, + "step": 25706 + }, + { + "epoch": 0.7622987278711858, + "grad_norm": 0.10140139609575272, + "learning_rate": 0.000135663879988031, + "loss": 2.5898, + "step": 25707 + }, + { + "epoch": 0.7623283812234973, + "grad_norm": 0.1068907305598259, + "learning_rate": 0.0001356316581583419, + "loss": 2.5778, + "step": 25708 + }, + { + "epoch": 0.7623580345758088, + "grad_norm": 0.10096932202577591, + "learning_rate": 0.00013559943955519693, + "loss": 2.6111, + "step": 25709 + }, + { + "epoch": 0.7623876879281203, + "grad_norm": 0.09605418145656586, + "learning_rate": 0.00013556722417888155, + "loss": 2.589, + "step": 25710 + }, + { + "epoch": 0.7624173412804317, + "grad_norm": 0.0996207520365715, + "learning_rate": 0.00013553501202968093, + "loss": 2.6121, + "step": 25711 + }, + { + "epoch": 0.7624469946327432, + "grad_norm": 0.09334301948547363, + "learning_rate": 0.00013550280310788032, + "loss": 2.5679, + "step": 25712 + }, + { + "epoch": 0.7624766479850547, + "grad_norm": 0.09522250294685364, + "learning_rate": 0.000135470597413765, + "loss": 2.5609, + "step": 25713 + }, + { + "epoch": 0.7625063013373662, + "grad_norm": 0.09185358881950378, + "learning_rate": 0.0001354383949476199, + "loss": 2.6307, + "step": 25714 + }, + { + "epoch": 0.7625359546896777, + "grad_norm": 0.09720878303050995, + "learning_rate": 0.00013540619570973073, + "loss": 2.5884, + "step": 25715 + }, + { + "epoch": 0.7625656080419891, + "grad_norm": 0.09785249829292297, + "learning_rate": 0.00013537399970038212, + "loss": 2.5793, + "step": 25716 + }, + { + "epoch": 0.7625952613943007, + "grad_norm": 0.10531359910964966, + "learning_rate": 0.0001353418069198593, + "loss": 2.6115, + "step": 25717 + }, + { + "epoch": 0.7626249147466121, + "grad_norm": 0.09977183490991592, + "learning_rate": 0.00013530961736844737, + "loss": 2.6154, + "step": 25718 + }, + { + "epoch": 0.7626545680989236, + "grad_norm": 0.09526227414608002, + "learning_rate": 0.00013527743104643143, + "loss": 2.6, + "step": 25719 + }, + { + "epoch": 0.762684221451235, + "grad_norm": 0.09958319365978241, + "learning_rate": 0.00013524524795409642, + "loss": 2.5842, + "step": 25720 + }, + { + "epoch": 0.7627138748035466, + "grad_norm": 0.09172375500202179, + "learning_rate": 0.00013521306809172735, + "loss": 2.5646, + "step": 25721 + }, + { + "epoch": 0.762743528155858, + "grad_norm": 0.09618372470140457, + "learning_rate": 0.00013518089145960916, + "loss": 2.6025, + "step": 25722 + }, + { + "epoch": 0.7627731815081695, + "grad_norm": 0.08900943398475647, + "learning_rate": 0.0001351487180580268, + "loss": 2.5909, + "step": 25723 + }, + { + "epoch": 0.762802834860481, + "grad_norm": 0.09665991365909576, + "learning_rate": 0.0001351165478872652, + "loss": 2.5933, + "step": 25724 + }, + { + "epoch": 0.7628324882127925, + "grad_norm": 0.09091460704803467, + "learning_rate": 0.00013508438094760917, + "loss": 2.5829, + "step": 25725 + }, + { + "epoch": 0.7628621415651039, + "grad_norm": 0.09093324095010757, + "learning_rate": 0.00013505221723934357, + "loss": 2.5842, + "step": 25726 + }, + { + "epoch": 0.7628917949174154, + "grad_norm": 0.09103640913963318, + "learning_rate": 0.00013502005676275326, + "loss": 2.6112, + "step": 25727 + }, + { + "epoch": 0.7629214482697269, + "grad_norm": 0.0928162932395935, + "learning_rate": 0.000134987899518123, + "loss": 2.6076, + "step": 25728 + }, + { + "epoch": 0.7629511016220384, + "grad_norm": 0.08923760801553726, + "learning_rate": 0.00013495574550573752, + "loss": 2.6026, + "step": 25729 + }, + { + "epoch": 0.7629807549743498, + "grad_norm": 0.08921737223863602, + "learning_rate": 0.00013492359472588156, + "loss": 2.6146, + "step": 25730 + }, + { + "epoch": 0.7630104083266613, + "grad_norm": 0.08577663451433182, + "learning_rate": 0.00013489144717883988, + "loss": 2.5715, + "step": 25731 + }, + { + "epoch": 0.7630400616789728, + "grad_norm": 0.08663561195135117, + "learning_rate": 0.00013485930286489707, + "loss": 2.5741, + "step": 25732 + }, + { + "epoch": 0.7630697150312843, + "grad_norm": 0.08998093008995056, + "learning_rate": 0.0001348271617843378, + "loss": 2.6084, + "step": 25733 + }, + { + "epoch": 0.7630993683835957, + "grad_norm": 0.08434022217988968, + "learning_rate": 0.00013479502393744675, + "loss": 2.5919, + "step": 25734 + }, + { + "epoch": 0.7631290217359072, + "grad_norm": 0.09417209029197693, + "learning_rate": 0.00013476288932450837, + "loss": 2.6157, + "step": 25735 + }, + { + "epoch": 0.7631586750882188, + "grad_norm": 0.08685165643692017, + "learning_rate": 0.00013473075794580736, + "loss": 2.5764, + "step": 25736 + }, + { + "epoch": 0.7631883284405302, + "grad_norm": 0.09141869097948074, + "learning_rate": 0.00013469862980162816, + "loss": 2.6243, + "step": 25737 + }, + { + "epoch": 0.7632179817928417, + "grad_norm": 0.09338401257991791, + "learning_rate": 0.00013466650489225528, + "loss": 2.5755, + "step": 25738 + }, + { + "epoch": 0.7632476351451531, + "grad_norm": 0.0872858315706253, + "learning_rate": 0.0001346343832179734, + "loss": 2.5739, + "step": 25739 + }, + { + "epoch": 0.7632772884974647, + "grad_norm": 0.08784137666225433, + "learning_rate": 0.00013460226477906645, + "loss": 2.6106, + "step": 25740 + }, + { + "epoch": 0.7633069418497761, + "grad_norm": 0.0938955545425415, + "learning_rate": 0.00013457014957581932, + "loss": 2.6013, + "step": 25741 + }, + { + "epoch": 0.7633365952020876, + "grad_norm": 0.08868542313575745, + "learning_rate": 0.00013453803760851623, + "loss": 2.5978, + "step": 25742 + }, + { + "epoch": 0.763366248554399, + "grad_norm": 0.10352500528097153, + "learning_rate": 0.00013450592887744156, + "loss": 2.6113, + "step": 25743 + }, + { + "epoch": 0.7633959019067106, + "grad_norm": 0.09816765040159225, + "learning_rate": 0.00013447382338287962, + "loss": 2.598, + "step": 25744 + }, + { + "epoch": 0.763425555259022, + "grad_norm": 0.09865560382604599, + "learning_rate": 0.0001344417211251147, + "loss": 2.579, + "step": 25745 + }, + { + "epoch": 0.7634552086113335, + "grad_norm": 0.09285545349121094, + "learning_rate": 0.0001344096221044311, + "loss": 2.5957, + "step": 25746 + }, + { + "epoch": 0.763484861963645, + "grad_norm": 0.08892437070608139, + "learning_rate": 0.00013437752632111305, + "loss": 2.6067, + "step": 25747 + }, + { + "epoch": 0.7635145153159565, + "grad_norm": 0.09728725254535675, + "learning_rate": 0.00013434543377544472, + "loss": 2.62, + "step": 25748 + }, + { + "epoch": 0.7635441686682679, + "grad_norm": 0.09124484658241272, + "learning_rate": 0.00013431334446771054, + "loss": 2.5939, + "step": 25749 + }, + { + "epoch": 0.7635738220205794, + "grad_norm": 0.10314402729272842, + "learning_rate": 0.00013428125839819432, + "loss": 2.5965, + "step": 25750 + }, + { + "epoch": 0.7636034753728909, + "grad_norm": 0.10196644067764282, + "learning_rate": 0.0001342491755671803, + "loss": 2.6011, + "step": 25751 + }, + { + "epoch": 0.7636331287252024, + "grad_norm": 0.09815644472837448, + "learning_rate": 0.00013421709597495263, + "loss": 2.5875, + "step": 25752 + }, + { + "epoch": 0.7636627820775138, + "grad_norm": 0.10419659316539764, + "learning_rate": 0.00013418501962179525, + "loss": 2.5732, + "step": 25753 + }, + { + "epoch": 0.7636924354298253, + "grad_norm": 0.08969123661518097, + "learning_rate": 0.00013415294650799236, + "loss": 2.5533, + "step": 25754 + }, + { + "epoch": 0.7637220887821368, + "grad_norm": 0.11073058843612671, + "learning_rate": 0.000134120876633828, + "loss": 2.6134, + "step": 25755 + }, + { + "epoch": 0.7637517421344483, + "grad_norm": 0.08915773779153824, + "learning_rate": 0.00013408880999958605, + "loss": 2.5998, + "step": 25756 + }, + { + "epoch": 0.7637813954867598, + "grad_norm": 0.09643904119729996, + "learning_rate": 0.00013405674660555057, + "loss": 2.6099, + "step": 25757 + }, + { + "epoch": 0.7638110488390712, + "grad_norm": 0.09176681190729141, + "learning_rate": 0.00013402468645200532, + "loss": 2.5847, + "step": 25758 + }, + { + "epoch": 0.7638407021913828, + "grad_norm": 0.08984004706144333, + "learning_rate": 0.0001339926295392343, + "loss": 2.6055, + "step": 25759 + }, + { + "epoch": 0.7638703555436942, + "grad_norm": 0.1910281926393509, + "learning_rate": 0.00013396057586752158, + "loss": 2.5805, + "step": 25760 + }, + { + "epoch": 0.7639000088960057, + "grad_norm": 0.09443648904561996, + "learning_rate": 0.0001339285254371506, + "loss": 2.5852, + "step": 25761 + }, + { + "epoch": 0.7639296622483172, + "grad_norm": 0.09870414435863495, + "learning_rate": 0.00013389647824840534, + "loss": 2.5983, + "step": 25762 + }, + { + "epoch": 0.7639593156006287, + "grad_norm": 0.09625400602817535, + "learning_rate": 0.00013386443430156965, + "loss": 2.6257, + "step": 25763 + }, + { + "epoch": 0.7639889689529401, + "grad_norm": 0.10368455946445465, + "learning_rate": 0.00013383239359692723, + "loss": 2.5953, + "step": 25764 + }, + { + "epoch": 0.7640186223052516, + "grad_norm": 0.09138840436935425, + "learning_rate": 0.00013380035613476182, + "loss": 2.6075, + "step": 25765 + }, + { + "epoch": 0.7640482756575631, + "grad_norm": 0.10362590104341507, + "learning_rate": 0.00013376832191535693, + "loss": 2.5863, + "step": 25766 + }, + { + "epoch": 0.7640779290098746, + "grad_norm": 0.08546342700719833, + "learning_rate": 0.00013373629093899658, + "loss": 2.5981, + "step": 25767 + }, + { + "epoch": 0.764107582362186, + "grad_norm": 0.11193933337926865, + "learning_rate": 0.00013370426320596425, + "loss": 2.5517, + "step": 25768 + }, + { + "epoch": 0.7641372357144975, + "grad_norm": 0.09119438380002975, + "learning_rate": 0.00013367223871654354, + "loss": 2.5706, + "step": 25769 + }, + { + "epoch": 0.764166889066809, + "grad_norm": 0.11521993577480316, + "learning_rate": 0.00013364021747101816, + "loss": 2.6005, + "step": 25770 + }, + { + "epoch": 0.7641965424191205, + "grad_norm": 0.09291141480207443, + "learning_rate": 0.00013360819946967145, + "loss": 2.6032, + "step": 25771 + }, + { + "epoch": 0.7642261957714319, + "grad_norm": 0.11265994608402252, + "learning_rate": 0.00013357618471278697, + "loss": 2.5941, + "step": 25772 + }, + { + "epoch": 0.7642558491237434, + "grad_norm": 0.10255741328001022, + "learning_rate": 0.00013354417320064833, + "loss": 2.6044, + "step": 25773 + }, + { + "epoch": 0.7642855024760549, + "grad_norm": 0.10032708942890167, + "learning_rate": 0.00013351216493353885, + "loss": 2.5832, + "step": 25774 + }, + { + "epoch": 0.7643151558283664, + "grad_norm": 0.11103907972574234, + "learning_rate": 0.0001334801599117421, + "loss": 2.6169, + "step": 25775 + }, + { + "epoch": 0.7643448091806779, + "grad_norm": 0.09488816559314728, + "learning_rate": 0.00013344815813554146, + "loss": 2.5972, + "step": 25776 + }, + { + "epoch": 0.7643744625329894, + "grad_norm": 0.10355420410633087, + "learning_rate": 0.0001334161596052203, + "loss": 2.6004, + "step": 25777 + }, + { + "epoch": 0.7644041158853009, + "grad_norm": 0.10202305018901825, + "learning_rate": 0.00013338416432106192, + "loss": 2.5845, + "step": 25778 + }, + { + "epoch": 0.7644337692376123, + "grad_norm": 0.09777908027172089, + "learning_rate": 0.0001333521722833496, + "loss": 2.6248, + "step": 25779 + }, + { + "epoch": 0.7644634225899238, + "grad_norm": 0.09851936995983124, + "learning_rate": 0.00013332018349236684, + "loss": 2.5489, + "step": 25780 + }, + { + "epoch": 0.7644930759422353, + "grad_norm": 0.09674744307994843, + "learning_rate": 0.00013328819794839697, + "loss": 2.5952, + "step": 25781 + }, + { + "epoch": 0.7645227292945468, + "grad_norm": 0.0865863561630249, + "learning_rate": 0.0001332562156517229, + "loss": 2.5639, + "step": 25782 + }, + { + "epoch": 0.7645523826468582, + "grad_norm": 0.09943459182977676, + "learning_rate": 0.000133224236602628, + "loss": 2.582, + "step": 25783 + }, + { + "epoch": 0.7645820359991697, + "grad_norm": 0.09094934910535812, + "learning_rate": 0.00013319226080139545, + "loss": 2.5861, + "step": 25784 + }, + { + "epoch": 0.7646116893514812, + "grad_norm": 0.09643014520406723, + "learning_rate": 0.00013316028824830834, + "loss": 2.5969, + "step": 25785 + }, + { + "epoch": 0.7646413427037927, + "grad_norm": 0.08827192336320877, + "learning_rate": 0.0001331283189436499, + "loss": 2.5894, + "step": 25786 + }, + { + "epoch": 0.7646709960561041, + "grad_norm": 0.0944424420595169, + "learning_rate": 0.0001330963528877031, + "loss": 2.5731, + "step": 25787 + }, + { + "epoch": 0.7647006494084156, + "grad_norm": 0.11201126128435135, + "learning_rate": 0.00013306439008075116, + "loss": 2.6013, + "step": 25788 + }, + { + "epoch": 0.7647303027607271, + "grad_norm": 0.09815938770771027, + "learning_rate": 0.00013303243052307696, + "loss": 2.5462, + "step": 25789 + }, + { + "epoch": 0.7647599561130386, + "grad_norm": 0.10047735273838043, + "learning_rate": 0.00013300047421496364, + "loss": 2.5833, + "step": 25790 + }, + { + "epoch": 0.76478960946535, + "grad_norm": 0.09615666419267654, + "learning_rate": 0.00013296852115669404, + "loss": 2.5798, + "step": 25791 + }, + { + "epoch": 0.7648192628176615, + "grad_norm": 0.09839171916246414, + "learning_rate": 0.00013293657134855124, + "loss": 2.5651, + "step": 25792 + }, + { + "epoch": 0.764848916169973, + "grad_norm": 0.0961202085018158, + "learning_rate": 0.0001329046247908181, + "loss": 2.6142, + "step": 25793 + }, + { + "epoch": 0.7648785695222845, + "grad_norm": 0.09671784937381744, + "learning_rate": 0.0001328726814837775, + "loss": 2.5335, + "step": 25794 + }, + { + "epoch": 0.7649082228745959, + "grad_norm": 0.09449388086795807, + "learning_rate": 0.00013284074142771237, + "loss": 2.5639, + "step": 25795 + }, + { + "epoch": 0.7649378762269075, + "grad_norm": 0.08942389488220215, + "learning_rate": 0.00013280880462290546, + "loss": 2.6051, + "step": 25796 + }, + { + "epoch": 0.764967529579219, + "grad_norm": 0.10138463228940964, + "learning_rate": 0.0001327768710696396, + "loss": 2.5765, + "step": 25797 + }, + { + "epoch": 0.7649971829315304, + "grad_norm": 0.09268036484718323, + "learning_rate": 0.0001327449407681976, + "loss": 2.6237, + "step": 25798 + }, + { + "epoch": 0.7650268362838419, + "grad_norm": 0.09433168172836304, + "learning_rate": 0.00013271301371886213, + "loss": 2.5667, + "step": 25799 + }, + { + "epoch": 0.7650564896361534, + "grad_norm": 0.09625695645809174, + "learning_rate": 0.00013268108992191602, + "loss": 2.5522, + "step": 25800 + }, + { + "epoch": 0.7650861429884649, + "grad_norm": 0.0925755426287651, + "learning_rate": 0.00013264916937764194, + "loss": 2.5517, + "step": 25801 + }, + { + "epoch": 0.7651157963407763, + "grad_norm": 0.0935850664973259, + "learning_rate": 0.00013261725208632246, + "loss": 2.6491, + "step": 25802 + }, + { + "epoch": 0.7651454496930878, + "grad_norm": 0.09728420525789261, + "learning_rate": 0.0001325853380482403, + "loss": 2.6296, + "step": 25803 + }, + { + "epoch": 0.7651751030453993, + "grad_norm": 0.0922752320766449, + "learning_rate": 0.00013255342726367804, + "loss": 2.5838, + "step": 25804 + }, + { + "epoch": 0.7652047563977108, + "grad_norm": 0.0921110138297081, + "learning_rate": 0.00013252151973291827, + "loss": 2.5741, + "step": 25805 + }, + { + "epoch": 0.7652344097500222, + "grad_norm": 0.08708476275205612, + "learning_rate": 0.0001324896154562435, + "loss": 2.5751, + "step": 25806 + }, + { + "epoch": 0.7652640631023337, + "grad_norm": 0.1015588566660881, + "learning_rate": 0.00013245771443393621, + "loss": 2.5383, + "step": 25807 + }, + { + "epoch": 0.7652937164546452, + "grad_norm": 0.08609307557344437, + "learning_rate": 0.00013242581666627902, + "loss": 2.5644, + "step": 25808 + }, + { + "epoch": 0.7653233698069567, + "grad_norm": 0.09451628476381302, + "learning_rate": 0.00013239392215355427, + "loss": 2.5851, + "step": 25809 + }, + { + "epoch": 0.7653530231592681, + "grad_norm": 0.08812329173088074, + "learning_rate": 0.00013236203089604448, + "loss": 2.5437, + "step": 25810 + }, + { + "epoch": 0.7653826765115797, + "grad_norm": 0.08945559710264206, + "learning_rate": 0.000132330142894032, + "loss": 2.6085, + "step": 25811 + }, + { + "epoch": 0.7654123298638911, + "grad_norm": 0.0927952453494072, + "learning_rate": 0.0001322982581477992, + "loss": 2.5935, + "step": 25812 + }, + { + "epoch": 0.7654419832162026, + "grad_norm": 0.09001491963863373, + "learning_rate": 0.00013226637665762848, + "loss": 2.5807, + "step": 25813 + }, + { + "epoch": 0.765471636568514, + "grad_norm": 0.09744247049093246, + "learning_rate": 0.00013223449842380208, + "loss": 2.5924, + "step": 25814 + }, + { + "epoch": 0.7655012899208256, + "grad_norm": 0.0971987321972847, + "learning_rate": 0.0001322026234466025, + "loss": 2.6115, + "step": 25815 + }, + { + "epoch": 0.765530943273137, + "grad_norm": 0.08871659636497498, + "learning_rate": 0.00013217075172631165, + "loss": 2.5889, + "step": 25816 + }, + { + "epoch": 0.7655605966254485, + "grad_norm": 0.0957762748003006, + "learning_rate": 0.00013213888326321193, + "loss": 2.6086, + "step": 25817 + }, + { + "epoch": 0.76559024997776, + "grad_norm": 0.09338114410638809, + "learning_rate": 0.00013210701805758542, + "loss": 2.5596, + "step": 25818 + }, + { + "epoch": 0.7656199033300715, + "grad_norm": 0.09298235177993774, + "learning_rate": 0.00013207515610971448, + "loss": 2.5796, + "step": 25819 + }, + { + "epoch": 0.765649556682383, + "grad_norm": 0.10276540368795395, + "learning_rate": 0.00013204329741988124, + "loss": 2.5745, + "step": 25820 + }, + { + "epoch": 0.7656792100346944, + "grad_norm": 0.09388978034257889, + "learning_rate": 0.00013201144198836777, + "loss": 2.6394, + "step": 25821 + }, + { + "epoch": 0.765708863387006, + "grad_norm": 0.09680122882127762, + "learning_rate": 0.00013197958981545616, + "loss": 2.5585, + "step": 25822 + }, + { + "epoch": 0.7657385167393174, + "grad_norm": 0.10078904777765274, + "learning_rate": 0.00013194774090142841, + "loss": 2.6062, + "step": 25823 + }, + { + "epoch": 0.7657681700916289, + "grad_norm": 0.09322766214609146, + "learning_rate": 0.0001319158952465666, + "loss": 2.5868, + "step": 25824 + }, + { + "epoch": 0.7657978234439403, + "grad_norm": 0.09389737248420715, + "learning_rate": 0.0001318840528511529, + "loss": 2.5767, + "step": 25825 + }, + { + "epoch": 0.7658274767962518, + "grad_norm": 0.09721337258815765, + "learning_rate": 0.00013185221371546892, + "loss": 2.5688, + "step": 25826 + }, + { + "epoch": 0.7658571301485633, + "grad_norm": 0.092539943754673, + "learning_rate": 0.00013182037783979677, + "loss": 2.5874, + "step": 25827 + }, + { + "epoch": 0.7658867835008748, + "grad_norm": 0.09781581908464432, + "learning_rate": 0.0001317885452244184, + "loss": 2.5904, + "step": 25828 + }, + { + "epoch": 0.7659164368531862, + "grad_norm": 0.10174063593149185, + "learning_rate": 0.00013175671586961564, + "loss": 2.5981, + "step": 25829 + }, + { + "epoch": 0.7659460902054978, + "grad_norm": 0.10116244107484818, + "learning_rate": 0.00013172488977567038, + "loss": 2.627, + "step": 25830 + }, + { + "epoch": 0.7659757435578092, + "grad_norm": 0.09591229259967804, + "learning_rate": 0.00013169306694286426, + "loss": 2.5831, + "step": 25831 + }, + { + "epoch": 0.7660053969101207, + "grad_norm": 0.100828617811203, + "learning_rate": 0.00013166124737147943, + "loss": 2.6032, + "step": 25832 + }, + { + "epoch": 0.7660350502624321, + "grad_norm": 0.09959596395492554, + "learning_rate": 0.00013162943106179747, + "loss": 2.588, + "step": 25833 + }, + { + "epoch": 0.7660647036147437, + "grad_norm": 0.10732419043779373, + "learning_rate": 0.00013159761801410014, + "loss": 2.5705, + "step": 25834 + }, + { + "epoch": 0.7660943569670551, + "grad_norm": 0.09551636129617691, + "learning_rate": 0.00013156580822866915, + "loss": 2.6004, + "step": 25835 + }, + { + "epoch": 0.7661240103193666, + "grad_norm": 0.09114720672369003, + "learning_rate": 0.00013153400170578627, + "loss": 2.5761, + "step": 25836 + }, + { + "epoch": 0.766153663671678, + "grad_norm": 0.10328216105699539, + "learning_rate": 0.00013150219844573297, + "loss": 2.6328, + "step": 25837 + }, + { + "epoch": 0.7661833170239896, + "grad_norm": 0.10205014050006866, + "learning_rate": 0.00013147039844879087, + "loss": 2.5791, + "step": 25838 + }, + { + "epoch": 0.7662129703763011, + "grad_norm": 0.10917104035615921, + "learning_rate": 0.00013143860171524176, + "loss": 2.5702, + "step": 25839 + }, + { + "epoch": 0.7662426237286125, + "grad_norm": 0.09400307387113571, + "learning_rate": 0.000131406808245367, + "loss": 2.605, + "step": 25840 + }, + { + "epoch": 0.766272277080924, + "grad_norm": 0.11057417839765549, + "learning_rate": 0.00013137501803944823, + "loss": 2.5495, + "step": 25841 + }, + { + "epoch": 0.7663019304332355, + "grad_norm": 0.095144122838974, + "learning_rate": 0.00013134323109776697, + "loss": 2.5871, + "step": 25842 + }, + { + "epoch": 0.766331583785547, + "grad_norm": 0.10406845808029175, + "learning_rate": 0.0001313114474206047, + "loss": 2.5935, + "step": 25843 + }, + { + "epoch": 0.7663612371378584, + "grad_norm": 0.10275667160749435, + "learning_rate": 0.00013127966700824268, + "loss": 2.5953, + "step": 25844 + }, + { + "epoch": 0.76639089049017, + "grad_norm": 0.09586338698863983, + "learning_rate": 0.00013124788986096264, + "loss": 2.6106, + "step": 25845 + }, + { + "epoch": 0.7664205438424814, + "grad_norm": 0.10007169097661972, + "learning_rate": 0.00013121611597904597, + "loss": 2.5835, + "step": 25846 + }, + { + "epoch": 0.7664501971947929, + "grad_norm": 0.096560999751091, + "learning_rate": 0.00013118434536277374, + "loss": 2.6119, + "step": 25847 + }, + { + "epoch": 0.7664798505471043, + "grad_norm": 0.09639056026935577, + "learning_rate": 0.00013115257801242747, + "loss": 2.6109, + "step": 25848 + }, + { + "epoch": 0.7665095038994159, + "grad_norm": 0.09560315310955048, + "learning_rate": 0.00013112081392828846, + "loss": 2.5942, + "step": 25849 + }, + { + "epoch": 0.7665391572517273, + "grad_norm": 0.09856634587049484, + "learning_rate": 0.0001310890531106379, + "loss": 2.6006, + "step": 25850 + }, + { + "epoch": 0.7665688106040388, + "grad_norm": 0.10090003162622452, + "learning_rate": 0.0001310572955597571, + "loss": 2.5826, + "step": 25851 + }, + { + "epoch": 0.7665984639563502, + "grad_norm": 0.09348130226135254, + "learning_rate": 0.00013102554127592732, + "loss": 2.5885, + "step": 25852 + }, + { + "epoch": 0.7666281173086618, + "grad_norm": 0.08944227546453476, + "learning_rate": 0.00013099379025942966, + "loss": 2.6039, + "step": 25853 + }, + { + "epoch": 0.7666577706609732, + "grad_norm": 0.09469113498926163, + "learning_rate": 0.00013096204251054533, + "loss": 2.5554, + "step": 25854 + }, + { + "epoch": 0.7666874240132847, + "grad_norm": 0.09616129845380783, + "learning_rate": 0.00013093029802955548, + "loss": 2.6006, + "step": 25855 + }, + { + "epoch": 0.7667170773655961, + "grad_norm": 0.09241517633199692, + "learning_rate": 0.00013089855681674119, + "loss": 2.5828, + "step": 25856 + }, + { + "epoch": 0.7667467307179077, + "grad_norm": 0.09131629765033722, + "learning_rate": 0.00013086681887238355, + "loss": 2.5889, + "step": 25857 + }, + { + "epoch": 0.7667763840702191, + "grad_norm": 0.10868015885353088, + "learning_rate": 0.00013083508419676354, + "loss": 2.6465, + "step": 25858 + }, + { + "epoch": 0.7668060374225306, + "grad_norm": 0.10072921216487885, + "learning_rate": 0.00013080335279016226, + "loss": 2.5884, + "step": 25859 + }, + { + "epoch": 0.7668356907748421, + "grad_norm": 0.09832052141427994, + "learning_rate": 0.00013077162465286064, + "loss": 2.5612, + "step": 25860 + }, + { + "epoch": 0.7668653441271536, + "grad_norm": 0.09849842637777328, + "learning_rate": 0.00013073989978513968, + "loss": 2.566, + "step": 25861 + }, + { + "epoch": 0.7668949974794651, + "grad_norm": 0.0958135575056076, + "learning_rate": 0.0001307081781872803, + "loss": 2.6406, + "step": 25862 + }, + { + "epoch": 0.7669246508317765, + "grad_norm": 0.09600520879030228, + "learning_rate": 0.00013067645985956339, + "loss": 2.6098, + "step": 25863 + }, + { + "epoch": 0.766954304184088, + "grad_norm": 0.09325942397117615, + "learning_rate": 0.0001306447448022698, + "loss": 2.5512, + "step": 25864 + }, + { + "epoch": 0.7669839575363995, + "grad_norm": 0.09069176018238068, + "learning_rate": 0.0001306130330156804, + "loss": 2.5905, + "step": 25865 + }, + { + "epoch": 0.767013610888711, + "grad_norm": 0.10070382058620453, + "learning_rate": 0.00013058132450007598, + "loss": 2.6168, + "step": 25866 + }, + { + "epoch": 0.7670432642410224, + "grad_norm": 0.10212574154138565, + "learning_rate": 0.0001305496192557374, + "loss": 2.5804, + "step": 25867 + }, + { + "epoch": 0.767072917593334, + "grad_norm": 0.09293604642152786, + "learning_rate": 0.00013051791728294532, + "loss": 2.5823, + "step": 25868 + }, + { + "epoch": 0.7671025709456454, + "grad_norm": 0.10175148397684097, + "learning_rate": 0.0001304862185819805, + "loss": 2.5859, + "step": 25869 + }, + { + "epoch": 0.7671322242979569, + "grad_norm": 0.0993163138628006, + "learning_rate": 0.0001304545231531236, + "loss": 2.5863, + "step": 25870 + }, + { + "epoch": 0.7671618776502683, + "grad_norm": 0.09842638671398163, + "learning_rate": 0.0001304228309966554, + "loss": 2.5728, + "step": 25871 + }, + { + "epoch": 0.7671915310025799, + "grad_norm": 0.0966937392950058, + "learning_rate": 0.00013039114211285647, + "loss": 2.6295, + "step": 25872 + }, + { + "epoch": 0.7672211843548913, + "grad_norm": 0.10384243726730347, + "learning_rate": 0.00013035945650200737, + "loss": 2.5773, + "step": 25873 + }, + { + "epoch": 0.7672508377072028, + "grad_norm": 0.09722957015037537, + "learning_rate": 0.00013032777416438872, + "loss": 2.5829, + "step": 25874 + }, + { + "epoch": 0.7672804910595142, + "grad_norm": 0.11269977688789368, + "learning_rate": 0.00013029609510028113, + "loss": 2.564, + "step": 25875 + }, + { + "epoch": 0.7673101444118258, + "grad_norm": 0.09499717503786087, + "learning_rate": 0.00013026441930996508, + "loss": 2.5858, + "step": 25876 + }, + { + "epoch": 0.7673397977641372, + "grad_norm": 0.10877417027950287, + "learning_rate": 0.00013023274679372106, + "loss": 2.6058, + "step": 25877 + }, + { + "epoch": 0.7673694511164487, + "grad_norm": 0.09360101073980331, + "learning_rate": 0.00013020107755182947, + "loss": 2.5879, + "step": 25878 + }, + { + "epoch": 0.7673991044687601, + "grad_norm": 0.09560927003622055, + "learning_rate": 0.00013016941158457092, + "loss": 2.5442, + "step": 25879 + }, + { + "epoch": 0.7674287578210717, + "grad_norm": 0.09454882889986038, + "learning_rate": 0.00013013774889222564, + "loss": 2.5736, + "step": 25880 + }, + { + "epoch": 0.7674584111733832, + "grad_norm": 0.09385445713996887, + "learning_rate": 0.00013010608947507425, + "loss": 2.6007, + "step": 25881 + }, + { + "epoch": 0.7674880645256946, + "grad_norm": 0.09797687083482742, + "learning_rate": 0.00013007443333339674, + "loss": 2.5579, + "step": 25882 + }, + { + "epoch": 0.7675177178780062, + "grad_norm": 0.08719931542873383, + "learning_rate": 0.00013004278046747347, + "loss": 2.5732, + "step": 25883 + }, + { + "epoch": 0.7675473712303176, + "grad_norm": 0.09589498490095139, + "learning_rate": 0.00013001113087758508, + "loss": 2.5825, + "step": 25884 + }, + { + "epoch": 0.7675770245826291, + "grad_norm": 0.0822567492723465, + "learning_rate": 0.00012997948456401165, + "loss": 2.5902, + "step": 25885 + }, + { + "epoch": 0.7676066779349405, + "grad_norm": 0.08417986333370209, + "learning_rate": 0.00012994784152703338, + "loss": 2.5785, + "step": 25886 + }, + { + "epoch": 0.7676363312872521, + "grad_norm": 0.09424541890621185, + "learning_rate": 0.00012991620176693041, + "loss": 2.6073, + "step": 25887 + }, + { + "epoch": 0.7676659846395635, + "grad_norm": 0.09225501120090485, + "learning_rate": 0.00012988456528398307, + "loss": 2.5966, + "step": 25888 + }, + { + "epoch": 0.767695637991875, + "grad_norm": 0.0936129167675972, + "learning_rate": 0.00012985293207847144, + "loss": 2.5875, + "step": 25889 + }, + { + "epoch": 0.7677252913441864, + "grad_norm": 0.10255943238735199, + "learning_rate": 0.00012982130215067562, + "loss": 2.5575, + "step": 25890 + }, + { + "epoch": 0.767754944696498, + "grad_norm": 0.09680670499801636, + "learning_rate": 0.00012978967550087588, + "loss": 2.6122, + "step": 25891 + }, + { + "epoch": 0.7677845980488094, + "grad_norm": 0.09901396185159683, + "learning_rate": 0.00012975805212935194, + "loss": 2.5893, + "step": 25892 + }, + { + "epoch": 0.7678142514011209, + "grad_norm": 0.09569739550352097, + "learning_rate": 0.000129726432036384, + "loss": 2.6109, + "step": 25893 + }, + { + "epoch": 0.7678439047534323, + "grad_norm": 0.10312079638242722, + "learning_rate": 0.000129694815222252, + "loss": 2.6001, + "step": 25894 + }, + { + "epoch": 0.7678735581057439, + "grad_norm": 0.10192077606916428, + "learning_rate": 0.00012966320168723594, + "loss": 2.605, + "step": 25895 + }, + { + "epoch": 0.7679032114580553, + "grad_norm": 0.09085716307163239, + "learning_rate": 0.00012963159143161585, + "loss": 2.5886, + "step": 25896 + }, + { + "epoch": 0.7679328648103668, + "grad_norm": 0.0971590131521225, + "learning_rate": 0.00012959998445567168, + "loss": 2.5784, + "step": 25897 + }, + { + "epoch": 0.7679625181626782, + "grad_norm": 0.10981094092130661, + "learning_rate": 0.00012956838075968313, + "loss": 2.6047, + "step": 25898 + }, + { + "epoch": 0.7679921715149898, + "grad_norm": 0.09064200520515442, + "learning_rate": 0.00012953678034393024, + "loss": 2.6103, + "step": 25899 + }, + { + "epoch": 0.7680218248673012, + "grad_norm": 0.10586506128311157, + "learning_rate": 0.0001295051832086927, + "loss": 2.6061, + "step": 25900 + }, + { + "epoch": 0.7680514782196127, + "grad_norm": 0.09432393312454224, + "learning_rate": 0.00012947358935425036, + "loss": 2.6015, + "step": 25901 + }, + { + "epoch": 0.7680811315719243, + "grad_norm": 0.10675682872533798, + "learning_rate": 0.00012944199878088314, + "loss": 2.5768, + "step": 25902 + }, + { + "epoch": 0.7681107849242357, + "grad_norm": 0.09361305087804794, + "learning_rate": 0.00012941041148887045, + "loss": 2.5832, + "step": 25903 + }, + { + "epoch": 0.7681404382765472, + "grad_norm": 0.09656821191310883, + "learning_rate": 0.00012937882747849223, + "loss": 2.5964, + "step": 25904 + }, + { + "epoch": 0.7681700916288586, + "grad_norm": 0.09681472927331924, + "learning_rate": 0.00012934724675002808, + "loss": 2.6037, + "step": 25905 + }, + { + "epoch": 0.7681997449811702, + "grad_norm": 0.0972181186079979, + "learning_rate": 0.00012931566930375764, + "loss": 2.6091, + "step": 25906 + }, + { + "epoch": 0.7682293983334816, + "grad_norm": 0.09552508592605591, + "learning_rate": 0.00012928409513996065, + "loss": 2.6033, + "step": 25907 + }, + { + "epoch": 0.7682590516857931, + "grad_norm": 0.09791076183319092, + "learning_rate": 0.00012925252425891642, + "loss": 2.5935, + "step": 25908 + }, + { + "epoch": 0.7682887050381045, + "grad_norm": 0.09269967675209045, + "learning_rate": 0.00012922095666090494, + "loss": 2.6124, + "step": 25909 + }, + { + "epoch": 0.7683183583904161, + "grad_norm": 0.09754546731710434, + "learning_rate": 0.00012918939234620548, + "loss": 2.6295, + "step": 25910 + }, + { + "epoch": 0.7683480117427275, + "grad_norm": 0.0982690304517746, + "learning_rate": 0.00012915783131509762, + "loss": 2.6113, + "step": 25911 + }, + { + "epoch": 0.768377665095039, + "grad_norm": 0.09610801935195923, + "learning_rate": 0.000129126273567861, + "loss": 2.6148, + "step": 25912 + }, + { + "epoch": 0.7684073184473504, + "grad_norm": 0.1081581562757492, + "learning_rate": 0.00012909471910477465, + "loss": 2.5779, + "step": 25913 + }, + { + "epoch": 0.768436971799662, + "grad_norm": 0.10173916816711426, + "learning_rate": 0.00012906316792611828, + "loss": 2.5999, + "step": 25914 + }, + { + "epoch": 0.7684666251519734, + "grad_norm": 0.09125972539186478, + "learning_rate": 0.00012903162003217121, + "loss": 2.5947, + "step": 25915 + }, + { + "epoch": 0.7684962785042849, + "grad_norm": 0.09973941743373871, + "learning_rate": 0.00012900007542321291, + "loss": 2.5594, + "step": 25916 + }, + { + "epoch": 0.7685259318565963, + "grad_norm": 0.09451921284198761, + "learning_rate": 0.00012896853409952253, + "loss": 2.6204, + "step": 25917 + }, + { + "epoch": 0.7685555852089079, + "grad_norm": 0.09716450423002243, + "learning_rate": 0.0001289369960613795, + "loss": 2.5458, + "step": 25918 + }, + { + "epoch": 0.7685852385612193, + "grad_norm": 0.09942775219678879, + "learning_rate": 0.00012890546130906307, + "loss": 2.5951, + "step": 25919 + }, + { + "epoch": 0.7686148919135308, + "grad_norm": 0.09129282087087631, + "learning_rate": 0.0001288739298428524, + "loss": 2.5992, + "step": 25920 + }, + { + "epoch": 0.7686445452658422, + "grad_norm": 0.0917467474937439, + "learning_rate": 0.00012884240166302668, + "loss": 2.5909, + "step": 25921 + }, + { + "epoch": 0.7686741986181538, + "grad_norm": 0.09132332354784012, + "learning_rate": 0.0001288108767698655, + "loss": 2.5779, + "step": 25922 + }, + { + "epoch": 0.7687038519704653, + "grad_norm": 0.09323173761367798, + "learning_rate": 0.00012877935516364763, + "loss": 2.5917, + "step": 25923 + }, + { + "epoch": 0.7687335053227767, + "grad_norm": 0.09572476893663406, + "learning_rate": 0.00012874783684465224, + "loss": 2.587, + "step": 25924 + }, + { + "epoch": 0.7687631586750883, + "grad_norm": 0.10198386013507843, + "learning_rate": 0.0001287163218131585, + "loss": 2.6176, + "step": 25925 + }, + { + "epoch": 0.7687928120273997, + "grad_norm": 0.09373199939727783, + "learning_rate": 0.00012868481006944545, + "loss": 2.5952, + "step": 25926 + }, + { + "epoch": 0.7688224653797112, + "grad_norm": 0.10531911253929138, + "learning_rate": 0.00012865330161379213, + "loss": 2.5525, + "step": 25927 + }, + { + "epoch": 0.7688521187320226, + "grad_norm": 0.092950738966465, + "learning_rate": 0.00012862179644647752, + "loss": 2.599, + "step": 25928 + }, + { + "epoch": 0.7688817720843342, + "grad_norm": 0.1044817790389061, + "learning_rate": 0.00012859029456778077, + "loss": 2.5935, + "step": 25929 + }, + { + "epoch": 0.7689114254366456, + "grad_norm": 0.10040824115276337, + "learning_rate": 0.0001285587959779806, + "loss": 2.576, + "step": 25930 + }, + { + "epoch": 0.7689410787889571, + "grad_norm": 0.11214238405227661, + "learning_rate": 0.00012852730067735614, + "loss": 2.5718, + "step": 25931 + }, + { + "epoch": 0.7689707321412685, + "grad_norm": 0.08889836817979813, + "learning_rate": 0.0001284958086661861, + "loss": 2.5981, + "step": 25932 + }, + { + "epoch": 0.7690003854935801, + "grad_norm": 0.10693438351154327, + "learning_rate": 0.0001284643199447495, + "loss": 2.583, + "step": 25933 + }, + { + "epoch": 0.7690300388458915, + "grad_norm": 0.09130191057920456, + "learning_rate": 0.0001284328345133251, + "loss": 2.5535, + "step": 25934 + }, + { + "epoch": 0.769059692198203, + "grad_norm": 0.10627863556146622, + "learning_rate": 0.00012840135237219175, + "loss": 2.5783, + "step": 25935 + }, + { + "epoch": 0.7690893455505144, + "grad_norm": 0.09508629143238068, + "learning_rate": 0.00012836987352162822, + "loss": 2.5973, + "step": 25936 + }, + { + "epoch": 0.769118998902826, + "grad_norm": 0.10839364677667618, + "learning_rate": 0.0001283383979619132, + "loss": 2.6056, + "step": 25937 + }, + { + "epoch": 0.7691486522551374, + "grad_norm": 0.09450077265501022, + "learning_rate": 0.00012830692569332547, + "loss": 2.5913, + "step": 25938 + }, + { + "epoch": 0.7691783056074489, + "grad_norm": 0.09796343743801117, + "learning_rate": 0.00012827545671614376, + "loss": 2.5734, + "step": 25939 + }, + { + "epoch": 0.7692079589597604, + "grad_norm": 0.11240578442811966, + "learning_rate": 0.00012824399103064665, + "loss": 2.6077, + "step": 25940 + }, + { + "epoch": 0.7692376123120719, + "grad_norm": 0.09741529822349548, + "learning_rate": 0.00012821252863711282, + "loss": 2.5935, + "step": 25941 + }, + { + "epoch": 0.7692672656643833, + "grad_norm": 0.0956171303987503, + "learning_rate": 0.00012818106953582087, + "loss": 2.5936, + "step": 25942 + }, + { + "epoch": 0.7692969190166948, + "grad_norm": 0.09707506746053696, + "learning_rate": 0.00012814961372704936, + "loss": 2.5865, + "step": 25943 + }, + { + "epoch": 0.7693265723690064, + "grad_norm": 0.09576837718486786, + "learning_rate": 0.00012811816121107683, + "loss": 2.5543, + "step": 25944 + }, + { + "epoch": 0.7693562257213178, + "grad_norm": 0.09498671442270279, + "learning_rate": 0.00012808671198818184, + "loss": 2.5993, + "step": 25945 + }, + { + "epoch": 0.7693858790736293, + "grad_norm": 0.09521960467100143, + "learning_rate": 0.000128055266058643, + "loss": 2.6001, + "step": 25946 + }, + { + "epoch": 0.7694155324259407, + "grad_norm": 0.09570593386888504, + "learning_rate": 0.00012802382342273834, + "loss": 2.6146, + "step": 25947 + }, + { + "epoch": 0.7694451857782523, + "grad_norm": 0.08889525383710861, + "learning_rate": 0.0001279923840807467, + "loss": 2.5821, + "step": 25948 + }, + { + "epoch": 0.7694748391305637, + "grad_norm": 0.09266054630279541, + "learning_rate": 0.00012796094803294632, + "loss": 2.5738, + "step": 25949 + }, + { + "epoch": 0.7695044924828752, + "grad_norm": 0.0897325798869133, + "learning_rate": 0.00012792951527961565, + "loss": 2.5787, + "step": 25950 + }, + { + "epoch": 0.7695341458351866, + "grad_norm": 0.09614262729883194, + "learning_rate": 0.00012789808582103302, + "loss": 2.5967, + "step": 25951 + }, + { + "epoch": 0.7695637991874982, + "grad_norm": 0.10432571917772293, + "learning_rate": 0.00012786665965747662, + "loss": 2.6231, + "step": 25952 + }, + { + "epoch": 0.7695934525398096, + "grad_norm": 0.09879986941814423, + "learning_rate": 0.00012783523678922492, + "loss": 2.5856, + "step": 25953 + }, + { + "epoch": 0.7696231058921211, + "grad_norm": 0.09882039576768875, + "learning_rate": 0.00012780381721655605, + "loss": 2.6096, + "step": 25954 + }, + { + "epoch": 0.7696527592444325, + "grad_norm": 0.09587015211582184, + "learning_rate": 0.00012777240093974824, + "loss": 2.6134, + "step": 25955 + }, + { + "epoch": 0.7696824125967441, + "grad_norm": 0.09201429039239883, + "learning_rate": 0.0001277409879590797, + "loss": 2.595, + "step": 25956 + }, + { + "epoch": 0.7697120659490555, + "grad_norm": 0.09674856066703796, + "learning_rate": 0.00012770957827482876, + "loss": 2.5886, + "step": 25957 + }, + { + "epoch": 0.769741719301367, + "grad_norm": 0.09146098047494888, + "learning_rate": 0.00012767817188727331, + "loss": 2.6005, + "step": 25958 + }, + { + "epoch": 0.7697713726536785, + "grad_norm": 0.09764831513166428, + "learning_rate": 0.00012764676879669151, + "loss": 2.574, + "step": 25959 + }, + { + "epoch": 0.76980102600599, + "grad_norm": 0.09949818253517151, + "learning_rate": 0.00012761536900336135, + "loss": 2.6516, + "step": 25960 + }, + { + "epoch": 0.7698306793583014, + "grad_norm": 0.09489337354898453, + "learning_rate": 0.00012758397250756115, + "loss": 2.5727, + "step": 25961 + }, + { + "epoch": 0.7698603327106129, + "grad_norm": 0.09746049344539642, + "learning_rate": 0.0001275525793095688, + "loss": 2.5871, + "step": 25962 + }, + { + "epoch": 0.7698899860629244, + "grad_norm": 0.09644866734743118, + "learning_rate": 0.00012752118940966234, + "loss": 2.5902, + "step": 25963 + }, + { + "epoch": 0.7699196394152359, + "grad_norm": 0.08978775888681412, + "learning_rate": 0.00012748980280811968, + "loss": 2.5739, + "step": 25964 + }, + { + "epoch": 0.7699492927675474, + "grad_norm": 0.09528569132089615, + "learning_rate": 0.0001274584195052187, + "loss": 2.6139, + "step": 25965 + }, + { + "epoch": 0.7699789461198588, + "grad_norm": 0.09539346396923065, + "learning_rate": 0.0001274270395012374, + "loss": 2.6028, + "step": 25966 + }, + { + "epoch": 0.7700085994721704, + "grad_norm": 0.09033894538879395, + "learning_rate": 0.00012739566279645372, + "loss": 2.6174, + "step": 25967 + }, + { + "epoch": 0.7700382528244818, + "grad_norm": 0.08642852306365967, + "learning_rate": 0.0001273642893911453, + "loss": 2.6176, + "step": 25968 + }, + { + "epoch": 0.7700679061767933, + "grad_norm": 0.09221532195806503, + "learning_rate": 0.00012733291928559005, + "loss": 2.6309, + "step": 25969 + }, + { + "epoch": 0.7700975595291047, + "grad_norm": 0.09644463658332825, + "learning_rate": 0.00012730155248006576, + "loss": 2.5894, + "step": 25970 + }, + { + "epoch": 0.7701272128814163, + "grad_norm": 0.09017125517129898, + "learning_rate": 0.00012727018897485022, + "loss": 2.5736, + "step": 25971 + }, + { + "epoch": 0.7701568662337277, + "grad_norm": 0.09768456965684891, + "learning_rate": 0.00012723882877022107, + "loss": 2.5964, + "step": 25972 + }, + { + "epoch": 0.7701865195860392, + "grad_norm": 0.0970175564289093, + "learning_rate": 0.000127207471866456, + "loss": 2.6116, + "step": 25973 + }, + { + "epoch": 0.7702161729383507, + "grad_norm": 0.09225339442491531, + "learning_rate": 0.00012717611826383284, + "loss": 2.6073, + "step": 25974 + }, + { + "epoch": 0.7702458262906622, + "grad_norm": 0.10171053558588028, + "learning_rate": 0.00012714476796262919, + "loss": 2.5594, + "step": 25975 + }, + { + "epoch": 0.7702754796429736, + "grad_norm": 0.08948320150375366, + "learning_rate": 0.0001271134209631226, + "loss": 2.5885, + "step": 25976 + }, + { + "epoch": 0.7703051329952851, + "grad_norm": 0.10985082387924194, + "learning_rate": 0.00012708207726559067, + "loss": 2.6052, + "step": 25977 + }, + { + "epoch": 0.7703347863475966, + "grad_norm": 0.09172743558883667, + "learning_rate": 0.00012705073687031116, + "loss": 2.6104, + "step": 25978 + }, + { + "epoch": 0.7703644396999081, + "grad_norm": 0.10194612294435501, + "learning_rate": 0.00012701939977756115, + "loss": 2.5916, + "step": 25979 + }, + { + "epoch": 0.7703940930522195, + "grad_norm": 0.08792296051979065, + "learning_rate": 0.00012698806598761847, + "loss": 2.6038, + "step": 25980 + }, + { + "epoch": 0.770423746404531, + "grad_norm": 0.09661957621574402, + "learning_rate": 0.00012695673550076043, + "loss": 2.5794, + "step": 25981 + }, + { + "epoch": 0.7704533997568425, + "grad_norm": 0.09891381114721298, + "learning_rate": 0.00012692540831726456, + "loss": 2.5947, + "step": 25982 + }, + { + "epoch": 0.770483053109154, + "grad_norm": 0.08708017319440842, + "learning_rate": 0.0001268940844374082, + "loss": 2.6022, + "step": 25983 + }, + { + "epoch": 0.7705127064614655, + "grad_norm": 0.09055814146995544, + "learning_rate": 0.00012686276386146878, + "loss": 2.6253, + "step": 25984 + }, + { + "epoch": 0.770542359813777, + "grad_norm": 0.08661549538373947, + "learning_rate": 0.00012683144658972366, + "loss": 2.619, + "step": 25985 + }, + { + "epoch": 0.7705720131660885, + "grad_norm": 0.08644676208496094, + "learning_rate": 0.00012680013262245, + "loss": 2.5903, + "step": 25986 + }, + { + "epoch": 0.7706016665183999, + "grad_norm": 0.0938609167933464, + "learning_rate": 0.00012676882195992528, + "loss": 2.606, + "step": 25987 + }, + { + "epoch": 0.7706313198707114, + "grad_norm": 0.08469964563846588, + "learning_rate": 0.00012673751460242694, + "loss": 2.5796, + "step": 25988 + }, + { + "epoch": 0.7706609732230228, + "grad_norm": 0.09793078154325485, + "learning_rate": 0.0001267062105502318, + "loss": 2.5586, + "step": 25989 + }, + { + "epoch": 0.7706906265753344, + "grad_norm": 0.08299179375171661, + "learning_rate": 0.00012667490980361722, + "loss": 2.5815, + "step": 25990 + }, + { + "epoch": 0.7707202799276458, + "grad_norm": 0.09943201392889023, + "learning_rate": 0.00012664361236286044, + "loss": 2.5908, + "step": 25991 + }, + { + "epoch": 0.7707499332799573, + "grad_norm": 0.09444890171289444, + "learning_rate": 0.00012661231822823853, + "loss": 2.5874, + "step": 25992 + }, + { + "epoch": 0.7707795866322688, + "grad_norm": 0.09781694412231445, + "learning_rate": 0.00012658102740002863, + "loss": 2.5909, + "step": 25993 + }, + { + "epoch": 0.7708092399845803, + "grad_norm": 0.10302849858999252, + "learning_rate": 0.00012654973987850788, + "loss": 2.5747, + "step": 25994 + }, + { + "epoch": 0.7708388933368917, + "grad_norm": 0.08825254440307617, + "learning_rate": 0.00012651845566395326, + "loss": 2.6315, + "step": 25995 + }, + { + "epoch": 0.7708685466892032, + "grad_norm": 0.1023210808634758, + "learning_rate": 0.0001264871747566418, + "loss": 2.5707, + "step": 25996 + }, + { + "epoch": 0.7708982000415147, + "grad_norm": 0.09976787865161896, + "learning_rate": 0.00012645589715685053, + "loss": 2.6127, + "step": 25997 + }, + { + "epoch": 0.7709278533938262, + "grad_norm": 0.08672772347927094, + "learning_rate": 0.0001264246228648564, + "loss": 2.5557, + "step": 25998 + }, + { + "epoch": 0.7709575067461376, + "grad_norm": 0.09926947206258774, + "learning_rate": 0.00012639335188093637, + "loss": 2.6147, + "step": 25999 + }, + { + "epoch": 0.7709871600984491, + "grad_norm": 0.09279894828796387, + "learning_rate": 0.00012636208420536732, + "loss": 2.5954, + "step": 26000 + }, + { + "epoch": 0.7710168134507606, + "grad_norm": 0.08814128488302231, + "learning_rate": 0.00012633081983842614, + "loss": 2.5918, + "step": 26001 + }, + { + "epoch": 0.7710464668030721, + "grad_norm": 0.08720702677965164, + "learning_rate": 0.00012629955878038973, + "loss": 2.587, + "step": 26002 + }, + { + "epoch": 0.7710761201553835, + "grad_norm": 0.09401946514844894, + "learning_rate": 0.00012626830103153487, + "loss": 2.5923, + "step": 26003 + }, + { + "epoch": 0.771105773507695, + "grad_norm": 0.09786242991685867, + "learning_rate": 0.00012623704659213831, + "loss": 2.6078, + "step": 26004 + }, + { + "epoch": 0.7711354268600066, + "grad_norm": 0.09929096698760986, + "learning_rate": 0.0001262057954624769, + "loss": 2.588, + "step": 26005 + }, + { + "epoch": 0.771165080212318, + "grad_norm": 0.09417693316936493, + "learning_rate": 0.00012617454764282733, + "loss": 2.5569, + "step": 26006 + }, + { + "epoch": 0.7711947335646295, + "grad_norm": 0.09426400810480118, + "learning_rate": 0.00012614330313346627, + "loss": 2.599, + "step": 26007 + }, + { + "epoch": 0.771224386916941, + "grad_norm": 0.09442544728517532, + "learning_rate": 0.00012611206193467045, + "loss": 2.5997, + "step": 26008 + }, + { + "epoch": 0.7712540402692525, + "grad_norm": 0.09105467796325684, + "learning_rate": 0.0001260808240467165, + "loss": 2.5675, + "step": 26009 + }, + { + "epoch": 0.7712836936215639, + "grad_norm": 0.10121838003396988, + "learning_rate": 0.00012604958946988104, + "loss": 2.5938, + "step": 26010 + }, + { + "epoch": 0.7713133469738754, + "grad_norm": 0.09199952334165573, + "learning_rate": 0.00012601835820444068, + "loss": 2.5663, + "step": 26011 + }, + { + "epoch": 0.7713430003261869, + "grad_norm": 0.09323422610759735, + "learning_rate": 0.00012598713025067194, + "loss": 2.5684, + "step": 26012 + }, + { + "epoch": 0.7713726536784984, + "grad_norm": 0.09271494299173355, + "learning_rate": 0.00012595590560885133, + "loss": 2.574, + "step": 26013 + }, + { + "epoch": 0.7714023070308098, + "grad_norm": 0.08734945207834244, + "learning_rate": 0.00012592468427925535, + "loss": 2.6031, + "step": 26014 + }, + { + "epoch": 0.7714319603831213, + "grad_norm": 0.0949297845363617, + "learning_rate": 0.00012589346626216058, + "loss": 2.5756, + "step": 26015 + }, + { + "epoch": 0.7714616137354328, + "grad_norm": 0.0883796289563179, + "learning_rate": 0.00012586225155784337, + "loss": 2.6131, + "step": 26016 + }, + { + "epoch": 0.7714912670877443, + "grad_norm": 0.08944221585988998, + "learning_rate": 0.0001258310401665801, + "loss": 2.5827, + "step": 26017 + }, + { + "epoch": 0.7715209204400557, + "grad_norm": 0.08358990401029587, + "learning_rate": 0.0001257998320886472, + "loss": 2.5689, + "step": 26018 + }, + { + "epoch": 0.7715505737923672, + "grad_norm": 0.10131990164518356, + "learning_rate": 0.00012576862732432105, + "loss": 2.5899, + "step": 26019 + }, + { + "epoch": 0.7715802271446787, + "grad_norm": 0.09058944880962372, + "learning_rate": 0.00012573742587387786, + "loss": 2.6172, + "step": 26020 + }, + { + "epoch": 0.7716098804969902, + "grad_norm": 0.09303746372461319, + "learning_rate": 0.00012570622773759412, + "loss": 2.5946, + "step": 26021 + }, + { + "epoch": 0.7716395338493016, + "grad_norm": 0.090597964823246, + "learning_rate": 0.00012567503291574606, + "loss": 2.581, + "step": 26022 + }, + { + "epoch": 0.7716691872016131, + "grad_norm": 0.09601376950740814, + "learning_rate": 0.0001256438414086097, + "loss": 2.558, + "step": 26023 + }, + { + "epoch": 0.7716988405539246, + "grad_norm": 0.08930254727602005, + "learning_rate": 0.00012561265321646138, + "loss": 2.6231, + "step": 26024 + }, + { + "epoch": 0.7717284939062361, + "grad_norm": 0.09746348112821579, + "learning_rate": 0.00012558146833957712, + "loss": 2.6024, + "step": 26025 + }, + { + "epoch": 0.7717581472585476, + "grad_norm": 0.09597495198249817, + "learning_rate": 0.0001255502867782334, + "loss": 2.5851, + "step": 26026 + }, + { + "epoch": 0.771787800610859, + "grad_norm": 0.09355125576257706, + "learning_rate": 0.00012551910853270614, + "loss": 2.5574, + "step": 26027 + }, + { + "epoch": 0.7718174539631706, + "grad_norm": 0.0979561135172844, + "learning_rate": 0.0001254879336032715, + "loss": 2.5875, + "step": 26028 + }, + { + "epoch": 0.771847107315482, + "grad_norm": 0.09151048958301544, + "learning_rate": 0.00012545676199020544, + "loss": 2.5799, + "step": 26029 + }, + { + "epoch": 0.7718767606677935, + "grad_norm": 0.10022345930337906, + "learning_rate": 0.00012542559369378405, + "loss": 2.5981, + "step": 26030 + }, + { + "epoch": 0.771906414020105, + "grad_norm": 0.097829170525074, + "learning_rate": 0.00012539442871428337, + "loss": 2.5849, + "step": 26031 + }, + { + "epoch": 0.7719360673724165, + "grad_norm": 0.0849844440817833, + "learning_rate": 0.00012536326705197927, + "loss": 2.5483, + "step": 26032 + }, + { + "epoch": 0.7719657207247279, + "grad_norm": 0.10804209113121033, + "learning_rate": 0.00012533210870714788, + "loss": 2.589, + "step": 26033 + }, + { + "epoch": 0.7719953740770394, + "grad_norm": 0.10397778451442719, + "learning_rate": 0.0001253009536800649, + "loss": 2.5763, + "step": 26034 + }, + { + "epoch": 0.7720250274293509, + "grad_norm": 0.1030813455581665, + "learning_rate": 0.00012526980197100624, + "loss": 2.5929, + "step": 26035 + }, + { + "epoch": 0.7720546807816624, + "grad_norm": 0.09453469514846802, + "learning_rate": 0.00012523865358024777, + "loss": 2.6192, + "step": 26036 + }, + { + "epoch": 0.7720843341339738, + "grad_norm": 0.09574726223945618, + "learning_rate": 0.00012520750850806544, + "loss": 2.5817, + "step": 26037 + }, + { + "epoch": 0.7721139874862853, + "grad_norm": 0.08612362295389175, + "learning_rate": 0.00012517636675473477, + "loss": 2.5754, + "step": 26038 + }, + { + "epoch": 0.7721436408385968, + "grad_norm": 0.09950307756662369, + "learning_rate": 0.00012514522832053181, + "loss": 2.5737, + "step": 26039 + }, + { + "epoch": 0.7721732941909083, + "grad_norm": 0.09426002204418182, + "learning_rate": 0.00012511409320573226, + "loss": 2.5944, + "step": 26040 + }, + { + "epoch": 0.7722029475432197, + "grad_norm": 0.09146207571029663, + "learning_rate": 0.0001250829614106117, + "loss": 2.5631, + "step": 26041 + }, + { + "epoch": 0.7722326008955313, + "grad_norm": 0.09668361395597458, + "learning_rate": 0.00012505183293544592, + "loss": 2.5759, + "step": 26042 + }, + { + "epoch": 0.7722622542478427, + "grad_norm": 0.0969415232539177, + "learning_rate": 0.00012502070778051068, + "loss": 2.5884, + "step": 26043 + }, + { + "epoch": 0.7722919076001542, + "grad_norm": 0.088418148458004, + "learning_rate": 0.00012498958594608128, + "loss": 2.6006, + "step": 26044 + }, + { + "epoch": 0.7723215609524656, + "grad_norm": 0.09736905246973038, + "learning_rate": 0.00012495846743243344, + "loss": 2.5516, + "step": 26045 + }, + { + "epoch": 0.7723512143047772, + "grad_norm": 0.08926822245121002, + "learning_rate": 0.00012492735223984275, + "loss": 2.5868, + "step": 26046 + }, + { + "epoch": 0.7723808676570887, + "grad_norm": 0.09633973240852356, + "learning_rate": 0.00012489624036858476, + "loss": 2.6195, + "step": 26047 + }, + { + "epoch": 0.7724105210094001, + "grad_norm": 0.08830871433019638, + "learning_rate": 0.00012486513181893488, + "loss": 2.5832, + "step": 26048 + }, + { + "epoch": 0.7724401743617116, + "grad_norm": 0.10083051025867462, + "learning_rate": 0.0001248340265911687, + "loss": 2.5805, + "step": 26049 + }, + { + "epoch": 0.7724698277140231, + "grad_norm": 0.09162482619285583, + "learning_rate": 0.00012480292468556166, + "loss": 2.5928, + "step": 26050 + }, + { + "epoch": 0.7724994810663346, + "grad_norm": 0.10078778117895126, + "learning_rate": 0.00012477182610238892, + "loss": 2.5831, + "step": 26051 + }, + { + "epoch": 0.772529134418646, + "grad_norm": 0.1000332161784172, + "learning_rate": 0.00012474073084192623, + "loss": 2.5891, + "step": 26052 + }, + { + "epoch": 0.7725587877709575, + "grad_norm": 0.09484901279211044, + "learning_rate": 0.00012470963890444874, + "loss": 2.5745, + "step": 26053 + }, + { + "epoch": 0.772588441123269, + "grad_norm": 0.090108722448349, + "learning_rate": 0.000124678550290232, + "loss": 2.5754, + "step": 26054 + }, + { + "epoch": 0.7726180944755805, + "grad_norm": 0.09963596612215042, + "learning_rate": 0.000124647464999551, + "loss": 2.6096, + "step": 26055 + }, + { + "epoch": 0.7726477478278919, + "grad_norm": 0.09655886888504028, + "learning_rate": 0.00012461638303268107, + "loss": 2.5647, + "step": 26056 + }, + { + "epoch": 0.7726774011802034, + "grad_norm": 0.10226784646511078, + "learning_rate": 0.0001245853043898975, + "loss": 2.5929, + "step": 26057 + }, + { + "epoch": 0.7727070545325149, + "grad_norm": 0.09238319844007492, + "learning_rate": 0.00012455422907147557, + "loss": 2.6201, + "step": 26058 + }, + { + "epoch": 0.7727367078848264, + "grad_norm": 0.10370156913995743, + "learning_rate": 0.0001245231570776903, + "loss": 2.5674, + "step": 26059 + }, + { + "epoch": 0.7727663612371378, + "grad_norm": 0.09234899282455444, + "learning_rate": 0.000124492088408817, + "loss": 2.6047, + "step": 26060 + }, + { + "epoch": 0.7727960145894494, + "grad_norm": 0.09450571984052658, + "learning_rate": 0.00012446102306513064, + "loss": 2.6286, + "step": 26061 + }, + { + "epoch": 0.7728256679417608, + "grad_norm": 0.0950852483510971, + "learning_rate": 0.0001244299610469064, + "loss": 2.5833, + "step": 26062 + }, + { + "epoch": 0.7728553212940723, + "grad_norm": 0.09512469917535782, + "learning_rate": 0.00012439890235441936, + "loss": 2.5827, + "step": 26063 + }, + { + "epoch": 0.7728849746463837, + "grad_norm": 0.09234264492988586, + "learning_rate": 0.0001243678469879445, + "loss": 2.586, + "step": 26064 + }, + { + "epoch": 0.7729146279986953, + "grad_norm": 0.11809320747852325, + "learning_rate": 0.0001243367949477568, + "loss": 2.5807, + "step": 26065 + }, + { + "epoch": 0.7729442813510067, + "grad_norm": 0.09202506393194199, + "learning_rate": 0.00012430574623413133, + "loss": 2.5884, + "step": 26066 + }, + { + "epoch": 0.7729739347033182, + "grad_norm": 0.10246835649013519, + "learning_rate": 0.00012427470084734287, + "loss": 2.6149, + "step": 26067 + }, + { + "epoch": 0.7730035880556297, + "grad_norm": 0.08942904323339462, + "learning_rate": 0.00012424365878766646, + "loss": 2.562, + "step": 26068 + }, + { + "epoch": 0.7730332414079412, + "grad_norm": 0.10518813133239746, + "learning_rate": 0.00012421262005537698, + "loss": 2.5786, + "step": 26069 + }, + { + "epoch": 0.7730628947602527, + "grad_norm": 0.08752822875976562, + "learning_rate": 0.00012418158465074924, + "loss": 2.5646, + "step": 26070 + }, + { + "epoch": 0.7730925481125641, + "grad_norm": 0.0980384424328804, + "learning_rate": 0.00012415055257405805, + "loss": 2.5587, + "step": 26071 + }, + { + "epoch": 0.7731222014648756, + "grad_norm": 0.09657172113656998, + "learning_rate": 0.00012411952382557827, + "loss": 2.5878, + "step": 26072 + }, + { + "epoch": 0.7731518548171871, + "grad_norm": 0.10238012671470642, + "learning_rate": 0.0001240884984055846, + "loss": 2.6263, + "step": 26073 + }, + { + "epoch": 0.7731815081694986, + "grad_norm": 0.08792460709810257, + "learning_rate": 0.00012405747631435182, + "loss": 2.5824, + "step": 26074 + }, + { + "epoch": 0.77321116152181, + "grad_norm": 0.11065114289522171, + "learning_rate": 0.0001240264575521546, + "loss": 2.5894, + "step": 26075 + }, + { + "epoch": 0.7732408148741216, + "grad_norm": 0.09598295390605927, + "learning_rate": 0.00012399544211926766, + "loss": 2.5746, + "step": 26076 + }, + { + "epoch": 0.773270468226433, + "grad_norm": 0.08874332159757614, + "learning_rate": 0.00012396443001596558, + "loss": 2.6068, + "step": 26077 + }, + { + "epoch": 0.7733001215787445, + "grad_norm": 0.10935080796480179, + "learning_rate": 0.00012393342124252305, + "loss": 2.6401, + "step": 26078 + }, + { + "epoch": 0.7733297749310559, + "grad_norm": 0.08172935992479324, + "learning_rate": 0.00012390241579921462, + "loss": 2.5978, + "step": 26079 + }, + { + "epoch": 0.7733594282833675, + "grad_norm": 0.1030644103884697, + "learning_rate": 0.00012387141368631484, + "loss": 2.6159, + "step": 26080 + }, + { + "epoch": 0.7733890816356789, + "grad_norm": 0.085399329662323, + "learning_rate": 0.00012384041490409825, + "loss": 2.5553, + "step": 26081 + }, + { + "epoch": 0.7734187349879904, + "grad_norm": 0.09905800223350525, + "learning_rate": 0.00012380941945283936, + "loss": 2.5948, + "step": 26082 + }, + { + "epoch": 0.7734483883403018, + "grad_norm": 0.09197629988193512, + "learning_rate": 0.00012377842733281264, + "loss": 2.5816, + "step": 26083 + }, + { + "epoch": 0.7734780416926134, + "grad_norm": 0.10676850378513336, + "learning_rate": 0.00012374743854429255, + "loss": 2.5951, + "step": 26084 + }, + { + "epoch": 0.7735076950449248, + "grad_norm": 0.08570516854524612, + "learning_rate": 0.0001237164530875534, + "loss": 2.586, + "step": 26085 + }, + { + "epoch": 0.7735373483972363, + "grad_norm": 0.10373332351446152, + "learning_rate": 0.0001236854709628697, + "loss": 2.577, + "step": 26086 + }, + { + "epoch": 0.7735670017495477, + "grad_norm": 0.09890017658472061, + "learning_rate": 0.00012365449217051572, + "loss": 2.5828, + "step": 26087 + }, + { + "epoch": 0.7735966551018593, + "grad_norm": 0.1019730418920517, + "learning_rate": 0.000123623516710766, + "loss": 2.567, + "step": 26088 + }, + { + "epoch": 0.7736263084541708, + "grad_norm": 0.09744053333997726, + "learning_rate": 0.00012359254458389434, + "loss": 2.5797, + "step": 26089 + }, + { + "epoch": 0.7736559618064822, + "grad_norm": 0.09944215416908264, + "learning_rate": 0.00012356157579017542, + "loss": 2.6049, + "step": 26090 + }, + { + "epoch": 0.7736856151587937, + "grad_norm": 0.10584445297718048, + "learning_rate": 0.0001235306103298834, + "loss": 2.6061, + "step": 26091 + }, + { + "epoch": 0.7737152685111052, + "grad_norm": 0.09225969761610031, + "learning_rate": 0.0001234996482032924, + "loss": 2.6137, + "step": 26092 + }, + { + "epoch": 0.7737449218634167, + "grad_norm": 0.0935797169804573, + "learning_rate": 0.0001234686894106767, + "loss": 2.5956, + "step": 26093 + }, + { + "epoch": 0.7737745752157281, + "grad_norm": 0.09178287535905838, + "learning_rate": 0.00012343773395231034, + "loss": 2.6021, + "step": 26094 + }, + { + "epoch": 0.7738042285680397, + "grad_norm": 0.08730494976043701, + "learning_rate": 0.00012340678182846748, + "loss": 2.616, + "step": 26095 + }, + { + "epoch": 0.7738338819203511, + "grad_norm": 0.09581287950277328, + "learning_rate": 0.00012337583303942222, + "loss": 2.5744, + "step": 26096 + }, + { + "epoch": 0.7738635352726626, + "grad_norm": 0.09037430584430695, + "learning_rate": 0.00012334488758544864, + "loss": 2.5797, + "step": 26097 + }, + { + "epoch": 0.773893188624974, + "grad_norm": 0.08809518814086914, + "learning_rate": 0.00012331394546682083, + "loss": 2.5974, + "step": 26098 + }, + { + "epoch": 0.7739228419772856, + "grad_norm": 0.08738604187965393, + "learning_rate": 0.0001232830066838126, + "loss": 2.6409, + "step": 26099 + }, + { + "epoch": 0.773952495329597, + "grad_norm": 0.09363061189651489, + "learning_rate": 0.00012325207123669796, + "loss": 2.615, + "step": 26100 + }, + { + "epoch": 0.7739821486819085, + "grad_norm": 0.08767169713973999, + "learning_rate": 0.0001232211391257509, + "loss": 2.5998, + "step": 26101 + }, + { + "epoch": 0.7740118020342199, + "grad_norm": 0.09250225871801376, + "learning_rate": 0.00012319021035124518, + "loss": 2.6031, + "step": 26102 + }, + { + "epoch": 0.7740414553865315, + "grad_norm": 0.09590315818786621, + "learning_rate": 0.00012315928491345496, + "loss": 2.5676, + "step": 26103 + }, + { + "epoch": 0.7740711087388429, + "grad_norm": 0.09776513278484344, + "learning_rate": 0.000123128362812654, + "loss": 2.6255, + "step": 26104 + }, + { + "epoch": 0.7741007620911544, + "grad_norm": 0.09218862652778625, + "learning_rate": 0.00012309744404911604, + "loss": 2.5944, + "step": 26105 + }, + { + "epoch": 0.7741304154434658, + "grad_norm": 0.0958472415804863, + "learning_rate": 0.00012306652862311492, + "loss": 2.6153, + "step": 26106 + }, + { + "epoch": 0.7741600687957774, + "grad_norm": 0.09058889746665955, + "learning_rate": 0.0001230356165349244, + "loss": 2.6039, + "step": 26107 + }, + { + "epoch": 0.7741897221480888, + "grad_norm": 0.09928029030561447, + "learning_rate": 0.0001230047077848182, + "loss": 2.6132, + "step": 26108 + }, + { + "epoch": 0.7742193755004003, + "grad_norm": 0.0965145081281662, + "learning_rate": 0.00012297380237307016, + "loss": 2.5641, + "step": 26109 + }, + { + "epoch": 0.7742490288527119, + "grad_norm": 0.10128946602344513, + "learning_rate": 0.00012294290029995364, + "loss": 2.5832, + "step": 26110 + }, + { + "epoch": 0.7742786822050233, + "grad_norm": 0.08755386620759964, + "learning_rate": 0.0001229120015657425, + "loss": 2.5788, + "step": 26111 + }, + { + "epoch": 0.7743083355573348, + "grad_norm": 0.10303396731615067, + "learning_rate": 0.0001228811061707103, + "loss": 2.5959, + "step": 26112 + }, + { + "epoch": 0.7743379889096462, + "grad_norm": 0.09021007269620895, + "learning_rate": 0.00012285021411513063, + "loss": 2.5778, + "step": 26113 + }, + { + "epoch": 0.7743676422619578, + "grad_norm": 0.09540503472089767, + "learning_rate": 0.00012281932539927703, + "loss": 2.582, + "step": 26114 + }, + { + "epoch": 0.7743972956142692, + "grad_norm": 0.09404154121875763, + "learning_rate": 0.00012278844002342292, + "loss": 2.604, + "step": 26115 + }, + { + "epoch": 0.7744269489665807, + "grad_norm": 0.08808837085962296, + "learning_rate": 0.0001227575579878421, + "loss": 2.586, + "step": 26116 + }, + { + "epoch": 0.7744566023188921, + "grad_norm": 0.09867213666439056, + "learning_rate": 0.00012272667929280785, + "loss": 2.5868, + "step": 26117 + }, + { + "epoch": 0.7744862556712037, + "grad_norm": 0.08901111036539078, + "learning_rate": 0.0001226958039385936, + "loss": 2.5869, + "step": 26118 + }, + { + "epoch": 0.7745159090235151, + "grad_norm": 0.09126869589090347, + "learning_rate": 0.0001226649319254729, + "loss": 2.5941, + "step": 26119 + }, + { + "epoch": 0.7745455623758266, + "grad_norm": 0.09289226680994034, + "learning_rate": 0.00012263406325371885, + "loss": 2.5983, + "step": 26120 + }, + { + "epoch": 0.774575215728138, + "grad_norm": 0.09461570531129837, + "learning_rate": 0.00012260319792360496, + "loss": 2.5911, + "step": 26121 + }, + { + "epoch": 0.7746048690804496, + "grad_norm": 0.08676443248987198, + "learning_rate": 0.0001225723359354045, + "loss": 2.5793, + "step": 26122 + }, + { + "epoch": 0.774634522432761, + "grad_norm": 0.0915558710694313, + "learning_rate": 0.0001225414772893908, + "loss": 2.6129, + "step": 26123 + }, + { + "epoch": 0.7746641757850725, + "grad_norm": 0.08777672052383423, + "learning_rate": 0.00012251062198583712, + "loss": 2.5861, + "step": 26124 + }, + { + "epoch": 0.7746938291373839, + "grad_norm": 0.0833565816283226, + "learning_rate": 0.0001224797700250167, + "loss": 2.5807, + "step": 26125 + }, + { + "epoch": 0.7747234824896955, + "grad_norm": 0.08905397355556488, + "learning_rate": 0.0001224489214072027, + "loss": 2.6165, + "step": 26126 + }, + { + "epoch": 0.7747531358420069, + "grad_norm": 0.09597869962453842, + "learning_rate": 0.00012241807613266832, + "loss": 2.6016, + "step": 26127 + }, + { + "epoch": 0.7747827891943184, + "grad_norm": 0.09218937903642654, + "learning_rate": 0.0001223872342016865, + "loss": 2.5744, + "step": 26128 + }, + { + "epoch": 0.7748124425466298, + "grad_norm": 0.09448476135730743, + "learning_rate": 0.00012235639561453072, + "loss": 2.598, + "step": 26129 + }, + { + "epoch": 0.7748420958989414, + "grad_norm": 0.0895644947886467, + "learning_rate": 0.00012232556037147403, + "loss": 2.5757, + "step": 26130 + }, + { + "epoch": 0.7748717492512529, + "grad_norm": 0.09795507043600082, + "learning_rate": 0.00012229472847278918, + "loss": 2.5699, + "step": 26131 + }, + { + "epoch": 0.7749014026035643, + "grad_norm": 0.0890330895781517, + "learning_rate": 0.0001222638999187493, + "loss": 2.5751, + "step": 26132 + }, + { + "epoch": 0.7749310559558759, + "grad_norm": 0.0978214368224144, + "learning_rate": 0.00012223307470962748, + "loss": 2.6026, + "step": 26133 + }, + { + "epoch": 0.7749607093081873, + "grad_norm": 0.09022043645381927, + "learning_rate": 0.00012220225284569658, + "loss": 2.5702, + "step": 26134 + }, + { + "epoch": 0.7749903626604988, + "grad_norm": 0.09692323952913284, + "learning_rate": 0.0001221714343272296, + "loss": 2.5713, + "step": 26135 + }, + { + "epoch": 0.7750200160128102, + "grad_norm": 0.09907464683055878, + "learning_rate": 0.00012214061915449942, + "loss": 2.6173, + "step": 26136 + }, + { + "epoch": 0.7750496693651218, + "grad_norm": 0.10150620341300964, + "learning_rate": 0.00012210980732777892, + "loss": 2.5798, + "step": 26137 + }, + { + "epoch": 0.7750793227174332, + "grad_norm": 0.0904623344540596, + "learning_rate": 0.0001220789988473409, + "loss": 2.5995, + "step": 26138 + }, + { + "epoch": 0.7751089760697447, + "grad_norm": 0.10241842269897461, + "learning_rate": 0.00012204819371345827, + "loss": 2.6145, + "step": 26139 + }, + { + "epoch": 0.7751386294220561, + "grad_norm": 0.08921714127063751, + "learning_rate": 0.00012201739192640376, + "loss": 2.6055, + "step": 26140 + }, + { + "epoch": 0.7751682827743677, + "grad_norm": 0.08907747268676758, + "learning_rate": 0.00012198659348645008, + "loss": 2.5308, + "step": 26141 + }, + { + "epoch": 0.7751979361266791, + "grad_norm": 0.09420355409383774, + "learning_rate": 0.00012195579839387005, + "loss": 2.5897, + "step": 26142 + }, + { + "epoch": 0.7752275894789906, + "grad_norm": 0.09204299002885818, + "learning_rate": 0.0001219250066489363, + "loss": 2.5882, + "step": 26143 + }, + { + "epoch": 0.775257242831302, + "grad_norm": 0.1052904799580574, + "learning_rate": 0.00012189421825192148, + "loss": 2.57, + "step": 26144 + }, + { + "epoch": 0.7752868961836136, + "grad_norm": 0.08875477313995361, + "learning_rate": 0.00012186343320309828, + "loss": 2.597, + "step": 26145 + }, + { + "epoch": 0.775316549535925, + "grad_norm": 0.09519136697053909, + "learning_rate": 0.00012183265150273931, + "loss": 2.5922, + "step": 26146 + }, + { + "epoch": 0.7753462028882365, + "grad_norm": 0.09453843533992767, + "learning_rate": 0.00012180187315111706, + "loss": 2.5848, + "step": 26147 + }, + { + "epoch": 0.775375856240548, + "grad_norm": 0.09576764702796936, + "learning_rate": 0.00012177109814850417, + "loss": 2.6317, + "step": 26148 + }, + { + "epoch": 0.7754055095928595, + "grad_norm": 0.09725040197372437, + "learning_rate": 0.00012174032649517319, + "loss": 2.5899, + "step": 26149 + }, + { + "epoch": 0.7754351629451709, + "grad_norm": 0.09389159083366394, + "learning_rate": 0.00012170955819139645, + "loss": 2.5995, + "step": 26150 + }, + { + "epoch": 0.7754648162974824, + "grad_norm": 0.09372539818286896, + "learning_rate": 0.00012167879323744658, + "loss": 2.557, + "step": 26151 + }, + { + "epoch": 0.775494469649794, + "grad_norm": 0.09718938171863556, + "learning_rate": 0.00012164803163359589, + "loss": 2.585, + "step": 26152 + }, + { + "epoch": 0.7755241230021054, + "grad_norm": 0.08991068601608276, + "learning_rate": 0.00012161727338011696, + "loss": 2.5705, + "step": 26153 + }, + { + "epoch": 0.7755537763544169, + "grad_norm": 0.09781043976545334, + "learning_rate": 0.00012158651847728175, + "loss": 2.5947, + "step": 26154 + }, + { + "epoch": 0.7755834297067283, + "grad_norm": 0.10970546305179596, + "learning_rate": 0.00012155576692536302, + "loss": 2.586, + "step": 26155 + }, + { + "epoch": 0.7756130830590399, + "grad_norm": 0.08704748749732971, + "learning_rate": 0.00012152501872463295, + "loss": 2.5765, + "step": 26156 + }, + { + "epoch": 0.7756427364113513, + "grad_norm": 0.10388471186161041, + "learning_rate": 0.00012149427387536377, + "loss": 2.5772, + "step": 26157 + }, + { + "epoch": 0.7756723897636628, + "grad_norm": 0.09519973397254944, + "learning_rate": 0.00012146353237782782, + "loss": 2.5799, + "step": 26158 + }, + { + "epoch": 0.7757020431159742, + "grad_norm": 0.10088322311639786, + "learning_rate": 0.0001214327942322972, + "loss": 2.6301, + "step": 26159 + }, + { + "epoch": 0.7757316964682858, + "grad_norm": 0.094905786216259, + "learning_rate": 0.0001214020594390442, + "loss": 2.6108, + "step": 26160 + }, + { + "epoch": 0.7757613498205972, + "grad_norm": 0.09409037977457047, + "learning_rate": 0.00012137132799834094, + "loss": 2.5674, + "step": 26161 + }, + { + "epoch": 0.7757910031729087, + "grad_norm": 0.10645609349012375, + "learning_rate": 0.00012134059991045959, + "loss": 2.6107, + "step": 26162 + }, + { + "epoch": 0.7758206565252201, + "grad_norm": 0.09577839076519012, + "learning_rate": 0.00012130987517567221, + "loss": 2.5775, + "step": 26163 + }, + { + "epoch": 0.7758503098775317, + "grad_norm": 0.09830314666032791, + "learning_rate": 0.00012127915379425098, + "loss": 2.5904, + "step": 26164 + }, + { + "epoch": 0.7758799632298431, + "grad_norm": 0.10072734206914902, + "learning_rate": 0.00012124843576646777, + "loss": 2.5778, + "step": 26165 + }, + { + "epoch": 0.7759096165821546, + "grad_norm": 0.09938324987888336, + "learning_rate": 0.00012121772109259465, + "loss": 2.5993, + "step": 26166 + }, + { + "epoch": 0.775939269934466, + "grad_norm": 0.10277006030082703, + "learning_rate": 0.00012118700977290348, + "loss": 2.5675, + "step": 26167 + }, + { + "epoch": 0.7759689232867776, + "grad_norm": 0.09725296497344971, + "learning_rate": 0.00012115630180766651, + "loss": 2.5657, + "step": 26168 + }, + { + "epoch": 0.775998576639089, + "grad_norm": 0.10850250720977783, + "learning_rate": 0.00012112559719715549, + "loss": 2.6151, + "step": 26169 + }, + { + "epoch": 0.7760282299914005, + "grad_norm": 0.09680074453353882, + "learning_rate": 0.00012109489594164236, + "loss": 2.5873, + "step": 26170 + }, + { + "epoch": 0.776057883343712, + "grad_norm": 0.10085788369178772, + "learning_rate": 0.00012106419804139896, + "loss": 2.5837, + "step": 26171 + }, + { + "epoch": 0.7760875366960235, + "grad_norm": 0.09741255640983582, + "learning_rate": 0.00012103350349669717, + "loss": 2.608, + "step": 26172 + }, + { + "epoch": 0.776117190048335, + "grad_norm": 0.09806954115629196, + "learning_rate": 0.00012100281230780868, + "loss": 2.5949, + "step": 26173 + }, + { + "epoch": 0.7761468434006464, + "grad_norm": 0.09190158545970917, + "learning_rate": 0.00012097212447500555, + "loss": 2.5586, + "step": 26174 + }, + { + "epoch": 0.776176496752958, + "grad_norm": 0.09771940857172012, + "learning_rate": 0.00012094143999855916, + "loss": 2.6022, + "step": 26175 + }, + { + "epoch": 0.7762061501052694, + "grad_norm": 0.101962611079216, + "learning_rate": 0.00012091075887874136, + "loss": 2.577, + "step": 26176 + }, + { + "epoch": 0.7762358034575809, + "grad_norm": 0.09734180569648743, + "learning_rate": 0.00012088008111582388, + "loss": 2.6052, + "step": 26177 + }, + { + "epoch": 0.7762654568098923, + "grad_norm": 0.09390915185213089, + "learning_rate": 0.00012084940671007833, + "loss": 2.5862, + "step": 26178 + }, + { + "epoch": 0.7762951101622039, + "grad_norm": 0.09129972755908966, + "learning_rate": 0.00012081873566177642, + "loss": 2.617, + "step": 26179 + }, + { + "epoch": 0.7763247635145153, + "grad_norm": 0.0929982140660286, + "learning_rate": 0.00012078806797118946, + "loss": 2.6035, + "step": 26180 + }, + { + "epoch": 0.7763544168668268, + "grad_norm": 0.09105979651212692, + "learning_rate": 0.00012075740363858945, + "loss": 2.5719, + "step": 26181 + }, + { + "epoch": 0.7763840702191382, + "grad_norm": 0.09087484329938889, + "learning_rate": 0.00012072674266424776, + "loss": 2.5854, + "step": 26182 + }, + { + "epoch": 0.7764137235714498, + "grad_norm": 0.09470297396183014, + "learning_rate": 0.00012069608504843582, + "loss": 2.6016, + "step": 26183 + }, + { + "epoch": 0.7764433769237612, + "grad_norm": 0.0879204049706459, + "learning_rate": 0.00012066543079142517, + "loss": 2.6016, + "step": 26184 + }, + { + "epoch": 0.7764730302760727, + "grad_norm": 0.09639536589384079, + "learning_rate": 0.00012063477989348737, + "loss": 2.5516, + "step": 26185 + }, + { + "epoch": 0.7765026836283841, + "grad_norm": 0.09380588680505753, + "learning_rate": 0.00012060413235489354, + "loss": 2.5911, + "step": 26186 + }, + { + "epoch": 0.7765323369806957, + "grad_norm": 0.08555945754051208, + "learning_rate": 0.00012057348817591529, + "loss": 2.5848, + "step": 26187 + }, + { + "epoch": 0.7765619903330071, + "grad_norm": 0.09481621533632278, + "learning_rate": 0.00012054284735682392, + "loss": 2.5701, + "step": 26188 + }, + { + "epoch": 0.7765916436853186, + "grad_norm": 0.09904637932777405, + "learning_rate": 0.00012051220989789075, + "loss": 2.5799, + "step": 26189 + }, + { + "epoch": 0.77662129703763, + "grad_norm": 0.08839306980371475, + "learning_rate": 0.00012048157579938712, + "loss": 2.5995, + "step": 26190 + }, + { + "epoch": 0.7766509503899416, + "grad_norm": 0.09968768805265427, + "learning_rate": 0.00012045094506158427, + "loss": 2.585, + "step": 26191 + }, + { + "epoch": 0.776680603742253, + "grad_norm": 0.1004595011472702, + "learning_rate": 0.00012042031768475348, + "loss": 2.5801, + "step": 26192 + }, + { + "epoch": 0.7767102570945645, + "grad_norm": 0.09323214739561081, + "learning_rate": 0.00012038969366916574, + "loss": 2.5864, + "step": 26193 + }, + { + "epoch": 0.7767399104468761, + "grad_norm": 0.09523798525333405, + "learning_rate": 0.00012035907301509258, + "loss": 2.5595, + "step": 26194 + }, + { + "epoch": 0.7767695637991875, + "grad_norm": 0.0938558503985405, + "learning_rate": 0.00012032845572280516, + "loss": 2.6208, + "step": 26195 + }, + { + "epoch": 0.776799217151499, + "grad_norm": 0.0974886566400528, + "learning_rate": 0.00012029784179257425, + "loss": 2.6036, + "step": 26196 + }, + { + "epoch": 0.7768288705038104, + "grad_norm": 0.09495237469673157, + "learning_rate": 0.00012026723122467114, + "loss": 2.6105, + "step": 26197 + }, + { + "epoch": 0.776858523856122, + "grad_norm": 0.09355917572975159, + "learning_rate": 0.00012023662401936692, + "loss": 2.6199, + "step": 26198 + }, + { + "epoch": 0.7768881772084334, + "grad_norm": 0.0973830595612526, + "learning_rate": 0.00012020602017693249, + "loss": 2.5665, + "step": 26199 + }, + { + "epoch": 0.7769178305607449, + "grad_norm": 0.08954387158155441, + "learning_rate": 0.00012017541969763901, + "loss": 2.6382, + "step": 26200 + }, + { + "epoch": 0.7769474839130563, + "grad_norm": 0.10115702450275421, + "learning_rate": 0.00012014482258175741, + "loss": 2.5544, + "step": 26201 + }, + { + "epoch": 0.7769771372653679, + "grad_norm": 0.09322478622198105, + "learning_rate": 0.00012011422882955853, + "loss": 2.5647, + "step": 26202 + }, + { + "epoch": 0.7770067906176793, + "grad_norm": 0.10005217790603638, + "learning_rate": 0.00012008363844131343, + "loss": 2.5895, + "step": 26203 + }, + { + "epoch": 0.7770364439699908, + "grad_norm": 0.09211506694555283, + "learning_rate": 0.00012005305141729289, + "loss": 2.5907, + "step": 26204 + }, + { + "epoch": 0.7770660973223023, + "grad_norm": 0.09258479624986649, + "learning_rate": 0.00012002246775776782, + "loss": 2.6009, + "step": 26205 + }, + { + "epoch": 0.7770957506746138, + "grad_norm": 0.09613201022148132, + "learning_rate": 0.00011999188746300899, + "loss": 2.6066, + "step": 26206 + }, + { + "epoch": 0.7771254040269252, + "grad_norm": 0.09907753765583038, + "learning_rate": 0.00011996131053328723, + "loss": 2.5911, + "step": 26207 + }, + { + "epoch": 0.7771550573792367, + "grad_norm": 0.08409300446510315, + "learning_rate": 0.0001199307369688733, + "loss": 2.5827, + "step": 26208 + }, + { + "epoch": 0.7771847107315482, + "grad_norm": 0.09214789420366287, + "learning_rate": 0.0001199001667700379, + "loss": 2.5997, + "step": 26209 + }, + { + "epoch": 0.7772143640838597, + "grad_norm": 0.09158029407262802, + "learning_rate": 0.00011986959993705182, + "loss": 2.569, + "step": 26210 + }, + { + "epoch": 0.7772440174361711, + "grad_norm": 0.08848516643047333, + "learning_rate": 0.00011983903647018568, + "loss": 2.6171, + "step": 26211 + }, + { + "epoch": 0.7772736707884826, + "grad_norm": 0.09194131195545197, + "learning_rate": 0.00011980847636971009, + "loss": 2.5905, + "step": 26212 + }, + { + "epoch": 0.7773033241407942, + "grad_norm": 0.08940250426530838, + "learning_rate": 0.00011977791963589574, + "loss": 2.5943, + "step": 26213 + }, + { + "epoch": 0.7773329774931056, + "grad_norm": 0.09076467901468277, + "learning_rate": 0.00011974736626901312, + "loss": 2.54, + "step": 26214 + }, + { + "epoch": 0.7773626308454171, + "grad_norm": 0.09587797522544861, + "learning_rate": 0.00011971681626933289, + "loss": 2.6006, + "step": 26215 + }, + { + "epoch": 0.7773922841977285, + "grad_norm": 0.0844583734869957, + "learning_rate": 0.00011968626963712554, + "loss": 2.5692, + "step": 26216 + }, + { + "epoch": 0.7774219375500401, + "grad_norm": 0.10469172149896622, + "learning_rate": 0.00011965572637266147, + "loss": 2.5988, + "step": 26217 + }, + { + "epoch": 0.7774515909023515, + "grad_norm": 0.08230181038379669, + "learning_rate": 0.00011962518647621135, + "loss": 2.5713, + "step": 26218 + }, + { + "epoch": 0.777481244254663, + "grad_norm": 0.09896787256002426, + "learning_rate": 0.00011959464994804542, + "loss": 2.6168, + "step": 26219 + }, + { + "epoch": 0.7775108976069744, + "grad_norm": 0.0912972018122673, + "learning_rate": 0.00011956411678843416, + "loss": 2.6157, + "step": 26220 + }, + { + "epoch": 0.777540550959286, + "grad_norm": 0.09050891548395157, + "learning_rate": 0.00011953358699764794, + "loss": 2.5631, + "step": 26221 + }, + { + "epoch": 0.7775702043115974, + "grad_norm": 0.10399783402681351, + "learning_rate": 0.00011950306057595712, + "loss": 2.5527, + "step": 26222 + }, + { + "epoch": 0.7775998576639089, + "grad_norm": 0.08762936294078827, + "learning_rate": 0.00011947253752363202, + "loss": 2.6008, + "step": 26223 + }, + { + "epoch": 0.7776295110162204, + "grad_norm": 0.09591015428304672, + "learning_rate": 0.00011944201784094294, + "loss": 2.5815, + "step": 26224 + }, + { + "epoch": 0.7776591643685319, + "grad_norm": 0.11331041902303696, + "learning_rate": 0.00011941150152816005, + "loss": 2.6542, + "step": 26225 + }, + { + "epoch": 0.7776888177208433, + "grad_norm": 0.09832878410816193, + "learning_rate": 0.00011938098858555368, + "loss": 2.5697, + "step": 26226 + }, + { + "epoch": 0.7777184710731548, + "grad_norm": 0.09343153238296509, + "learning_rate": 0.00011935047901339401, + "loss": 2.5975, + "step": 26227 + }, + { + "epoch": 0.7777481244254663, + "grad_norm": 0.09025020152330399, + "learning_rate": 0.00011931997281195117, + "loss": 2.6156, + "step": 26228 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.09352005273103714, + "learning_rate": 0.00011928946998149543, + "loss": 2.5967, + "step": 26229 + }, + { + "epoch": 0.7778074311300892, + "grad_norm": 0.09348230063915253, + "learning_rate": 0.00011925897052229667, + "loss": 2.5568, + "step": 26230 + }, + { + "epoch": 0.7778370844824007, + "grad_norm": 0.08681105077266693, + "learning_rate": 0.0001192284744346251, + "loss": 2.5766, + "step": 26231 + }, + { + "epoch": 0.7778667378347122, + "grad_norm": 0.0889197513461113, + "learning_rate": 0.00011919798171875057, + "loss": 2.5769, + "step": 26232 + }, + { + "epoch": 0.7778963911870237, + "grad_norm": 0.09232145547866821, + "learning_rate": 0.0001191674923749434, + "loss": 2.566, + "step": 26233 + }, + { + "epoch": 0.7779260445393352, + "grad_norm": 0.0954190269112587, + "learning_rate": 0.00011913700640347346, + "loss": 2.5925, + "step": 26234 + }, + { + "epoch": 0.7779556978916466, + "grad_norm": 0.08355719596147537, + "learning_rate": 0.00011910652380461074, + "loss": 2.6058, + "step": 26235 + }, + { + "epoch": 0.7779853512439582, + "grad_norm": 0.10228892415761948, + "learning_rate": 0.00011907604457862509, + "loss": 2.5748, + "step": 26236 + }, + { + "epoch": 0.7780150045962696, + "grad_norm": 0.09008879959583282, + "learning_rate": 0.00011904556872578649, + "loss": 2.5992, + "step": 26237 + }, + { + "epoch": 0.7780446579485811, + "grad_norm": 0.10171995311975479, + "learning_rate": 0.00011901509624636476, + "loss": 2.5991, + "step": 26238 + }, + { + "epoch": 0.7780743113008926, + "grad_norm": 0.09293865412473679, + "learning_rate": 0.00011898462714062974, + "loss": 2.5786, + "step": 26239 + }, + { + "epoch": 0.7781039646532041, + "grad_norm": 0.10737624764442444, + "learning_rate": 0.00011895416140885135, + "loss": 2.5652, + "step": 26240 + }, + { + "epoch": 0.7781336180055155, + "grad_norm": 0.09332606196403503, + "learning_rate": 0.00011892369905129919, + "loss": 2.5986, + "step": 26241 + }, + { + "epoch": 0.778163271357827, + "grad_norm": 0.11288096010684967, + "learning_rate": 0.00011889324006824304, + "loss": 2.589, + "step": 26242 + }, + { + "epoch": 0.7781929247101385, + "grad_norm": 0.09297959506511688, + "learning_rate": 0.00011886278445995269, + "loss": 2.5712, + "step": 26243 + }, + { + "epoch": 0.77822257806245, + "grad_norm": 0.1018306314945221, + "learning_rate": 0.00011883233222669782, + "loss": 2.5834, + "step": 26244 + }, + { + "epoch": 0.7782522314147614, + "grad_norm": 0.10038542002439499, + "learning_rate": 0.00011880188336874786, + "loss": 2.5696, + "step": 26245 + }, + { + "epoch": 0.7782818847670729, + "grad_norm": 0.10653217881917953, + "learning_rate": 0.00011877143788637285, + "loss": 2.5912, + "step": 26246 + }, + { + "epoch": 0.7783115381193844, + "grad_norm": 0.1087726578116417, + "learning_rate": 0.00011874099577984221, + "loss": 2.5872, + "step": 26247 + }, + { + "epoch": 0.7783411914716959, + "grad_norm": 0.11225034296512604, + "learning_rate": 0.00011871055704942546, + "loss": 2.6264, + "step": 26248 + }, + { + "epoch": 0.7783708448240073, + "grad_norm": 0.10001996159553528, + "learning_rate": 0.00011868012169539222, + "loss": 2.5911, + "step": 26249 + }, + { + "epoch": 0.7784004981763188, + "grad_norm": 0.10799010843038559, + "learning_rate": 0.00011864968971801204, + "loss": 2.5983, + "step": 26250 + }, + { + "epoch": 0.7784301515286303, + "grad_norm": 0.09347078204154968, + "learning_rate": 0.00011861926111755423, + "loss": 2.5717, + "step": 26251 + }, + { + "epoch": 0.7784598048809418, + "grad_norm": 0.10457743704319, + "learning_rate": 0.00011858883589428826, + "loss": 2.5707, + "step": 26252 + }, + { + "epoch": 0.7784894582332532, + "grad_norm": 0.10587893426418304, + "learning_rate": 0.00011855841404848366, + "loss": 2.6026, + "step": 26253 + }, + { + "epoch": 0.7785191115855647, + "grad_norm": 0.10483458638191223, + "learning_rate": 0.00011852799558040978, + "loss": 2.5729, + "step": 26254 + }, + { + "epoch": 0.7785487649378763, + "grad_norm": 0.09343799948692322, + "learning_rate": 0.00011849758049033599, + "loss": 2.5917, + "step": 26255 + }, + { + "epoch": 0.7785784182901877, + "grad_norm": 0.09656060487031937, + "learning_rate": 0.00011846716877853159, + "loss": 2.5812, + "step": 26256 + }, + { + "epoch": 0.7786080716424992, + "grad_norm": 0.10022132098674774, + "learning_rate": 0.00011843676044526575, + "loss": 2.6004, + "step": 26257 + }, + { + "epoch": 0.7786377249948107, + "grad_norm": 0.10087663680315018, + "learning_rate": 0.00011840635549080803, + "loss": 2.5785, + "step": 26258 + }, + { + "epoch": 0.7786673783471222, + "grad_norm": 0.10190358012914658, + "learning_rate": 0.00011837595391542754, + "loss": 2.6023, + "step": 26259 + }, + { + "epoch": 0.7786970316994336, + "grad_norm": 0.09963212162256241, + "learning_rate": 0.00011834555571939348, + "loss": 2.5647, + "step": 26260 + }, + { + "epoch": 0.7787266850517451, + "grad_norm": 0.1143403947353363, + "learning_rate": 0.00011831516090297517, + "loss": 2.5902, + "step": 26261 + }, + { + "epoch": 0.7787563384040566, + "grad_norm": 0.10307958722114563, + "learning_rate": 0.00011828476946644151, + "loss": 2.6084, + "step": 26262 + }, + { + "epoch": 0.7787859917563681, + "grad_norm": 0.09758532047271729, + "learning_rate": 0.00011825438141006173, + "loss": 2.5731, + "step": 26263 + }, + { + "epoch": 0.7788156451086795, + "grad_norm": 0.10105274617671967, + "learning_rate": 0.00011822399673410489, + "loss": 2.5876, + "step": 26264 + }, + { + "epoch": 0.778845298460991, + "grad_norm": 0.09006865322589874, + "learning_rate": 0.0001181936154388401, + "loss": 2.5841, + "step": 26265 + }, + { + "epoch": 0.7788749518133025, + "grad_norm": 0.11666784435510635, + "learning_rate": 0.00011816323752453639, + "loss": 2.6174, + "step": 26266 + }, + { + "epoch": 0.778904605165614, + "grad_norm": 0.09618151187896729, + "learning_rate": 0.00011813286299146281, + "loss": 2.6025, + "step": 26267 + }, + { + "epoch": 0.7789342585179254, + "grad_norm": 0.10022537410259247, + "learning_rate": 0.0001181024918398882, + "loss": 2.585, + "step": 26268 + }, + { + "epoch": 0.778963911870237, + "grad_norm": 0.09744511544704437, + "learning_rate": 0.0001180721240700816, + "loss": 2.5908, + "step": 26269 + }, + { + "epoch": 0.7789935652225484, + "grad_norm": 0.09597273916006088, + "learning_rate": 0.00011804175968231178, + "loss": 2.5651, + "step": 26270 + }, + { + "epoch": 0.7790232185748599, + "grad_norm": 0.10526420176029205, + "learning_rate": 0.00011801139867684801, + "loss": 2.5994, + "step": 26271 + }, + { + "epoch": 0.7790528719271713, + "grad_norm": 0.0932769849896431, + "learning_rate": 0.00011798104105395869, + "loss": 2.5846, + "step": 26272 + }, + { + "epoch": 0.7790825252794829, + "grad_norm": 0.10159170627593994, + "learning_rate": 0.00011795068681391285, + "loss": 2.6077, + "step": 26273 + }, + { + "epoch": 0.7791121786317943, + "grad_norm": 0.09784478694200516, + "learning_rate": 0.00011792033595697926, + "loss": 2.5784, + "step": 26274 + }, + { + "epoch": 0.7791418319841058, + "grad_norm": 0.09288401156663895, + "learning_rate": 0.00011788998848342669, + "loss": 2.6326, + "step": 26275 + }, + { + "epoch": 0.7791714853364173, + "grad_norm": 0.0951070487499237, + "learning_rate": 0.00011785964439352386, + "loss": 2.5816, + "step": 26276 + }, + { + "epoch": 0.7792011386887288, + "grad_norm": 0.08684800565242767, + "learning_rate": 0.00011782930368753947, + "loss": 2.5825, + "step": 26277 + }, + { + "epoch": 0.7792307920410403, + "grad_norm": 0.10011975467205048, + "learning_rate": 0.00011779896636574223, + "loss": 2.5871, + "step": 26278 + }, + { + "epoch": 0.7792604453933517, + "grad_norm": 0.10343282669782639, + "learning_rate": 0.00011776863242840069, + "loss": 2.6011, + "step": 26279 + }, + { + "epoch": 0.7792900987456632, + "grad_norm": 0.09996643662452698, + "learning_rate": 0.00011773830187578354, + "loss": 2.5634, + "step": 26280 + }, + { + "epoch": 0.7793197520979747, + "grad_norm": 0.10168340802192688, + "learning_rate": 0.00011770797470815931, + "loss": 2.5804, + "step": 26281 + }, + { + "epoch": 0.7793494054502862, + "grad_norm": 0.10199567675590515, + "learning_rate": 0.00011767765092579663, + "loss": 2.5937, + "step": 26282 + }, + { + "epoch": 0.7793790588025976, + "grad_norm": 0.10583827644586563, + "learning_rate": 0.00011764733052896392, + "loss": 2.5697, + "step": 26283 + }, + { + "epoch": 0.7794087121549091, + "grad_norm": 0.09493182599544525, + "learning_rate": 0.0001176170135179298, + "loss": 2.6079, + "step": 26284 + }, + { + "epoch": 0.7794383655072206, + "grad_norm": 0.10726877301931381, + "learning_rate": 0.00011758669989296262, + "loss": 2.5998, + "step": 26285 + }, + { + "epoch": 0.7794680188595321, + "grad_norm": 0.09049159288406372, + "learning_rate": 0.00011755638965433085, + "loss": 2.5589, + "step": 26286 + }, + { + "epoch": 0.7794976722118435, + "grad_norm": 0.10482293367385864, + "learning_rate": 0.00011752608280230292, + "loss": 2.569, + "step": 26287 + }, + { + "epoch": 0.779527325564155, + "grad_norm": 0.10264187306165695, + "learning_rate": 0.00011749577933714717, + "loss": 2.6013, + "step": 26288 + }, + { + "epoch": 0.7795569789164665, + "grad_norm": 0.09478239715099335, + "learning_rate": 0.00011746547925913192, + "loss": 2.6021, + "step": 26289 + }, + { + "epoch": 0.779586632268778, + "grad_norm": 0.10706286877393723, + "learning_rate": 0.00011743518256852554, + "loss": 2.5861, + "step": 26290 + }, + { + "epoch": 0.7796162856210894, + "grad_norm": 0.09459244459867477, + "learning_rate": 0.00011740488926559628, + "loss": 2.6222, + "step": 26291 + }, + { + "epoch": 0.779645938973401, + "grad_norm": 0.10464169830083847, + "learning_rate": 0.0001173745993506124, + "loss": 2.5539, + "step": 26292 + }, + { + "epoch": 0.7796755923257124, + "grad_norm": 0.10022375732660294, + "learning_rate": 0.00011734431282384211, + "loss": 2.6196, + "step": 26293 + }, + { + "epoch": 0.7797052456780239, + "grad_norm": 0.09752263128757477, + "learning_rate": 0.0001173140296855536, + "loss": 2.5994, + "step": 26294 + }, + { + "epoch": 0.7797348990303353, + "grad_norm": 0.09815005958080292, + "learning_rate": 0.0001172837499360152, + "loss": 2.5746, + "step": 26295 + }, + { + "epoch": 0.7797645523826469, + "grad_norm": 0.10488712042570114, + "learning_rate": 0.0001172534735754946, + "loss": 2.564, + "step": 26296 + }, + { + "epoch": 0.7797942057349584, + "grad_norm": 0.10129183530807495, + "learning_rate": 0.00011722320060426033, + "loss": 2.6073, + "step": 26297 + }, + { + "epoch": 0.7798238590872698, + "grad_norm": 0.09782321006059647, + "learning_rate": 0.00011719293102258033, + "loss": 2.5218, + "step": 26298 + }, + { + "epoch": 0.7798535124395813, + "grad_norm": 0.10505972057580948, + "learning_rate": 0.00011716266483072263, + "loss": 2.556, + "step": 26299 + }, + { + "epoch": 0.7798831657918928, + "grad_norm": 0.09787788987159729, + "learning_rate": 0.00011713240202895525, + "loss": 2.5572, + "step": 26300 + }, + { + "epoch": 0.7799128191442043, + "grad_norm": 0.0987854078412056, + "learning_rate": 0.00011710214261754621, + "loss": 2.5886, + "step": 26301 + }, + { + "epoch": 0.7799424724965157, + "grad_norm": 0.09190794825553894, + "learning_rate": 0.00011707188659676338, + "loss": 2.5694, + "step": 26302 + }, + { + "epoch": 0.7799721258488272, + "grad_norm": 0.09761999547481537, + "learning_rate": 0.00011704163396687468, + "loss": 2.6261, + "step": 26303 + }, + { + "epoch": 0.7800017792011387, + "grad_norm": 0.09143470972776413, + "learning_rate": 0.0001170113847281481, + "loss": 2.5919, + "step": 26304 + }, + { + "epoch": 0.7800314325534502, + "grad_norm": 0.09157378226518631, + "learning_rate": 0.00011698113888085144, + "loss": 2.5907, + "step": 26305 + }, + { + "epoch": 0.7800610859057616, + "grad_norm": 0.10313642770051956, + "learning_rate": 0.0001169508964252527, + "loss": 2.5729, + "step": 26306 + }, + { + "epoch": 0.7800907392580732, + "grad_norm": 0.08479069918394089, + "learning_rate": 0.00011692065736161933, + "loss": 2.6256, + "step": 26307 + }, + { + "epoch": 0.7801203926103846, + "grad_norm": 0.09837338328361511, + "learning_rate": 0.0001168904216902193, + "loss": 2.6235, + "step": 26308 + }, + { + "epoch": 0.7801500459626961, + "grad_norm": 0.0882907286286354, + "learning_rate": 0.00011686018941132026, + "loss": 2.5644, + "step": 26309 + }, + { + "epoch": 0.7801796993150075, + "grad_norm": 0.08943235129117966, + "learning_rate": 0.00011682996052519007, + "loss": 2.6205, + "step": 26310 + }, + { + "epoch": 0.7802093526673191, + "grad_norm": 0.08982300758361816, + "learning_rate": 0.0001167997350320964, + "loss": 2.6183, + "step": 26311 + }, + { + "epoch": 0.7802390060196305, + "grad_norm": 0.09566687047481537, + "learning_rate": 0.00011676951293230686, + "loss": 2.6016, + "step": 26312 + }, + { + "epoch": 0.780268659371942, + "grad_norm": 0.09351611137390137, + "learning_rate": 0.00011673929422608903, + "loss": 2.6025, + "step": 26313 + }, + { + "epoch": 0.7802983127242534, + "grad_norm": 0.10204397141933441, + "learning_rate": 0.00011670907891371058, + "loss": 2.6119, + "step": 26314 + }, + { + "epoch": 0.780327966076565, + "grad_norm": 0.0839349776506424, + "learning_rate": 0.00011667886699543901, + "loss": 2.5885, + "step": 26315 + }, + { + "epoch": 0.7803576194288764, + "grad_norm": 0.1055663526058197, + "learning_rate": 0.000116648658471542, + "loss": 2.5703, + "step": 26316 + }, + { + "epoch": 0.7803872727811879, + "grad_norm": 0.09651344269514084, + "learning_rate": 0.00011661845334228677, + "loss": 2.5962, + "step": 26317 + }, + { + "epoch": 0.7804169261334994, + "grad_norm": 0.09923838824033737, + "learning_rate": 0.00011658825160794096, + "loss": 2.5785, + "step": 26318 + }, + { + "epoch": 0.7804465794858109, + "grad_norm": 0.09940314292907715, + "learning_rate": 0.00011655805326877195, + "loss": 2.591, + "step": 26319 + }, + { + "epoch": 0.7804762328381224, + "grad_norm": 0.09679178893566132, + "learning_rate": 0.00011652785832504726, + "loss": 2.5592, + "step": 26320 + }, + { + "epoch": 0.7805058861904338, + "grad_norm": 0.09391272813081741, + "learning_rate": 0.00011649766677703416, + "loss": 2.6074, + "step": 26321 + }, + { + "epoch": 0.7805355395427453, + "grad_norm": 0.10799206048250198, + "learning_rate": 0.0001164674786249999, + "loss": 2.5902, + "step": 26322 + }, + { + "epoch": 0.7805651928950568, + "grad_norm": 0.09774719923734665, + "learning_rate": 0.00011643729386921215, + "loss": 2.6133, + "step": 26323 + }, + { + "epoch": 0.7805948462473683, + "grad_norm": 0.10599225014448166, + "learning_rate": 0.00011640711250993796, + "loss": 2.6162, + "step": 26324 + }, + { + "epoch": 0.7806244995996797, + "grad_norm": 0.08666869252920151, + "learning_rate": 0.00011637693454744464, + "loss": 2.6173, + "step": 26325 + }, + { + "epoch": 0.7806541529519913, + "grad_norm": 0.09727732092142105, + "learning_rate": 0.0001163467599819995, + "loss": 2.5617, + "step": 26326 + }, + { + "epoch": 0.7806838063043027, + "grad_norm": 0.09543491899967194, + "learning_rate": 0.00011631658881386959, + "loss": 2.6213, + "step": 26327 + }, + { + "epoch": 0.7807134596566142, + "grad_norm": 0.09878960251808167, + "learning_rate": 0.0001162864210433221, + "loss": 2.5931, + "step": 26328 + }, + { + "epoch": 0.7807431130089256, + "grad_norm": 0.09269534051418304, + "learning_rate": 0.00011625625667062423, + "loss": 2.5731, + "step": 26329 + }, + { + "epoch": 0.7807727663612372, + "grad_norm": 0.10194265097379684, + "learning_rate": 0.00011622609569604308, + "loss": 2.5727, + "step": 26330 + }, + { + "epoch": 0.7808024197135486, + "grad_norm": 0.1040312647819519, + "learning_rate": 0.00011619593811984574, + "loss": 2.5963, + "step": 26331 + }, + { + "epoch": 0.7808320730658601, + "grad_norm": 0.10599590837955475, + "learning_rate": 0.0001161657839422992, + "loss": 2.5864, + "step": 26332 + }, + { + "epoch": 0.7808617264181715, + "grad_norm": 0.09542202204465866, + "learning_rate": 0.0001161356331636706, + "loss": 2.5883, + "step": 26333 + }, + { + "epoch": 0.7808913797704831, + "grad_norm": 0.09544005990028381, + "learning_rate": 0.00011610548578422681, + "loss": 2.5766, + "step": 26334 + }, + { + "epoch": 0.7809210331227945, + "grad_norm": 0.09129434078931808, + "learning_rate": 0.00011607534180423469, + "loss": 2.5889, + "step": 26335 + }, + { + "epoch": 0.780950686475106, + "grad_norm": 0.08786637336015701, + "learning_rate": 0.0001160452012239615, + "loss": 2.5899, + "step": 26336 + }, + { + "epoch": 0.7809803398274174, + "grad_norm": 0.0879465639591217, + "learning_rate": 0.00011601506404367407, + "loss": 2.5643, + "step": 26337 + }, + { + "epoch": 0.781009993179729, + "grad_norm": 0.09878901392221451, + "learning_rate": 0.00011598493026363898, + "loss": 2.6092, + "step": 26338 + }, + { + "epoch": 0.7810396465320405, + "grad_norm": 0.08365967869758606, + "learning_rate": 0.00011595479988412333, + "loss": 2.5721, + "step": 26339 + }, + { + "epoch": 0.7810692998843519, + "grad_norm": 0.09613168984651566, + "learning_rate": 0.00011592467290539377, + "loss": 2.5848, + "step": 26340 + }, + { + "epoch": 0.7810989532366635, + "grad_norm": 0.08822344988584518, + "learning_rate": 0.0001158945493277172, + "loss": 2.5839, + "step": 26341 + }, + { + "epoch": 0.7811286065889749, + "grad_norm": 0.09292473644018173, + "learning_rate": 0.00011586442915136031, + "loss": 2.5883, + "step": 26342 + }, + { + "epoch": 0.7811582599412864, + "grad_norm": 0.08705811947584152, + "learning_rate": 0.00011583431237658986, + "loss": 2.5641, + "step": 26343 + }, + { + "epoch": 0.7811879132935978, + "grad_norm": 0.10094460844993591, + "learning_rate": 0.0001158041990036725, + "loss": 2.605, + "step": 26344 + }, + { + "epoch": 0.7812175666459094, + "grad_norm": 0.10139389336109161, + "learning_rate": 0.0001157740890328749, + "loss": 2.5926, + "step": 26345 + }, + { + "epoch": 0.7812472199982208, + "grad_norm": 0.1029827743768692, + "learning_rate": 0.00011574398246446371, + "loss": 2.5976, + "step": 26346 + }, + { + "epoch": 0.7812768733505323, + "grad_norm": 0.09612222015857697, + "learning_rate": 0.00011571387929870547, + "loss": 2.6005, + "step": 26347 + }, + { + "epoch": 0.7813065267028437, + "grad_norm": 0.1020478829741478, + "learning_rate": 0.00011568377953586684, + "loss": 2.5888, + "step": 26348 + }, + { + "epoch": 0.7813361800551553, + "grad_norm": 0.10072310268878937, + "learning_rate": 0.00011565368317621428, + "loss": 2.6121, + "step": 26349 + }, + { + "epoch": 0.7813658334074667, + "grad_norm": 0.09236480295658112, + "learning_rate": 0.00011562359022001434, + "loss": 2.5644, + "step": 26350 + }, + { + "epoch": 0.7813954867597782, + "grad_norm": 0.1032228097319603, + "learning_rate": 0.00011559350066753349, + "loss": 2.6262, + "step": 26351 + }, + { + "epoch": 0.7814251401120896, + "grad_norm": 0.09491331875324249, + "learning_rate": 0.00011556341451903819, + "loss": 2.6056, + "step": 26352 + }, + { + "epoch": 0.7814547934644012, + "grad_norm": 0.10917258262634277, + "learning_rate": 0.0001155333317747948, + "loss": 2.542, + "step": 26353 + }, + { + "epoch": 0.7814844468167126, + "grad_norm": 0.10228203982114792, + "learning_rate": 0.00011550325243506976, + "loss": 2.5749, + "step": 26354 + }, + { + "epoch": 0.7815141001690241, + "grad_norm": 0.10064367949962616, + "learning_rate": 0.00011547317650012945, + "loss": 2.5843, + "step": 26355 + }, + { + "epoch": 0.7815437535213355, + "grad_norm": 0.09116610884666443, + "learning_rate": 0.0001154431039702401, + "loss": 2.5539, + "step": 26356 + }, + { + "epoch": 0.7815734068736471, + "grad_norm": 0.08340926468372345, + "learning_rate": 0.00011541303484566811, + "loss": 2.589, + "step": 26357 + }, + { + "epoch": 0.7816030602259585, + "grad_norm": 0.08879993855953217, + "learning_rate": 0.00011538296912667973, + "loss": 2.6012, + "step": 26358 + }, + { + "epoch": 0.78163271357827, + "grad_norm": 0.09209601581096649, + "learning_rate": 0.00011535290681354116, + "loss": 2.564, + "step": 26359 + }, + { + "epoch": 0.7816623669305816, + "grad_norm": 0.10059552639722824, + "learning_rate": 0.00011532284790651864, + "loss": 2.5992, + "step": 26360 + }, + { + "epoch": 0.781692020282893, + "grad_norm": 0.08803568035364151, + "learning_rate": 0.0001152927924058783, + "loss": 2.5909, + "step": 26361 + }, + { + "epoch": 0.7817216736352045, + "grad_norm": 0.09974164515733719, + "learning_rate": 0.00011526274031188633, + "loss": 2.6184, + "step": 26362 + }, + { + "epoch": 0.7817513269875159, + "grad_norm": 0.08806726336479187, + "learning_rate": 0.00011523269162480887, + "loss": 2.5815, + "step": 26363 + }, + { + "epoch": 0.7817809803398275, + "grad_norm": 0.09492076933383942, + "learning_rate": 0.0001152026463449119, + "loss": 2.5783, + "step": 26364 + }, + { + "epoch": 0.7818106336921389, + "grad_norm": 0.09484458714723587, + "learning_rate": 0.00011517260447246159, + "loss": 2.6011, + "step": 26365 + }, + { + "epoch": 0.7818402870444504, + "grad_norm": 0.095828577876091, + "learning_rate": 0.00011514256600772394, + "loss": 2.5825, + "step": 26366 + }, + { + "epoch": 0.7818699403967618, + "grad_norm": 0.09161258488893509, + "learning_rate": 0.00011511253095096491, + "loss": 2.583, + "step": 26367 + }, + { + "epoch": 0.7818995937490734, + "grad_norm": 0.10411212593317032, + "learning_rate": 0.00011508249930245047, + "loss": 2.5725, + "step": 26368 + }, + { + "epoch": 0.7819292471013848, + "grad_norm": 0.09117703884840012, + "learning_rate": 0.00011505247106244655, + "loss": 2.5712, + "step": 26369 + }, + { + "epoch": 0.7819589004536963, + "grad_norm": 0.09793514013290405, + "learning_rate": 0.0001150224462312191, + "loss": 2.5849, + "step": 26370 + }, + { + "epoch": 0.7819885538060077, + "grad_norm": 0.09504999220371246, + "learning_rate": 0.0001149924248090341, + "loss": 2.5981, + "step": 26371 + }, + { + "epoch": 0.7820182071583193, + "grad_norm": 0.10176366567611694, + "learning_rate": 0.00011496240679615716, + "loss": 2.6172, + "step": 26372 + }, + { + "epoch": 0.7820478605106307, + "grad_norm": 0.10480639338493347, + "learning_rate": 0.00011493239219285418, + "loss": 2.5829, + "step": 26373 + }, + { + "epoch": 0.7820775138629422, + "grad_norm": 0.10090451687574387, + "learning_rate": 0.00011490238099939081, + "loss": 2.5598, + "step": 26374 + }, + { + "epoch": 0.7821071672152536, + "grad_norm": 0.10213375091552734, + "learning_rate": 0.0001148723732160331, + "loss": 2.617, + "step": 26375 + }, + { + "epoch": 0.7821368205675652, + "grad_norm": 0.08441931754350662, + "learning_rate": 0.00011484236884304661, + "loss": 2.5655, + "step": 26376 + }, + { + "epoch": 0.7821664739198766, + "grad_norm": 0.09733965247869492, + "learning_rate": 0.0001148123678806971, + "loss": 2.5817, + "step": 26377 + }, + { + "epoch": 0.7821961272721881, + "grad_norm": 0.09520100057125092, + "learning_rate": 0.00011478237032925015, + "loss": 2.6312, + "step": 26378 + }, + { + "epoch": 0.7822257806244995, + "grad_norm": 0.08638447523117065, + "learning_rate": 0.00011475237618897144, + "loss": 2.5701, + "step": 26379 + }, + { + "epoch": 0.7822554339768111, + "grad_norm": 0.09298041462898254, + "learning_rate": 0.00011472238546012659, + "loss": 2.6041, + "step": 26380 + }, + { + "epoch": 0.7822850873291226, + "grad_norm": 0.09055197983980179, + "learning_rate": 0.00011469239814298115, + "loss": 2.554, + "step": 26381 + }, + { + "epoch": 0.782314740681434, + "grad_norm": 0.09092267602682114, + "learning_rate": 0.00011466241423780077, + "loss": 2.5358, + "step": 26382 + }, + { + "epoch": 0.7823443940337456, + "grad_norm": 0.085915707051754, + "learning_rate": 0.00011463243374485072, + "loss": 2.5714, + "step": 26383 + }, + { + "epoch": 0.782374047386057, + "grad_norm": 0.08327245712280273, + "learning_rate": 0.00011460245666439667, + "loss": 2.6159, + "step": 26384 + }, + { + "epoch": 0.7824037007383685, + "grad_norm": 0.07978479564189911, + "learning_rate": 0.00011457248299670398, + "loss": 2.6259, + "step": 26385 + }, + { + "epoch": 0.7824333540906799, + "grad_norm": 0.09015428274869919, + "learning_rate": 0.00011454251274203808, + "loss": 2.5905, + "step": 26386 + }, + { + "epoch": 0.7824630074429915, + "grad_norm": 0.08257850259542465, + "learning_rate": 0.00011451254590066429, + "loss": 2.6028, + "step": 26387 + }, + { + "epoch": 0.7824926607953029, + "grad_norm": 0.0896509662270546, + "learning_rate": 0.00011448258247284821, + "loss": 2.6112, + "step": 26388 + }, + { + "epoch": 0.7825223141476144, + "grad_norm": 0.08729074895381927, + "learning_rate": 0.00011445262245885502, + "loss": 2.5729, + "step": 26389 + }, + { + "epoch": 0.7825519674999258, + "grad_norm": 0.08536747843027115, + "learning_rate": 0.00011442266585895006, + "loss": 2.551, + "step": 26390 + }, + { + "epoch": 0.7825816208522374, + "grad_norm": 0.0884896069765091, + "learning_rate": 0.00011439271267339851, + "loss": 2.6129, + "step": 26391 + }, + { + "epoch": 0.7826112742045488, + "grad_norm": 0.08274133503437042, + "learning_rate": 0.00011436276290246589, + "loss": 2.5687, + "step": 26392 + }, + { + "epoch": 0.7826409275568603, + "grad_norm": 0.09584993869066238, + "learning_rate": 0.00011433281654641703, + "loss": 2.5887, + "step": 26393 + }, + { + "epoch": 0.7826705809091717, + "grad_norm": 0.08284386247396469, + "learning_rate": 0.00011430287360551733, + "loss": 2.5962, + "step": 26394 + }, + { + "epoch": 0.7827002342614833, + "grad_norm": 0.10275301337242126, + "learning_rate": 0.00011427293408003181, + "loss": 2.6274, + "step": 26395 + }, + { + "epoch": 0.7827298876137947, + "grad_norm": 0.08751016855239868, + "learning_rate": 0.0001142429979702257, + "loss": 2.5825, + "step": 26396 + }, + { + "epoch": 0.7827595409661062, + "grad_norm": 0.09433083981275558, + "learning_rate": 0.0001142130652763641, + "loss": 2.5537, + "step": 26397 + }, + { + "epoch": 0.7827891943184176, + "grad_norm": 0.08943434804677963, + "learning_rate": 0.00011418313599871194, + "loss": 2.609, + "step": 26398 + }, + { + "epoch": 0.7828188476707292, + "grad_norm": 0.10394168645143509, + "learning_rate": 0.00011415321013753444, + "loss": 2.6134, + "step": 26399 + }, + { + "epoch": 0.7828485010230406, + "grad_norm": 0.0892200618982315, + "learning_rate": 0.00011412328769309627, + "loss": 2.5721, + "step": 26400 + }, + { + "epoch": 0.7828781543753521, + "grad_norm": 0.11407013982534409, + "learning_rate": 0.00011409336866566278, + "loss": 2.5799, + "step": 26401 + }, + { + "epoch": 0.7829078077276637, + "grad_norm": 0.09990798681974411, + "learning_rate": 0.00011406345305549893, + "loss": 2.5751, + "step": 26402 + }, + { + "epoch": 0.7829374610799751, + "grad_norm": 0.11239553987979889, + "learning_rate": 0.00011403354086286927, + "loss": 2.5809, + "step": 26403 + }, + { + "epoch": 0.7829671144322866, + "grad_norm": 0.10149583220481873, + "learning_rate": 0.00011400363208803882, + "loss": 2.584, + "step": 26404 + }, + { + "epoch": 0.782996767784598, + "grad_norm": 0.11824250966310501, + "learning_rate": 0.00011397372673127255, + "loss": 2.5981, + "step": 26405 + }, + { + "epoch": 0.7830264211369096, + "grad_norm": 0.10283447057008743, + "learning_rate": 0.00011394382479283511, + "loss": 2.5546, + "step": 26406 + }, + { + "epoch": 0.783056074489221, + "grad_norm": 0.11108362674713135, + "learning_rate": 0.0001139139262729914, + "loss": 2.5738, + "step": 26407 + }, + { + "epoch": 0.7830857278415325, + "grad_norm": 0.09770140796899796, + "learning_rate": 0.00011388403117200619, + "loss": 2.6185, + "step": 26408 + }, + { + "epoch": 0.7831153811938439, + "grad_norm": 0.11405172199010849, + "learning_rate": 0.00011385413949014411, + "loss": 2.5675, + "step": 26409 + }, + { + "epoch": 0.7831450345461555, + "grad_norm": 0.10745226591825485, + "learning_rate": 0.00011382425122766993, + "loss": 2.5753, + "step": 26410 + }, + { + "epoch": 0.7831746878984669, + "grad_norm": 0.10805387794971466, + "learning_rate": 0.00011379436638484824, + "loss": 2.6283, + "step": 26411 + }, + { + "epoch": 0.7832043412507784, + "grad_norm": 0.10160209983587265, + "learning_rate": 0.0001137644849619438, + "loss": 2.5995, + "step": 26412 + }, + { + "epoch": 0.7832339946030898, + "grad_norm": 0.10303410142660141, + "learning_rate": 0.00011373460695922105, + "loss": 2.5756, + "step": 26413 + }, + { + "epoch": 0.7832636479554014, + "grad_norm": 0.09236223995685577, + "learning_rate": 0.00011370473237694473, + "loss": 2.5626, + "step": 26414 + }, + { + "epoch": 0.7832933013077128, + "grad_norm": 0.0931399017572403, + "learning_rate": 0.0001136748612153793, + "loss": 2.5578, + "step": 26415 + }, + { + "epoch": 0.7833229546600243, + "grad_norm": 0.0871608629822731, + "learning_rate": 0.00011364499347478929, + "loss": 2.5789, + "step": 26416 + }, + { + "epoch": 0.7833526080123357, + "grad_norm": 0.10224582254886627, + "learning_rate": 0.00011361512915543914, + "loss": 2.5871, + "step": 26417 + }, + { + "epoch": 0.7833822613646473, + "grad_norm": 0.09777490794658661, + "learning_rate": 0.00011358526825759342, + "loss": 2.5723, + "step": 26418 + }, + { + "epoch": 0.7834119147169587, + "grad_norm": 0.09949961304664612, + "learning_rate": 0.00011355541078151643, + "loss": 2.6054, + "step": 26419 + }, + { + "epoch": 0.7834415680692702, + "grad_norm": 0.09250246733427048, + "learning_rate": 0.00011352555672747262, + "loss": 2.608, + "step": 26420 + }, + { + "epoch": 0.7834712214215818, + "grad_norm": 0.09789558500051498, + "learning_rate": 0.00011349570609572629, + "loss": 2.6114, + "step": 26421 + }, + { + "epoch": 0.7835008747738932, + "grad_norm": 0.0971965417265892, + "learning_rate": 0.00011346585888654187, + "loss": 2.585, + "step": 26422 + }, + { + "epoch": 0.7835305281262047, + "grad_norm": 0.09031590819358826, + "learning_rate": 0.00011343601510018364, + "loss": 2.6031, + "step": 26423 + }, + { + "epoch": 0.7835601814785161, + "grad_norm": 0.09553392976522446, + "learning_rate": 0.00011340617473691578, + "loss": 2.5976, + "step": 26424 + }, + { + "epoch": 0.7835898348308277, + "grad_norm": 0.0903010219335556, + "learning_rate": 0.00011337633779700268, + "loss": 2.5531, + "step": 26425 + }, + { + "epoch": 0.7836194881831391, + "grad_norm": 0.08368315547704697, + "learning_rate": 0.00011334650428070841, + "loss": 2.5797, + "step": 26426 + }, + { + "epoch": 0.7836491415354506, + "grad_norm": 0.10596255213022232, + "learning_rate": 0.00011331667418829728, + "loss": 2.6015, + "step": 26427 + }, + { + "epoch": 0.783678794887762, + "grad_norm": 0.09255722910165787, + "learning_rate": 0.00011328684752003332, + "loss": 2.6052, + "step": 26428 + }, + { + "epoch": 0.7837084482400736, + "grad_norm": 0.09396615624427795, + "learning_rate": 0.00011325702427618068, + "loss": 2.5821, + "step": 26429 + }, + { + "epoch": 0.783738101592385, + "grad_norm": 0.09360744804143906, + "learning_rate": 0.00011322720445700352, + "loss": 2.595, + "step": 26430 + }, + { + "epoch": 0.7837677549446965, + "grad_norm": 0.09774801880121231, + "learning_rate": 0.00011319738806276586, + "loss": 2.6431, + "step": 26431 + }, + { + "epoch": 0.783797408297008, + "grad_norm": 0.08881143480539322, + "learning_rate": 0.0001131675750937317, + "loss": 2.5921, + "step": 26432 + }, + { + "epoch": 0.7838270616493195, + "grad_norm": 0.09535543620586395, + "learning_rate": 0.00011313776555016509, + "loss": 2.5848, + "step": 26433 + }, + { + "epoch": 0.7838567150016309, + "grad_norm": 0.09051349014043808, + "learning_rate": 0.00011310795943232993, + "loss": 2.5711, + "step": 26434 + }, + { + "epoch": 0.7838863683539424, + "grad_norm": 0.09206060320138931, + "learning_rate": 0.0001130781567404902, + "loss": 2.6128, + "step": 26435 + }, + { + "epoch": 0.7839160217062539, + "grad_norm": 0.08298317342996597, + "learning_rate": 0.00011304835747490983, + "loss": 2.6022, + "step": 26436 + }, + { + "epoch": 0.7839456750585654, + "grad_norm": 0.10677877813577652, + "learning_rate": 0.00011301856163585278, + "loss": 2.5914, + "step": 26437 + }, + { + "epoch": 0.7839753284108768, + "grad_norm": 0.08457902073860168, + "learning_rate": 0.0001129887692235827, + "loss": 2.5751, + "step": 26438 + }, + { + "epoch": 0.7840049817631883, + "grad_norm": 0.10027869790792465, + "learning_rate": 0.00011295898023836332, + "loss": 2.5822, + "step": 26439 + }, + { + "epoch": 0.7840346351154998, + "grad_norm": 0.09434185177087784, + "learning_rate": 0.00011292919468045875, + "loss": 2.6074, + "step": 26440 + }, + { + "epoch": 0.7840642884678113, + "grad_norm": 0.08882693946361542, + "learning_rate": 0.00011289941255013264, + "loss": 2.568, + "step": 26441 + }, + { + "epoch": 0.7840939418201228, + "grad_norm": 0.08798566460609436, + "learning_rate": 0.00011286963384764865, + "loss": 2.6044, + "step": 26442 + }, + { + "epoch": 0.7841235951724342, + "grad_norm": 0.092766672372818, + "learning_rate": 0.0001128398585732705, + "loss": 2.5974, + "step": 26443 + }, + { + "epoch": 0.7841532485247458, + "grad_norm": 0.09450829774141312, + "learning_rate": 0.00011281008672726185, + "loss": 2.624, + "step": 26444 + }, + { + "epoch": 0.7841829018770572, + "grad_norm": 0.09689497947692871, + "learning_rate": 0.00011278031830988633, + "loss": 2.556, + "step": 26445 + }, + { + "epoch": 0.7842125552293687, + "grad_norm": 0.08641384541988373, + "learning_rate": 0.0001127505533214076, + "loss": 2.566, + "step": 26446 + }, + { + "epoch": 0.7842422085816801, + "grad_norm": 0.10640137642621994, + "learning_rate": 0.00011272079176208927, + "loss": 2.6128, + "step": 26447 + }, + { + "epoch": 0.7842718619339917, + "grad_norm": 0.09077578783035278, + "learning_rate": 0.00011269103363219474, + "loss": 2.5885, + "step": 26448 + }, + { + "epoch": 0.7843015152863031, + "grad_norm": 0.10119592398405075, + "learning_rate": 0.00011266127893198752, + "loss": 2.5939, + "step": 26449 + }, + { + "epoch": 0.7843311686386146, + "grad_norm": 0.09226120263338089, + "learning_rate": 0.00011263152766173118, + "loss": 2.5967, + "step": 26450 + }, + { + "epoch": 0.784360821990926, + "grad_norm": 0.09185401350259781, + "learning_rate": 0.00011260177982168906, + "loss": 2.5779, + "step": 26451 + }, + { + "epoch": 0.7843904753432376, + "grad_norm": 0.09217140823602676, + "learning_rate": 0.00011257203541212479, + "loss": 2.6009, + "step": 26452 + }, + { + "epoch": 0.784420128695549, + "grad_norm": 0.09565573930740356, + "learning_rate": 0.0001125422944333016, + "loss": 2.5873, + "step": 26453 + }, + { + "epoch": 0.7844497820478605, + "grad_norm": 0.09123252332210541, + "learning_rate": 0.00011251255688548295, + "loss": 2.5566, + "step": 26454 + }, + { + "epoch": 0.784479435400172, + "grad_norm": 0.10082529485225677, + "learning_rate": 0.00011248282276893213, + "loss": 2.5547, + "step": 26455 + }, + { + "epoch": 0.7845090887524835, + "grad_norm": 0.09826134890317917, + "learning_rate": 0.00011245309208391241, + "loss": 2.5926, + "step": 26456 + }, + { + "epoch": 0.7845387421047949, + "grad_norm": 0.10207603126764297, + "learning_rate": 0.00011242336483068704, + "loss": 2.6125, + "step": 26457 + }, + { + "epoch": 0.7845683954571064, + "grad_norm": 0.10594332963228226, + "learning_rate": 0.00011239364100951949, + "loss": 2.5876, + "step": 26458 + }, + { + "epoch": 0.7845980488094179, + "grad_norm": 0.09040133655071259, + "learning_rate": 0.00011236392062067263, + "loss": 2.5961, + "step": 26459 + }, + { + "epoch": 0.7846277021617294, + "grad_norm": 0.09894604980945587, + "learning_rate": 0.00011233420366440977, + "loss": 2.5842, + "step": 26460 + }, + { + "epoch": 0.7846573555140408, + "grad_norm": 0.09368259459733963, + "learning_rate": 0.00011230449014099414, + "loss": 2.5597, + "step": 26461 + }, + { + "epoch": 0.7846870088663523, + "grad_norm": 0.09771422296762466, + "learning_rate": 0.00011227478005068875, + "loss": 2.5401, + "step": 26462 + }, + { + "epoch": 0.7847166622186639, + "grad_norm": 0.09805711358785629, + "learning_rate": 0.00011224507339375672, + "loss": 2.5905, + "step": 26463 + }, + { + "epoch": 0.7847463155709753, + "grad_norm": 0.10061002522706985, + "learning_rate": 0.00011221537017046101, + "loss": 2.5765, + "step": 26464 + }, + { + "epoch": 0.7847759689232868, + "grad_norm": 0.09862156957387924, + "learning_rate": 0.00011218567038106487, + "loss": 2.6253, + "step": 26465 + }, + { + "epoch": 0.7848056222755982, + "grad_norm": 0.09827470779418945, + "learning_rate": 0.00011215597402583122, + "loss": 2.5737, + "step": 26466 + }, + { + "epoch": 0.7848352756279098, + "grad_norm": 0.10040709376335144, + "learning_rate": 0.00011212628110502298, + "loss": 2.5605, + "step": 26467 + }, + { + "epoch": 0.7848649289802212, + "grad_norm": 0.0965903252363205, + "learning_rate": 0.00011209659161890323, + "loss": 2.599, + "step": 26468 + }, + { + "epoch": 0.7848945823325327, + "grad_norm": 0.0927097499370575, + "learning_rate": 0.00011206690556773463, + "loss": 2.5594, + "step": 26469 + }, + { + "epoch": 0.7849242356848442, + "grad_norm": 0.09594602882862091, + "learning_rate": 0.00011203722295178015, + "loss": 2.5779, + "step": 26470 + }, + { + "epoch": 0.7849538890371557, + "grad_norm": 0.08989674597978592, + "learning_rate": 0.00011200754377130268, + "loss": 2.6129, + "step": 26471 + }, + { + "epoch": 0.7849835423894671, + "grad_norm": 0.08889340609312057, + "learning_rate": 0.00011197786802656495, + "loss": 2.5835, + "step": 26472 + }, + { + "epoch": 0.7850131957417786, + "grad_norm": 0.09366258978843689, + "learning_rate": 0.00011194819571782982, + "loss": 2.5878, + "step": 26473 + }, + { + "epoch": 0.7850428490940901, + "grad_norm": 0.09772650897502899, + "learning_rate": 0.00011191852684536008, + "loss": 2.6195, + "step": 26474 + }, + { + "epoch": 0.7850725024464016, + "grad_norm": 0.09189505875110626, + "learning_rate": 0.00011188886140941835, + "loss": 2.6068, + "step": 26475 + }, + { + "epoch": 0.785102155798713, + "grad_norm": 0.10053227096796036, + "learning_rate": 0.00011185919941026739, + "loss": 2.5609, + "step": 26476 + }, + { + "epoch": 0.7851318091510245, + "grad_norm": 0.09097214043140411, + "learning_rate": 0.00011182954084816965, + "loss": 2.5938, + "step": 26477 + }, + { + "epoch": 0.785161462503336, + "grad_norm": 0.09431011229753494, + "learning_rate": 0.00011179988572338818, + "loss": 2.5788, + "step": 26478 + }, + { + "epoch": 0.7851911158556475, + "grad_norm": 0.10484598577022552, + "learning_rate": 0.00011177023403618542, + "loss": 2.6, + "step": 26479 + }, + { + "epoch": 0.7852207692079589, + "grad_norm": 0.0902581512928009, + "learning_rate": 0.00011174058578682378, + "loss": 2.5646, + "step": 26480 + }, + { + "epoch": 0.7852504225602704, + "grad_norm": 0.11916911602020264, + "learning_rate": 0.00011171094097556588, + "loss": 2.5915, + "step": 26481 + }, + { + "epoch": 0.7852800759125819, + "grad_norm": 0.0886375904083252, + "learning_rate": 0.00011168129960267426, + "loss": 2.5928, + "step": 26482 + }, + { + "epoch": 0.7853097292648934, + "grad_norm": 0.10633011907339096, + "learning_rate": 0.00011165166166841139, + "loss": 2.5913, + "step": 26483 + }, + { + "epoch": 0.7853393826172049, + "grad_norm": 0.09110621362924576, + "learning_rate": 0.00011162202717303971, + "loss": 2.5991, + "step": 26484 + }, + { + "epoch": 0.7853690359695163, + "grad_norm": 0.1081119105219841, + "learning_rate": 0.00011159239611682159, + "loss": 2.5566, + "step": 26485 + }, + { + "epoch": 0.7853986893218279, + "grad_norm": 0.09326665103435516, + "learning_rate": 0.00011156276850001956, + "loss": 2.6212, + "step": 26486 + }, + { + "epoch": 0.7854283426741393, + "grad_norm": 0.0969332605600357, + "learning_rate": 0.00011153314432289586, + "loss": 2.6033, + "step": 26487 + }, + { + "epoch": 0.7854579960264508, + "grad_norm": 0.10250787436962128, + "learning_rate": 0.00011150352358571281, + "loss": 2.6035, + "step": 26488 + }, + { + "epoch": 0.7854876493787623, + "grad_norm": 0.09762678295373917, + "learning_rate": 0.00011147390628873278, + "loss": 2.6067, + "step": 26489 + }, + { + "epoch": 0.7855173027310738, + "grad_norm": 0.09368843585252762, + "learning_rate": 0.00011144429243221798, + "loss": 2.5871, + "step": 26490 + }, + { + "epoch": 0.7855469560833852, + "grad_norm": 0.10002956539392471, + "learning_rate": 0.00011141468201643068, + "loss": 2.5879, + "step": 26491 + }, + { + "epoch": 0.7855766094356967, + "grad_norm": 0.09074016660451889, + "learning_rate": 0.00011138507504163303, + "loss": 2.5948, + "step": 26492 + }, + { + "epoch": 0.7856062627880082, + "grad_norm": 0.09170928597450256, + "learning_rate": 0.00011135547150808727, + "loss": 2.5523, + "step": 26493 + }, + { + "epoch": 0.7856359161403197, + "grad_norm": 0.10214176774024963, + "learning_rate": 0.00011132587141605555, + "loss": 2.5737, + "step": 26494 + }, + { + "epoch": 0.7856655694926311, + "grad_norm": 0.1115133985877037, + "learning_rate": 0.00011129627476579996, + "loss": 2.6104, + "step": 26495 + }, + { + "epoch": 0.7856952228449426, + "grad_norm": 0.10193660855293274, + "learning_rate": 0.00011126668155758252, + "loss": 2.603, + "step": 26496 + }, + { + "epoch": 0.7857248761972541, + "grad_norm": 0.10928016155958176, + "learning_rate": 0.00011123709179166535, + "loss": 2.595, + "step": 26497 + }, + { + "epoch": 0.7857545295495656, + "grad_norm": 0.10483477264642715, + "learning_rate": 0.00011120750546831049, + "loss": 2.5998, + "step": 26498 + }, + { + "epoch": 0.785784182901877, + "grad_norm": 0.08975210785865784, + "learning_rate": 0.00011117792258777992, + "loss": 2.6099, + "step": 26499 + }, + { + "epoch": 0.7858138362541885, + "grad_norm": 0.10257670283317566, + "learning_rate": 0.00011114834315033557, + "loss": 2.5632, + "step": 26500 + }, + { + "epoch": 0.7858434896065, + "grad_norm": 0.0928615927696228, + "learning_rate": 0.00011111876715623936, + "loss": 2.5868, + "step": 26501 + }, + { + "epoch": 0.7858731429588115, + "grad_norm": 0.09715503454208374, + "learning_rate": 0.0001110891946057534, + "loss": 2.5609, + "step": 26502 + }, + { + "epoch": 0.7859027963111229, + "grad_norm": 0.08314234763383865, + "learning_rate": 0.00011105962549913906, + "loss": 2.5585, + "step": 26503 + }, + { + "epoch": 0.7859324496634345, + "grad_norm": 0.09394281357526779, + "learning_rate": 0.00011103005983665864, + "loss": 2.5921, + "step": 26504 + }, + { + "epoch": 0.785962103015746, + "grad_norm": 0.09328441321849823, + "learning_rate": 0.00011100049761857378, + "loss": 2.5732, + "step": 26505 + }, + { + "epoch": 0.7859917563680574, + "grad_norm": 0.08553216606378555, + "learning_rate": 0.00011097093884514636, + "loss": 2.5888, + "step": 26506 + }, + { + "epoch": 0.7860214097203689, + "grad_norm": 0.10194293409585953, + "learning_rate": 0.00011094138351663801, + "loss": 2.5784, + "step": 26507 + }, + { + "epoch": 0.7860510630726804, + "grad_norm": 0.0879768580198288, + "learning_rate": 0.00011091183163331048, + "loss": 2.5773, + "step": 26508 + }, + { + "epoch": 0.7860807164249919, + "grad_norm": 0.10294809937477112, + "learning_rate": 0.00011088228319542548, + "loss": 2.5814, + "step": 26509 + }, + { + "epoch": 0.7861103697773033, + "grad_norm": 0.09643787145614624, + "learning_rate": 0.00011085273820324466, + "loss": 2.6122, + "step": 26510 + }, + { + "epoch": 0.7861400231296148, + "grad_norm": 0.10380848497152328, + "learning_rate": 0.00011082319665702962, + "loss": 2.5786, + "step": 26511 + }, + { + "epoch": 0.7861696764819263, + "grad_norm": 0.10260063409805298, + "learning_rate": 0.00011079365855704198, + "loss": 2.5743, + "step": 26512 + }, + { + "epoch": 0.7861993298342378, + "grad_norm": 0.10015936195850372, + "learning_rate": 0.00011076412390354346, + "loss": 2.6169, + "step": 26513 + }, + { + "epoch": 0.7862289831865492, + "grad_norm": 0.09122809022665024, + "learning_rate": 0.00011073459269679532, + "loss": 2.5874, + "step": 26514 + }, + { + "epoch": 0.7862586365388607, + "grad_norm": 0.10036830604076385, + "learning_rate": 0.00011070506493705913, + "loss": 2.5774, + "step": 26515 + }, + { + "epoch": 0.7862882898911722, + "grad_norm": 0.09158866852521896, + "learning_rate": 0.00011067554062459629, + "loss": 2.5853, + "step": 26516 + }, + { + "epoch": 0.7863179432434837, + "grad_norm": 0.10053037106990814, + "learning_rate": 0.00011064601975966848, + "loss": 2.6047, + "step": 26517 + }, + { + "epoch": 0.7863475965957951, + "grad_norm": 0.09224898368120193, + "learning_rate": 0.000110616502342537, + "loss": 2.5664, + "step": 26518 + }, + { + "epoch": 0.7863772499481066, + "grad_norm": 0.08982648700475693, + "learning_rate": 0.00011058698837346326, + "loss": 2.6016, + "step": 26519 + }, + { + "epoch": 0.7864069033004181, + "grad_norm": 0.09409793466329575, + "learning_rate": 0.00011055747785270853, + "loss": 2.595, + "step": 26520 + }, + { + "epoch": 0.7864365566527296, + "grad_norm": 0.08983326703310013, + "learning_rate": 0.00011052797078053423, + "loss": 2.5963, + "step": 26521 + }, + { + "epoch": 0.786466210005041, + "grad_norm": 0.09059928357601166, + "learning_rate": 0.00011049846715720158, + "loss": 2.594, + "step": 26522 + }, + { + "epoch": 0.7864958633573526, + "grad_norm": 0.09116402268409729, + "learning_rate": 0.00011046896698297198, + "loss": 2.6183, + "step": 26523 + }, + { + "epoch": 0.786525516709664, + "grad_norm": 0.09444781392812729, + "learning_rate": 0.00011043947025810647, + "loss": 2.6094, + "step": 26524 + }, + { + "epoch": 0.7865551700619755, + "grad_norm": 0.09755174070596695, + "learning_rate": 0.00011040997698286625, + "loss": 2.6003, + "step": 26525 + }, + { + "epoch": 0.786584823414287, + "grad_norm": 0.0985587015748024, + "learning_rate": 0.00011038048715751258, + "loss": 2.5794, + "step": 26526 + }, + { + "epoch": 0.7866144767665985, + "grad_norm": 0.09334336221218109, + "learning_rate": 0.00011035100078230653, + "loss": 2.6082, + "step": 26527 + }, + { + "epoch": 0.78664413011891, + "grad_norm": 0.09908290952444077, + "learning_rate": 0.00011032151785750932, + "loss": 2.597, + "step": 26528 + }, + { + "epoch": 0.7866737834712214, + "grad_norm": 0.09524184465408325, + "learning_rate": 0.00011029203838338181, + "loss": 2.5703, + "step": 26529 + }, + { + "epoch": 0.7867034368235329, + "grad_norm": 0.10370161384344101, + "learning_rate": 0.00011026256236018528, + "loss": 2.588, + "step": 26530 + }, + { + "epoch": 0.7867330901758444, + "grad_norm": 0.092894047498703, + "learning_rate": 0.0001102330897881807, + "loss": 2.6057, + "step": 26531 + }, + { + "epoch": 0.7867627435281559, + "grad_norm": 0.10126139223575592, + "learning_rate": 0.00011020362066762902, + "loss": 2.5748, + "step": 26532 + }, + { + "epoch": 0.7867923968804673, + "grad_norm": 0.08920752257108688, + "learning_rate": 0.00011017415499879114, + "loss": 2.5605, + "step": 26533 + }, + { + "epoch": 0.7868220502327788, + "grad_norm": 0.09995575249195099, + "learning_rate": 0.00011014469278192817, + "loss": 2.5742, + "step": 26534 + }, + { + "epoch": 0.7868517035850903, + "grad_norm": 0.08762204647064209, + "learning_rate": 0.00011011523401730078, + "loss": 2.5689, + "step": 26535 + }, + { + "epoch": 0.7868813569374018, + "grad_norm": 0.09599903970956802, + "learning_rate": 0.00011008577870516989, + "loss": 2.5533, + "step": 26536 + }, + { + "epoch": 0.7869110102897132, + "grad_norm": 0.09255862981081009, + "learning_rate": 0.00011005632684579636, + "loss": 2.611, + "step": 26537 + }, + { + "epoch": 0.7869406636420248, + "grad_norm": 0.10664625465869904, + "learning_rate": 0.00011002687843944099, + "loss": 2.5702, + "step": 26538 + }, + { + "epoch": 0.7869703169943362, + "grad_norm": 0.09330272674560547, + "learning_rate": 0.00010999743348636454, + "loss": 2.5755, + "step": 26539 + }, + { + "epoch": 0.7869999703466477, + "grad_norm": 0.09770409762859344, + "learning_rate": 0.00010996799198682772, + "loss": 2.5823, + "step": 26540 + }, + { + "epoch": 0.7870296236989591, + "grad_norm": 0.09776467829942703, + "learning_rate": 0.00010993855394109136, + "loss": 2.5899, + "step": 26541 + }, + { + "epoch": 0.7870592770512707, + "grad_norm": 0.09665101766586304, + "learning_rate": 0.00010990911934941588, + "loss": 2.5775, + "step": 26542 + }, + { + "epoch": 0.7870889304035821, + "grad_norm": 0.09852796047925949, + "learning_rate": 0.00010987968821206223, + "loss": 2.5846, + "step": 26543 + }, + { + "epoch": 0.7871185837558936, + "grad_norm": 0.08741287887096405, + "learning_rate": 0.00010985026052929104, + "loss": 2.5939, + "step": 26544 + }, + { + "epoch": 0.787148237108205, + "grad_norm": 0.10970742255449295, + "learning_rate": 0.00010982083630136264, + "loss": 2.6196, + "step": 26545 + }, + { + "epoch": 0.7871778904605166, + "grad_norm": 0.08502481132745743, + "learning_rate": 0.00010979141552853772, + "loss": 2.5776, + "step": 26546 + }, + { + "epoch": 0.7872075438128281, + "grad_norm": 0.09892813116312027, + "learning_rate": 0.0001097619982110768, + "loss": 2.5962, + "step": 26547 + }, + { + "epoch": 0.7872371971651395, + "grad_norm": 0.10126100480556488, + "learning_rate": 0.00010973258434924033, + "loss": 2.5606, + "step": 26548 + }, + { + "epoch": 0.787266850517451, + "grad_norm": 0.09723438322544098, + "learning_rate": 0.00010970317394328883, + "loss": 2.5913, + "step": 26549 + }, + { + "epoch": 0.7872965038697625, + "grad_norm": 0.10280791670084, + "learning_rate": 0.00010967376699348274, + "loss": 2.608, + "step": 26550 + }, + { + "epoch": 0.787326157222074, + "grad_norm": 0.09971220046281815, + "learning_rate": 0.00010964436350008245, + "loss": 2.5899, + "step": 26551 + }, + { + "epoch": 0.7873558105743854, + "grad_norm": 0.09737494587898254, + "learning_rate": 0.00010961496346334826, + "loss": 2.5743, + "step": 26552 + }, + { + "epoch": 0.787385463926697, + "grad_norm": 0.09978033602237701, + "learning_rate": 0.00010958556688354065, + "loss": 2.6135, + "step": 26553 + }, + { + "epoch": 0.7874151172790084, + "grad_norm": 0.09555747359991074, + "learning_rate": 0.0001095561737609198, + "loss": 2.5854, + "step": 26554 + }, + { + "epoch": 0.7874447706313199, + "grad_norm": 0.0988362655043602, + "learning_rate": 0.00010952678409574606, + "loss": 2.5779, + "step": 26555 + }, + { + "epoch": 0.7874744239836313, + "grad_norm": 0.09086796641349792, + "learning_rate": 0.00010949739788827972, + "loss": 2.5754, + "step": 26556 + }, + { + "epoch": 0.7875040773359429, + "grad_norm": 0.09347988665103912, + "learning_rate": 0.00010946801513878091, + "loss": 2.6006, + "step": 26557 + }, + { + "epoch": 0.7875337306882543, + "grad_norm": 0.09304586052894592, + "learning_rate": 0.00010943863584750985, + "loss": 2.5349, + "step": 26558 + }, + { + "epoch": 0.7875633840405658, + "grad_norm": 0.09518790990114212, + "learning_rate": 0.00010940926001472673, + "loss": 2.5893, + "step": 26559 + }, + { + "epoch": 0.7875930373928772, + "grad_norm": 0.09037692844867706, + "learning_rate": 0.00010937988764069167, + "loss": 2.6112, + "step": 26560 + }, + { + "epoch": 0.7876226907451888, + "grad_norm": 0.09480496495962143, + "learning_rate": 0.00010935051872566476, + "loss": 2.6059, + "step": 26561 + }, + { + "epoch": 0.7876523440975002, + "grad_norm": 0.10076173394918442, + "learning_rate": 0.000109321153269906, + "loss": 2.5774, + "step": 26562 + }, + { + "epoch": 0.7876819974498117, + "grad_norm": 0.10232080519199371, + "learning_rate": 0.00010929179127367555, + "loss": 2.6275, + "step": 26563 + }, + { + "epoch": 0.7877116508021231, + "grad_norm": 0.09940958768129349, + "learning_rate": 0.00010926243273723329, + "loss": 2.572, + "step": 26564 + }, + { + "epoch": 0.7877413041544347, + "grad_norm": 0.09180905669927597, + "learning_rate": 0.00010923307766083934, + "loss": 2.5511, + "step": 26565 + }, + { + "epoch": 0.7877709575067461, + "grad_norm": 0.08704401552677155, + "learning_rate": 0.00010920372604475348, + "loss": 2.5793, + "step": 26566 + }, + { + "epoch": 0.7878006108590576, + "grad_norm": 0.10061563551425934, + "learning_rate": 0.00010917437788923578, + "loss": 2.6147, + "step": 26567 + }, + { + "epoch": 0.7878302642113691, + "grad_norm": 0.08458434790372849, + "learning_rate": 0.000109145033194546, + "loss": 2.6271, + "step": 26568 + }, + { + "epoch": 0.7878599175636806, + "grad_norm": 0.09346327930688858, + "learning_rate": 0.00010911569196094406, + "loss": 2.5359, + "step": 26569 + }, + { + "epoch": 0.7878895709159921, + "grad_norm": 0.08857227116823196, + "learning_rate": 0.00010908635418868974, + "loss": 2.5531, + "step": 26570 + }, + { + "epoch": 0.7879192242683035, + "grad_norm": 0.08830087631940842, + "learning_rate": 0.00010905701987804284, + "loss": 2.5728, + "step": 26571 + }, + { + "epoch": 0.787948877620615, + "grad_norm": 0.08704400062561035, + "learning_rate": 0.00010902768902926318, + "loss": 2.6081, + "step": 26572 + }, + { + "epoch": 0.7879785309729265, + "grad_norm": 0.09513429552316666, + "learning_rate": 0.0001089983616426104, + "loss": 2.5824, + "step": 26573 + }, + { + "epoch": 0.788008184325238, + "grad_norm": 0.08401941508054733, + "learning_rate": 0.00010896903771834427, + "loss": 2.5973, + "step": 26574 + }, + { + "epoch": 0.7880378376775494, + "grad_norm": 0.10063360631465912, + "learning_rate": 0.00010893971725672447, + "loss": 2.6089, + "step": 26575 + }, + { + "epoch": 0.788067491029861, + "grad_norm": 0.10304185003042221, + "learning_rate": 0.00010891040025801052, + "loss": 2.565, + "step": 26576 + }, + { + "epoch": 0.7880971443821724, + "grad_norm": 0.09306980669498444, + "learning_rate": 0.00010888108672246217, + "loss": 2.6033, + "step": 26577 + }, + { + "epoch": 0.7881267977344839, + "grad_norm": 0.09230642020702362, + "learning_rate": 0.00010885177665033902, + "loss": 2.6083, + "step": 26578 + }, + { + "epoch": 0.7881564510867953, + "grad_norm": 0.09171627461910248, + "learning_rate": 0.00010882247004190038, + "loss": 2.5616, + "step": 26579 + }, + { + "epoch": 0.7881861044391069, + "grad_norm": 0.10066249966621399, + "learning_rate": 0.00010879316689740598, + "loss": 2.5849, + "step": 26580 + }, + { + "epoch": 0.7882157577914183, + "grad_norm": 0.09199449419975281, + "learning_rate": 0.00010876386721711507, + "loss": 2.5891, + "step": 26581 + }, + { + "epoch": 0.7882454111437298, + "grad_norm": 0.09563568234443665, + "learning_rate": 0.00010873457100128737, + "loss": 2.6143, + "step": 26582 + }, + { + "epoch": 0.7882750644960412, + "grad_norm": 0.09280052781105042, + "learning_rate": 0.00010870527825018222, + "loss": 2.5847, + "step": 26583 + }, + { + "epoch": 0.7883047178483528, + "grad_norm": 0.09179432690143585, + "learning_rate": 0.000108675988964059, + "loss": 2.5628, + "step": 26584 + }, + { + "epoch": 0.7883343712006642, + "grad_norm": 0.09601576626300812, + "learning_rate": 0.0001086467031431771, + "loss": 2.5984, + "step": 26585 + }, + { + "epoch": 0.7883640245529757, + "grad_norm": 0.08479385823011398, + "learning_rate": 0.0001086174207877958, + "loss": 2.553, + "step": 26586 + }, + { + "epoch": 0.7883936779052871, + "grad_norm": 0.09810728579759598, + "learning_rate": 0.0001085881418981744, + "loss": 2.5874, + "step": 26587 + }, + { + "epoch": 0.7884233312575987, + "grad_norm": 0.08669764548540115, + "learning_rate": 0.0001085588664745722, + "loss": 2.5879, + "step": 26588 + }, + { + "epoch": 0.7884529846099102, + "grad_norm": 0.095123790204525, + "learning_rate": 0.00010852959451724859, + "loss": 2.6036, + "step": 26589 + }, + { + "epoch": 0.7884826379622216, + "grad_norm": 0.09495508670806885, + "learning_rate": 0.00010850032602646243, + "loss": 2.5569, + "step": 26590 + }, + { + "epoch": 0.7885122913145332, + "grad_norm": 0.08488892763853073, + "learning_rate": 0.00010847106100247312, + "loss": 2.5909, + "step": 26591 + }, + { + "epoch": 0.7885419446668446, + "grad_norm": 0.08938591182231903, + "learning_rate": 0.00010844179944553973, + "loss": 2.6013, + "step": 26592 + }, + { + "epoch": 0.7885715980191561, + "grad_norm": 0.09212537854909897, + "learning_rate": 0.0001084125413559215, + "loss": 2.6146, + "step": 26593 + }, + { + "epoch": 0.7886012513714675, + "grad_norm": 0.09351964294910431, + "learning_rate": 0.00010838328673387721, + "loss": 2.5656, + "step": 26594 + }, + { + "epoch": 0.7886309047237791, + "grad_norm": 0.0904945358633995, + "learning_rate": 0.00010835403557966627, + "loss": 2.5617, + "step": 26595 + }, + { + "epoch": 0.7886605580760905, + "grad_norm": 0.09282810986042023, + "learning_rate": 0.00010832478789354761, + "loss": 2.5932, + "step": 26596 + }, + { + "epoch": 0.788690211428402, + "grad_norm": 0.09358502179384232, + "learning_rate": 0.00010829554367578021, + "loss": 2.5979, + "step": 26597 + }, + { + "epoch": 0.7887198647807134, + "grad_norm": 0.08510786294937134, + "learning_rate": 0.00010826630292662292, + "loss": 2.5786, + "step": 26598 + }, + { + "epoch": 0.788749518133025, + "grad_norm": 0.09152694791555405, + "learning_rate": 0.00010823706564633495, + "loss": 2.5819, + "step": 26599 + }, + { + "epoch": 0.7887791714853364, + "grad_norm": 0.08588085323572159, + "learning_rate": 0.00010820783183517491, + "loss": 2.6098, + "step": 26600 + }, + { + "epoch": 0.7888088248376479, + "grad_norm": 0.09645593911409378, + "learning_rate": 0.0001081786014934017, + "loss": 2.6265, + "step": 26601 + }, + { + "epoch": 0.7888384781899593, + "grad_norm": 0.08521974831819534, + "learning_rate": 0.00010814937462127428, + "loss": 2.5858, + "step": 26602 + }, + { + "epoch": 0.7888681315422709, + "grad_norm": 0.09114261716604233, + "learning_rate": 0.00010812015121905139, + "loss": 2.5946, + "step": 26603 + }, + { + "epoch": 0.7888977848945823, + "grad_norm": 0.09398552775382996, + "learning_rate": 0.00010809093128699177, + "loss": 2.6027, + "step": 26604 + }, + { + "epoch": 0.7889274382468938, + "grad_norm": 0.08247606456279755, + "learning_rate": 0.00010806171482535431, + "loss": 2.5984, + "step": 26605 + }, + { + "epoch": 0.7889570915992052, + "grad_norm": 0.09874890744686127, + "learning_rate": 0.00010803250183439762, + "loss": 2.5715, + "step": 26606 + }, + { + "epoch": 0.7889867449515168, + "grad_norm": 0.0817548856139183, + "learning_rate": 0.00010800329231438022, + "loss": 2.5793, + "step": 26607 + }, + { + "epoch": 0.7890163983038282, + "grad_norm": 0.09012209624052048, + "learning_rate": 0.0001079740862655611, + "loss": 2.6036, + "step": 26608 + }, + { + "epoch": 0.7890460516561397, + "grad_norm": 0.08935775607824326, + "learning_rate": 0.00010794488368819877, + "loss": 2.6149, + "step": 26609 + }, + { + "epoch": 0.7890757050084513, + "grad_norm": 0.08698929846286774, + "learning_rate": 0.00010791568458255191, + "loss": 2.5561, + "step": 26610 + }, + { + "epoch": 0.7891053583607627, + "grad_norm": 0.08729495108127594, + "learning_rate": 0.00010788648894887887, + "loss": 2.5602, + "step": 26611 + }, + { + "epoch": 0.7891350117130742, + "grad_norm": 0.08726233243942261, + "learning_rate": 0.00010785729678743822, + "loss": 2.5559, + "step": 26612 + }, + { + "epoch": 0.7891646650653856, + "grad_norm": 0.08831393718719482, + "learning_rate": 0.00010782810809848853, + "loss": 2.5882, + "step": 26613 + }, + { + "epoch": 0.7891943184176972, + "grad_norm": 0.08496421575546265, + "learning_rate": 0.00010779892288228826, + "loss": 2.5916, + "step": 26614 + }, + { + "epoch": 0.7892239717700086, + "grad_norm": 0.08819737285375595, + "learning_rate": 0.00010776974113909587, + "loss": 2.5793, + "step": 26615 + }, + { + "epoch": 0.7892536251223201, + "grad_norm": 0.08440075814723969, + "learning_rate": 0.00010774056286916973, + "loss": 2.5965, + "step": 26616 + }, + { + "epoch": 0.7892832784746315, + "grad_norm": 0.0870116725564003, + "learning_rate": 0.0001077113880727682, + "loss": 2.5847, + "step": 26617 + }, + { + "epoch": 0.7893129318269431, + "grad_norm": 0.0909564271569252, + "learning_rate": 0.00010768221675014972, + "loss": 2.5833, + "step": 26618 + }, + { + "epoch": 0.7893425851792545, + "grad_norm": 0.09047701954841614, + "learning_rate": 0.00010765304890157251, + "loss": 2.6049, + "step": 26619 + }, + { + "epoch": 0.789372238531566, + "grad_norm": 0.08635769039392471, + "learning_rate": 0.00010762388452729494, + "loss": 2.5983, + "step": 26620 + }, + { + "epoch": 0.7894018918838774, + "grad_norm": 0.08888866007328033, + "learning_rate": 0.00010759472362757522, + "loss": 2.5349, + "step": 26621 + }, + { + "epoch": 0.789431545236189, + "grad_norm": 0.08537489175796509, + "learning_rate": 0.00010756556620267154, + "loss": 2.5857, + "step": 26622 + }, + { + "epoch": 0.7894611985885004, + "grad_norm": 0.09021252393722534, + "learning_rate": 0.00010753641225284217, + "loss": 2.5966, + "step": 26623 + }, + { + "epoch": 0.7894908519408119, + "grad_norm": 0.08375661075115204, + "learning_rate": 0.00010750726177834519, + "loss": 2.5441, + "step": 26624 + }, + { + "epoch": 0.7895205052931233, + "grad_norm": 0.0898338258266449, + "learning_rate": 0.0001074781147794388, + "loss": 2.5786, + "step": 26625 + }, + { + "epoch": 0.7895501586454349, + "grad_norm": 0.0852724090218544, + "learning_rate": 0.00010744897125638109, + "loss": 2.5618, + "step": 26626 + }, + { + "epoch": 0.7895798119977463, + "grad_norm": 0.08455072343349457, + "learning_rate": 0.00010741983120943011, + "loss": 2.5664, + "step": 26627 + }, + { + "epoch": 0.7896094653500578, + "grad_norm": 0.08750469237565994, + "learning_rate": 0.00010739069463884394, + "loss": 2.5604, + "step": 26628 + }, + { + "epoch": 0.7896391187023694, + "grad_norm": 0.0921759158372879, + "learning_rate": 0.00010736156154488053, + "loss": 2.5854, + "step": 26629 + }, + { + "epoch": 0.7896687720546808, + "grad_norm": 0.09586520493030548, + "learning_rate": 0.0001073324319277979, + "loss": 2.5957, + "step": 26630 + }, + { + "epoch": 0.7896984254069923, + "grad_norm": 0.09375594556331635, + "learning_rate": 0.00010730330578785397, + "loss": 2.572, + "step": 26631 + }, + { + "epoch": 0.7897280787593037, + "grad_norm": 0.08554930984973907, + "learning_rate": 0.00010727418312530668, + "loss": 2.5927, + "step": 26632 + }, + { + "epoch": 0.7897577321116153, + "grad_norm": 0.10012451559305191, + "learning_rate": 0.00010724506394041388, + "loss": 2.5986, + "step": 26633 + }, + { + "epoch": 0.7897873854639267, + "grad_norm": 0.08630572259426117, + "learning_rate": 0.0001072159482334335, + "loss": 2.5746, + "step": 26634 + }, + { + "epoch": 0.7898170388162382, + "grad_norm": 0.10066428035497665, + "learning_rate": 0.00010718683600462332, + "loss": 2.6121, + "step": 26635 + }, + { + "epoch": 0.7898466921685496, + "grad_norm": 0.09054652601480484, + "learning_rate": 0.00010715772725424111, + "loss": 2.5907, + "step": 26636 + }, + { + "epoch": 0.7898763455208612, + "grad_norm": 0.09721598774194717, + "learning_rate": 0.00010712862198254463, + "loss": 2.5826, + "step": 26637 + }, + { + "epoch": 0.7899059988731726, + "grad_norm": 0.09345309436321259, + "learning_rate": 0.00010709952018979169, + "loss": 2.5808, + "step": 26638 + }, + { + "epoch": 0.7899356522254841, + "grad_norm": 0.0992341935634613, + "learning_rate": 0.00010707042187623989, + "loss": 2.5802, + "step": 26639 + }, + { + "epoch": 0.7899653055777955, + "grad_norm": 0.09577979892492294, + "learning_rate": 0.00010704132704214698, + "loss": 2.5897, + "step": 26640 + }, + { + "epoch": 0.7899949589301071, + "grad_norm": 0.09861969947814941, + "learning_rate": 0.00010701223568777058, + "loss": 2.5848, + "step": 26641 + }, + { + "epoch": 0.7900246122824185, + "grad_norm": 0.09622788429260254, + "learning_rate": 0.00010698314781336826, + "loss": 2.5787, + "step": 26642 + }, + { + "epoch": 0.79005426563473, + "grad_norm": 0.0939040407538414, + "learning_rate": 0.00010695406341919766, + "loss": 2.6069, + "step": 26643 + }, + { + "epoch": 0.7900839189870414, + "grad_norm": 0.09398363530635834, + "learning_rate": 0.0001069249825055164, + "loss": 2.6022, + "step": 26644 + }, + { + "epoch": 0.790113572339353, + "grad_norm": 0.08918105065822601, + "learning_rate": 0.00010689590507258162, + "loss": 2.6147, + "step": 26645 + }, + { + "epoch": 0.7901432256916644, + "grad_norm": 0.10254193842411041, + "learning_rate": 0.00010686683112065121, + "loss": 2.5817, + "step": 26646 + }, + { + "epoch": 0.7901728790439759, + "grad_norm": 0.08864642679691315, + "learning_rate": 0.00010683776064998252, + "loss": 2.5834, + "step": 26647 + }, + { + "epoch": 0.7902025323962873, + "grad_norm": 0.10122150927782059, + "learning_rate": 0.00010680869366083295, + "loss": 2.5731, + "step": 26648 + }, + { + "epoch": 0.7902321857485989, + "grad_norm": 0.08342815190553665, + "learning_rate": 0.00010677963015345988, + "loss": 2.5629, + "step": 26649 + }, + { + "epoch": 0.7902618391009104, + "grad_norm": 0.09580608457326889, + "learning_rate": 0.00010675057012812061, + "loss": 2.5682, + "step": 26650 + }, + { + "epoch": 0.7902914924532218, + "grad_norm": 0.09020525962114334, + "learning_rate": 0.00010672151358507265, + "loss": 2.5959, + "step": 26651 + }, + { + "epoch": 0.7903211458055334, + "grad_norm": 0.09688195586204529, + "learning_rate": 0.0001066924605245731, + "loss": 2.5802, + "step": 26652 + }, + { + "epoch": 0.7903507991578448, + "grad_norm": 0.09290153533220291, + "learning_rate": 0.00010666341094687937, + "loss": 2.561, + "step": 26653 + }, + { + "epoch": 0.7903804525101563, + "grad_norm": 0.09905742853879929, + "learning_rate": 0.0001066343648522488, + "loss": 2.5904, + "step": 26654 + }, + { + "epoch": 0.7904101058624677, + "grad_norm": 0.09353265166282654, + "learning_rate": 0.00010660532224093828, + "loss": 2.5428, + "step": 26655 + }, + { + "epoch": 0.7904397592147793, + "grad_norm": 0.0963367372751236, + "learning_rate": 0.00010657628311320517, + "loss": 2.5744, + "step": 26656 + }, + { + "epoch": 0.7904694125670907, + "grad_norm": 0.09199681878089905, + "learning_rate": 0.00010654724746930661, + "loss": 2.5693, + "step": 26657 + }, + { + "epoch": 0.7904990659194022, + "grad_norm": 0.10051363706588745, + "learning_rate": 0.00010651821530949957, + "loss": 2.5754, + "step": 26658 + }, + { + "epoch": 0.7905287192717136, + "grad_norm": 0.10086392611265182, + "learning_rate": 0.0001064891866340414, + "loss": 2.5951, + "step": 26659 + }, + { + "epoch": 0.7905583726240252, + "grad_norm": 0.09175258129835129, + "learning_rate": 0.00010646016144318904, + "loss": 2.5733, + "step": 26660 + }, + { + "epoch": 0.7905880259763366, + "grad_norm": 0.10819565504789352, + "learning_rate": 0.00010643113973719948, + "loss": 2.5765, + "step": 26661 + }, + { + "epoch": 0.7906176793286481, + "grad_norm": 0.09759899228811264, + "learning_rate": 0.00010640212151632977, + "loss": 2.6148, + "step": 26662 + }, + { + "epoch": 0.7906473326809595, + "grad_norm": 0.09273163229227066, + "learning_rate": 0.00010637310678083678, + "loss": 2.5936, + "step": 26663 + }, + { + "epoch": 0.7906769860332711, + "grad_norm": 0.09591040760278702, + "learning_rate": 0.00010634409553097751, + "loss": 2.5718, + "step": 26664 + }, + { + "epoch": 0.7907066393855825, + "grad_norm": 0.09822335839271545, + "learning_rate": 0.00010631508776700905, + "loss": 2.5844, + "step": 26665 + }, + { + "epoch": 0.790736292737894, + "grad_norm": 0.09325319528579712, + "learning_rate": 0.00010628608348918783, + "loss": 2.5927, + "step": 26666 + }, + { + "epoch": 0.7907659460902055, + "grad_norm": 0.10160685330629349, + "learning_rate": 0.00010625708269777096, + "loss": 2.5987, + "step": 26667 + }, + { + "epoch": 0.790795599442517, + "grad_norm": 0.09383542090654373, + "learning_rate": 0.00010622808539301526, + "loss": 2.5482, + "step": 26668 + }, + { + "epoch": 0.7908252527948284, + "grad_norm": 0.09238027036190033, + "learning_rate": 0.00010619909157517738, + "loss": 2.5793, + "step": 26669 + }, + { + "epoch": 0.7908549061471399, + "grad_norm": 0.09422710537910461, + "learning_rate": 0.00010617010124451415, + "loss": 2.5507, + "step": 26670 + }, + { + "epoch": 0.7908845594994515, + "grad_norm": 0.09207925945520401, + "learning_rate": 0.00010614111440128216, + "loss": 2.5806, + "step": 26671 + }, + { + "epoch": 0.7909142128517629, + "grad_norm": 0.08700239658355713, + "learning_rate": 0.00010611213104573836, + "loss": 2.6198, + "step": 26672 + }, + { + "epoch": 0.7909438662040744, + "grad_norm": 0.09296109527349472, + "learning_rate": 0.00010608315117813921, + "loss": 2.5765, + "step": 26673 + }, + { + "epoch": 0.7909735195563858, + "grad_norm": 0.07824783772230148, + "learning_rate": 0.00010605417479874141, + "loss": 2.5666, + "step": 26674 + }, + { + "epoch": 0.7910031729086974, + "grad_norm": 0.09537924081087112, + "learning_rate": 0.00010602520190780157, + "loss": 2.5925, + "step": 26675 + }, + { + "epoch": 0.7910328262610088, + "grad_norm": 0.08504340797662735, + "learning_rate": 0.00010599623250557616, + "loss": 2.5857, + "step": 26676 + }, + { + "epoch": 0.7910624796133203, + "grad_norm": 0.08150454610586166, + "learning_rate": 0.0001059672665923217, + "loss": 2.5715, + "step": 26677 + }, + { + "epoch": 0.7910921329656317, + "grad_norm": 0.09068223088979721, + "learning_rate": 0.00010593830416829469, + "loss": 2.6195, + "step": 26678 + }, + { + "epoch": 0.7911217863179433, + "grad_norm": 0.08573195338249207, + "learning_rate": 0.00010590934523375168, + "loss": 2.5858, + "step": 26679 + }, + { + "epoch": 0.7911514396702547, + "grad_norm": 0.0844913050532341, + "learning_rate": 0.00010588038978894904, + "loss": 2.5996, + "step": 26680 + }, + { + "epoch": 0.7911810930225662, + "grad_norm": 0.09451627731323242, + "learning_rate": 0.0001058514378341432, + "loss": 2.6055, + "step": 26681 + }, + { + "epoch": 0.7912107463748776, + "grad_norm": 0.08476924896240234, + "learning_rate": 0.00010582248936959055, + "loss": 2.6077, + "step": 26682 + }, + { + "epoch": 0.7912403997271892, + "grad_norm": 0.09555498510599136, + "learning_rate": 0.0001057935443955474, + "loss": 2.5941, + "step": 26683 + }, + { + "epoch": 0.7912700530795006, + "grad_norm": 0.08919116109609604, + "learning_rate": 0.00010576460291226997, + "loss": 2.5612, + "step": 26684 + }, + { + "epoch": 0.7912997064318121, + "grad_norm": 0.09231216460466385, + "learning_rate": 0.00010573566492001474, + "loss": 2.5882, + "step": 26685 + }, + { + "epoch": 0.7913293597841236, + "grad_norm": 0.09622536599636078, + "learning_rate": 0.00010570673041903806, + "loss": 2.598, + "step": 26686 + }, + { + "epoch": 0.7913590131364351, + "grad_norm": 0.0907469391822815, + "learning_rate": 0.00010567779940959577, + "loss": 2.6113, + "step": 26687 + }, + { + "epoch": 0.7913886664887465, + "grad_norm": 0.09247057884931564, + "learning_rate": 0.00010564887189194428, + "loss": 2.583, + "step": 26688 + }, + { + "epoch": 0.791418319841058, + "grad_norm": 0.09400110691785812, + "learning_rate": 0.00010561994786633972, + "loss": 2.61, + "step": 26689 + }, + { + "epoch": 0.7914479731933695, + "grad_norm": 0.09645844995975494, + "learning_rate": 0.00010559102733303822, + "loss": 2.5867, + "step": 26690 + }, + { + "epoch": 0.791477626545681, + "grad_norm": 0.08979333192110062, + "learning_rate": 0.0001055621102922959, + "loss": 2.5876, + "step": 26691 + }, + { + "epoch": 0.7915072798979925, + "grad_norm": 0.09113853424787521, + "learning_rate": 0.00010553319674436873, + "loss": 2.6179, + "step": 26692 + }, + { + "epoch": 0.7915369332503039, + "grad_norm": 0.09560345858335495, + "learning_rate": 0.00010550428668951284, + "loss": 2.5684, + "step": 26693 + }, + { + "epoch": 0.7915665866026155, + "grad_norm": 0.0943436473608017, + "learning_rate": 0.00010547538012798424, + "loss": 2.5553, + "step": 26694 + }, + { + "epoch": 0.7915962399549269, + "grad_norm": 0.09159042686223984, + "learning_rate": 0.00010544647706003885, + "loss": 2.5753, + "step": 26695 + }, + { + "epoch": 0.7916258933072384, + "grad_norm": 0.09750336408615112, + "learning_rate": 0.00010541757748593262, + "loss": 2.6022, + "step": 26696 + }, + { + "epoch": 0.7916555466595498, + "grad_norm": 0.09467615187168121, + "learning_rate": 0.00010538868140592145, + "loss": 2.5672, + "step": 26697 + }, + { + "epoch": 0.7916852000118614, + "grad_norm": 0.09104098379611969, + "learning_rate": 0.00010535978882026126, + "loss": 2.5951, + "step": 26698 + }, + { + "epoch": 0.7917148533641728, + "grad_norm": 0.09296675026416779, + "learning_rate": 0.0001053308997292079, + "loss": 2.5703, + "step": 26699 + }, + { + "epoch": 0.7917445067164843, + "grad_norm": 0.0870446115732193, + "learning_rate": 0.00010530201413301716, + "loss": 2.5614, + "step": 26700 + }, + { + "epoch": 0.7917741600687958, + "grad_norm": 0.09226766228675842, + "learning_rate": 0.00010527313203194483, + "loss": 2.6102, + "step": 26701 + }, + { + "epoch": 0.7918038134211073, + "grad_norm": 0.08661293238401413, + "learning_rate": 0.00010524425342624666, + "loss": 2.5955, + "step": 26702 + }, + { + "epoch": 0.7918334667734187, + "grad_norm": 0.09058178216218948, + "learning_rate": 0.0001052153783161784, + "loss": 2.6241, + "step": 26703 + }, + { + "epoch": 0.7918631201257302, + "grad_norm": 0.0870782658457756, + "learning_rate": 0.0001051865067019957, + "loss": 2.5577, + "step": 26704 + }, + { + "epoch": 0.7918927734780417, + "grad_norm": 0.0880296528339386, + "learning_rate": 0.00010515763858395428, + "loss": 2.5851, + "step": 26705 + }, + { + "epoch": 0.7919224268303532, + "grad_norm": 0.09868087619543076, + "learning_rate": 0.00010512877396230969, + "loss": 2.5574, + "step": 26706 + }, + { + "epoch": 0.7919520801826646, + "grad_norm": 0.09031946957111359, + "learning_rate": 0.00010509991283731762, + "loss": 2.5695, + "step": 26707 + }, + { + "epoch": 0.7919817335349761, + "grad_norm": 0.0867503434419632, + "learning_rate": 0.00010507105520923365, + "loss": 2.5638, + "step": 26708 + }, + { + "epoch": 0.7920113868872876, + "grad_norm": 0.09657195955514908, + "learning_rate": 0.00010504220107831336, + "loss": 2.526, + "step": 26709 + }, + { + "epoch": 0.7920410402395991, + "grad_norm": 0.09007521718740463, + "learning_rate": 0.00010501335044481192, + "loss": 2.6061, + "step": 26710 + }, + { + "epoch": 0.7920706935919105, + "grad_norm": 0.09117414802312851, + "learning_rate": 0.00010498450330898518, + "loss": 2.6014, + "step": 26711 + }, + { + "epoch": 0.792100346944222, + "grad_norm": 0.08573728054761887, + "learning_rate": 0.0001049556596710885, + "loss": 2.5523, + "step": 26712 + }, + { + "epoch": 0.7921300002965336, + "grad_norm": 0.08974455296993256, + "learning_rate": 0.00010492681953137723, + "loss": 2.5973, + "step": 26713 + }, + { + "epoch": 0.792159653648845, + "grad_norm": 0.08601979166269302, + "learning_rate": 0.00010489798289010682, + "loss": 2.5762, + "step": 26714 + }, + { + "epoch": 0.7921893070011565, + "grad_norm": 0.09646434336900711, + "learning_rate": 0.00010486914974753253, + "loss": 2.5748, + "step": 26715 + }, + { + "epoch": 0.792218960353468, + "grad_norm": 0.08880047500133514, + "learning_rate": 0.00010484032010390982, + "loss": 2.5728, + "step": 26716 + }, + { + "epoch": 0.7922486137057795, + "grad_norm": 0.08440537750720978, + "learning_rate": 0.00010481149395949386, + "loss": 2.5591, + "step": 26717 + }, + { + "epoch": 0.7922782670580909, + "grad_norm": 0.09673485159873962, + "learning_rate": 0.00010478267131453994, + "loss": 2.61, + "step": 26718 + }, + { + "epoch": 0.7923079204104024, + "grad_norm": 0.0864160880446434, + "learning_rate": 0.00010475385216930333, + "loss": 2.6106, + "step": 26719 + }, + { + "epoch": 0.7923375737627139, + "grad_norm": 0.09636931866407394, + "learning_rate": 0.00010472503652403931, + "loss": 2.6205, + "step": 26720 + }, + { + "epoch": 0.7923672271150254, + "grad_norm": 0.0852445513010025, + "learning_rate": 0.00010469622437900283, + "loss": 2.5753, + "step": 26721 + }, + { + "epoch": 0.7923968804673368, + "grad_norm": 0.0996546596288681, + "learning_rate": 0.0001046674157344491, + "loss": 2.5705, + "step": 26722 + }, + { + "epoch": 0.7924265338196483, + "grad_norm": 0.1056353747844696, + "learning_rate": 0.00010463861059063317, + "loss": 2.6019, + "step": 26723 + }, + { + "epoch": 0.7924561871719598, + "grad_norm": 0.09213141351938248, + "learning_rate": 0.00010460980894781036, + "loss": 2.581, + "step": 26724 + }, + { + "epoch": 0.7924858405242713, + "grad_norm": 0.1003512367606163, + "learning_rate": 0.0001045810108062355, + "loss": 2.5451, + "step": 26725 + }, + { + "epoch": 0.7925154938765827, + "grad_norm": 0.09557946771383286, + "learning_rate": 0.00010455221616616368, + "loss": 2.5928, + "step": 26726 + }, + { + "epoch": 0.7925451472288942, + "grad_norm": 0.10567710548639297, + "learning_rate": 0.0001045234250278499, + "loss": 2.5673, + "step": 26727 + }, + { + "epoch": 0.7925748005812057, + "grad_norm": 0.08959553390741348, + "learning_rate": 0.00010449463739154903, + "loss": 2.5347, + "step": 26728 + }, + { + "epoch": 0.7926044539335172, + "grad_norm": 0.09668854624032974, + "learning_rate": 0.00010446585325751606, + "loss": 2.5575, + "step": 26729 + }, + { + "epoch": 0.7926341072858286, + "grad_norm": 0.10579019039869308, + "learning_rate": 0.00010443707262600599, + "loss": 2.588, + "step": 26730 + }, + { + "epoch": 0.7926637606381401, + "grad_norm": 0.10739944130182266, + "learning_rate": 0.00010440829549727337, + "loss": 2.5751, + "step": 26731 + }, + { + "epoch": 0.7926934139904516, + "grad_norm": 0.0968092679977417, + "learning_rate": 0.00010437952187157323, + "loss": 2.5955, + "step": 26732 + }, + { + "epoch": 0.7927230673427631, + "grad_norm": 0.09322173148393631, + "learning_rate": 0.0001043507517491603, + "loss": 2.589, + "step": 26733 + }, + { + "epoch": 0.7927527206950746, + "grad_norm": 0.09416116774082184, + "learning_rate": 0.0001043219851302894, + "loss": 2.6032, + "step": 26734 + }, + { + "epoch": 0.792782374047386, + "grad_norm": 0.08576472103595734, + "learning_rate": 0.00010429322201521524, + "loss": 2.6071, + "step": 26735 + }, + { + "epoch": 0.7928120273996976, + "grad_norm": 0.09908562898635864, + "learning_rate": 0.00010426446240419235, + "loss": 2.5898, + "step": 26736 + }, + { + "epoch": 0.792841680752009, + "grad_norm": 0.08964598923921585, + "learning_rate": 0.00010423570629747575, + "loss": 2.5514, + "step": 26737 + }, + { + "epoch": 0.7928713341043205, + "grad_norm": 0.08733144402503967, + "learning_rate": 0.00010420695369531991, + "loss": 2.5845, + "step": 26738 + }, + { + "epoch": 0.792900987456632, + "grad_norm": 0.08774039149284363, + "learning_rate": 0.00010417820459797939, + "loss": 2.5886, + "step": 26739 + }, + { + "epoch": 0.7929306408089435, + "grad_norm": 0.09336840361356735, + "learning_rate": 0.00010414945900570883, + "loss": 2.6015, + "step": 26740 + }, + { + "epoch": 0.7929602941612549, + "grad_norm": 0.09125056862831116, + "learning_rate": 0.00010412071691876291, + "loss": 2.5539, + "step": 26741 + }, + { + "epoch": 0.7929899475135664, + "grad_norm": 0.08707376569509506, + "learning_rate": 0.00010409197833739581, + "loss": 2.5824, + "step": 26742 + }, + { + "epoch": 0.7930196008658779, + "grad_norm": 0.08621181547641754, + "learning_rate": 0.00010406324326186223, + "loss": 2.5563, + "step": 26743 + }, + { + "epoch": 0.7930492542181894, + "grad_norm": 0.09570728242397308, + "learning_rate": 0.00010403451169241663, + "loss": 2.5914, + "step": 26744 + }, + { + "epoch": 0.7930789075705008, + "grad_norm": 0.09412623196840286, + "learning_rate": 0.00010400578362931334, + "loss": 2.601, + "step": 26745 + }, + { + "epoch": 0.7931085609228123, + "grad_norm": 0.09142784774303436, + "learning_rate": 0.0001039770590728068, + "loss": 2.5786, + "step": 26746 + }, + { + "epoch": 0.7931382142751238, + "grad_norm": 0.09833481907844543, + "learning_rate": 0.0001039483380231514, + "loss": 2.6009, + "step": 26747 + }, + { + "epoch": 0.7931678676274353, + "grad_norm": 0.09710225462913513, + "learning_rate": 0.0001039196204806015, + "loss": 2.5802, + "step": 26748 + }, + { + "epoch": 0.7931975209797467, + "grad_norm": 0.0940944030880928, + "learning_rate": 0.00010389090644541116, + "loss": 2.5931, + "step": 26749 + }, + { + "epoch": 0.7932271743320582, + "grad_norm": 0.10134148597717285, + "learning_rate": 0.00010386219591783496, + "loss": 2.5917, + "step": 26750 + }, + { + "epoch": 0.7932568276843697, + "grad_norm": 0.09422086179256439, + "learning_rate": 0.00010383348889812716, + "loss": 2.5567, + "step": 26751 + }, + { + "epoch": 0.7932864810366812, + "grad_norm": 0.09654911607503891, + "learning_rate": 0.0001038047853865417, + "loss": 2.5788, + "step": 26752 + }, + { + "epoch": 0.7933161343889926, + "grad_norm": 0.0897589921951294, + "learning_rate": 0.00010377608538333283, + "loss": 2.5641, + "step": 26753 + }, + { + "epoch": 0.7933457877413042, + "grad_norm": 0.09026258438825607, + "learning_rate": 0.00010374738888875478, + "loss": 2.5782, + "step": 26754 + }, + { + "epoch": 0.7933754410936157, + "grad_norm": 0.09676247835159302, + "learning_rate": 0.0001037186959030616, + "loss": 2.5738, + "step": 26755 + }, + { + "epoch": 0.7934050944459271, + "grad_norm": 0.09332282096147537, + "learning_rate": 0.00010369000642650739, + "loss": 2.5535, + "step": 26756 + }, + { + "epoch": 0.7934347477982386, + "grad_norm": 0.09062600880861282, + "learning_rate": 0.00010366132045934618, + "loss": 2.5466, + "step": 26757 + }, + { + "epoch": 0.7934644011505501, + "grad_norm": 0.09528692066669464, + "learning_rate": 0.00010363263800183204, + "loss": 2.6029, + "step": 26758 + }, + { + "epoch": 0.7934940545028616, + "grad_norm": 0.1020427867770195, + "learning_rate": 0.00010360395905421887, + "loss": 2.5887, + "step": 26759 + }, + { + "epoch": 0.793523707855173, + "grad_norm": 0.10192998498678207, + "learning_rate": 0.00010357528361676072, + "loss": 2.5806, + "step": 26760 + }, + { + "epoch": 0.7935533612074845, + "grad_norm": 0.09766867756843567, + "learning_rate": 0.00010354661168971147, + "loss": 2.606, + "step": 26761 + }, + { + "epoch": 0.793583014559796, + "grad_norm": 0.09505084902048111, + "learning_rate": 0.00010351794327332503, + "loss": 2.5793, + "step": 26762 + }, + { + "epoch": 0.7936126679121075, + "grad_norm": 0.08918129652738571, + "learning_rate": 0.00010348927836785527, + "loss": 2.5879, + "step": 26763 + }, + { + "epoch": 0.7936423212644189, + "grad_norm": 0.08989298343658447, + "learning_rate": 0.00010346061697355603, + "loss": 2.5938, + "step": 26764 + }, + { + "epoch": 0.7936719746167304, + "grad_norm": 0.0908818244934082, + "learning_rate": 0.00010343195909068104, + "loss": 2.5638, + "step": 26765 + }, + { + "epoch": 0.7937016279690419, + "grad_norm": 0.0919477641582489, + "learning_rate": 0.00010340330471948417, + "loss": 2.6115, + "step": 26766 + }, + { + "epoch": 0.7937312813213534, + "grad_norm": 0.09424029290676117, + "learning_rate": 0.00010337465386021905, + "loss": 2.5953, + "step": 26767 + }, + { + "epoch": 0.7937609346736648, + "grad_norm": 0.08706816285848618, + "learning_rate": 0.00010334600651313952, + "loss": 2.5593, + "step": 26768 + }, + { + "epoch": 0.7937905880259764, + "grad_norm": 0.09603425860404968, + "learning_rate": 0.00010331736267849912, + "loss": 2.585, + "step": 26769 + }, + { + "epoch": 0.7938202413782878, + "grad_norm": 0.08557851612567902, + "learning_rate": 0.00010328872235655163, + "loss": 2.5637, + "step": 26770 + }, + { + "epoch": 0.7938498947305993, + "grad_norm": 0.0860988199710846, + "learning_rate": 0.00010326008554755057, + "loss": 2.5807, + "step": 26771 + }, + { + "epoch": 0.7938795480829107, + "grad_norm": 0.09106936305761337, + "learning_rate": 0.00010323145225174952, + "loss": 2.5989, + "step": 26772 + }, + { + "epoch": 0.7939092014352223, + "grad_norm": 0.08577632158994675, + "learning_rate": 0.0001032028224694021, + "loss": 2.598, + "step": 26773 + }, + { + "epoch": 0.7939388547875337, + "grad_norm": 0.08846646547317505, + "learning_rate": 0.0001031741962007618, + "loss": 2.5737, + "step": 26774 + }, + { + "epoch": 0.7939685081398452, + "grad_norm": 0.08902278542518616, + "learning_rate": 0.00010314557344608211, + "loss": 2.5662, + "step": 26775 + }, + { + "epoch": 0.7939981614921567, + "grad_norm": 0.0948042944073677, + "learning_rate": 0.00010311695420561645, + "loss": 2.5962, + "step": 26776 + }, + { + "epoch": 0.7940278148444682, + "grad_norm": 0.0904935970902443, + "learning_rate": 0.00010308833847961829, + "loss": 2.59, + "step": 26777 + }, + { + "epoch": 0.7940574681967797, + "grad_norm": 0.09143593162298203, + "learning_rate": 0.00010305972626834103, + "loss": 2.5716, + "step": 26778 + }, + { + "epoch": 0.7940871215490911, + "grad_norm": 0.09270863234996796, + "learning_rate": 0.00010303111757203804, + "loss": 2.5789, + "step": 26779 + }, + { + "epoch": 0.7941167749014026, + "grad_norm": 0.08708366751670837, + "learning_rate": 0.00010300251239096264, + "loss": 2.5568, + "step": 26780 + }, + { + "epoch": 0.7941464282537141, + "grad_norm": 0.08914230763912201, + "learning_rate": 0.00010297391072536816, + "loss": 2.5822, + "step": 26781 + }, + { + "epoch": 0.7941760816060256, + "grad_norm": 0.09194552898406982, + "learning_rate": 0.00010294531257550782, + "loss": 2.5978, + "step": 26782 + }, + { + "epoch": 0.794205734958337, + "grad_norm": 0.09014835953712463, + "learning_rate": 0.00010291671794163487, + "loss": 2.6118, + "step": 26783 + }, + { + "epoch": 0.7942353883106485, + "grad_norm": 0.09160979837179184, + "learning_rate": 0.00010288812682400256, + "loss": 2.5971, + "step": 26784 + }, + { + "epoch": 0.79426504166296, + "grad_norm": 0.09543565660715103, + "learning_rate": 0.00010285953922286406, + "loss": 2.5891, + "step": 26785 + }, + { + "epoch": 0.7942946950152715, + "grad_norm": 0.09368912875652313, + "learning_rate": 0.00010283095513847268, + "loss": 2.5975, + "step": 26786 + }, + { + "epoch": 0.7943243483675829, + "grad_norm": 0.09195569157600403, + "learning_rate": 0.00010280237457108115, + "loss": 2.5922, + "step": 26787 + }, + { + "epoch": 0.7943540017198945, + "grad_norm": 0.09337257593870163, + "learning_rate": 0.00010277379752094268, + "loss": 2.5678, + "step": 26788 + }, + { + "epoch": 0.7943836550722059, + "grad_norm": 0.08743804693222046, + "learning_rate": 0.00010274522398831054, + "loss": 2.6058, + "step": 26789 + }, + { + "epoch": 0.7944133084245174, + "grad_norm": 0.08997130393981934, + "learning_rate": 0.00010271665397343766, + "loss": 2.5872, + "step": 26790 + }, + { + "epoch": 0.7944429617768288, + "grad_norm": 0.09043336659669876, + "learning_rate": 0.00010268808747657698, + "loss": 2.5664, + "step": 26791 + }, + { + "epoch": 0.7944726151291404, + "grad_norm": 0.0856575146317482, + "learning_rate": 0.0001026595244979815, + "loss": 2.5431, + "step": 26792 + }, + { + "epoch": 0.7945022684814518, + "grad_norm": 0.0881839394569397, + "learning_rate": 0.00010263096503790409, + "loss": 2.5777, + "step": 26793 + }, + { + "epoch": 0.7945319218337633, + "grad_norm": 0.0871247798204422, + "learning_rate": 0.00010260240909659773, + "loss": 2.5574, + "step": 26794 + }, + { + "epoch": 0.7945615751860747, + "grad_norm": 0.08392319828271866, + "learning_rate": 0.00010257385667431524, + "loss": 2.5478, + "step": 26795 + }, + { + "epoch": 0.7945912285383863, + "grad_norm": 0.09446536004543304, + "learning_rate": 0.0001025453077713096, + "loss": 2.563, + "step": 26796 + }, + { + "epoch": 0.7946208818906978, + "grad_norm": 0.08426166325807571, + "learning_rate": 0.0001025167623878333, + "loss": 2.6198, + "step": 26797 + }, + { + "epoch": 0.7946505352430092, + "grad_norm": 0.09256281703710556, + "learning_rate": 0.00010248822052413937, + "loss": 2.5779, + "step": 26798 + }, + { + "epoch": 0.7946801885953207, + "grad_norm": 0.08813954889774323, + "learning_rate": 0.00010245968218048046, + "loss": 2.5739, + "step": 26799 + }, + { + "epoch": 0.7947098419476322, + "grad_norm": 0.09240183234214783, + "learning_rate": 0.00010243114735710928, + "loss": 2.5729, + "step": 26800 + }, + { + "epoch": 0.7947394952999437, + "grad_norm": 0.09500223398208618, + "learning_rate": 0.00010240261605427842, + "loss": 2.6031, + "step": 26801 + }, + { + "epoch": 0.7947691486522551, + "grad_norm": 0.09026290476322174, + "learning_rate": 0.00010237408827224076, + "loss": 2.593, + "step": 26802 + }, + { + "epoch": 0.7947988020045667, + "grad_norm": 0.08560929447412491, + "learning_rate": 0.00010234556401124878, + "loss": 2.5707, + "step": 26803 + }, + { + "epoch": 0.7948284553568781, + "grad_norm": 0.09160847961902618, + "learning_rate": 0.00010231704327155517, + "loss": 2.5936, + "step": 26804 + }, + { + "epoch": 0.7948581087091896, + "grad_norm": 0.08619637787342072, + "learning_rate": 0.00010228852605341232, + "loss": 2.5954, + "step": 26805 + }, + { + "epoch": 0.794887762061501, + "grad_norm": 0.09722544252872467, + "learning_rate": 0.00010226001235707299, + "loss": 2.5492, + "step": 26806 + }, + { + "epoch": 0.7949174154138126, + "grad_norm": 0.09787598997354507, + "learning_rate": 0.00010223150218278943, + "loss": 2.6157, + "step": 26807 + }, + { + "epoch": 0.794947068766124, + "grad_norm": 0.09385386854410172, + "learning_rate": 0.00010220299553081414, + "loss": 2.5779, + "step": 26808 + }, + { + "epoch": 0.7949767221184355, + "grad_norm": 0.09039872139692307, + "learning_rate": 0.00010217449240139964, + "loss": 2.5583, + "step": 26809 + }, + { + "epoch": 0.7950063754707469, + "grad_norm": 0.09152981638908386, + "learning_rate": 0.00010214599279479825, + "loss": 2.6057, + "step": 26810 + }, + { + "epoch": 0.7950360288230585, + "grad_norm": 0.09610453248023987, + "learning_rate": 0.00010211749671126241, + "loss": 2.5821, + "step": 26811 + }, + { + "epoch": 0.7950656821753699, + "grad_norm": 0.09401798993349075, + "learning_rate": 0.00010208900415104444, + "loss": 2.6101, + "step": 26812 + }, + { + "epoch": 0.7950953355276814, + "grad_norm": 0.0894935354590416, + "learning_rate": 0.00010206051511439651, + "loss": 2.5536, + "step": 26813 + }, + { + "epoch": 0.7951249888799928, + "grad_norm": 0.09596515446901321, + "learning_rate": 0.00010203202960157116, + "loss": 2.5938, + "step": 26814 + }, + { + "epoch": 0.7951546422323044, + "grad_norm": 0.08770153671503067, + "learning_rate": 0.00010200354761282049, + "loss": 2.6187, + "step": 26815 + }, + { + "epoch": 0.7951842955846158, + "grad_norm": 0.0891166552901268, + "learning_rate": 0.00010197506914839671, + "loss": 2.5705, + "step": 26816 + }, + { + "epoch": 0.7952139489369273, + "grad_norm": 0.09725437313318253, + "learning_rate": 0.00010194659420855218, + "loss": 2.5738, + "step": 26817 + }, + { + "epoch": 0.7952436022892388, + "grad_norm": 0.08927612751722336, + "learning_rate": 0.00010191812279353868, + "loss": 2.5505, + "step": 26818 + }, + { + "epoch": 0.7952732556415503, + "grad_norm": 0.09341835230588913, + "learning_rate": 0.00010188965490360862, + "loss": 2.6106, + "step": 26819 + }, + { + "epoch": 0.7953029089938618, + "grad_norm": 0.08209678530693054, + "learning_rate": 0.00010186119053901393, + "loss": 2.5601, + "step": 26820 + }, + { + "epoch": 0.7953325623461732, + "grad_norm": 0.10343519598245621, + "learning_rate": 0.00010183272970000678, + "loss": 2.5847, + "step": 26821 + }, + { + "epoch": 0.7953622156984848, + "grad_norm": 0.08546917885541916, + "learning_rate": 0.00010180427238683915, + "loss": 2.5851, + "step": 26822 + }, + { + "epoch": 0.7953918690507962, + "grad_norm": 0.09643746167421341, + "learning_rate": 0.00010177581859976304, + "loss": 2.5752, + "step": 26823 + }, + { + "epoch": 0.7954215224031077, + "grad_norm": 0.09315687417984009, + "learning_rate": 0.00010174736833903037, + "loss": 2.5781, + "step": 26824 + }, + { + "epoch": 0.7954511757554191, + "grad_norm": 0.09628729522228241, + "learning_rate": 0.00010171892160489315, + "loss": 2.6238, + "step": 26825 + }, + { + "epoch": 0.7954808291077307, + "grad_norm": 0.08901796489953995, + "learning_rate": 0.00010169047839760309, + "loss": 2.5908, + "step": 26826 + }, + { + "epoch": 0.7955104824600421, + "grad_norm": 0.08631371706724167, + "learning_rate": 0.00010166203871741247, + "loss": 2.6025, + "step": 26827 + }, + { + "epoch": 0.7955401358123536, + "grad_norm": 0.08815660327672958, + "learning_rate": 0.00010163360256457276, + "loss": 2.5804, + "step": 26828 + }, + { + "epoch": 0.795569789164665, + "grad_norm": 0.08818046748638153, + "learning_rate": 0.00010160516993933588, + "loss": 2.5713, + "step": 26829 + }, + { + "epoch": 0.7955994425169766, + "grad_norm": 0.08658827096223831, + "learning_rate": 0.00010157674084195362, + "loss": 2.5694, + "step": 26830 + }, + { + "epoch": 0.795629095869288, + "grad_norm": 0.0876871719956398, + "learning_rate": 0.00010154831527267766, + "loss": 2.5659, + "step": 26831 + }, + { + "epoch": 0.7956587492215995, + "grad_norm": 0.08655359596014023, + "learning_rate": 0.00010151989323175981, + "loss": 2.5902, + "step": 26832 + }, + { + "epoch": 0.7956884025739109, + "grad_norm": 0.09458496421575546, + "learning_rate": 0.00010149147471945169, + "loss": 2.5637, + "step": 26833 + }, + { + "epoch": 0.7957180559262225, + "grad_norm": 0.08689261227846146, + "learning_rate": 0.00010146305973600495, + "loss": 2.5881, + "step": 26834 + }, + { + "epoch": 0.7957477092785339, + "grad_norm": 0.09864405542612076, + "learning_rate": 0.0001014346482816712, + "loss": 2.5929, + "step": 26835 + }, + { + "epoch": 0.7957773626308454, + "grad_norm": 0.08848897367715836, + "learning_rate": 0.0001014062403567021, + "loss": 2.5883, + "step": 26836 + }, + { + "epoch": 0.795807015983157, + "grad_norm": 0.0941709354519844, + "learning_rate": 0.00010137783596134914, + "loss": 2.5932, + "step": 26837 + }, + { + "epoch": 0.7958366693354684, + "grad_norm": 0.0943668782711029, + "learning_rate": 0.00010134943509586386, + "loss": 2.5804, + "step": 26838 + }, + { + "epoch": 0.7958663226877799, + "grad_norm": 0.0895932987332344, + "learning_rate": 0.00010132103776049779, + "loss": 2.6077, + "step": 26839 + }, + { + "epoch": 0.7958959760400913, + "grad_norm": 0.0880148783326149, + "learning_rate": 0.00010129264395550236, + "loss": 2.6038, + "step": 26840 + }, + { + "epoch": 0.7959256293924029, + "grad_norm": 0.09011148661375046, + "learning_rate": 0.00010126425368112896, + "loss": 2.6064, + "step": 26841 + }, + { + "epoch": 0.7959552827447143, + "grad_norm": 0.09047842770814896, + "learning_rate": 0.0001012358669376291, + "loss": 2.5808, + "step": 26842 + }, + { + "epoch": 0.7959849360970258, + "grad_norm": 0.09733635187149048, + "learning_rate": 0.00010120748372525401, + "loss": 2.5886, + "step": 26843 + }, + { + "epoch": 0.7960145894493372, + "grad_norm": 0.09645264595746994, + "learning_rate": 0.00010117910404425512, + "loss": 2.6053, + "step": 26844 + }, + { + "epoch": 0.7960442428016488, + "grad_norm": 0.10476716607809067, + "learning_rate": 0.00010115072789488378, + "loss": 2.6145, + "step": 26845 + }, + { + "epoch": 0.7960738961539602, + "grad_norm": 0.0944371223449707, + "learning_rate": 0.00010112235527739116, + "loss": 2.6075, + "step": 26846 + }, + { + "epoch": 0.7961035495062717, + "grad_norm": 0.10114379227161407, + "learning_rate": 0.00010109398619202853, + "loss": 2.5967, + "step": 26847 + }, + { + "epoch": 0.7961332028585831, + "grad_norm": 0.09403108805418015, + "learning_rate": 0.00010106562063904718, + "loss": 2.5573, + "step": 26848 + }, + { + "epoch": 0.7961628562108947, + "grad_norm": 0.10112915933132172, + "learning_rate": 0.00010103725861869817, + "loss": 2.5605, + "step": 26849 + }, + { + "epoch": 0.7961925095632061, + "grad_norm": 0.09507685154676437, + "learning_rate": 0.00010100890013123277, + "loss": 2.5877, + "step": 26850 + }, + { + "epoch": 0.7962221629155176, + "grad_norm": 0.10165061801671982, + "learning_rate": 0.0001009805451769022, + "loss": 2.6167, + "step": 26851 + }, + { + "epoch": 0.796251816267829, + "grad_norm": 0.09999692440032959, + "learning_rate": 0.00010095219375595704, + "loss": 2.5726, + "step": 26852 + }, + { + "epoch": 0.7962814696201406, + "grad_norm": 0.09219977259635925, + "learning_rate": 0.00010092384586864888, + "loss": 2.5857, + "step": 26853 + }, + { + "epoch": 0.796311122972452, + "grad_norm": 0.0929691419005394, + "learning_rate": 0.00010089550151522859, + "loss": 2.5736, + "step": 26854 + }, + { + "epoch": 0.7963407763247635, + "grad_norm": 0.08860503882169724, + "learning_rate": 0.00010086716069594709, + "loss": 2.5916, + "step": 26855 + }, + { + "epoch": 0.7963704296770749, + "grad_norm": 0.10048021376132965, + "learning_rate": 0.00010083882341105543, + "loss": 2.6015, + "step": 26856 + }, + { + "epoch": 0.7964000830293865, + "grad_norm": 0.08831547200679779, + "learning_rate": 0.00010081048966080448, + "loss": 2.5604, + "step": 26857 + }, + { + "epoch": 0.796429736381698, + "grad_norm": 0.10100527107715607, + "learning_rate": 0.00010078215944544517, + "loss": 2.585, + "step": 26858 + }, + { + "epoch": 0.7964593897340094, + "grad_norm": 0.08397788554430008, + "learning_rate": 0.00010075383276522837, + "loss": 2.6064, + "step": 26859 + }, + { + "epoch": 0.796489043086321, + "grad_norm": 0.1095350906252861, + "learning_rate": 0.00010072550962040494, + "loss": 2.6059, + "step": 26860 + }, + { + "epoch": 0.7965186964386324, + "grad_norm": 0.08947432041168213, + "learning_rate": 0.00010069719001122563, + "loss": 2.6182, + "step": 26861 + }, + { + "epoch": 0.7965483497909439, + "grad_norm": 0.10034286230802536, + "learning_rate": 0.00010066887393794133, + "loss": 2.6023, + "step": 26862 + }, + { + "epoch": 0.7965780031432553, + "grad_norm": 0.08867752552032471, + "learning_rate": 0.00010064056140080263, + "loss": 2.5849, + "step": 26863 + }, + { + "epoch": 0.7966076564955669, + "grad_norm": 0.09343932569026947, + "learning_rate": 0.00010061225240006028, + "loss": 2.5987, + "step": 26864 + }, + { + "epoch": 0.7966373098478783, + "grad_norm": 0.0940699502825737, + "learning_rate": 0.00010058394693596484, + "loss": 2.5532, + "step": 26865 + }, + { + "epoch": 0.7966669632001898, + "grad_norm": 0.10137556493282318, + "learning_rate": 0.00010055564500876729, + "loss": 2.6149, + "step": 26866 + }, + { + "epoch": 0.7966966165525012, + "grad_norm": 0.0898018628358841, + "learning_rate": 0.00010052734661871804, + "loss": 2.5558, + "step": 26867 + }, + { + "epoch": 0.7967262699048128, + "grad_norm": 0.09558790922164917, + "learning_rate": 0.00010049905176606766, + "loss": 2.5948, + "step": 26868 + }, + { + "epoch": 0.7967559232571242, + "grad_norm": 0.08980466425418854, + "learning_rate": 0.00010047076045106679, + "loss": 2.6086, + "step": 26869 + }, + { + "epoch": 0.7967855766094357, + "grad_norm": 0.09379067271947861, + "learning_rate": 0.00010044247267396595, + "loss": 2.5807, + "step": 26870 + }, + { + "epoch": 0.7968152299617471, + "grad_norm": 0.09438717365264893, + "learning_rate": 0.00010041418843501555, + "loss": 2.607, + "step": 26871 + }, + { + "epoch": 0.7968448833140587, + "grad_norm": 0.09321344643831253, + "learning_rate": 0.00010038590773446627, + "loss": 2.6145, + "step": 26872 + }, + { + "epoch": 0.7968745366663701, + "grad_norm": 0.0876213014125824, + "learning_rate": 0.00010035763057256819, + "loss": 2.6021, + "step": 26873 + }, + { + "epoch": 0.7969041900186816, + "grad_norm": 0.08893906325101852, + "learning_rate": 0.0001003293569495719, + "loss": 2.5784, + "step": 26874 + }, + { + "epoch": 0.796933843370993, + "grad_norm": 0.09333903342485428, + "learning_rate": 0.00010030108686572775, + "loss": 2.6193, + "step": 26875 + }, + { + "epoch": 0.7969634967233046, + "grad_norm": 0.08219581842422485, + "learning_rate": 0.00010027282032128615, + "loss": 2.5795, + "step": 26876 + }, + { + "epoch": 0.796993150075616, + "grad_norm": 0.0862005427479744, + "learning_rate": 0.00010024455731649728, + "loss": 2.6026, + "step": 26877 + }, + { + "epoch": 0.7970228034279275, + "grad_norm": 0.09134846180677414, + "learning_rate": 0.00010021629785161135, + "loss": 2.593, + "step": 26878 + }, + { + "epoch": 0.7970524567802391, + "grad_norm": 0.09082725644111633, + "learning_rate": 0.00010018804192687886, + "loss": 2.5988, + "step": 26879 + }, + { + "epoch": 0.7970821101325505, + "grad_norm": 0.08749062567949295, + "learning_rate": 0.00010015978954254984, + "loss": 2.5726, + "step": 26880 + }, + { + "epoch": 0.797111763484862, + "grad_norm": 0.10092639923095703, + "learning_rate": 0.00010013154069887458, + "loss": 2.5903, + "step": 26881 + }, + { + "epoch": 0.7971414168371734, + "grad_norm": 0.09431920200586319, + "learning_rate": 0.0001001032953961033, + "loss": 2.5902, + "step": 26882 + }, + { + "epoch": 0.797171070189485, + "grad_norm": 0.0885922908782959, + "learning_rate": 0.00010007505363448578, + "loss": 2.5782, + "step": 26883 + }, + { + "epoch": 0.7972007235417964, + "grad_norm": 0.09962771087884903, + "learning_rate": 0.00010004681541427236, + "loss": 2.5892, + "step": 26884 + }, + { + "epoch": 0.7972303768941079, + "grad_norm": 0.08968087285757065, + "learning_rate": 0.00010001858073571302, + "loss": 2.6216, + "step": 26885 + }, + { + "epoch": 0.7972600302464193, + "grad_norm": 0.09124978631734848, + "learning_rate": 9.999034959905784e-05, + "loss": 2.6245, + "step": 26886 + }, + { + "epoch": 0.7972896835987309, + "grad_norm": 0.10132432729005814, + "learning_rate": 9.996212200455673e-05, + "loss": 2.5981, + "step": 26887 + }, + { + "epoch": 0.7973193369510423, + "grad_norm": 0.08229582011699677, + "learning_rate": 9.993389795245972e-05, + "loss": 2.5799, + "step": 26888 + }, + { + "epoch": 0.7973489903033538, + "grad_norm": 0.10080339759588242, + "learning_rate": 9.990567744301671e-05, + "loss": 2.5882, + "step": 26889 + }, + { + "epoch": 0.7973786436556652, + "grad_norm": 0.09136644005775452, + "learning_rate": 9.987746047647755e-05, + "loss": 2.5672, + "step": 26890 + }, + { + "epoch": 0.7974082970079768, + "grad_norm": 0.08735425025224686, + "learning_rate": 9.984924705309212e-05, + "loss": 2.563, + "step": 26891 + }, + { + "epoch": 0.7974379503602882, + "grad_norm": 0.09207450598478317, + "learning_rate": 9.982103717311037e-05, + "loss": 2.6212, + "step": 26892 + }, + { + "epoch": 0.7974676037125997, + "grad_norm": 0.0923493355512619, + "learning_rate": 9.979283083678214e-05, + "loss": 2.5586, + "step": 26893 + }, + { + "epoch": 0.7974972570649111, + "grad_norm": 0.07877897471189499, + "learning_rate": 9.976462804435699e-05, + "loss": 2.5751, + "step": 26894 + }, + { + "epoch": 0.7975269104172227, + "grad_norm": 0.09534353762865067, + "learning_rate": 9.973642879608475e-05, + "loss": 2.5861, + "step": 26895 + }, + { + "epoch": 0.7975565637695341, + "grad_norm": 0.08081774413585663, + "learning_rate": 9.970823309221517e-05, + "loss": 2.5709, + "step": 26896 + }, + { + "epoch": 0.7975862171218456, + "grad_norm": 0.09597726166248322, + "learning_rate": 9.968004093299782e-05, + "loss": 2.5853, + "step": 26897 + }, + { + "epoch": 0.797615870474157, + "grad_norm": 0.09633835405111313, + "learning_rate": 9.965185231868245e-05, + "loss": 2.5897, + "step": 26898 + }, + { + "epoch": 0.7976455238264686, + "grad_norm": 0.09016823023557663, + "learning_rate": 9.962366724951872e-05, + "loss": 2.5557, + "step": 26899 + }, + { + "epoch": 0.7976751771787801, + "grad_norm": 0.09653627872467041, + "learning_rate": 9.959548572575606e-05, + "loss": 2.547, + "step": 26900 + }, + { + "epoch": 0.7977048305310915, + "grad_norm": 0.08959219604730606, + "learning_rate": 9.95673077476441e-05, + "loss": 2.6, + "step": 26901 + }, + { + "epoch": 0.7977344838834031, + "grad_norm": 0.09269833564758301, + "learning_rate": 9.953913331543241e-05, + "loss": 2.5798, + "step": 26902 + }, + { + "epoch": 0.7977641372357145, + "grad_norm": 0.09054471552371979, + "learning_rate": 9.951096242937041e-05, + "loss": 2.5596, + "step": 26903 + }, + { + "epoch": 0.797793790588026, + "grad_norm": 0.09978022426366806, + "learning_rate": 9.948279508970754e-05, + "loss": 2.5999, + "step": 26904 + }, + { + "epoch": 0.7978234439403374, + "grad_norm": 0.08947613090276718, + "learning_rate": 9.945463129669336e-05, + "loss": 2.5478, + "step": 26905 + }, + { + "epoch": 0.797853097292649, + "grad_norm": 0.0968366339802742, + "learning_rate": 9.942647105057706e-05, + "loss": 2.5853, + "step": 26906 + }, + { + "epoch": 0.7978827506449604, + "grad_norm": 0.09437555819749832, + "learning_rate": 9.939831435160818e-05, + "loss": 2.5938, + "step": 26907 + }, + { + "epoch": 0.7979124039972719, + "grad_norm": 0.09428133815526962, + "learning_rate": 9.9370161200036e-05, + "loss": 2.5613, + "step": 26908 + }, + { + "epoch": 0.7979420573495833, + "grad_norm": 0.09060286730527878, + "learning_rate": 9.934201159610979e-05, + "loss": 2.5748, + "step": 26909 + }, + { + "epoch": 0.7979717107018949, + "grad_norm": 0.0884523019194603, + "learning_rate": 9.931386554007888e-05, + "loss": 2.5886, + "step": 26910 + }, + { + "epoch": 0.7980013640542063, + "grad_norm": 0.08847801387310028, + "learning_rate": 9.928572303219241e-05, + "loss": 2.5752, + "step": 26911 + }, + { + "epoch": 0.7980310174065178, + "grad_norm": 0.09252879023551941, + "learning_rate": 9.925758407269963e-05, + "loss": 2.6168, + "step": 26912 + }, + { + "epoch": 0.7980606707588292, + "grad_norm": 0.09582153707742691, + "learning_rate": 9.92294486618498e-05, + "loss": 2.5724, + "step": 26913 + }, + { + "epoch": 0.7980903241111408, + "grad_norm": 0.09753040969371796, + "learning_rate": 9.920131679989197e-05, + "loss": 2.5765, + "step": 26914 + }, + { + "epoch": 0.7981199774634522, + "grad_norm": 0.0975419282913208, + "learning_rate": 9.917318848707524e-05, + "loss": 2.5884, + "step": 26915 + }, + { + "epoch": 0.7981496308157637, + "grad_norm": 0.09748780727386475, + "learning_rate": 9.914506372364873e-05, + "loss": 2.6077, + "step": 26916 + }, + { + "epoch": 0.7981792841680752, + "grad_norm": 0.0857422724366188, + "learning_rate": 9.911694250986153e-05, + "loss": 2.5773, + "step": 26917 + }, + { + "epoch": 0.7982089375203867, + "grad_norm": 0.10193321853876114, + "learning_rate": 9.90888248459626e-05, + "loss": 2.5834, + "step": 26918 + }, + { + "epoch": 0.7982385908726981, + "grad_norm": 0.08571518957614899, + "learning_rate": 9.90607107322009e-05, + "loss": 2.5722, + "step": 26919 + }, + { + "epoch": 0.7982682442250096, + "grad_norm": 0.09787256270647049, + "learning_rate": 9.903260016882548e-05, + "loss": 2.5863, + "step": 26920 + }, + { + "epoch": 0.7982978975773212, + "grad_norm": 0.09324627369642258, + "learning_rate": 9.900449315608517e-05, + "loss": 2.5845, + "step": 26921 + }, + { + "epoch": 0.7983275509296326, + "grad_norm": 0.09957670420408249, + "learning_rate": 9.897638969422895e-05, + "loss": 2.5546, + "step": 26922 + }, + { + "epoch": 0.7983572042819441, + "grad_norm": 0.09468167275190353, + "learning_rate": 9.894828978350562e-05, + "loss": 2.5766, + "step": 26923 + }, + { + "epoch": 0.7983868576342555, + "grad_norm": 0.08983191847801208, + "learning_rate": 9.892019342416402e-05, + "loss": 2.5881, + "step": 26924 + }, + { + "epoch": 0.7984165109865671, + "grad_norm": 0.095442034304142, + "learning_rate": 9.889210061645293e-05, + "loss": 2.5992, + "step": 26925 + }, + { + "epoch": 0.7984461643388785, + "grad_norm": 0.09034043550491333, + "learning_rate": 9.886401136062118e-05, + "loss": 2.5726, + "step": 26926 + }, + { + "epoch": 0.79847581769119, + "grad_norm": 0.08593156188726425, + "learning_rate": 9.883592565691752e-05, + "loss": 2.5966, + "step": 26927 + }, + { + "epoch": 0.7985054710435014, + "grad_norm": 0.09918127954006195, + "learning_rate": 9.880784350559052e-05, + "loss": 2.6016, + "step": 26928 + }, + { + "epoch": 0.798535124395813, + "grad_norm": 0.0871984139084816, + "learning_rate": 9.877976490688895e-05, + "loss": 2.587, + "step": 26929 + }, + { + "epoch": 0.7985647777481244, + "grad_norm": 0.09746984392404556, + "learning_rate": 9.875168986106125e-05, + "loss": 2.5675, + "step": 26930 + }, + { + "epoch": 0.7985944311004359, + "grad_norm": 0.09237338602542877, + "learning_rate": 9.872361836835637e-05, + "loss": 2.5919, + "step": 26931 + }, + { + "epoch": 0.7986240844527474, + "grad_norm": 0.08974553644657135, + "learning_rate": 9.869555042902273e-05, + "loss": 2.5665, + "step": 26932 + }, + { + "epoch": 0.7986537378050589, + "grad_norm": 0.0864110067486763, + "learning_rate": 9.866748604330883e-05, + "loss": 2.5594, + "step": 26933 + }, + { + "epoch": 0.7986833911573703, + "grad_norm": 0.08405541628599167, + "learning_rate": 9.863942521146329e-05, + "loss": 2.6175, + "step": 26934 + }, + { + "epoch": 0.7987130445096818, + "grad_norm": 0.097742959856987, + "learning_rate": 9.861136793373449e-05, + "loss": 2.6204, + "step": 26935 + }, + { + "epoch": 0.7987426978619933, + "grad_norm": 0.08873166888952255, + "learning_rate": 9.858331421037093e-05, + "loss": 2.5664, + "step": 26936 + }, + { + "epoch": 0.7987723512143048, + "grad_norm": 0.0965559184551239, + "learning_rate": 9.855526404162107e-05, + "loss": 2.6038, + "step": 26937 + }, + { + "epoch": 0.7988020045666162, + "grad_norm": 0.08371292799711227, + "learning_rate": 9.852721742773336e-05, + "loss": 2.5705, + "step": 26938 + }, + { + "epoch": 0.7988316579189277, + "grad_norm": 0.08672252297401428, + "learning_rate": 9.849917436895589e-05, + "loss": 2.5608, + "step": 26939 + }, + { + "epoch": 0.7988613112712392, + "grad_norm": 0.09095306694507599, + "learning_rate": 9.847113486553715e-05, + "loss": 2.6004, + "step": 26940 + }, + { + "epoch": 0.7988909646235507, + "grad_norm": 0.08620940893888474, + "learning_rate": 9.844309891772546e-05, + "loss": 2.5961, + "step": 26941 + }, + { + "epoch": 0.7989206179758622, + "grad_norm": 0.09075102210044861, + "learning_rate": 9.841506652576904e-05, + "loss": 2.6211, + "step": 26942 + }, + { + "epoch": 0.7989502713281736, + "grad_norm": 0.08179283887147903, + "learning_rate": 9.838703768991603e-05, + "loss": 2.5842, + "step": 26943 + }, + { + "epoch": 0.7989799246804852, + "grad_norm": 0.09051056206226349, + "learning_rate": 9.835901241041484e-05, + "loss": 2.5783, + "step": 26944 + }, + { + "epoch": 0.7990095780327966, + "grad_norm": 0.08291566371917725, + "learning_rate": 9.833099068751355e-05, + "loss": 2.5534, + "step": 26945 + }, + { + "epoch": 0.7990392313851081, + "grad_norm": 0.09068089723587036, + "learning_rate": 9.830297252146025e-05, + "loss": 2.5448, + "step": 26946 + }, + { + "epoch": 0.7990688847374195, + "grad_norm": 0.09366348385810852, + "learning_rate": 9.827495791250313e-05, + "loss": 2.5874, + "step": 26947 + }, + { + "epoch": 0.7990985380897311, + "grad_norm": 0.08865005522966385, + "learning_rate": 9.82469468608903e-05, + "loss": 2.5711, + "step": 26948 + }, + { + "epoch": 0.7991281914420425, + "grad_norm": 0.09395763278007507, + "learning_rate": 9.82189393668696e-05, + "loss": 2.6317, + "step": 26949 + }, + { + "epoch": 0.799157844794354, + "grad_norm": 0.09087812900543213, + "learning_rate": 9.819093543068919e-05, + "loss": 2.5895, + "step": 26950 + }, + { + "epoch": 0.7991874981466655, + "grad_norm": 0.09614872187376022, + "learning_rate": 9.8162935052597e-05, + "loss": 2.556, + "step": 26951 + }, + { + "epoch": 0.799217151498977, + "grad_norm": 0.09316176176071167, + "learning_rate": 9.813493823284098e-05, + "loss": 2.5854, + "step": 26952 + }, + { + "epoch": 0.7992468048512884, + "grad_norm": 0.08455819636583328, + "learning_rate": 9.810694497166906e-05, + "loss": 2.5812, + "step": 26953 + }, + { + "epoch": 0.7992764582035999, + "grad_norm": 0.09694216400384903, + "learning_rate": 9.807895526932914e-05, + "loss": 2.591, + "step": 26954 + }, + { + "epoch": 0.7993061115559114, + "grad_norm": 0.09311330318450928, + "learning_rate": 9.805096912606904e-05, + "loss": 2.5897, + "step": 26955 + }, + { + "epoch": 0.7993357649082229, + "grad_norm": 0.09178756922483444, + "learning_rate": 9.802298654213648e-05, + "loss": 2.5559, + "step": 26956 + }, + { + "epoch": 0.7993654182605343, + "grad_norm": 0.08615200221538544, + "learning_rate": 9.799500751777952e-05, + "loss": 2.5793, + "step": 26957 + }, + { + "epoch": 0.7993950716128458, + "grad_norm": 0.09594275802373886, + "learning_rate": 9.796703205324575e-05, + "loss": 2.5519, + "step": 26958 + }, + { + "epoch": 0.7994247249651573, + "grad_norm": 0.08393137156963348, + "learning_rate": 9.793906014878306e-05, + "loss": 2.573, + "step": 26959 + }, + { + "epoch": 0.7994543783174688, + "grad_norm": 0.1014331504702568, + "learning_rate": 9.791109180463886e-05, + "loss": 2.5954, + "step": 26960 + }, + { + "epoch": 0.7994840316697802, + "grad_norm": 0.08798511326313019, + "learning_rate": 9.788312702106094e-05, + "loss": 2.6008, + "step": 26961 + }, + { + "epoch": 0.7995136850220917, + "grad_norm": 0.0872904360294342, + "learning_rate": 9.785516579829701e-05, + "loss": 2.5609, + "step": 26962 + }, + { + "epoch": 0.7995433383744033, + "grad_norm": 0.0937185138463974, + "learning_rate": 9.782720813659457e-05, + "loss": 2.5894, + "step": 26963 + }, + { + "epoch": 0.7995729917267147, + "grad_norm": 0.08906880021095276, + "learning_rate": 9.779925403620127e-05, + "loss": 2.5992, + "step": 26964 + }, + { + "epoch": 0.7996026450790262, + "grad_norm": 0.08025503158569336, + "learning_rate": 9.777130349736458e-05, + "loss": 2.5966, + "step": 26965 + }, + { + "epoch": 0.7996322984313377, + "grad_norm": 0.08953811973333359, + "learning_rate": 9.774335652033206e-05, + "loss": 2.6392, + "step": 26966 + }, + { + "epoch": 0.7996619517836492, + "grad_norm": 0.08641606569290161, + "learning_rate": 9.771541310535115e-05, + "loss": 2.5498, + "step": 26967 + }, + { + "epoch": 0.7996916051359606, + "grad_norm": 0.08585576713085175, + "learning_rate": 9.768747325266935e-05, + "loss": 2.5969, + "step": 26968 + }, + { + "epoch": 0.7997212584882721, + "grad_norm": 0.093529112637043, + "learning_rate": 9.765953696253399e-05, + "loss": 2.549, + "step": 26969 + }, + { + "epoch": 0.7997509118405836, + "grad_norm": 0.09000305831432343, + "learning_rate": 9.763160423519247e-05, + "loss": 2.5827, + "step": 26970 + }, + { + "epoch": 0.7997805651928951, + "grad_norm": 0.08487260341644287, + "learning_rate": 9.760367507089218e-05, + "loss": 2.6004, + "step": 26971 + }, + { + "epoch": 0.7998102185452065, + "grad_norm": 0.09426689893007278, + "learning_rate": 9.757574946988046e-05, + "loss": 2.5732, + "step": 26972 + }, + { + "epoch": 0.799839871897518, + "grad_norm": 0.08880240470170975, + "learning_rate": 9.754782743240453e-05, + "loss": 2.6079, + "step": 26973 + }, + { + "epoch": 0.7998695252498295, + "grad_norm": 0.08592958748340607, + "learning_rate": 9.751990895871166e-05, + "loss": 2.583, + "step": 26974 + }, + { + "epoch": 0.799899178602141, + "grad_norm": 0.08821424096822739, + "learning_rate": 9.749199404904907e-05, + "loss": 2.5754, + "step": 26975 + }, + { + "epoch": 0.7999288319544524, + "grad_norm": 0.08682826906442642, + "learning_rate": 9.746408270366397e-05, + "loss": 2.5966, + "step": 26976 + }, + { + "epoch": 0.799958485306764, + "grad_norm": 0.08342903107404709, + "learning_rate": 9.743617492280349e-05, + "loss": 2.6004, + "step": 26977 + }, + { + "epoch": 0.7999881386590754, + "grad_norm": 0.09269682317972183, + "learning_rate": 9.740827070671482e-05, + "loss": 2.5862, + "step": 26978 + }, + { + "epoch": 0.8000177920113869, + "grad_norm": 0.10020403563976288, + "learning_rate": 9.738037005564499e-05, + "loss": 2.5454, + "step": 26979 + }, + { + "epoch": 0.8000474453636983, + "grad_norm": 0.09644003957509995, + "learning_rate": 9.735247296984112e-05, + "loss": 2.6056, + "step": 26980 + }, + { + "epoch": 0.8000770987160098, + "grad_norm": 0.10469695180654526, + "learning_rate": 9.73245794495502e-05, + "loss": 2.5927, + "step": 26981 + }, + { + "epoch": 0.8001067520683213, + "grad_norm": 0.09779437631368637, + "learning_rate": 9.729668949501924e-05, + "loss": 2.6141, + "step": 26982 + }, + { + "epoch": 0.8001364054206328, + "grad_norm": 0.1019892692565918, + "learning_rate": 9.726880310649522e-05, + "loss": 2.5665, + "step": 26983 + }, + { + "epoch": 0.8001660587729443, + "grad_norm": 0.10163471102714539, + "learning_rate": 9.724092028422504e-05, + "loss": 2.5828, + "step": 26984 + }, + { + "epoch": 0.8001957121252558, + "grad_norm": 0.0980323925614357, + "learning_rate": 9.721304102845569e-05, + "loss": 2.5782, + "step": 26985 + }, + { + "epoch": 0.8002253654775673, + "grad_norm": 0.100263312458992, + "learning_rate": 9.7185165339434e-05, + "loss": 2.6003, + "step": 26986 + }, + { + "epoch": 0.8002550188298787, + "grad_norm": 0.0981621965765953, + "learning_rate": 9.71572932174068e-05, + "loss": 2.5833, + "step": 26987 + }, + { + "epoch": 0.8002846721821902, + "grad_norm": 0.11430928856134415, + "learning_rate": 9.712942466262093e-05, + "loss": 2.5728, + "step": 26988 + }, + { + "epoch": 0.8003143255345017, + "grad_norm": 0.09818508476018906, + "learning_rate": 9.710155967532314e-05, + "loss": 2.5973, + "step": 26989 + }, + { + "epoch": 0.8003439788868132, + "grad_norm": 0.09383606910705566, + "learning_rate": 9.707369825576023e-05, + "loss": 2.5746, + "step": 26990 + }, + { + "epoch": 0.8003736322391246, + "grad_norm": 0.09445401281118393, + "learning_rate": 9.704584040417885e-05, + "loss": 2.5929, + "step": 26991 + }, + { + "epoch": 0.8004032855914361, + "grad_norm": 0.09540625661611557, + "learning_rate": 9.701798612082569e-05, + "loss": 2.5946, + "step": 26992 + }, + { + "epoch": 0.8004329389437476, + "grad_norm": 0.09836099296808243, + "learning_rate": 9.699013540594765e-05, + "loss": 2.5961, + "step": 26993 + }, + { + "epoch": 0.8004625922960591, + "grad_norm": 0.09306103736162186, + "learning_rate": 9.69622882597908e-05, + "loss": 2.6052, + "step": 26994 + }, + { + "epoch": 0.8004922456483705, + "grad_norm": 0.10570573061704636, + "learning_rate": 9.693444468260221e-05, + "loss": 2.5947, + "step": 26995 + }, + { + "epoch": 0.800521899000682, + "grad_norm": 0.09666068851947784, + "learning_rate": 9.69066046746283e-05, + "loss": 2.6088, + "step": 26996 + }, + { + "epoch": 0.8005515523529935, + "grad_norm": 0.10020473599433899, + "learning_rate": 9.687876823611564e-05, + "loss": 2.5942, + "step": 26997 + }, + { + "epoch": 0.800581205705305, + "grad_norm": 0.12149304896593094, + "learning_rate": 9.685093536731066e-05, + "loss": 2.5845, + "step": 26998 + }, + { + "epoch": 0.8006108590576164, + "grad_norm": 0.09542202204465866, + "learning_rate": 9.682310606845979e-05, + "loss": 2.6062, + "step": 26999 + }, + { + "epoch": 0.800640512409928, + "grad_norm": 0.09155400842428207, + "learning_rate": 9.67952803398096e-05, + "loss": 2.5823, + "step": 27000 + }, + { + "epoch": 0.8006701657622394, + "grad_norm": 0.1102442741394043, + "learning_rate": 9.676745818160637e-05, + "loss": 2.5663, + "step": 27001 + }, + { + "epoch": 0.8006998191145509, + "grad_norm": 0.0908118411898613, + "learning_rate": 9.673963959409654e-05, + "loss": 2.5776, + "step": 27002 + }, + { + "epoch": 0.8007294724668623, + "grad_norm": 0.09471476823091507, + "learning_rate": 9.671182457752653e-05, + "loss": 2.5855, + "step": 27003 + }, + { + "epoch": 0.8007591258191739, + "grad_norm": 0.09891090542078018, + "learning_rate": 9.668401313214237e-05, + "loss": 2.583, + "step": 27004 + }, + { + "epoch": 0.8007887791714854, + "grad_norm": 0.09551790356636047, + "learning_rate": 9.665620525819058e-05, + "loss": 2.577, + "step": 27005 + }, + { + "epoch": 0.8008184325237968, + "grad_norm": 0.09224822372198105, + "learning_rate": 9.662840095591724e-05, + "loss": 2.5961, + "step": 27006 + }, + { + "epoch": 0.8008480858761083, + "grad_norm": 0.0851936861872673, + "learning_rate": 9.66006002255686e-05, + "loss": 2.5557, + "step": 27007 + }, + { + "epoch": 0.8008777392284198, + "grad_norm": 0.08699764311313629, + "learning_rate": 9.657280306739097e-05, + "loss": 2.596, + "step": 27008 + }, + { + "epoch": 0.8009073925807313, + "grad_norm": 0.09053058922290802, + "learning_rate": 9.654500948163042e-05, + "loss": 2.5872, + "step": 27009 + }, + { + "epoch": 0.8009370459330427, + "grad_norm": 0.08340919017791748, + "learning_rate": 9.651721946853304e-05, + "loss": 2.5888, + "step": 27010 + }, + { + "epoch": 0.8009666992853542, + "grad_norm": 0.0949312373995781, + "learning_rate": 9.648943302834501e-05, + "loss": 2.5827, + "step": 27011 + }, + { + "epoch": 0.8009963526376657, + "grad_norm": 0.08468140661716461, + "learning_rate": 9.646165016131225e-05, + "loss": 2.5976, + "step": 27012 + }, + { + "epoch": 0.8010260059899772, + "grad_norm": 0.10661768913269043, + "learning_rate": 9.643387086768086e-05, + "loss": 2.5947, + "step": 27013 + }, + { + "epoch": 0.8010556593422886, + "grad_norm": 0.08983120322227478, + "learning_rate": 9.640609514769694e-05, + "loss": 2.5843, + "step": 27014 + }, + { + "epoch": 0.8010853126946001, + "grad_norm": 0.10722316056489944, + "learning_rate": 9.63783230016062e-05, + "loss": 2.5847, + "step": 27015 + }, + { + "epoch": 0.8011149660469116, + "grad_norm": 0.09534044563770294, + "learning_rate": 9.635055442965468e-05, + "loss": 2.6124, + "step": 27016 + }, + { + "epoch": 0.8011446193992231, + "grad_norm": 0.09986543655395508, + "learning_rate": 9.632278943208833e-05, + "loss": 2.5948, + "step": 27017 + }, + { + "epoch": 0.8011742727515345, + "grad_norm": 0.1027185246348381, + "learning_rate": 9.629502800915291e-05, + "loss": 2.5897, + "step": 27018 + }, + { + "epoch": 0.801203926103846, + "grad_norm": 0.10611782222986221, + "learning_rate": 9.626727016109437e-05, + "loss": 2.607, + "step": 27019 + }, + { + "epoch": 0.8012335794561575, + "grad_norm": 0.09555840492248535, + "learning_rate": 9.623951588815827e-05, + "loss": 2.5834, + "step": 27020 + }, + { + "epoch": 0.801263232808469, + "grad_norm": 0.10211940109729767, + "learning_rate": 9.621176519059072e-05, + "loss": 2.5724, + "step": 27021 + }, + { + "epoch": 0.8012928861607804, + "grad_norm": 0.09638356417417526, + "learning_rate": 9.61840180686373e-05, + "loss": 2.5776, + "step": 27022 + }, + { + "epoch": 0.801322539513092, + "grad_norm": 0.09403219819068909, + "learning_rate": 9.615627452254371e-05, + "loss": 2.5729, + "step": 27023 + }, + { + "epoch": 0.8013521928654034, + "grad_norm": 0.09376164525747299, + "learning_rate": 9.612853455255577e-05, + "loss": 2.5989, + "step": 27024 + }, + { + "epoch": 0.8013818462177149, + "grad_norm": 0.09946247190237045, + "learning_rate": 9.610079815891882e-05, + "loss": 2.6153, + "step": 27025 + }, + { + "epoch": 0.8014114995700264, + "grad_norm": 0.10022345930337906, + "learning_rate": 9.607306534187865e-05, + "loss": 2.6003, + "step": 27026 + }, + { + "epoch": 0.8014411529223379, + "grad_norm": 0.09184622764587402, + "learning_rate": 9.604533610168081e-05, + "loss": 2.5888, + "step": 27027 + }, + { + "epoch": 0.8014708062746494, + "grad_norm": 0.09699689596891403, + "learning_rate": 9.601761043857088e-05, + "loss": 2.5812, + "step": 27028 + }, + { + "epoch": 0.8015004596269608, + "grad_norm": 0.0959629938006401, + "learning_rate": 9.598988835279431e-05, + "loss": 2.5691, + "step": 27029 + }, + { + "epoch": 0.8015301129792723, + "grad_norm": 0.09097835421562195, + "learning_rate": 9.596216984459665e-05, + "loss": 2.5893, + "step": 27030 + }, + { + "epoch": 0.8015597663315838, + "grad_norm": 0.08944027125835419, + "learning_rate": 9.593445491422331e-05, + "loss": 2.5708, + "step": 27031 + }, + { + "epoch": 0.8015894196838953, + "grad_norm": 0.09600473940372467, + "learning_rate": 9.590674356191975e-05, + "loss": 2.6012, + "step": 27032 + }, + { + "epoch": 0.8016190730362067, + "grad_norm": 0.09852162003517151, + "learning_rate": 9.587903578793122e-05, + "loss": 2.6283, + "step": 27033 + }, + { + "epoch": 0.8016487263885183, + "grad_norm": 0.0913759171962738, + "learning_rate": 9.585133159250331e-05, + "loss": 2.575, + "step": 27034 + }, + { + "epoch": 0.8016783797408297, + "grad_norm": 0.09234514832496643, + "learning_rate": 9.582363097588137e-05, + "loss": 2.5845, + "step": 27035 + }, + { + "epoch": 0.8017080330931412, + "grad_norm": 0.09536304324865341, + "learning_rate": 9.579593393831044e-05, + "loss": 2.6027, + "step": 27036 + }, + { + "epoch": 0.8017376864454526, + "grad_norm": 0.09086097031831741, + "learning_rate": 9.576824048003585e-05, + "loss": 2.613, + "step": 27037 + }, + { + "epoch": 0.8017673397977642, + "grad_norm": 0.0898490697145462, + "learning_rate": 9.574055060130287e-05, + "loss": 2.5821, + "step": 27038 + }, + { + "epoch": 0.8017969931500756, + "grad_norm": 0.09793564677238464, + "learning_rate": 9.571286430235676e-05, + "loss": 2.5597, + "step": 27039 + }, + { + "epoch": 0.8018266465023871, + "grad_norm": 0.0897483378648758, + "learning_rate": 9.568518158344258e-05, + "loss": 2.6016, + "step": 27040 + }, + { + "epoch": 0.8018562998546985, + "grad_norm": 0.09637056291103363, + "learning_rate": 9.565750244480554e-05, + "loss": 2.5873, + "step": 27041 + }, + { + "epoch": 0.8018859532070101, + "grad_norm": 0.08653878420591354, + "learning_rate": 9.56298268866907e-05, + "loss": 2.5981, + "step": 27042 + }, + { + "epoch": 0.8019156065593215, + "grad_norm": 0.0943775400519371, + "learning_rate": 9.56021549093432e-05, + "loss": 2.6176, + "step": 27043 + }, + { + "epoch": 0.801945259911633, + "grad_norm": 0.0869893953204155, + "learning_rate": 9.557448651300798e-05, + "loss": 2.6096, + "step": 27044 + }, + { + "epoch": 0.8019749132639445, + "grad_norm": 0.08859409391880035, + "learning_rate": 9.554682169793011e-05, + "loss": 2.5741, + "step": 27045 + }, + { + "epoch": 0.802004566616256, + "grad_norm": 0.09021271765232086, + "learning_rate": 9.55191604643546e-05, + "loss": 2.5756, + "step": 27046 + }, + { + "epoch": 0.8020342199685675, + "grad_norm": 0.08888377249240875, + "learning_rate": 9.549150281252633e-05, + "loss": 2.5756, + "step": 27047 + }, + { + "epoch": 0.8020638733208789, + "grad_norm": 0.08574145287275314, + "learning_rate": 9.54638487426902e-05, + "loss": 2.553, + "step": 27048 + }, + { + "epoch": 0.8020935266731904, + "grad_norm": 0.09372267872095108, + "learning_rate": 9.54361982550912e-05, + "loss": 2.5946, + "step": 27049 + }, + { + "epoch": 0.8021231800255019, + "grad_norm": 0.09269193559885025, + "learning_rate": 9.540855134997406e-05, + "loss": 2.5716, + "step": 27050 + }, + { + "epoch": 0.8021528333778134, + "grad_norm": 0.09730628877878189, + "learning_rate": 9.538090802758365e-05, + "loss": 2.5626, + "step": 27051 + }, + { + "epoch": 0.8021824867301248, + "grad_norm": 0.08997853845357895, + "learning_rate": 9.535326828816471e-05, + "loss": 2.5749, + "step": 27052 + }, + { + "epoch": 0.8022121400824364, + "grad_norm": 0.08910167217254639, + "learning_rate": 9.53256321319621e-05, + "loss": 2.5776, + "step": 27053 + }, + { + "epoch": 0.8022417934347478, + "grad_norm": 0.08915834873914719, + "learning_rate": 9.529799955922042e-05, + "loss": 2.5932, + "step": 27054 + }, + { + "epoch": 0.8022714467870593, + "grad_norm": 0.0863100066781044, + "learning_rate": 9.527037057018446e-05, + "loss": 2.5804, + "step": 27055 + }, + { + "epoch": 0.8023011001393707, + "grad_norm": 0.08923178166151047, + "learning_rate": 9.524274516509885e-05, + "loss": 2.5646, + "step": 27056 + }, + { + "epoch": 0.8023307534916823, + "grad_norm": 0.0926373153924942, + "learning_rate": 9.521512334420818e-05, + "loss": 2.6047, + "step": 27057 + }, + { + "epoch": 0.8023604068439937, + "grad_norm": 0.08816570043563843, + "learning_rate": 9.51875051077572e-05, + "loss": 2.595, + "step": 27058 + }, + { + "epoch": 0.8023900601963052, + "grad_norm": 0.08298400789499283, + "learning_rate": 9.51598904559901e-05, + "loss": 2.5803, + "step": 27059 + }, + { + "epoch": 0.8024197135486166, + "grad_norm": 0.0852050706744194, + "learning_rate": 9.513227938915181e-05, + "loss": 2.5294, + "step": 27060 + }, + { + "epoch": 0.8024493669009282, + "grad_norm": 0.08476211130619049, + "learning_rate": 9.510467190748667e-05, + "loss": 2.5808, + "step": 27061 + }, + { + "epoch": 0.8024790202532396, + "grad_norm": 0.08738941699266434, + "learning_rate": 9.507706801123916e-05, + "loss": 2.5517, + "step": 27062 + }, + { + "epoch": 0.8025086736055511, + "grad_norm": 0.0857289656996727, + "learning_rate": 9.50494677006537e-05, + "loss": 2.5684, + "step": 27063 + }, + { + "epoch": 0.8025383269578625, + "grad_norm": 0.0815010666847229, + "learning_rate": 9.50218709759747e-05, + "loss": 2.602, + "step": 27064 + }, + { + "epoch": 0.8025679803101741, + "grad_norm": 0.08587811887264252, + "learning_rate": 9.499427783744658e-05, + "loss": 2.6055, + "step": 27065 + }, + { + "epoch": 0.8025976336624856, + "grad_norm": 0.08704786002635956, + "learning_rate": 9.496668828531363e-05, + "loss": 2.5731, + "step": 27066 + }, + { + "epoch": 0.802627287014797, + "grad_norm": 0.09674934297800064, + "learning_rate": 9.493910231982017e-05, + "loss": 2.5832, + "step": 27067 + }, + { + "epoch": 0.8026569403671086, + "grad_norm": 0.09756288677453995, + "learning_rate": 9.49115199412105e-05, + "loss": 2.5623, + "step": 27068 + }, + { + "epoch": 0.80268659371942, + "grad_norm": 0.08730878680944443, + "learning_rate": 9.488394114972898e-05, + "loss": 2.5804, + "step": 27069 + }, + { + "epoch": 0.8027162470717315, + "grad_norm": 0.09357418119907379, + "learning_rate": 9.485636594561958e-05, + "loss": 2.5781, + "step": 27070 + }, + { + "epoch": 0.8027459004240429, + "grad_norm": 0.09133600443601608, + "learning_rate": 9.482879432912661e-05, + "loss": 2.5867, + "step": 27071 + }, + { + "epoch": 0.8027755537763545, + "grad_norm": 0.09858182072639465, + "learning_rate": 9.48012263004941e-05, + "loss": 2.6119, + "step": 27072 + }, + { + "epoch": 0.8028052071286659, + "grad_norm": 0.09229011088609695, + "learning_rate": 9.477366185996634e-05, + "loss": 2.5546, + "step": 27073 + }, + { + "epoch": 0.8028348604809774, + "grad_norm": 0.09340010583400726, + "learning_rate": 9.474610100778741e-05, + "loss": 2.572, + "step": 27074 + }, + { + "epoch": 0.8028645138332888, + "grad_norm": 0.08940932154655457, + "learning_rate": 9.47185437442013e-05, + "loss": 2.557, + "step": 27075 + }, + { + "epoch": 0.8028941671856004, + "grad_norm": 0.10083437711000443, + "learning_rate": 9.469099006945203e-05, + "loss": 2.5551, + "step": 27076 + }, + { + "epoch": 0.8029238205379118, + "grad_norm": 0.08918929845094681, + "learning_rate": 9.466343998378368e-05, + "loss": 2.5646, + "step": 27077 + }, + { + "epoch": 0.8029534738902233, + "grad_norm": 0.0890466570854187, + "learning_rate": 9.463589348744011e-05, + "loss": 2.5512, + "step": 27078 + }, + { + "epoch": 0.8029831272425347, + "grad_norm": 0.08651445060968399, + "learning_rate": 9.460835058066541e-05, + "loss": 2.5549, + "step": 27079 + }, + { + "epoch": 0.8030127805948463, + "grad_norm": 0.08931753039360046, + "learning_rate": 9.458081126370322e-05, + "loss": 2.5673, + "step": 27080 + }, + { + "epoch": 0.8030424339471577, + "grad_norm": 0.09363649040460587, + "learning_rate": 9.45532755367975e-05, + "loss": 2.5626, + "step": 27081 + }, + { + "epoch": 0.8030720872994692, + "grad_norm": 0.08650125563144684, + "learning_rate": 9.452574340019216e-05, + "loss": 2.5912, + "step": 27082 + }, + { + "epoch": 0.8031017406517806, + "grad_norm": 0.08952878415584564, + "learning_rate": 9.449821485413096e-05, + "loss": 2.5894, + "step": 27083 + }, + { + "epoch": 0.8031313940040922, + "grad_norm": 0.09135980904102325, + "learning_rate": 9.447068989885766e-05, + "loss": 2.5493, + "step": 27084 + }, + { + "epoch": 0.8031610473564036, + "grad_norm": 0.093921959400177, + "learning_rate": 9.444316853461587e-05, + "loss": 2.5762, + "step": 27085 + }, + { + "epoch": 0.8031907007087151, + "grad_norm": 0.10828738659620285, + "learning_rate": 9.44156507616496e-05, + "loss": 2.5898, + "step": 27086 + }, + { + "epoch": 0.8032203540610267, + "grad_norm": 0.09178660064935684, + "learning_rate": 9.438813658020234e-05, + "loss": 2.5798, + "step": 27087 + }, + { + "epoch": 0.8032500074133381, + "grad_norm": 0.09651584178209305, + "learning_rate": 9.436062599051776e-05, + "loss": 2.5794, + "step": 27088 + }, + { + "epoch": 0.8032796607656496, + "grad_norm": 0.0962563306093216, + "learning_rate": 9.433311899283942e-05, + "loss": 2.5781, + "step": 27089 + }, + { + "epoch": 0.803309314117961, + "grad_norm": 0.10739199817180634, + "learning_rate": 9.430561558741114e-05, + "loss": 2.5885, + "step": 27090 + }, + { + "epoch": 0.8033389674702726, + "grad_norm": 0.08670210093259811, + "learning_rate": 9.427811577447609e-05, + "loss": 2.554, + "step": 27091 + }, + { + "epoch": 0.803368620822584, + "grad_norm": 0.10641878843307495, + "learning_rate": 9.4250619554278e-05, + "loss": 2.584, + "step": 27092 + }, + { + "epoch": 0.8033982741748955, + "grad_norm": 0.08946084976196289, + "learning_rate": 9.422312692706032e-05, + "loss": 2.5496, + "step": 27093 + }, + { + "epoch": 0.8034279275272069, + "grad_norm": 0.09984536468982697, + "learning_rate": 9.419563789306645e-05, + "loss": 2.624, + "step": 27094 + }, + { + "epoch": 0.8034575808795185, + "grad_norm": 0.09778881818056107, + "learning_rate": 9.41681524525399e-05, + "loss": 2.5313, + "step": 27095 + }, + { + "epoch": 0.8034872342318299, + "grad_norm": 0.09437068551778793, + "learning_rate": 9.4140670605724e-05, + "loss": 2.5873, + "step": 27096 + }, + { + "epoch": 0.8035168875841414, + "grad_norm": 0.09118622541427612, + "learning_rate": 9.411319235286219e-05, + "loss": 2.5632, + "step": 27097 + }, + { + "epoch": 0.8035465409364528, + "grad_norm": 0.09449398517608643, + "learning_rate": 9.408571769419755e-05, + "loss": 2.5458, + "step": 27098 + }, + { + "epoch": 0.8035761942887644, + "grad_norm": 0.10052735358476639, + "learning_rate": 9.40582466299737e-05, + "loss": 2.5966, + "step": 27099 + }, + { + "epoch": 0.8036058476410758, + "grad_norm": 0.09154751896858215, + "learning_rate": 9.403077916043384e-05, + "loss": 2.5974, + "step": 27100 + }, + { + "epoch": 0.8036355009933873, + "grad_norm": 0.10018084943294525, + "learning_rate": 9.400331528582101e-05, + "loss": 2.5803, + "step": 27101 + }, + { + "epoch": 0.8036651543456987, + "grad_norm": 0.09222970902919769, + "learning_rate": 9.397585500637856e-05, + "loss": 2.5608, + "step": 27102 + }, + { + "epoch": 0.8036948076980103, + "grad_norm": 0.09596208482980728, + "learning_rate": 9.394839832234958e-05, + "loss": 2.5734, + "step": 27103 + }, + { + "epoch": 0.8037244610503217, + "grad_norm": 0.09126629680395126, + "learning_rate": 9.392094523397721e-05, + "loss": 2.5859, + "step": 27104 + }, + { + "epoch": 0.8037541144026332, + "grad_norm": 0.09684117138385773, + "learning_rate": 9.389349574150457e-05, + "loss": 2.5859, + "step": 27105 + }, + { + "epoch": 0.8037837677549446, + "grad_norm": 0.0910513699054718, + "learning_rate": 9.386604984517477e-05, + "loss": 2.6009, + "step": 27106 + }, + { + "epoch": 0.8038134211072562, + "grad_norm": 0.09148532897233963, + "learning_rate": 9.383860754523076e-05, + "loss": 2.5844, + "step": 27107 + }, + { + "epoch": 0.8038430744595677, + "grad_norm": 0.08587662875652313, + "learning_rate": 9.38111688419156e-05, + "loss": 2.5582, + "step": 27108 + }, + { + "epoch": 0.8038727278118791, + "grad_norm": 0.0918385237455368, + "learning_rate": 9.378373373547233e-05, + "loss": 2.5926, + "step": 27109 + }, + { + "epoch": 0.8039023811641907, + "grad_norm": 0.08651844412088394, + "learning_rate": 9.375630222614373e-05, + "loss": 2.5842, + "step": 27110 + }, + { + "epoch": 0.8039320345165021, + "grad_norm": 0.0863281786441803, + "learning_rate": 9.372887431417288e-05, + "loss": 2.5932, + "step": 27111 + }, + { + "epoch": 0.8039616878688136, + "grad_norm": 0.09227024763822556, + "learning_rate": 9.370144999980257e-05, + "loss": 2.5644, + "step": 27112 + }, + { + "epoch": 0.803991341221125, + "grad_norm": 0.09503967314958572, + "learning_rate": 9.367402928327562e-05, + "loss": 2.5891, + "step": 27113 + }, + { + "epoch": 0.8040209945734366, + "grad_norm": 0.08547239005565643, + "learning_rate": 9.364661216483494e-05, + "loss": 2.5466, + "step": 27114 + }, + { + "epoch": 0.804050647925748, + "grad_norm": 0.09921309351921082, + "learning_rate": 9.361919864472317e-05, + "loss": 2.5391, + "step": 27115 + }, + { + "epoch": 0.8040803012780595, + "grad_norm": 0.08816312998533249, + "learning_rate": 9.359178872318325e-05, + "loss": 2.5819, + "step": 27116 + }, + { + "epoch": 0.8041099546303709, + "grad_norm": 0.0888349786400795, + "learning_rate": 9.356438240045778e-05, + "loss": 2.5614, + "step": 27117 + }, + { + "epoch": 0.8041396079826825, + "grad_norm": 0.09541597217321396, + "learning_rate": 9.353697967678942e-05, + "loss": 2.6066, + "step": 27118 + }, + { + "epoch": 0.8041692613349939, + "grad_norm": 0.10326118767261505, + "learning_rate": 9.350958055242093e-05, + "loss": 2.5751, + "step": 27119 + }, + { + "epoch": 0.8041989146873054, + "grad_norm": 0.08500013500452042, + "learning_rate": 9.348218502759482e-05, + "loss": 2.5645, + "step": 27120 + }, + { + "epoch": 0.8042285680396168, + "grad_norm": 0.09579519182443619, + "learning_rate": 9.345479310255378e-05, + "loss": 2.6098, + "step": 27121 + }, + { + "epoch": 0.8042582213919284, + "grad_norm": 0.09862758964300156, + "learning_rate": 9.34274047775403e-05, + "loss": 2.5782, + "step": 27122 + }, + { + "epoch": 0.8042878747442398, + "grad_norm": 0.09244141727685928, + "learning_rate": 9.340002005279697e-05, + "loss": 2.6141, + "step": 27123 + }, + { + "epoch": 0.8043175280965513, + "grad_norm": 0.09580905735492706, + "learning_rate": 9.337263892856624e-05, + "loss": 2.5679, + "step": 27124 + }, + { + "epoch": 0.8043471814488627, + "grad_norm": 0.09545421600341797, + "learning_rate": 9.334526140509059e-05, + "loss": 2.5939, + "step": 27125 + }, + { + "epoch": 0.8043768348011743, + "grad_norm": 0.0909583568572998, + "learning_rate": 9.331788748261244e-05, + "loss": 2.5728, + "step": 27126 + }, + { + "epoch": 0.8044064881534857, + "grad_norm": 0.09961525350809097, + "learning_rate": 9.329051716137421e-05, + "loss": 2.5739, + "step": 27127 + }, + { + "epoch": 0.8044361415057972, + "grad_norm": 0.09822148829698563, + "learning_rate": 9.326315044161826e-05, + "loss": 2.5988, + "step": 27128 + }, + { + "epoch": 0.8044657948581088, + "grad_norm": 0.08813940733671188, + "learning_rate": 9.323578732358695e-05, + "loss": 2.5836, + "step": 27129 + }, + { + "epoch": 0.8044954482104202, + "grad_norm": 0.08949162811040878, + "learning_rate": 9.320842780752253e-05, + "loss": 2.5988, + "step": 27130 + }, + { + "epoch": 0.8045251015627317, + "grad_norm": 0.09118891507387161, + "learning_rate": 9.318107189366737e-05, + "loss": 2.5739, + "step": 27131 + }, + { + "epoch": 0.8045547549150431, + "grad_norm": 0.09246401488780975, + "learning_rate": 9.31537195822636e-05, + "loss": 2.5744, + "step": 27132 + }, + { + "epoch": 0.8045844082673547, + "grad_norm": 0.0941959097981453, + "learning_rate": 9.312637087355347e-05, + "loss": 2.5927, + "step": 27133 + }, + { + "epoch": 0.8046140616196661, + "grad_norm": 0.09467649459838867, + "learning_rate": 9.309902576777929e-05, + "loss": 2.578, + "step": 27134 + }, + { + "epoch": 0.8046437149719776, + "grad_norm": 0.09961020946502686, + "learning_rate": 9.307168426518297e-05, + "loss": 2.5635, + "step": 27135 + }, + { + "epoch": 0.804673368324289, + "grad_norm": 0.09992209076881409, + "learning_rate": 9.304434636600673e-05, + "loss": 2.5595, + "step": 27136 + }, + { + "epoch": 0.8047030216766006, + "grad_norm": 0.09398413449525833, + "learning_rate": 9.301701207049251e-05, + "loss": 2.5763, + "step": 27137 + }, + { + "epoch": 0.804732675028912, + "grad_norm": 0.08953659981489182, + "learning_rate": 9.298968137888264e-05, + "loss": 2.5926, + "step": 27138 + }, + { + "epoch": 0.8047623283812235, + "grad_norm": 0.0922987312078476, + "learning_rate": 9.296235429141903e-05, + "loss": 2.6286, + "step": 27139 + }, + { + "epoch": 0.804791981733535, + "grad_norm": 0.09846623986959457, + "learning_rate": 9.293503080834365e-05, + "loss": 2.5699, + "step": 27140 + }, + { + "epoch": 0.8048216350858465, + "grad_norm": 0.08412421494722366, + "learning_rate": 9.290771092989842e-05, + "loss": 2.5831, + "step": 27141 + }, + { + "epoch": 0.8048512884381579, + "grad_norm": 0.09174469113349915, + "learning_rate": 9.288039465632526e-05, + "loss": 2.5885, + "step": 27142 + }, + { + "epoch": 0.8048809417904694, + "grad_norm": 0.08836282044649124, + "learning_rate": 9.285308198786612e-05, + "loss": 2.6052, + "step": 27143 + }, + { + "epoch": 0.8049105951427808, + "grad_norm": 0.09750781953334808, + "learning_rate": 9.28257729247628e-05, + "loss": 2.6012, + "step": 27144 + }, + { + "epoch": 0.8049402484950924, + "grad_norm": 0.08278724551200867, + "learning_rate": 9.279846746725729e-05, + "loss": 2.6037, + "step": 27145 + }, + { + "epoch": 0.8049699018474038, + "grad_norm": 0.09449601918458939, + "learning_rate": 9.277116561559113e-05, + "loss": 2.6026, + "step": 27146 + }, + { + "epoch": 0.8049995551997153, + "grad_norm": 0.0911278948187828, + "learning_rate": 9.274386737000617e-05, + "loss": 2.5564, + "step": 27147 + }, + { + "epoch": 0.8050292085520268, + "grad_norm": 0.09236674010753632, + "learning_rate": 9.271657273074419e-05, + "loss": 2.5788, + "step": 27148 + }, + { + "epoch": 0.8050588619043383, + "grad_norm": 0.09501232951879501, + "learning_rate": 9.268928169804685e-05, + "loss": 2.6059, + "step": 27149 + }, + { + "epoch": 0.8050885152566498, + "grad_norm": 0.09494705498218536, + "learning_rate": 9.266199427215576e-05, + "loss": 2.584, + "step": 27150 + }, + { + "epoch": 0.8051181686089612, + "grad_norm": 0.10055917501449585, + "learning_rate": 9.263471045331274e-05, + "loss": 2.5774, + "step": 27151 + }, + { + "epoch": 0.8051478219612728, + "grad_norm": 0.09583573043346405, + "learning_rate": 9.260743024175921e-05, + "loss": 2.5664, + "step": 27152 + }, + { + "epoch": 0.8051774753135842, + "grad_norm": 0.09415699541568756, + "learning_rate": 9.258015363773692e-05, + "loss": 2.5737, + "step": 27153 + }, + { + "epoch": 0.8052071286658957, + "grad_norm": 0.09758693724870682, + "learning_rate": 9.25528806414872e-05, + "loss": 2.5641, + "step": 27154 + }, + { + "epoch": 0.8052367820182071, + "grad_norm": 0.10470956563949585, + "learning_rate": 9.252561125325187e-05, + "loss": 2.6024, + "step": 27155 + }, + { + "epoch": 0.8052664353705187, + "grad_norm": 0.10030511766672134, + "learning_rate": 9.249834547327201e-05, + "loss": 2.5783, + "step": 27156 + }, + { + "epoch": 0.8052960887228301, + "grad_norm": 0.1034388318657875, + "learning_rate": 9.247108330178927e-05, + "loss": 2.5855, + "step": 27157 + }, + { + "epoch": 0.8053257420751416, + "grad_norm": 0.09241942316293716, + "learning_rate": 9.244382473904505e-05, + "loss": 2.601, + "step": 27158 + }, + { + "epoch": 0.805355395427453, + "grad_norm": 0.09486813098192215, + "learning_rate": 9.241656978528073e-05, + "loss": 2.5807, + "step": 27159 + }, + { + "epoch": 0.8053850487797646, + "grad_norm": 0.09488891065120697, + "learning_rate": 9.238931844073762e-05, + "loss": 2.5676, + "step": 27160 + }, + { + "epoch": 0.805414702132076, + "grad_norm": 0.10344209522008896, + "learning_rate": 9.236207070565705e-05, + "loss": 2.6004, + "step": 27161 + }, + { + "epoch": 0.8054443554843875, + "grad_norm": 0.09248095005750656, + "learning_rate": 9.233482658028031e-05, + "loss": 2.6014, + "step": 27162 + }, + { + "epoch": 0.805474008836699, + "grad_norm": 0.09468291699886322, + "learning_rate": 9.230758606484857e-05, + "loss": 2.581, + "step": 27163 + }, + { + "epoch": 0.8055036621890105, + "grad_norm": 0.08505712449550629, + "learning_rate": 9.228034915960321e-05, + "loss": 2.59, + "step": 27164 + }, + { + "epoch": 0.8055333155413219, + "grad_norm": 0.09264800697565079, + "learning_rate": 9.22531158647854e-05, + "loss": 2.5873, + "step": 27165 + }, + { + "epoch": 0.8055629688936334, + "grad_norm": 0.08612655848264694, + "learning_rate": 9.222588618063632e-05, + "loss": 2.5374, + "step": 27166 + }, + { + "epoch": 0.8055926222459449, + "grad_norm": 0.09453478455543518, + "learning_rate": 9.219866010739691e-05, + "loss": 2.5582, + "step": 27167 + }, + { + "epoch": 0.8056222755982564, + "grad_norm": 0.09499230980873108, + "learning_rate": 9.217143764530834e-05, + "loss": 2.5375, + "step": 27168 + }, + { + "epoch": 0.8056519289505678, + "grad_norm": 0.0926516056060791, + "learning_rate": 9.214421879461172e-05, + "loss": 2.6164, + "step": 27169 + }, + { + "epoch": 0.8056815823028793, + "grad_norm": 0.08568290621042252, + "learning_rate": 9.211700355554803e-05, + "loss": 2.5735, + "step": 27170 + }, + { + "epoch": 0.8057112356551909, + "grad_norm": 0.0916101336479187, + "learning_rate": 9.208979192835832e-05, + "loss": 2.5647, + "step": 27171 + }, + { + "epoch": 0.8057408890075023, + "grad_norm": 0.09891414642333984, + "learning_rate": 9.206258391328348e-05, + "loss": 2.5699, + "step": 27172 + }, + { + "epoch": 0.8057705423598138, + "grad_norm": 0.09483391791582108, + "learning_rate": 9.203537951056445e-05, + "loss": 2.5846, + "step": 27173 + }, + { + "epoch": 0.8058001957121252, + "grad_norm": 0.09341845661401749, + "learning_rate": 9.20081787204422e-05, + "loss": 2.5655, + "step": 27174 + }, + { + "epoch": 0.8058298490644368, + "grad_norm": 0.09799879789352417, + "learning_rate": 9.19809815431576e-05, + "loss": 2.5903, + "step": 27175 + }, + { + "epoch": 0.8058595024167482, + "grad_norm": 0.0910024642944336, + "learning_rate": 9.195378797895138e-05, + "loss": 2.5726, + "step": 27176 + }, + { + "epoch": 0.8058891557690597, + "grad_norm": 0.09601247310638428, + "learning_rate": 9.192659802806441e-05, + "loss": 2.595, + "step": 27177 + }, + { + "epoch": 0.8059188091213711, + "grad_norm": 0.08840116858482361, + "learning_rate": 9.189941169073751e-05, + "loss": 2.5617, + "step": 27178 + }, + { + "epoch": 0.8059484624736827, + "grad_norm": 0.09203116595745087, + "learning_rate": 9.187222896721131e-05, + "loss": 2.5349, + "step": 27179 + }, + { + "epoch": 0.8059781158259941, + "grad_norm": 0.0983070582151413, + "learning_rate": 9.184504985772663e-05, + "loss": 2.5924, + "step": 27180 + }, + { + "epoch": 0.8060077691783056, + "grad_norm": 0.08581032603979111, + "learning_rate": 9.18178743625241e-05, + "loss": 2.5565, + "step": 27181 + }, + { + "epoch": 0.806037422530617, + "grad_norm": 0.10103441774845123, + "learning_rate": 9.179070248184429e-05, + "loss": 2.5868, + "step": 27182 + }, + { + "epoch": 0.8060670758829286, + "grad_norm": 0.08311375230550766, + "learning_rate": 9.176353421592792e-05, + "loss": 2.556, + "step": 27183 + }, + { + "epoch": 0.80609672923524, + "grad_norm": 0.10519801080226898, + "learning_rate": 9.173636956501552e-05, + "loss": 2.5545, + "step": 27184 + }, + { + "epoch": 0.8061263825875515, + "grad_norm": 0.09000016003847122, + "learning_rate": 9.170920852934766e-05, + "loss": 2.5631, + "step": 27185 + }, + { + "epoch": 0.806156035939863, + "grad_norm": 0.09161215275526047, + "learning_rate": 9.168205110916483e-05, + "loss": 2.594, + "step": 27186 + }, + { + "epoch": 0.8061856892921745, + "grad_norm": 0.09175396710634232, + "learning_rate": 9.165489730470749e-05, + "loss": 2.556, + "step": 27187 + }, + { + "epoch": 0.8062153426444859, + "grad_norm": 0.09517557919025421, + "learning_rate": 9.162774711621619e-05, + "loss": 2.6137, + "step": 27188 + }, + { + "epoch": 0.8062449959967974, + "grad_norm": 0.0899096354842186, + "learning_rate": 9.160060054393121e-05, + "loss": 2.6064, + "step": 27189 + }, + { + "epoch": 0.8062746493491089, + "grad_norm": 0.08879973739385605, + "learning_rate": 9.157345758809304e-05, + "loss": 2.6144, + "step": 27190 + }, + { + "epoch": 0.8063043027014204, + "grad_norm": 0.09460367262363434, + "learning_rate": 9.154631824894205e-05, + "loss": 2.5556, + "step": 27191 + }, + { + "epoch": 0.8063339560537319, + "grad_norm": 0.08325488120317459, + "learning_rate": 9.151918252671849e-05, + "loss": 2.5882, + "step": 27192 + }, + { + "epoch": 0.8063636094060433, + "grad_norm": 0.09586183726787567, + "learning_rate": 9.149205042166269e-05, + "loss": 2.5838, + "step": 27193 + }, + { + "epoch": 0.8063932627583549, + "grad_norm": 0.09471157938241959, + "learning_rate": 9.14649219340149e-05, + "loss": 2.5384, + "step": 27194 + }, + { + "epoch": 0.8064229161106663, + "grad_norm": 0.09222476929426193, + "learning_rate": 9.143779706401533e-05, + "loss": 2.5926, + "step": 27195 + }, + { + "epoch": 0.8064525694629778, + "grad_norm": 0.10030599683523178, + "learning_rate": 9.141067581190426e-05, + "loss": 2.5769, + "step": 27196 + }, + { + "epoch": 0.8064822228152893, + "grad_norm": 0.09004797786474228, + "learning_rate": 9.138355817792176e-05, + "loss": 2.5652, + "step": 27197 + }, + { + "epoch": 0.8065118761676008, + "grad_norm": 0.09604717791080475, + "learning_rate": 9.135644416230798e-05, + "loss": 2.6034, + "step": 27198 + }, + { + "epoch": 0.8065415295199122, + "grad_norm": 0.08311336487531662, + "learning_rate": 9.1329333765303e-05, + "loss": 2.556, + "step": 27199 + }, + { + "epoch": 0.8065711828722237, + "grad_norm": 0.09270385652780533, + "learning_rate": 9.130222698714707e-05, + "loss": 2.5925, + "step": 27200 + }, + { + "epoch": 0.8066008362245352, + "grad_norm": 0.08335378766059875, + "learning_rate": 9.127512382807984e-05, + "loss": 2.5325, + "step": 27201 + }, + { + "epoch": 0.8066304895768467, + "grad_norm": 0.08225712180137634, + "learning_rate": 9.124802428834162e-05, + "loss": 2.5973, + "step": 27202 + }, + { + "epoch": 0.8066601429291581, + "grad_norm": 0.09783348441123962, + "learning_rate": 9.122092836817236e-05, + "loss": 2.59, + "step": 27203 + }, + { + "epoch": 0.8066897962814696, + "grad_norm": 0.08416464179754257, + "learning_rate": 9.11938360678119e-05, + "loss": 2.5875, + "step": 27204 + }, + { + "epoch": 0.8067194496337811, + "grad_norm": 0.1006573885679245, + "learning_rate": 9.116674738750025e-05, + "loss": 2.6176, + "step": 27205 + }, + { + "epoch": 0.8067491029860926, + "grad_norm": 0.0926152914762497, + "learning_rate": 9.113966232747717e-05, + "loss": 2.5972, + "step": 27206 + }, + { + "epoch": 0.806778756338404, + "grad_norm": 0.097350113093853, + "learning_rate": 9.111258088798258e-05, + "loss": 2.5765, + "step": 27207 + }, + { + "epoch": 0.8068084096907155, + "grad_norm": 0.09540120512247086, + "learning_rate": 9.108550306925628e-05, + "loss": 2.5938, + "step": 27208 + }, + { + "epoch": 0.806838063043027, + "grad_norm": 0.08808202296495438, + "learning_rate": 9.105842887153804e-05, + "loss": 2.5489, + "step": 27209 + }, + { + "epoch": 0.8068677163953385, + "grad_norm": 0.09388608485460281, + "learning_rate": 9.10313582950677e-05, + "loss": 2.5778, + "step": 27210 + }, + { + "epoch": 0.8068973697476499, + "grad_norm": 0.09285447746515274, + "learning_rate": 9.100429134008481e-05, + "loss": 2.6006, + "step": 27211 + }, + { + "epoch": 0.8069270230999614, + "grad_norm": 0.08704305440187454, + "learning_rate": 9.097722800682905e-05, + "loss": 2.5989, + "step": 27212 + }, + { + "epoch": 0.806956676452273, + "grad_norm": 0.10767266899347305, + "learning_rate": 9.09501682955402e-05, + "loss": 2.5848, + "step": 27213 + }, + { + "epoch": 0.8069863298045844, + "grad_norm": 0.0892045721411705, + "learning_rate": 9.092311220645772e-05, + "loss": 2.561, + "step": 27214 + }, + { + "epoch": 0.8070159831568959, + "grad_norm": 0.08995449542999268, + "learning_rate": 9.089605973982134e-05, + "loss": 2.5725, + "step": 27215 + }, + { + "epoch": 0.8070456365092074, + "grad_norm": 0.08424988389015198, + "learning_rate": 9.086901089587063e-05, + "loss": 2.5829, + "step": 27216 + }, + { + "epoch": 0.8070752898615189, + "grad_norm": 0.0943467766046524, + "learning_rate": 9.08419656748451e-05, + "loss": 2.6018, + "step": 27217 + }, + { + "epoch": 0.8071049432138303, + "grad_norm": 0.08354651182889938, + "learning_rate": 9.081492407698411e-05, + "loss": 2.5645, + "step": 27218 + }, + { + "epoch": 0.8071345965661418, + "grad_norm": 0.09064377099275589, + "learning_rate": 9.078788610252725e-05, + "loss": 2.5525, + "step": 27219 + }, + { + "epoch": 0.8071642499184533, + "grad_norm": 0.09324058890342712, + "learning_rate": 9.07608517517139e-05, + "loss": 2.5927, + "step": 27220 + }, + { + "epoch": 0.8071939032707648, + "grad_norm": 0.08909137547016144, + "learning_rate": 9.07338210247835e-05, + "loss": 2.6008, + "step": 27221 + }, + { + "epoch": 0.8072235566230762, + "grad_norm": 0.09303201735019684, + "learning_rate": 9.070679392197534e-05, + "loss": 2.5885, + "step": 27222 + }, + { + "epoch": 0.8072532099753877, + "grad_norm": 0.09158424288034439, + "learning_rate": 9.067977044352871e-05, + "loss": 2.5938, + "step": 27223 + }, + { + "epoch": 0.8072828633276992, + "grad_norm": 0.09331610053777695, + "learning_rate": 9.065275058968303e-05, + "loss": 2.5935, + "step": 27224 + }, + { + "epoch": 0.8073125166800107, + "grad_norm": 0.09326671063899994, + "learning_rate": 9.062573436067745e-05, + "loss": 2.5854, + "step": 27225 + }, + { + "epoch": 0.8073421700323221, + "grad_norm": 0.0915137231349945, + "learning_rate": 9.059872175675126e-05, + "loss": 2.5896, + "step": 27226 + }, + { + "epoch": 0.8073718233846336, + "grad_norm": 0.09213074296712875, + "learning_rate": 9.057171277814358e-05, + "loss": 2.5612, + "step": 27227 + }, + { + "epoch": 0.8074014767369451, + "grad_norm": 0.09355367720127106, + "learning_rate": 9.054470742509374e-05, + "loss": 2.5922, + "step": 27228 + }, + { + "epoch": 0.8074311300892566, + "grad_norm": 0.09362068772315979, + "learning_rate": 9.051770569784085e-05, + "loss": 2.5932, + "step": 27229 + }, + { + "epoch": 0.807460783441568, + "grad_norm": 0.08482933044433594, + "learning_rate": 9.049070759662392e-05, + "loss": 2.5945, + "step": 27230 + }, + { + "epoch": 0.8074904367938796, + "grad_norm": 0.09679339081048965, + "learning_rate": 9.046371312168222e-05, + "loss": 2.5634, + "step": 27231 + }, + { + "epoch": 0.807520090146191, + "grad_norm": 0.08880709856748581, + "learning_rate": 9.04367222732545e-05, + "loss": 2.5823, + "step": 27232 + }, + { + "epoch": 0.8075497434985025, + "grad_norm": 0.08459904789924622, + "learning_rate": 9.04097350515799e-05, + "loss": 2.5991, + "step": 27233 + }, + { + "epoch": 0.807579396850814, + "grad_norm": 0.10118622332811356, + "learning_rate": 9.038275145689739e-05, + "loss": 2.6036, + "step": 27234 + }, + { + "epoch": 0.8076090502031255, + "grad_norm": 0.08478018641471863, + "learning_rate": 9.035577148944596e-05, + "loss": 2.532, + "step": 27235 + }, + { + "epoch": 0.807638703555437, + "grad_norm": 0.09374938160181046, + "learning_rate": 9.032879514946445e-05, + "loss": 2.601, + "step": 27236 + }, + { + "epoch": 0.8076683569077484, + "grad_norm": 0.07921343296766281, + "learning_rate": 9.030182243719181e-05, + "loss": 2.5832, + "step": 27237 + }, + { + "epoch": 0.8076980102600599, + "grad_norm": 0.09037577360868454, + "learning_rate": 9.027485335286684e-05, + "loss": 2.5626, + "step": 27238 + }, + { + "epoch": 0.8077276636123714, + "grad_norm": 0.08801856637001038, + "learning_rate": 9.024788789672838e-05, + "loss": 2.5583, + "step": 27239 + }, + { + "epoch": 0.8077573169646829, + "grad_norm": 0.08211498707532883, + "learning_rate": 9.022092606901505e-05, + "loss": 2.5926, + "step": 27240 + }, + { + "epoch": 0.8077869703169943, + "grad_norm": 0.08907822519540787, + "learning_rate": 9.019396786996592e-05, + "loss": 2.5788, + "step": 27241 + }, + { + "epoch": 0.8078166236693058, + "grad_norm": 0.09161773324012756, + "learning_rate": 9.01670132998197e-05, + "loss": 2.5549, + "step": 27242 + }, + { + "epoch": 0.8078462770216173, + "grad_norm": 0.09185168892145157, + "learning_rate": 9.014006235881473e-05, + "loss": 2.5606, + "step": 27243 + }, + { + "epoch": 0.8078759303739288, + "grad_norm": 0.08621837943792343, + "learning_rate": 9.011311504718988e-05, + "loss": 2.5487, + "step": 27244 + }, + { + "epoch": 0.8079055837262402, + "grad_norm": 0.08398102223873138, + "learning_rate": 9.008617136518377e-05, + "loss": 2.5627, + "step": 27245 + }, + { + "epoch": 0.8079352370785517, + "grad_norm": 0.10470909625291824, + "learning_rate": 9.005923131303496e-05, + "loss": 2.6055, + "step": 27246 + }, + { + "epoch": 0.8079648904308632, + "grad_norm": 0.08851833641529083, + "learning_rate": 9.003229489098203e-05, + "loss": 2.5917, + "step": 27247 + }, + { + "epoch": 0.8079945437831747, + "grad_norm": 0.08580972999334335, + "learning_rate": 9.000536209926353e-05, + "loss": 2.5613, + "step": 27248 + }, + { + "epoch": 0.8080241971354861, + "grad_norm": 0.09302722662687302, + "learning_rate": 8.997843293811786e-05, + "loss": 2.5859, + "step": 27249 + }, + { + "epoch": 0.8080538504877977, + "grad_norm": 0.09410391747951508, + "learning_rate": 8.995150740778358e-05, + "loss": 2.5791, + "step": 27250 + }, + { + "epoch": 0.8080835038401091, + "grad_norm": 0.09092742204666138, + "learning_rate": 8.992458550849908e-05, + "loss": 2.6049, + "step": 27251 + }, + { + "epoch": 0.8081131571924206, + "grad_norm": 0.0914648026227951, + "learning_rate": 8.989766724050274e-05, + "loss": 2.5558, + "step": 27252 + }, + { + "epoch": 0.808142810544732, + "grad_norm": 0.09090568870306015, + "learning_rate": 8.987075260403299e-05, + "loss": 2.6147, + "step": 27253 + }, + { + "epoch": 0.8081724638970436, + "grad_norm": 0.09324884414672852, + "learning_rate": 8.984384159932807e-05, + "loss": 2.5869, + "step": 27254 + }, + { + "epoch": 0.8082021172493551, + "grad_norm": 0.0892934799194336, + "learning_rate": 8.98169342266264e-05, + "loss": 2.5797, + "step": 27255 + }, + { + "epoch": 0.8082317706016665, + "grad_norm": 0.10111378133296967, + "learning_rate": 8.979003048616613e-05, + "loss": 2.5459, + "step": 27256 + }, + { + "epoch": 0.808261423953978, + "grad_norm": 0.09233927726745605, + "learning_rate": 8.976313037818562e-05, + "loss": 2.5823, + "step": 27257 + }, + { + "epoch": 0.8082910773062895, + "grad_norm": 0.09910917282104492, + "learning_rate": 8.973623390292296e-05, + "loss": 2.5072, + "step": 27258 + }, + { + "epoch": 0.808320730658601, + "grad_norm": 0.092647023499012, + "learning_rate": 8.970934106061634e-05, + "loss": 2.6075, + "step": 27259 + }, + { + "epoch": 0.8083503840109124, + "grad_norm": 0.09454407542943954, + "learning_rate": 8.968245185150398e-05, + "loss": 2.5703, + "step": 27260 + }, + { + "epoch": 0.808380037363224, + "grad_norm": 0.09647075831890106, + "learning_rate": 8.965556627582394e-05, + "loss": 2.6075, + "step": 27261 + }, + { + "epoch": 0.8084096907155354, + "grad_norm": 0.08599869161844254, + "learning_rate": 8.962868433381427e-05, + "loss": 2.5803, + "step": 27262 + }, + { + "epoch": 0.8084393440678469, + "grad_norm": 0.08906155079603195, + "learning_rate": 8.960180602571305e-05, + "loss": 2.5935, + "step": 27263 + }, + { + "epoch": 0.8084689974201583, + "grad_norm": 0.09066352993249893, + "learning_rate": 8.957493135175825e-05, + "loss": 2.5653, + "step": 27264 + }, + { + "epoch": 0.8084986507724699, + "grad_norm": 0.09032373130321503, + "learning_rate": 8.954806031218793e-05, + "loss": 2.5731, + "step": 27265 + }, + { + "epoch": 0.8085283041247813, + "grad_norm": 0.09113319963216782, + "learning_rate": 8.952119290723999e-05, + "loss": 2.561, + "step": 27266 + }, + { + "epoch": 0.8085579574770928, + "grad_norm": 0.09779481589794159, + "learning_rate": 8.949432913715233e-05, + "loss": 2.5815, + "step": 27267 + }, + { + "epoch": 0.8085876108294042, + "grad_norm": 0.08971776068210602, + "learning_rate": 8.946746900216279e-05, + "loss": 2.5384, + "step": 27268 + }, + { + "epoch": 0.8086172641817158, + "grad_norm": 0.09007861465215683, + "learning_rate": 8.944061250250935e-05, + "loss": 2.5696, + "step": 27269 + }, + { + "epoch": 0.8086469175340272, + "grad_norm": 0.0872100219130516, + "learning_rate": 8.941375963842973e-05, + "loss": 2.5669, + "step": 27270 + }, + { + "epoch": 0.8086765708863387, + "grad_norm": 0.08578468859195709, + "learning_rate": 8.938691041016178e-05, + "loss": 2.5814, + "step": 27271 + }, + { + "epoch": 0.8087062242386501, + "grad_norm": 0.08272199332714081, + "learning_rate": 8.936006481794318e-05, + "loss": 2.5725, + "step": 27272 + }, + { + "epoch": 0.8087358775909617, + "grad_norm": 0.09330065548419952, + "learning_rate": 8.933322286201173e-05, + "loss": 2.5782, + "step": 27273 + }, + { + "epoch": 0.8087655309432732, + "grad_norm": 0.08747871220111847, + "learning_rate": 8.930638454260504e-05, + "loss": 2.6119, + "step": 27274 + }, + { + "epoch": 0.8087951842955846, + "grad_norm": 0.09970694780349731, + "learning_rate": 8.927954985996084e-05, + "loss": 2.5855, + "step": 27275 + }, + { + "epoch": 0.8088248376478961, + "grad_norm": 0.08586345613002777, + "learning_rate": 8.925271881431679e-05, + "loss": 2.5416, + "step": 27276 + }, + { + "epoch": 0.8088544910002076, + "grad_norm": 0.08952458947896957, + "learning_rate": 8.922589140591036e-05, + "loss": 2.5726, + "step": 27277 + }, + { + "epoch": 0.8088841443525191, + "grad_norm": 0.09894398599863052, + "learning_rate": 8.919906763497914e-05, + "loss": 2.6088, + "step": 27278 + }, + { + "epoch": 0.8089137977048305, + "grad_norm": 0.08420028537511826, + "learning_rate": 8.917224750176056e-05, + "loss": 2.584, + "step": 27279 + }, + { + "epoch": 0.808943451057142, + "grad_norm": 0.0981818214058876, + "learning_rate": 8.914543100649242e-05, + "loss": 2.5426, + "step": 27280 + }, + { + "epoch": 0.8089731044094535, + "grad_norm": 0.08427208662033081, + "learning_rate": 8.911861814941197e-05, + "loss": 2.6032, + "step": 27281 + }, + { + "epoch": 0.809002757761765, + "grad_norm": 0.09554142504930496, + "learning_rate": 8.90918089307567e-05, + "loss": 2.6058, + "step": 27282 + }, + { + "epoch": 0.8090324111140764, + "grad_norm": 0.09308762848377228, + "learning_rate": 8.906500335076395e-05, + "loss": 2.5691, + "step": 27283 + }, + { + "epoch": 0.809062064466388, + "grad_norm": 0.08424347639083862, + "learning_rate": 8.903820140967116e-05, + "loss": 2.5483, + "step": 27284 + }, + { + "epoch": 0.8090917178186994, + "grad_norm": 0.09892893582582474, + "learning_rate": 8.901140310771566e-05, + "loss": 2.5986, + "step": 27285 + }, + { + "epoch": 0.8091213711710109, + "grad_norm": 0.08528570085763931, + "learning_rate": 8.898460844513484e-05, + "loss": 2.5314, + "step": 27286 + }, + { + "epoch": 0.8091510245233223, + "grad_norm": 0.08990723639726639, + "learning_rate": 8.895781742216574e-05, + "loss": 2.5441, + "step": 27287 + }, + { + "epoch": 0.8091806778756339, + "grad_norm": 0.08339585363864899, + "learning_rate": 8.893103003904573e-05, + "loss": 2.6427, + "step": 27288 + }, + { + "epoch": 0.8092103312279453, + "grad_norm": 0.09159857779741287, + "learning_rate": 8.890424629601196e-05, + "loss": 2.5578, + "step": 27289 + }, + { + "epoch": 0.8092399845802568, + "grad_norm": 0.08737090975046158, + "learning_rate": 8.887746619330166e-05, + "loss": 2.5264, + "step": 27290 + }, + { + "epoch": 0.8092696379325682, + "grad_norm": 0.09071941673755646, + "learning_rate": 8.885068973115201e-05, + "loss": 2.6101, + "step": 27291 + }, + { + "epoch": 0.8092992912848798, + "grad_norm": 0.09128955006599426, + "learning_rate": 8.882391690979996e-05, + "loss": 2.5475, + "step": 27292 + }, + { + "epoch": 0.8093289446371912, + "grad_norm": 0.08212514966726303, + "learning_rate": 8.879714772948278e-05, + "loss": 2.5482, + "step": 27293 + }, + { + "epoch": 0.8093585979895027, + "grad_norm": 0.10031439363956451, + "learning_rate": 8.877038219043748e-05, + "loss": 2.602, + "step": 27294 + }, + { + "epoch": 0.8093882513418142, + "grad_norm": 0.0838383287191391, + "learning_rate": 8.8743620292901e-05, + "loss": 2.5885, + "step": 27295 + }, + { + "epoch": 0.8094179046941257, + "grad_norm": 0.14529751241207123, + "learning_rate": 8.871686203711038e-05, + "loss": 2.5619, + "step": 27296 + }, + { + "epoch": 0.8094475580464372, + "grad_norm": 0.0842689648270607, + "learning_rate": 8.869010742330264e-05, + "loss": 2.5649, + "step": 27297 + }, + { + "epoch": 0.8094772113987486, + "grad_norm": 0.09382209926843643, + "learning_rate": 8.866335645171447e-05, + "loss": 2.5672, + "step": 27298 + }, + { + "epoch": 0.8095068647510602, + "grad_norm": 0.0906984955072403, + "learning_rate": 8.863660912258292e-05, + "loss": 2.5748, + "step": 27299 + }, + { + "epoch": 0.8095365181033716, + "grad_norm": 0.10094954073429108, + "learning_rate": 8.86098654361448e-05, + "loss": 2.5873, + "step": 27300 + }, + { + "epoch": 0.8095661714556831, + "grad_norm": 0.08231193572282791, + "learning_rate": 8.858312539263691e-05, + "loss": 2.6009, + "step": 27301 + }, + { + "epoch": 0.8095958248079945, + "grad_norm": 0.09844378381967545, + "learning_rate": 8.855638899229607e-05, + "loss": 2.5947, + "step": 27302 + }, + { + "epoch": 0.8096254781603061, + "grad_norm": 0.09401267021894455, + "learning_rate": 8.852965623535903e-05, + "loss": 2.549, + "step": 27303 + }, + { + "epoch": 0.8096551315126175, + "grad_norm": 0.09075117856264114, + "learning_rate": 8.850292712206249e-05, + "loss": 2.5892, + "step": 27304 + }, + { + "epoch": 0.809684784864929, + "grad_norm": 0.09343108534812927, + "learning_rate": 8.847620165264308e-05, + "loss": 2.5692, + "step": 27305 + }, + { + "epoch": 0.8097144382172404, + "grad_norm": 0.09460444748401642, + "learning_rate": 8.844947982733765e-05, + "loss": 2.5711, + "step": 27306 + }, + { + "epoch": 0.809744091569552, + "grad_norm": 0.09167010337114334, + "learning_rate": 8.842276164638286e-05, + "loss": 2.5743, + "step": 27307 + }, + { + "epoch": 0.8097737449218634, + "grad_norm": 0.09477195143699646, + "learning_rate": 8.839604711001497e-05, + "loss": 2.5592, + "step": 27308 + }, + { + "epoch": 0.8098033982741749, + "grad_norm": 0.08722632378339767, + "learning_rate": 8.836933621847082e-05, + "loss": 2.5734, + "step": 27309 + }, + { + "epoch": 0.8098330516264863, + "grad_norm": 0.09297391027212143, + "learning_rate": 8.83426289719868e-05, + "loss": 2.6004, + "step": 27310 + }, + { + "epoch": 0.8098627049787979, + "grad_norm": 0.08489644527435303, + "learning_rate": 8.831592537079946e-05, + "loss": 2.6056, + "step": 27311 + }, + { + "epoch": 0.8098923583311093, + "grad_norm": 0.08840028196573257, + "learning_rate": 8.828922541514534e-05, + "loss": 2.5669, + "step": 27312 + }, + { + "epoch": 0.8099220116834208, + "grad_norm": 0.08838743716478348, + "learning_rate": 8.826252910526072e-05, + "loss": 2.5958, + "step": 27313 + }, + { + "epoch": 0.8099516650357322, + "grad_norm": 0.08677101880311966, + "learning_rate": 8.823583644138211e-05, + "loss": 2.6052, + "step": 27314 + }, + { + "epoch": 0.8099813183880438, + "grad_norm": 0.08197497576475143, + "learning_rate": 8.82091474237458e-05, + "loss": 2.5583, + "step": 27315 + }, + { + "epoch": 0.8100109717403553, + "grad_norm": 0.08484247326850891, + "learning_rate": 8.818246205258822e-05, + "loss": 2.5399, + "step": 27316 + }, + { + "epoch": 0.8100406250926667, + "grad_norm": 0.08429805189371109, + "learning_rate": 8.815578032814565e-05, + "loss": 2.5755, + "step": 27317 + }, + { + "epoch": 0.8100702784449783, + "grad_norm": 0.07843578606843948, + "learning_rate": 8.81291022506543e-05, + "loss": 2.5485, + "step": 27318 + }, + { + "epoch": 0.8100999317972897, + "grad_norm": 0.08227745443582535, + "learning_rate": 8.810242782035044e-05, + "loss": 2.5685, + "step": 27319 + }, + { + "epoch": 0.8101295851496012, + "grad_norm": 0.08574479073286057, + "learning_rate": 8.807575703747028e-05, + "loss": 2.6042, + "step": 27320 + }, + { + "epoch": 0.8101592385019126, + "grad_norm": 0.08161991834640503, + "learning_rate": 8.804908990224996e-05, + "loss": 2.6011, + "step": 27321 + }, + { + "epoch": 0.8101888918542242, + "grad_norm": 0.09207433462142944, + "learning_rate": 8.802242641492575e-05, + "loss": 2.5546, + "step": 27322 + }, + { + "epoch": 0.8102185452065356, + "grad_norm": 0.08431506156921387, + "learning_rate": 8.799576657573361e-05, + "loss": 2.6006, + "step": 27323 + }, + { + "epoch": 0.8102481985588471, + "grad_norm": 0.08382803201675415, + "learning_rate": 8.796911038490968e-05, + "loss": 2.6161, + "step": 27324 + }, + { + "epoch": 0.8102778519111585, + "grad_norm": 0.08762633800506592, + "learning_rate": 8.794245784269006e-05, + "loss": 2.5721, + "step": 27325 + }, + { + "epoch": 0.8103075052634701, + "grad_norm": 0.08201520889997482, + "learning_rate": 8.791580894931062e-05, + "loss": 2.5719, + "step": 27326 + }, + { + "epoch": 0.8103371586157815, + "grad_norm": 0.09022344648838043, + "learning_rate": 8.788916370500749e-05, + "loss": 2.6055, + "step": 27327 + }, + { + "epoch": 0.810366811968093, + "grad_norm": 0.08515484631061554, + "learning_rate": 8.78625221100165e-05, + "loss": 2.5551, + "step": 27328 + }, + { + "epoch": 0.8103964653204044, + "grad_norm": 0.08800514042377472, + "learning_rate": 8.783588416457367e-05, + "loss": 2.6393, + "step": 27329 + }, + { + "epoch": 0.810426118672716, + "grad_norm": 0.08656040579080582, + "learning_rate": 8.780924986891481e-05, + "loss": 2.572, + "step": 27330 + }, + { + "epoch": 0.8104557720250274, + "grad_norm": 0.09073000401258469, + "learning_rate": 8.77826192232758e-05, + "loss": 2.5471, + "step": 27331 + }, + { + "epoch": 0.8104854253773389, + "grad_norm": 0.08562526106834412, + "learning_rate": 8.775599222789244e-05, + "loss": 2.6129, + "step": 27332 + }, + { + "epoch": 0.8105150787296503, + "grad_norm": 0.0968160405755043, + "learning_rate": 8.772936888300053e-05, + "loss": 2.5995, + "step": 27333 + }, + { + "epoch": 0.8105447320819619, + "grad_norm": 0.09075716137886047, + "learning_rate": 8.770274918883586e-05, + "loss": 2.5655, + "step": 27334 + }, + { + "epoch": 0.8105743854342733, + "grad_norm": 0.09305606037378311, + "learning_rate": 8.767613314563405e-05, + "loss": 2.6241, + "step": 27335 + }, + { + "epoch": 0.8106040387865848, + "grad_norm": 0.08910888433456421, + "learning_rate": 8.764952075363092e-05, + "loss": 2.5805, + "step": 27336 + }, + { + "epoch": 0.8106336921388964, + "grad_norm": 0.08021784573793411, + "learning_rate": 8.762291201306199e-05, + "loss": 2.6123, + "step": 27337 + }, + { + "epoch": 0.8106633454912078, + "grad_norm": 0.09772240370512009, + "learning_rate": 8.759630692416304e-05, + "loss": 2.604, + "step": 27338 + }, + { + "epoch": 0.8106929988435193, + "grad_norm": 0.08446004986763, + "learning_rate": 8.756970548716953e-05, + "loss": 2.5297, + "step": 27339 + }, + { + "epoch": 0.8107226521958307, + "grad_norm": 0.08677010983228683, + "learning_rate": 8.75431077023171e-05, + "loss": 2.561, + "step": 27340 + }, + { + "epoch": 0.8107523055481423, + "grad_norm": 0.1072758138179779, + "learning_rate": 8.751651356984119e-05, + "loss": 2.6018, + "step": 27341 + }, + { + "epoch": 0.8107819589004537, + "grad_norm": 0.08479108661413193, + "learning_rate": 8.748992308997755e-05, + "loss": 2.6041, + "step": 27342 + }, + { + "epoch": 0.8108116122527652, + "grad_norm": 0.10366395115852356, + "learning_rate": 8.746333626296127e-05, + "loss": 2.568, + "step": 27343 + }, + { + "epoch": 0.8108412656050766, + "grad_norm": 0.08610913157463074, + "learning_rate": 8.743675308902787e-05, + "loss": 2.5498, + "step": 27344 + }, + { + "epoch": 0.8108709189573882, + "grad_norm": 0.1006425991654396, + "learning_rate": 8.741017356841297e-05, + "loss": 2.6056, + "step": 27345 + }, + { + "epoch": 0.8109005723096996, + "grad_norm": 0.08954354375600815, + "learning_rate": 8.738359770135179e-05, + "loss": 2.6096, + "step": 27346 + }, + { + "epoch": 0.8109302256620111, + "grad_norm": 0.0925290435552597, + "learning_rate": 8.735702548807966e-05, + "loss": 2.6402, + "step": 27347 + }, + { + "epoch": 0.8109598790143225, + "grad_norm": 0.08544279634952545, + "learning_rate": 8.733045692883191e-05, + "loss": 2.5559, + "step": 27348 + }, + { + "epoch": 0.8109895323666341, + "grad_norm": 0.08979549258947372, + "learning_rate": 8.730389202384382e-05, + "loss": 2.5763, + "step": 27349 + }, + { + "epoch": 0.8110191857189455, + "grad_norm": 0.08557116985321045, + "learning_rate": 8.727733077335053e-05, + "loss": 2.5683, + "step": 27350 + }, + { + "epoch": 0.811048839071257, + "grad_norm": 0.09928178787231445, + "learning_rate": 8.725077317758739e-05, + "loss": 2.5887, + "step": 27351 + }, + { + "epoch": 0.8110784924235684, + "grad_norm": 0.08598238229751587, + "learning_rate": 8.722421923678959e-05, + "loss": 2.592, + "step": 27352 + }, + { + "epoch": 0.81110814577588, + "grad_norm": 0.08411408215761185, + "learning_rate": 8.719766895119207e-05, + "loss": 2.5836, + "step": 27353 + }, + { + "epoch": 0.8111377991281914, + "grad_norm": 0.08140140026807785, + "learning_rate": 8.717112232103008e-05, + "loss": 2.578, + "step": 27354 + }, + { + "epoch": 0.8111674524805029, + "grad_norm": 0.08572444319725037, + "learning_rate": 8.714457934653863e-05, + "loss": 2.5428, + "step": 27355 + }, + { + "epoch": 0.8111971058328143, + "grad_norm": 0.08636531233787537, + "learning_rate": 8.711804002795276e-05, + "loss": 2.6105, + "step": 27356 + }, + { + "epoch": 0.8112267591851259, + "grad_norm": 0.08240972459316254, + "learning_rate": 8.709150436550744e-05, + "loss": 2.5686, + "step": 27357 + }, + { + "epoch": 0.8112564125374374, + "grad_norm": 0.08186500519514084, + "learning_rate": 8.706497235943783e-05, + "loss": 2.5359, + "step": 27358 + }, + { + "epoch": 0.8112860658897488, + "grad_norm": 0.09334615617990494, + "learning_rate": 8.703844400997878e-05, + "loss": 2.5919, + "step": 27359 + }, + { + "epoch": 0.8113157192420604, + "grad_norm": 0.08806832134723663, + "learning_rate": 8.701191931736518e-05, + "loss": 2.5821, + "step": 27360 + }, + { + "epoch": 0.8113453725943718, + "grad_norm": 0.07897934317588806, + "learning_rate": 8.698539828183193e-05, + "loss": 2.5912, + "step": 27361 + }, + { + "epoch": 0.8113750259466833, + "grad_norm": 0.09358146786689758, + "learning_rate": 8.695888090361386e-05, + "loss": 2.5912, + "step": 27362 + }, + { + "epoch": 0.8114046792989947, + "grad_norm": 0.09028082340955734, + "learning_rate": 8.693236718294595e-05, + "loss": 2.586, + "step": 27363 + }, + { + "epoch": 0.8114343326513063, + "grad_norm": 0.0881166160106659, + "learning_rate": 8.690585712006272e-05, + "loss": 2.5611, + "step": 27364 + }, + { + "epoch": 0.8114639860036177, + "grad_norm": 0.0892069935798645, + "learning_rate": 8.687935071519898e-05, + "loss": 2.568, + "step": 27365 + }, + { + "epoch": 0.8114936393559292, + "grad_norm": 0.0848018229007721, + "learning_rate": 8.685284796858955e-05, + "loss": 2.5787, + "step": 27366 + }, + { + "epoch": 0.8115232927082406, + "grad_norm": 0.08711962401866913, + "learning_rate": 8.682634888046903e-05, + "loss": 2.598, + "step": 27367 + }, + { + "epoch": 0.8115529460605522, + "grad_norm": 0.08532993495464325, + "learning_rate": 8.679985345107211e-05, + "loss": 2.5607, + "step": 27368 + }, + { + "epoch": 0.8115825994128636, + "grad_norm": 0.09094864130020142, + "learning_rate": 8.677336168063332e-05, + "loss": 2.5589, + "step": 27369 + }, + { + "epoch": 0.8116122527651751, + "grad_norm": 0.08492986857891083, + "learning_rate": 8.674687356938743e-05, + "loss": 2.5774, + "step": 27370 + }, + { + "epoch": 0.8116419061174865, + "grad_norm": 0.09187416732311249, + "learning_rate": 8.67203891175689e-05, + "loss": 2.593, + "step": 27371 + }, + { + "epoch": 0.8116715594697981, + "grad_norm": 0.09597554802894592, + "learning_rate": 8.66939083254123e-05, + "loss": 2.6114, + "step": 27372 + }, + { + "epoch": 0.8117012128221095, + "grad_norm": 0.08466890454292297, + "learning_rate": 8.666743119315218e-05, + "loss": 2.5912, + "step": 27373 + }, + { + "epoch": 0.811730866174421, + "grad_norm": 0.09466859698295593, + "learning_rate": 8.664095772102282e-05, + "loss": 2.5869, + "step": 27374 + }, + { + "epoch": 0.8117605195267324, + "grad_norm": 0.09041654318571091, + "learning_rate": 8.661448790925868e-05, + "loss": 2.6031, + "step": 27375 + }, + { + "epoch": 0.811790172879044, + "grad_norm": 0.09197181463241577, + "learning_rate": 8.658802175809427e-05, + "loss": 2.5815, + "step": 27376 + }, + { + "epoch": 0.8118198262313554, + "grad_norm": 0.10586518049240112, + "learning_rate": 8.656155926776383e-05, + "loss": 2.5618, + "step": 27377 + }, + { + "epoch": 0.8118494795836669, + "grad_norm": 0.08312857896089554, + "learning_rate": 8.653510043850176e-05, + "loss": 2.5953, + "step": 27378 + }, + { + "epoch": 0.8118791329359785, + "grad_norm": 0.09293169528245926, + "learning_rate": 8.650864527054237e-05, + "loss": 2.6086, + "step": 27379 + }, + { + "epoch": 0.8119087862882899, + "grad_norm": 0.09236880391836166, + "learning_rate": 8.648219376411986e-05, + "loss": 2.6022, + "step": 27380 + }, + { + "epoch": 0.8119384396406014, + "grad_norm": 0.08564291894435883, + "learning_rate": 8.645574591946859e-05, + "loss": 2.5901, + "step": 27381 + }, + { + "epoch": 0.8119680929929128, + "grad_norm": 0.08681531250476837, + "learning_rate": 8.642930173682245e-05, + "loss": 2.5586, + "step": 27382 + }, + { + "epoch": 0.8119977463452244, + "grad_norm": 0.09152913838624954, + "learning_rate": 8.640286121641611e-05, + "loss": 2.6055, + "step": 27383 + }, + { + "epoch": 0.8120273996975358, + "grad_norm": 0.09571704268455505, + "learning_rate": 8.637642435848336e-05, + "loss": 2.5726, + "step": 27384 + }, + { + "epoch": 0.8120570530498473, + "grad_norm": 0.08242803812026978, + "learning_rate": 8.634999116325832e-05, + "loss": 2.5535, + "step": 27385 + }, + { + "epoch": 0.8120867064021587, + "grad_norm": 0.08594180643558502, + "learning_rate": 8.63235616309751e-05, + "loss": 2.5664, + "step": 27386 + }, + { + "epoch": 0.8121163597544703, + "grad_norm": 0.08959037810564041, + "learning_rate": 8.629713576186776e-05, + "loss": 2.6031, + "step": 27387 + }, + { + "epoch": 0.8121460131067817, + "grad_norm": 0.0852711945772171, + "learning_rate": 8.627071355617027e-05, + "loss": 2.5919, + "step": 27388 + }, + { + "epoch": 0.8121756664590932, + "grad_norm": 0.08897058665752411, + "learning_rate": 8.624429501411667e-05, + "loss": 2.5736, + "step": 27389 + }, + { + "epoch": 0.8122053198114046, + "grad_norm": 0.0812622532248497, + "learning_rate": 8.621788013594084e-05, + "loss": 2.5764, + "step": 27390 + }, + { + "epoch": 0.8122349731637162, + "grad_norm": 0.0924786627292633, + "learning_rate": 8.619146892187674e-05, + "loss": 2.5982, + "step": 27391 + }, + { + "epoch": 0.8122646265160276, + "grad_norm": 0.08219049125909805, + "learning_rate": 8.616506137215813e-05, + "loss": 2.5748, + "step": 27392 + }, + { + "epoch": 0.8122942798683391, + "grad_norm": 0.09283830225467682, + "learning_rate": 8.613865748701899e-05, + "loss": 2.5848, + "step": 27393 + }, + { + "epoch": 0.8123239332206506, + "grad_norm": 0.08563993126153946, + "learning_rate": 8.611225726669309e-05, + "loss": 2.6104, + "step": 27394 + }, + { + "epoch": 0.8123535865729621, + "grad_norm": 0.09336702525615692, + "learning_rate": 8.608586071141417e-05, + "loss": 2.6186, + "step": 27395 + }, + { + "epoch": 0.8123832399252735, + "grad_norm": 0.08105794340372086, + "learning_rate": 8.605946782141599e-05, + "loss": 2.5758, + "step": 27396 + }, + { + "epoch": 0.812412893277585, + "grad_norm": 0.09614761173725128, + "learning_rate": 8.603307859693233e-05, + "loss": 2.6131, + "step": 27397 + }, + { + "epoch": 0.8124425466298965, + "grad_norm": 0.08943680673837662, + "learning_rate": 8.600669303819675e-05, + "loss": 2.5961, + "step": 27398 + }, + { + "epoch": 0.812472199982208, + "grad_norm": 0.0938781127333641, + "learning_rate": 8.598031114544303e-05, + "loss": 2.5887, + "step": 27399 + }, + { + "epoch": 0.8125018533345195, + "grad_norm": 0.08763180673122406, + "learning_rate": 8.595393291890463e-05, + "loss": 2.5658, + "step": 27400 + }, + { + "epoch": 0.8125315066868309, + "grad_norm": 0.08242328464984894, + "learning_rate": 8.592755835881527e-05, + "loss": 2.5871, + "step": 27401 + }, + { + "epoch": 0.8125611600391425, + "grad_norm": 0.08775699883699417, + "learning_rate": 8.590118746540847e-05, + "loss": 2.5828, + "step": 27402 + }, + { + "epoch": 0.8125908133914539, + "grad_norm": 0.09310311079025269, + "learning_rate": 8.587482023891773e-05, + "loss": 2.5758, + "step": 27403 + }, + { + "epoch": 0.8126204667437654, + "grad_norm": 0.08721473067998886, + "learning_rate": 8.584845667957653e-05, + "loss": 2.5723, + "step": 27404 + }, + { + "epoch": 0.8126501200960768, + "grad_norm": 0.09271591156721115, + "learning_rate": 8.582209678761837e-05, + "loss": 2.5925, + "step": 27405 + }, + { + "epoch": 0.8126797734483884, + "grad_norm": 0.10180795192718506, + "learning_rate": 8.57957405632766e-05, + "loss": 2.6024, + "step": 27406 + }, + { + "epoch": 0.8127094268006998, + "grad_norm": 0.08738940954208374, + "learning_rate": 8.576938800678474e-05, + "loss": 2.6103, + "step": 27407 + }, + { + "epoch": 0.8127390801530113, + "grad_norm": 0.09242211282253265, + "learning_rate": 8.574303911837589e-05, + "loss": 2.5784, + "step": 27408 + }, + { + "epoch": 0.8127687335053227, + "grad_norm": 0.0857449471950531, + "learning_rate": 8.571669389828358e-05, + "loss": 2.5597, + "step": 27409 + }, + { + "epoch": 0.8127983868576343, + "grad_norm": 0.09115008264780045, + "learning_rate": 8.569035234674105e-05, + "loss": 2.5825, + "step": 27410 + }, + { + "epoch": 0.8128280402099457, + "grad_norm": 0.09085515886545181, + "learning_rate": 8.566401446398165e-05, + "loss": 2.5914, + "step": 27411 + }, + { + "epoch": 0.8128576935622572, + "grad_norm": 0.08774704486131668, + "learning_rate": 8.563768025023844e-05, + "loss": 2.5877, + "step": 27412 + }, + { + "epoch": 0.8128873469145687, + "grad_norm": 0.08996433764696121, + "learning_rate": 8.561134970574474e-05, + "loss": 2.5837, + "step": 27413 + }, + { + "epoch": 0.8129170002668802, + "grad_norm": 0.09346424043178558, + "learning_rate": 8.558502283073366e-05, + "loss": 2.5974, + "step": 27414 + }, + { + "epoch": 0.8129466536191916, + "grad_norm": 0.08878827840089798, + "learning_rate": 8.555869962543834e-05, + "loss": 2.5772, + "step": 27415 + }, + { + "epoch": 0.8129763069715031, + "grad_norm": 0.08965427428483963, + "learning_rate": 8.553238009009184e-05, + "loss": 2.6089, + "step": 27416 + }, + { + "epoch": 0.8130059603238146, + "grad_norm": 0.08567467331886292, + "learning_rate": 8.550606422492729e-05, + "loss": 2.5748, + "step": 27417 + }, + { + "epoch": 0.8130356136761261, + "grad_norm": 0.0888478085398674, + "learning_rate": 8.547975203017777e-05, + "loss": 2.6183, + "step": 27418 + }, + { + "epoch": 0.8130652670284375, + "grad_norm": 0.10313130170106888, + "learning_rate": 8.545344350607609e-05, + "loss": 2.6035, + "step": 27419 + }, + { + "epoch": 0.813094920380749, + "grad_norm": 0.08713789284229279, + "learning_rate": 8.542713865285534e-05, + "loss": 2.5793, + "step": 27420 + }, + { + "epoch": 0.8131245737330606, + "grad_norm": 0.09878242015838623, + "learning_rate": 8.540083747074834e-05, + "loss": 2.5895, + "step": 27421 + }, + { + "epoch": 0.813154227085372, + "grad_norm": 0.08728481084108353, + "learning_rate": 8.537453995998818e-05, + "loss": 2.5819, + "step": 27422 + }, + { + "epoch": 0.8131838804376835, + "grad_norm": 0.08658801019191742, + "learning_rate": 8.534824612080766e-05, + "loss": 2.5995, + "step": 27423 + }, + { + "epoch": 0.813213533789995, + "grad_norm": 0.09557941555976868, + "learning_rate": 8.532195595343955e-05, + "loss": 2.553, + "step": 27424 + }, + { + "epoch": 0.8132431871423065, + "grad_norm": 0.08833729475736618, + "learning_rate": 8.529566945811673e-05, + "loss": 2.5984, + "step": 27425 + }, + { + "epoch": 0.8132728404946179, + "grad_norm": 0.09162390232086182, + "learning_rate": 8.526938663507194e-05, + "loss": 2.576, + "step": 27426 + }, + { + "epoch": 0.8133024938469294, + "grad_norm": 0.09369463473558426, + "learning_rate": 8.52431074845379e-05, + "loss": 2.5847, + "step": 27427 + }, + { + "epoch": 0.8133321471992409, + "grad_norm": 0.0946970209479332, + "learning_rate": 8.521683200674745e-05, + "loss": 2.612, + "step": 27428 + }, + { + "epoch": 0.8133618005515524, + "grad_norm": 0.08208049088716507, + "learning_rate": 8.519056020193305e-05, + "loss": 2.6095, + "step": 27429 + }, + { + "epoch": 0.8133914539038638, + "grad_norm": 0.0992651954293251, + "learning_rate": 8.516429207032744e-05, + "loss": 2.5703, + "step": 27430 + }, + { + "epoch": 0.8134211072561753, + "grad_norm": 0.08852720260620117, + "learning_rate": 8.513802761216327e-05, + "loss": 2.5381, + "step": 27431 + }, + { + "epoch": 0.8134507606084868, + "grad_norm": 0.09567904472351074, + "learning_rate": 8.511176682767302e-05, + "loss": 2.5261, + "step": 27432 + }, + { + "epoch": 0.8134804139607983, + "grad_norm": 0.09122312068939209, + "learning_rate": 8.508550971708929e-05, + "loss": 2.553, + "step": 27433 + }, + { + "epoch": 0.8135100673131097, + "grad_norm": 0.0914992019534111, + "learning_rate": 8.505925628064448e-05, + "loss": 2.5743, + "step": 27434 + }, + { + "epoch": 0.8135397206654212, + "grad_norm": 0.0939316675066948, + "learning_rate": 8.503300651857132e-05, + "loss": 2.5831, + "step": 27435 + }, + { + "epoch": 0.8135693740177327, + "grad_norm": 0.0933629646897316, + "learning_rate": 8.500676043110211e-05, + "loss": 2.5856, + "step": 27436 + }, + { + "epoch": 0.8135990273700442, + "grad_norm": 0.08477851003408432, + "learning_rate": 8.498051801846923e-05, + "loss": 2.5941, + "step": 27437 + }, + { + "epoch": 0.8136286807223556, + "grad_norm": 0.09182638674974442, + "learning_rate": 8.495427928090515e-05, + "loss": 2.5972, + "step": 27438 + }, + { + "epoch": 0.8136583340746671, + "grad_norm": 0.08886336535215378, + "learning_rate": 8.492804421864225e-05, + "loss": 2.6133, + "step": 27439 + }, + { + "epoch": 0.8136879874269786, + "grad_norm": 0.08738204091787338, + "learning_rate": 8.490181283191268e-05, + "loss": 2.5788, + "step": 27440 + }, + { + "epoch": 0.8137176407792901, + "grad_norm": 0.08817414939403534, + "learning_rate": 8.487558512094878e-05, + "loss": 2.5689, + "step": 27441 + }, + { + "epoch": 0.8137472941316016, + "grad_norm": 0.09117939323186874, + "learning_rate": 8.484936108598285e-05, + "loss": 2.5932, + "step": 27442 + }, + { + "epoch": 0.813776947483913, + "grad_norm": 0.08988957107067108, + "learning_rate": 8.482314072724706e-05, + "loss": 2.6242, + "step": 27443 + }, + { + "epoch": 0.8138066008362246, + "grad_norm": 0.09379664808511734, + "learning_rate": 8.479692404497363e-05, + "loss": 2.5771, + "step": 27444 + }, + { + "epoch": 0.813836254188536, + "grad_norm": 0.08630576729774475, + "learning_rate": 8.477071103939471e-05, + "loss": 2.5518, + "step": 27445 + }, + { + "epoch": 0.8138659075408475, + "grad_norm": 0.08917951583862305, + "learning_rate": 8.474450171074244e-05, + "loss": 2.5629, + "step": 27446 + }, + { + "epoch": 0.813895560893159, + "grad_norm": 0.08570494502782822, + "learning_rate": 8.471829605924874e-05, + "loss": 2.589, + "step": 27447 + }, + { + "epoch": 0.8139252142454705, + "grad_norm": 0.09142699092626572, + "learning_rate": 8.469209408514595e-05, + "loss": 2.5743, + "step": 27448 + }, + { + "epoch": 0.8139548675977819, + "grad_norm": 0.08950799703598022, + "learning_rate": 8.466589578866607e-05, + "loss": 2.6118, + "step": 27449 + }, + { + "epoch": 0.8139845209500934, + "grad_norm": 0.09186340868473053, + "learning_rate": 8.463970117004083e-05, + "loss": 2.599, + "step": 27450 + }, + { + "epoch": 0.8140141743024049, + "grad_norm": 0.09654946625232697, + "learning_rate": 8.461351022950236e-05, + "loss": 2.5842, + "step": 27451 + }, + { + "epoch": 0.8140438276547164, + "grad_norm": 0.0971456989645958, + "learning_rate": 8.458732296728255e-05, + "loss": 2.6028, + "step": 27452 + }, + { + "epoch": 0.8140734810070278, + "grad_norm": 0.08333076536655426, + "learning_rate": 8.456113938361326e-05, + "loss": 2.547, + "step": 27453 + }, + { + "epoch": 0.8141031343593393, + "grad_norm": 0.09665723145008087, + "learning_rate": 8.453495947872641e-05, + "loss": 2.5895, + "step": 27454 + }, + { + "epoch": 0.8141327877116508, + "grad_norm": 0.09527746587991714, + "learning_rate": 8.450878325285382e-05, + "loss": 2.5762, + "step": 27455 + }, + { + "epoch": 0.8141624410639623, + "grad_norm": 0.08979425579309464, + "learning_rate": 8.448261070622731e-05, + "loss": 2.5871, + "step": 27456 + }, + { + "epoch": 0.8141920944162737, + "grad_norm": 0.09431318938732147, + "learning_rate": 8.445644183907858e-05, + "loss": 2.5762, + "step": 27457 + }, + { + "epoch": 0.8142217477685852, + "grad_norm": 0.0939958319067955, + "learning_rate": 8.443027665163938e-05, + "loss": 2.6398, + "step": 27458 + }, + { + "epoch": 0.8142514011208967, + "grad_norm": 0.08968457579612732, + "learning_rate": 8.440411514414137e-05, + "loss": 2.5548, + "step": 27459 + }, + { + "epoch": 0.8142810544732082, + "grad_norm": 0.08997206389904022, + "learning_rate": 8.43779573168163e-05, + "loss": 2.5337, + "step": 27460 + }, + { + "epoch": 0.8143107078255196, + "grad_norm": 0.08820125460624695, + "learning_rate": 8.435180316989576e-05, + "loss": 2.5705, + "step": 27461 + }, + { + "epoch": 0.8143403611778312, + "grad_norm": 0.09541090577840805, + "learning_rate": 8.432565270361131e-05, + "loss": 2.5911, + "step": 27462 + }, + { + "epoch": 0.8143700145301427, + "grad_norm": 0.09211394935846329, + "learning_rate": 8.429950591819463e-05, + "loss": 2.6276, + "step": 27463 + }, + { + "epoch": 0.8143996678824541, + "grad_norm": 0.09270735085010529, + "learning_rate": 8.42733628138771e-05, + "loss": 2.5547, + "step": 27464 + }, + { + "epoch": 0.8144293212347656, + "grad_norm": 0.08276825398206711, + "learning_rate": 8.424722339089036e-05, + "loss": 2.5956, + "step": 27465 + }, + { + "epoch": 0.8144589745870771, + "grad_norm": 0.09111529588699341, + "learning_rate": 8.422108764946579e-05, + "loss": 2.5553, + "step": 27466 + }, + { + "epoch": 0.8144886279393886, + "grad_norm": 0.08523256331682205, + "learning_rate": 8.419495558983487e-05, + "loss": 2.581, + "step": 27467 + }, + { + "epoch": 0.8145182812917, + "grad_norm": 0.08658216893672943, + "learning_rate": 8.416882721222896e-05, + "loss": 2.5794, + "step": 27468 + }, + { + "epoch": 0.8145479346440115, + "grad_norm": 0.08620557934045792, + "learning_rate": 8.414270251687945e-05, + "loss": 2.5864, + "step": 27469 + }, + { + "epoch": 0.814577587996323, + "grad_norm": 0.08237515389919281, + "learning_rate": 8.411658150401774e-05, + "loss": 2.5508, + "step": 27470 + }, + { + "epoch": 0.8146072413486345, + "grad_norm": 0.09064095467329025, + "learning_rate": 8.409046417387505e-05, + "loss": 2.5936, + "step": 27471 + }, + { + "epoch": 0.8146368947009459, + "grad_norm": 0.09405490756034851, + "learning_rate": 8.406435052668271e-05, + "loss": 2.6194, + "step": 27472 + }, + { + "epoch": 0.8146665480532574, + "grad_norm": 0.08600252121686935, + "learning_rate": 8.403824056267195e-05, + "loss": 2.5646, + "step": 27473 + }, + { + "epoch": 0.8146962014055689, + "grad_norm": 0.1015995517373085, + "learning_rate": 8.401213428207394e-05, + "loss": 2.6086, + "step": 27474 + }, + { + "epoch": 0.8147258547578804, + "grad_norm": 0.0880105048418045, + "learning_rate": 8.398603168511992e-05, + "loss": 2.6065, + "step": 27475 + }, + { + "epoch": 0.8147555081101918, + "grad_norm": 0.09300152957439423, + "learning_rate": 8.395993277204095e-05, + "loss": 2.5856, + "step": 27476 + }, + { + "epoch": 0.8147851614625033, + "grad_norm": 0.0901501327753067, + "learning_rate": 8.393383754306821e-05, + "loss": 2.5979, + "step": 27477 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 0.08836545050144196, + "learning_rate": 8.390774599843282e-05, + "loss": 2.5759, + "step": 27478 + }, + { + "epoch": 0.8148444681671263, + "grad_norm": 0.09415488690137863, + "learning_rate": 8.388165813836568e-05, + "loss": 2.5664, + "step": 27479 + }, + { + "epoch": 0.8148741215194377, + "grad_norm": 0.1019861027598381, + "learning_rate": 8.385557396309789e-05, + "loss": 2.5792, + "step": 27480 + }, + { + "epoch": 0.8149037748717493, + "grad_norm": 0.08979973196983337, + "learning_rate": 8.382949347286045e-05, + "loss": 2.6003, + "step": 27481 + }, + { + "epoch": 0.8149334282240608, + "grad_norm": 0.0982411578297615, + "learning_rate": 8.380341666788427e-05, + "loss": 2.5736, + "step": 27482 + }, + { + "epoch": 0.8149630815763722, + "grad_norm": 0.09937754273414612, + "learning_rate": 8.377734354840039e-05, + "loss": 2.6244, + "step": 27483 + }, + { + "epoch": 0.8149927349286837, + "grad_norm": 0.08628121018409729, + "learning_rate": 8.375127411463945e-05, + "loss": 2.5584, + "step": 27484 + }, + { + "epoch": 0.8150223882809952, + "grad_norm": 0.09341048449277878, + "learning_rate": 8.372520836683244e-05, + "loss": 2.5792, + "step": 27485 + }, + { + "epoch": 0.8150520416333067, + "grad_norm": 0.09558101743459702, + "learning_rate": 8.369914630521003e-05, + "loss": 2.609, + "step": 27486 + }, + { + "epoch": 0.8150816949856181, + "grad_norm": 0.09525621682405472, + "learning_rate": 8.367308793000328e-05, + "loss": 2.5838, + "step": 27487 + }, + { + "epoch": 0.8151113483379296, + "grad_norm": 0.09304404258728027, + "learning_rate": 8.36470332414428e-05, + "loss": 2.5661, + "step": 27488 + }, + { + "epoch": 0.8151410016902411, + "grad_norm": 0.09286525845527649, + "learning_rate": 8.362098223975928e-05, + "loss": 2.5921, + "step": 27489 + }, + { + "epoch": 0.8151706550425526, + "grad_norm": 0.1044713705778122, + "learning_rate": 8.359493492518344e-05, + "loss": 2.6101, + "step": 27490 + }, + { + "epoch": 0.815200308394864, + "grad_norm": 0.09808453917503357, + "learning_rate": 8.356889129794593e-05, + "loss": 2.5725, + "step": 27491 + }, + { + "epoch": 0.8152299617471755, + "grad_norm": 0.0901617482304573, + "learning_rate": 8.354285135827733e-05, + "loss": 2.5749, + "step": 27492 + }, + { + "epoch": 0.815259615099487, + "grad_norm": 0.09026265144348145, + "learning_rate": 8.351681510640829e-05, + "loss": 2.5793, + "step": 27493 + }, + { + "epoch": 0.8152892684517985, + "grad_norm": 0.09225814044475555, + "learning_rate": 8.349078254256948e-05, + "loss": 2.592, + "step": 27494 + }, + { + "epoch": 0.8153189218041099, + "grad_norm": 0.08941437304019928, + "learning_rate": 8.346475366699119e-05, + "loss": 2.5635, + "step": 27495 + }, + { + "epoch": 0.8153485751564215, + "grad_norm": 0.08754786103963852, + "learning_rate": 8.343872847990392e-05, + "loss": 2.5708, + "step": 27496 + }, + { + "epoch": 0.8153782285087329, + "grad_norm": 0.08864506334066391, + "learning_rate": 8.34127069815383e-05, + "loss": 2.5637, + "step": 27497 + }, + { + "epoch": 0.8154078818610444, + "grad_norm": 0.09332861751317978, + "learning_rate": 8.33866891721246e-05, + "loss": 2.5814, + "step": 27498 + }, + { + "epoch": 0.8154375352133558, + "grad_norm": 0.08405434340238571, + "learning_rate": 8.336067505189316e-05, + "loss": 2.5772, + "step": 27499 + }, + { + "epoch": 0.8154671885656674, + "grad_norm": 0.09202630072832108, + "learning_rate": 8.333466462107465e-05, + "loss": 2.5958, + "step": 27500 + }, + { + "epoch": 0.8154968419179788, + "grad_norm": 0.11101637780666351, + "learning_rate": 8.330865787989911e-05, + "loss": 2.6092, + "step": 27501 + }, + { + "epoch": 0.8155264952702903, + "grad_norm": 0.09634122997522354, + "learning_rate": 8.328265482859698e-05, + "loss": 2.5789, + "step": 27502 + }, + { + "epoch": 0.8155561486226018, + "grad_norm": 0.09156070649623871, + "learning_rate": 8.325665546739846e-05, + "loss": 2.6071, + "step": 27503 + }, + { + "epoch": 0.8155858019749133, + "grad_norm": 0.08952076733112335, + "learning_rate": 8.323065979653394e-05, + "loss": 2.5767, + "step": 27504 + }, + { + "epoch": 0.8156154553272248, + "grad_norm": 0.1032726839184761, + "learning_rate": 8.320466781623331e-05, + "loss": 2.5853, + "step": 27505 + }, + { + "epoch": 0.8156451086795362, + "grad_norm": 0.08697840571403503, + "learning_rate": 8.317867952672686e-05, + "loss": 2.5981, + "step": 27506 + }, + { + "epoch": 0.8156747620318477, + "grad_norm": 0.10011119395494461, + "learning_rate": 8.315269492824478e-05, + "loss": 2.5987, + "step": 27507 + }, + { + "epoch": 0.8157044153841592, + "grad_norm": 0.08496523648500443, + "learning_rate": 8.312671402101717e-05, + "loss": 2.6031, + "step": 27508 + }, + { + "epoch": 0.8157340687364707, + "grad_norm": 0.09272296726703644, + "learning_rate": 8.310073680527402e-05, + "loss": 2.5615, + "step": 27509 + }, + { + "epoch": 0.8157637220887821, + "grad_norm": 0.09602770209312439, + "learning_rate": 8.307476328124542e-05, + "loss": 2.5492, + "step": 27510 + }, + { + "epoch": 0.8157933754410936, + "grad_norm": 0.08702711760997772, + "learning_rate": 8.304879344916139e-05, + "loss": 2.5683, + "step": 27511 + }, + { + "epoch": 0.8158230287934051, + "grad_norm": 0.09403109550476074, + "learning_rate": 8.30228273092517e-05, + "loss": 2.5393, + "step": 27512 + }, + { + "epoch": 0.8158526821457166, + "grad_norm": 0.0941372662782669, + "learning_rate": 8.299686486174657e-05, + "loss": 2.5857, + "step": 27513 + }, + { + "epoch": 0.815882335498028, + "grad_norm": 0.10019306093454361, + "learning_rate": 8.29709061068758e-05, + "loss": 2.5716, + "step": 27514 + }, + { + "epoch": 0.8159119888503396, + "grad_norm": 0.0961993858218193, + "learning_rate": 8.294495104486932e-05, + "loss": 2.5739, + "step": 27515 + }, + { + "epoch": 0.815941642202651, + "grad_norm": 0.09426552057266235, + "learning_rate": 8.291899967595678e-05, + "loss": 2.5725, + "step": 27516 + }, + { + "epoch": 0.8159712955549625, + "grad_norm": 0.09028711169958115, + "learning_rate": 8.289305200036812e-05, + "loss": 2.5878, + "step": 27517 + }, + { + "epoch": 0.8160009489072739, + "grad_norm": 0.09206107258796692, + "learning_rate": 8.286710801833303e-05, + "loss": 2.5994, + "step": 27518 + }, + { + "epoch": 0.8160306022595855, + "grad_norm": 0.08835041522979736, + "learning_rate": 8.284116773008132e-05, + "loss": 2.5929, + "step": 27519 + }, + { + "epoch": 0.8160602556118969, + "grad_norm": 0.0836014673113823, + "learning_rate": 8.281523113584272e-05, + "loss": 2.5431, + "step": 27520 + }, + { + "epoch": 0.8160899089642084, + "grad_norm": 0.08857844769954681, + "learning_rate": 8.27892982358468e-05, + "loss": 2.5768, + "step": 27521 + }, + { + "epoch": 0.8161195623165198, + "grad_norm": 0.08287451416254044, + "learning_rate": 8.276336903032327e-05, + "loss": 2.5656, + "step": 27522 + }, + { + "epoch": 0.8161492156688314, + "grad_norm": 0.08483083546161652, + "learning_rate": 8.273744351950174e-05, + "loss": 2.5776, + "step": 27523 + }, + { + "epoch": 0.8161788690211429, + "grad_norm": 0.08240838348865509, + "learning_rate": 8.271152170361174e-05, + "loss": 2.6127, + "step": 27524 + }, + { + "epoch": 0.8162085223734543, + "grad_norm": 0.08322107046842575, + "learning_rate": 8.268560358288285e-05, + "loss": 2.5825, + "step": 27525 + }, + { + "epoch": 0.8162381757257658, + "grad_norm": 0.0885629653930664, + "learning_rate": 8.265968915754463e-05, + "loss": 2.6018, + "step": 27526 + }, + { + "epoch": 0.8162678290780773, + "grad_norm": 0.08643045276403427, + "learning_rate": 8.263377842782644e-05, + "loss": 2.5562, + "step": 27527 + }, + { + "epoch": 0.8162974824303888, + "grad_norm": 0.08265109360218048, + "learning_rate": 8.260787139395775e-05, + "loss": 2.5663, + "step": 27528 + }, + { + "epoch": 0.8163271357827002, + "grad_norm": 0.07989774644374847, + "learning_rate": 8.258196805616808e-05, + "loss": 2.5569, + "step": 27529 + }, + { + "epoch": 0.8163567891350118, + "grad_norm": 0.08842354267835617, + "learning_rate": 8.255606841468672e-05, + "loss": 2.6016, + "step": 27530 + }, + { + "epoch": 0.8163864424873232, + "grad_norm": 0.08478191494941711, + "learning_rate": 8.253017246974298e-05, + "loss": 2.5672, + "step": 27531 + }, + { + "epoch": 0.8164160958396347, + "grad_norm": 0.09064197540283203, + "learning_rate": 8.250428022156626e-05, + "loss": 2.5764, + "step": 27532 + }, + { + "epoch": 0.8164457491919461, + "grad_norm": 0.08180531859397888, + "learning_rate": 8.247839167038579e-05, + "loss": 2.5623, + "step": 27533 + }, + { + "epoch": 0.8164754025442577, + "grad_norm": 0.09520772099494934, + "learning_rate": 8.24525068164308e-05, + "loss": 2.5911, + "step": 27534 + }, + { + "epoch": 0.8165050558965691, + "grad_norm": 0.09500813484191895, + "learning_rate": 8.242662565993059e-05, + "loss": 2.62, + "step": 27535 + }, + { + "epoch": 0.8165347092488806, + "grad_norm": 0.09163498878479004, + "learning_rate": 8.240074820111421e-05, + "loss": 2.5869, + "step": 27536 + }, + { + "epoch": 0.816564362601192, + "grad_norm": 0.0918145701289177, + "learning_rate": 8.237487444021096e-05, + "loss": 2.5737, + "step": 27537 + }, + { + "epoch": 0.8165940159535036, + "grad_norm": 0.08385312557220459, + "learning_rate": 8.234900437744985e-05, + "loss": 2.5901, + "step": 27538 + }, + { + "epoch": 0.816623669305815, + "grad_norm": 0.08650581538677216, + "learning_rate": 8.232313801305996e-05, + "loss": 2.6111, + "step": 27539 + }, + { + "epoch": 0.8166533226581265, + "grad_norm": 0.08971234411001205, + "learning_rate": 8.229727534727044e-05, + "loss": 2.577, + "step": 27540 + }, + { + "epoch": 0.8166829760104379, + "grad_norm": 0.08623965829610825, + "learning_rate": 8.227141638031022e-05, + "loss": 2.5763, + "step": 27541 + }, + { + "epoch": 0.8167126293627495, + "grad_norm": 0.08999433368444443, + "learning_rate": 8.224556111240827e-05, + "loss": 2.572, + "step": 27542 + }, + { + "epoch": 0.8167422827150609, + "grad_norm": 0.08680478483438492, + "learning_rate": 8.221970954379365e-05, + "loss": 2.5737, + "step": 27543 + }, + { + "epoch": 0.8167719360673724, + "grad_norm": 0.09373148530721664, + "learning_rate": 8.219386167469517e-05, + "loss": 2.5954, + "step": 27544 + }, + { + "epoch": 0.816801589419684, + "grad_norm": 0.08816974610090256, + "learning_rate": 8.216801750534176e-05, + "loss": 2.5638, + "step": 27545 + }, + { + "epoch": 0.8168312427719954, + "grad_norm": 0.09216835349798203, + "learning_rate": 8.214217703596228e-05, + "loss": 2.5884, + "step": 27546 + }, + { + "epoch": 0.8168608961243069, + "grad_norm": 0.08806844800710678, + "learning_rate": 8.211634026678554e-05, + "loss": 2.5835, + "step": 27547 + }, + { + "epoch": 0.8168905494766183, + "grad_norm": 0.09033187478780746, + "learning_rate": 8.20905071980404e-05, + "loss": 2.5538, + "step": 27548 + }, + { + "epoch": 0.8169202028289299, + "grad_norm": 0.0878879502415657, + "learning_rate": 8.206467782995558e-05, + "loss": 2.5896, + "step": 27549 + }, + { + "epoch": 0.8169498561812413, + "grad_norm": 0.09175986051559448, + "learning_rate": 8.203885216275958e-05, + "loss": 2.5908, + "step": 27550 + }, + { + "epoch": 0.8169795095335528, + "grad_norm": 0.10117097944021225, + "learning_rate": 8.201303019668143e-05, + "loss": 2.5904, + "step": 27551 + }, + { + "epoch": 0.8170091628858642, + "grad_norm": 0.08498642593622208, + "learning_rate": 8.198721193194964e-05, + "loss": 2.5724, + "step": 27552 + }, + { + "epoch": 0.8170388162381758, + "grad_norm": 0.09546181559562683, + "learning_rate": 8.196139736879276e-05, + "loss": 2.5767, + "step": 27553 + }, + { + "epoch": 0.8170684695904872, + "grad_norm": 0.08552974462509155, + "learning_rate": 8.193558650743955e-05, + "loss": 2.5818, + "step": 27554 + }, + { + "epoch": 0.8170981229427987, + "grad_norm": 0.08892720937728882, + "learning_rate": 8.190977934811843e-05, + "loss": 2.5857, + "step": 27555 + }, + { + "epoch": 0.8171277762951101, + "grad_norm": 0.08526905626058578, + "learning_rate": 8.1883975891058e-05, + "loss": 2.5583, + "step": 27556 + }, + { + "epoch": 0.8171574296474217, + "grad_norm": 0.09175071865320206, + "learning_rate": 8.185817613648672e-05, + "loss": 2.5742, + "step": 27557 + }, + { + "epoch": 0.8171870829997331, + "grad_norm": 0.08756684511899948, + "learning_rate": 8.183238008463312e-05, + "loss": 2.5632, + "step": 27558 + }, + { + "epoch": 0.8172167363520446, + "grad_norm": 0.09832184761762619, + "learning_rate": 8.180658773572563e-05, + "loss": 2.5897, + "step": 27559 + }, + { + "epoch": 0.817246389704356, + "grad_norm": 0.0954800546169281, + "learning_rate": 8.178079908999248e-05, + "loss": 2.5575, + "step": 27560 + }, + { + "epoch": 0.8172760430566676, + "grad_norm": 0.09189687669277191, + "learning_rate": 8.175501414766212e-05, + "loss": 2.6276, + "step": 27561 + }, + { + "epoch": 0.817305696408979, + "grad_norm": 0.08509200066328049, + "learning_rate": 8.172923290896295e-05, + "loss": 2.5688, + "step": 27562 + }, + { + "epoch": 0.8173353497612905, + "grad_norm": 0.08590573072433472, + "learning_rate": 8.170345537412305e-05, + "loss": 2.5739, + "step": 27563 + }, + { + "epoch": 0.8173650031136019, + "grad_norm": 0.08572360873222351, + "learning_rate": 8.167768154337101e-05, + "loss": 2.5905, + "step": 27564 + }, + { + "epoch": 0.8173946564659135, + "grad_norm": 0.08531023561954498, + "learning_rate": 8.165191141693489e-05, + "loss": 2.5931, + "step": 27565 + }, + { + "epoch": 0.817424309818225, + "grad_norm": 0.09283162653446198, + "learning_rate": 8.162614499504289e-05, + "loss": 2.6256, + "step": 27566 + }, + { + "epoch": 0.8174539631705364, + "grad_norm": 0.08293744921684265, + "learning_rate": 8.160038227792322e-05, + "loss": 2.5696, + "step": 27567 + }, + { + "epoch": 0.817483616522848, + "grad_norm": 0.08527745306491852, + "learning_rate": 8.157462326580395e-05, + "loss": 2.5283, + "step": 27568 + }, + { + "epoch": 0.8175132698751594, + "grad_norm": 0.08656799793243408, + "learning_rate": 8.15488679589132e-05, + "loss": 2.5611, + "step": 27569 + }, + { + "epoch": 0.8175429232274709, + "grad_norm": 0.08381415903568268, + "learning_rate": 8.152311635747922e-05, + "loss": 2.5706, + "step": 27570 + }, + { + "epoch": 0.8175725765797823, + "grad_norm": 0.09079862385988235, + "learning_rate": 8.149736846172972e-05, + "loss": 2.5899, + "step": 27571 + }, + { + "epoch": 0.8176022299320939, + "grad_norm": 0.08738410472869873, + "learning_rate": 8.147162427189287e-05, + "loss": 2.59, + "step": 27572 + }, + { + "epoch": 0.8176318832844053, + "grad_norm": 0.08758766204118729, + "learning_rate": 8.144588378819661e-05, + "loss": 2.571, + "step": 27573 + }, + { + "epoch": 0.8176615366367168, + "grad_norm": 0.09255508333444595, + "learning_rate": 8.142014701086892e-05, + "loss": 2.5932, + "step": 27574 + }, + { + "epoch": 0.8176911899890282, + "grad_norm": 0.0869121327996254, + "learning_rate": 8.139441394013769e-05, + "loss": 2.5498, + "step": 27575 + }, + { + "epoch": 0.8177208433413398, + "grad_norm": 0.09912408888339996, + "learning_rate": 8.13686845762306e-05, + "loss": 2.5786, + "step": 27576 + }, + { + "epoch": 0.8177504966936512, + "grad_norm": 0.08784458786249161, + "learning_rate": 8.134295891937582e-05, + "loss": 2.587, + "step": 27577 + }, + { + "epoch": 0.8177801500459627, + "grad_norm": 0.11029092222452164, + "learning_rate": 8.131723696980098e-05, + "loss": 2.5913, + "step": 27578 + }, + { + "epoch": 0.8178098033982741, + "grad_norm": 0.08878999203443527, + "learning_rate": 8.129151872773388e-05, + "loss": 2.5373, + "step": 27579 + }, + { + "epoch": 0.8178394567505857, + "grad_norm": 0.10196606814861298, + "learning_rate": 8.12658041934024e-05, + "loss": 2.5949, + "step": 27580 + }, + { + "epoch": 0.8178691101028971, + "grad_norm": 0.0988386794924736, + "learning_rate": 8.124009336703397e-05, + "loss": 2.569, + "step": 27581 + }, + { + "epoch": 0.8178987634552086, + "grad_norm": 0.09183206409215927, + "learning_rate": 8.121438624885636e-05, + "loss": 2.5582, + "step": 27582 + }, + { + "epoch": 0.81792841680752, + "grad_norm": 0.09033823013305664, + "learning_rate": 8.11886828390973e-05, + "loss": 2.5832, + "step": 27583 + }, + { + "epoch": 0.8179580701598316, + "grad_norm": 0.08542480319738388, + "learning_rate": 8.116298313798432e-05, + "loss": 2.568, + "step": 27584 + }, + { + "epoch": 0.817987723512143, + "grad_norm": 0.08932889997959137, + "learning_rate": 8.113728714574497e-05, + "loss": 2.6061, + "step": 27585 + }, + { + "epoch": 0.8180173768644545, + "grad_norm": 0.08028478920459747, + "learning_rate": 8.111159486260689e-05, + "loss": 2.5581, + "step": 27586 + }, + { + "epoch": 0.8180470302167661, + "grad_norm": 0.08623463660478592, + "learning_rate": 8.108590628879752e-05, + "loss": 2.5537, + "step": 27587 + }, + { + "epoch": 0.8180766835690775, + "grad_norm": 0.09236648678779602, + "learning_rate": 8.106022142454434e-05, + "loss": 2.574, + "step": 27588 + }, + { + "epoch": 0.818106336921389, + "grad_norm": 0.09273091703653336, + "learning_rate": 8.103454027007473e-05, + "loss": 2.5358, + "step": 27589 + }, + { + "epoch": 0.8181359902737004, + "grad_norm": 0.0895458534359932, + "learning_rate": 8.100886282561626e-05, + "loss": 2.5971, + "step": 27590 + }, + { + "epoch": 0.818165643626012, + "grad_norm": 0.0939783900976181, + "learning_rate": 8.098318909139634e-05, + "loss": 2.5523, + "step": 27591 + }, + { + "epoch": 0.8181952969783234, + "grad_norm": 0.08305452018976212, + "learning_rate": 8.095751906764214e-05, + "loss": 2.5847, + "step": 27592 + }, + { + "epoch": 0.8182249503306349, + "grad_norm": 0.09240950644016266, + "learning_rate": 8.093185275458098e-05, + "loss": 2.561, + "step": 27593 + }, + { + "epoch": 0.8182546036829463, + "grad_norm": 0.08279503881931305, + "learning_rate": 8.090619015244022e-05, + "loss": 2.5738, + "step": 27594 + }, + { + "epoch": 0.8182842570352579, + "grad_norm": 0.0835212841629982, + "learning_rate": 8.088053126144712e-05, + "loss": 2.5492, + "step": 27595 + }, + { + "epoch": 0.8183139103875693, + "grad_norm": 0.09222657233476639, + "learning_rate": 8.085487608182878e-05, + "loss": 2.5952, + "step": 27596 + }, + { + "epoch": 0.8183435637398808, + "grad_norm": 0.08674825727939606, + "learning_rate": 8.082922461381253e-05, + "loss": 2.5892, + "step": 27597 + }, + { + "epoch": 0.8183732170921922, + "grad_norm": 0.08959214389324188, + "learning_rate": 8.080357685762541e-05, + "loss": 2.5647, + "step": 27598 + }, + { + "epoch": 0.8184028704445038, + "grad_norm": 0.08734691143035889, + "learning_rate": 8.077793281349461e-05, + "loss": 2.5718, + "step": 27599 + }, + { + "epoch": 0.8184325237968152, + "grad_norm": 0.09002779424190521, + "learning_rate": 8.075229248164711e-05, + "loss": 2.5518, + "step": 27600 + }, + { + "epoch": 0.8184621771491267, + "grad_norm": 0.08280186355113983, + "learning_rate": 8.072665586231004e-05, + "loss": 2.58, + "step": 27601 + }, + { + "epoch": 0.8184918305014381, + "grad_norm": 0.10054413974285126, + "learning_rate": 8.070102295571041e-05, + "loss": 2.5796, + "step": 27602 + }, + { + "epoch": 0.8185214838537497, + "grad_norm": 0.09321596473455429, + "learning_rate": 8.067539376207523e-05, + "loss": 2.5719, + "step": 27603 + }, + { + "epoch": 0.8185511372060611, + "grad_norm": 0.09105085581541061, + "learning_rate": 8.064976828163134e-05, + "loss": 2.5747, + "step": 27604 + }, + { + "epoch": 0.8185807905583726, + "grad_norm": 0.09165004640817642, + "learning_rate": 8.06241465146058e-05, + "loss": 2.5727, + "step": 27605 + }, + { + "epoch": 0.818610443910684, + "grad_norm": 0.10314622521400452, + "learning_rate": 8.05985284612254e-05, + "loss": 2.5957, + "step": 27606 + }, + { + "epoch": 0.8186400972629956, + "grad_norm": 0.0913771539926529, + "learning_rate": 8.057291412171703e-05, + "loss": 2.5611, + "step": 27607 + }, + { + "epoch": 0.8186697506153071, + "grad_norm": 0.09487437456846237, + "learning_rate": 8.054730349630746e-05, + "loss": 2.5578, + "step": 27608 + }, + { + "epoch": 0.8186994039676185, + "grad_norm": 0.08442848920822144, + "learning_rate": 8.052169658522357e-05, + "loss": 2.5581, + "step": 27609 + }, + { + "epoch": 0.8187290573199301, + "grad_norm": 0.08634006232023239, + "learning_rate": 8.049609338869201e-05, + "loss": 2.6059, + "step": 27610 + }, + { + "epoch": 0.8187587106722415, + "grad_norm": 0.09121010452508926, + "learning_rate": 8.047049390693955e-05, + "loss": 2.5434, + "step": 27611 + }, + { + "epoch": 0.818788364024553, + "grad_norm": 0.08590834587812424, + "learning_rate": 8.04448981401929e-05, + "loss": 2.5846, + "step": 27612 + }, + { + "epoch": 0.8188180173768644, + "grad_norm": 0.09459611028432846, + "learning_rate": 8.04193060886787e-05, + "loss": 2.5983, + "step": 27613 + }, + { + "epoch": 0.818847670729176, + "grad_norm": 0.08866539597511292, + "learning_rate": 8.039371775262372e-05, + "loss": 2.5574, + "step": 27614 + }, + { + "epoch": 0.8188773240814874, + "grad_norm": 0.09553837031126022, + "learning_rate": 8.036813313225411e-05, + "loss": 2.5772, + "step": 27615 + }, + { + "epoch": 0.8189069774337989, + "grad_norm": 0.09229882806539536, + "learning_rate": 8.03425522277968e-05, + "loss": 2.5853, + "step": 27616 + }, + { + "epoch": 0.8189366307861103, + "grad_norm": 0.08867625147104263, + "learning_rate": 8.031697503947827e-05, + "loss": 2.554, + "step": 27617 + }, + { + "epoch": 0.8189662841384219, + "grad_norm": 0.0950135588645935, + "learning_rate": 8.029140156752495e-05, + "loss": 2.5654, + "step": 27618 + }, + { + "epoch": 0.8189959374907333, + "grad_norm": 0.08615752309560776, + "learning_rate": 8.026583181216329e-05, + "loss": 2.5882, + "step": 27619 + }, + { + "epoch": 0.8190255908430448, + "grad_norm": 0.09816981852054596, + "learning_rate": 8.024026577361976e-05, + "loss": 2.5826, + "step": 27620 + }, + { + "epoch": 0.8190552441953562, + "grad_norm": 0.08535411953926086, + "learning_rate": 8.021470345212073e-05, + "loss": 2.5795, + "step": 27621 + }, + { + "epoch": 0.8190848975476678, + "grad_norm": 0.09018592536449432, + "learning_rate": 8.018914484789252e-05, + "loss": 2.56, + "step": 27622 + }, + { + "epoch": 0.8191145508999792, + "grad_norm": 0.09193812310695648, + "learning_rate": 8.016358996116157e-05, + "loss": 2.5795, + "step": 27623 + }, + { + "epoch": 0.8191442042522907, + "grad_norm": 0.08082694560289383, + "learning_rate": 8.013803879215403e-05, + "loss": 2.5789, + "step": 27624 + }, + { + "epoch": 0.8191738576046022, + "grad_norm": 0.08946701884269714, + "learning_rate": 8.011249134109638e-05, + "loss": 2.562, + "step": 27625 + }, + { + "epoch": 0.8192035109569137, + "grad_norm": 0.09363475441932678, + "learning_rate": 8.008694760821456e-05, + "loss": 2.5868, + "step": 27626 + }, + { + "epoch": 0.8192331643092251, + "grad_norm": 0.08212330937385559, + "learning_rate": 8.006140759373486e-05, + "loss": 2.5584, + "step": 27627 + }, + { + "epoch": 0.8192628176615366, + "grad_norm": 0.09083743393421173, + "learning_rate": 8.003587129788337e-05, + "loss": 2.6108, + "step": 27628 + }, + { + "epoch": 0.8192924710138482, + "grad_norm": 0.08506257086992264, + "learning_rate": 8.001033872088648e-05, + "loss": 2.5717, + "step": 27629 + }, + { + "epoch": 0.8193221243661596, + "grad_norm": 0.08164626359939575, + "learning_rate": 7.998480986297013e-05, + "loss": 2.5685, + "step": 27630 + }, + { + "epoch": 0.8193517777184711, + "grad_norm": 0.08979234844446182, + "learning_rate": 7.995928472436037e-05, + "loss": 2.5614, + "step": 27631 + }, + { + "epoch": 0.8193814310707825, + "grad_norm": 0.0844506025314331, + "learning_rate": 7.993376330528323e-05, + "loss": 2.596, + "step": 27632 + }, + { + "epoch": 0.8194110844230941, + "grad_norm": 0.09299811720848083, + "learning_rate": 7.990824560596472e-05, + "loss": 2.5828, + "step": 27633 + }, + { + "epoch": 0.8194407377754055, + "grad_norm": 0.09580199420452118, + "learning_rate": 7.988273162663078e-05, + "loss": 2.5718, + "step": 27634 + }, + { + "epoch": 0.819470391127717, + "grad_norm": 0.09710322320461273, + "learning_rate": 7.985722136750755e-05, + "loss": 2.6125, + "step": 27635 + }, + { + "epoch": 0.8195000444800284, + "grad_norm": 0.08686951547861099, + "learning_rate": 7.983171482882057e-05, + "loss": 2.5414, + "step": 27636 + }, + { + "epoch": 0.81952969783234, + "grad_norm": 0.09406031668186188, + "learning_rate": 7.980621201079591e-05, + "loss": 2.556, + "step": 27637 + }, + { + "epoch": 0.8195593511846514, + "grad_norm": 0.09535912424325943, + "learning_rate": 7.978071291365935e-05, + "loss": 2.5994, + "step": 27638 + }, + { + "epoch": 0.8195890045369629, + "grad_norm": 0.09101035445928574, + "learning_rate": 7.97552175376367e-05, + "loss": 2.5873, + "step": 27639 + }, + { + "epoch": 0.8196186578892743, + "grad_norm": 0.08877592533826828, + "learning_rate": 7.972972588295374e-05, + "loss": 2.5479, + "step": 27640 + }, + { + "epoch": 0.8196483112415859, + "grad_norm": 0.09871300309896469, + "learning_rate": 7.970423794983606e-05, + "loss": 2.5586, + "step": 27641 + }, + { + "epoch": 0.8196779645938973, + "grad_norm": 0.0931575745344162, + "learning_rate": 7.967875373850964e-05, + "loss": 2.5889, + "step": 27642 + }, + { + "epoch": 0.8197076179462088, + "grad_norm": 0.09087734669446945, + "learning_rate": 7.965327324920002e-05, + "loss": 2.5759, + "step": 27643 + }, + { + "epoch": 0.8197372712985203, + "grad_norm": 0.09570690244436264, + "learning_rate": 7.962779648213276e-05, + "loss": 2.5844, + "step": 27644 + }, + { + "epoch": 0.8197669246508318, + "grad_norm": 0.08865848183631897, + "learning_rate": 7.960232343753354e-05, + "loss": 2.5504, + "step": 27645 + }, + { + "epoch": 0.8197965780031432, + "grad_norm": 0.09016221016645432, + "learning_rate": 7.957685411562804e-05, + "loss": 2.5549, + "step": 27646 + }, + { + "epoch": 0.8198262313554547, + "grad_norm": 0.08455543965101242, + "learning_rate": 7.955138851664156e-05, + "loss": 2.5674, + "step": 27647 + }, + { + "epoch": 0.8198558847077662, + "grad_norm": 0.09213974326848984, + "learning_rate": 7.952592664079966e-05, + "loss": 2.5956, + "step": 27648 + }, + { + "epoch": 0.8198855380600777, + "grad_norm": 0.0938999280333519, + "learning_rate": 7.950046848832787e-05, + "loss": 2.5604, + "step": 27649 + }, + { + "epoch": 0.8199151914123892, + "grad_norm": 0.08757451921701431, + "learning_rate": 7.947501405945162e-05, + "loss": 2.5644, + "step": 27650 + }, + { + "epoch": 0.8199448447647006, + "grad_norm": 0.09372477978467941, + "learning_rate": 7.944956335439629e-05, + "loss": 2.5778, + "step": 27651 + }, + { + "epoch": 0.8199744981170122, + "grad_norm": 0.089654840528965, + "learning_rate": 7.942411637338732e-05, + "loss": 2.5749, + "step": 27652 + }, + { + "epoch": 0.8200041514693236, + "grad_norm": 0.10077497363090515, + "learning_rate": 7.939867311664989e-05, + "loss": 2.5622, + "step": 27653 + }, + { + "epoch": 0.8200338048216351, + "grad_norm": 0.08557920902967453, + "learning_rate": 7.937323358440934e-05, + "loss": 2.5993, + "step": 27654 + }, + { + "epoch": 0.8200634581739465, + "grad_norm": 0.0844711884856224, + "learning_rate": 7.934779777689116e-05, + "loss": 2.5533, + "step": 27655 + }, + { + "epoch": 0.8200931115262581, + "grad_norm": 0.08414566516876221, + "learning_rate": 7.93223656943205e-05, + "loss": 2.5874, + "step": 27656 + }, + { + "epoch": 0.8201227648785695, + "grad_norm": 0.09614352136850357, + "learning_rate": 7.929693733692239e-05, + "loss": 2.5313, + "step": 27657 + }, + { + "epoch": 0.820152418230881, + "grad_norm": 0.08023770898580551, + "learning_rate": 7.92715127049221e-05, + "loss": 2.5337, + "step": 27658 + }, + { + "epoch": 0.8201820715831925, + "grad_norm": 0.09143868833780289, + "learning_rate": 7.92460917985448e-05, + "loss": 2.5794, + "step": 27659 + }, + { + "epoch": 0.820211724935504, + "grad_norm": 0.08718583732843399, + "learning_rate": 7.922067461801557e-05, + "loss": 2.5789, + "step": 27660 + }, + { + "epoch": 0.8202413782878154, + "grad_norm": 0.080922432243824, + "learning_rate": 7.919526116355952e-05, + "loss": 2.5719, + "step": 27661 + }, + { + "epoch": 0.8202710316401269, + "grad_norm": 0.08975598961114883, + "learning_rate": 7.916985143540168e-05, + "loss": 2.5851, + "step": 27662 + }, + { + "epoch": 0.8203006849924384, + "grad_norm": 0.08278416842222214, + "learning_rate": 7.914444543376698e-05, + "loss": 2.5924, + "step": 27663 + }, + { + "epoch": 0.8203303383447499, + "grad_norm": 0.08989175409078598, + "learning_rate": 7.911904315888047e-05, + "loss": 2.5406, + "step": 27664 + }, + { + "epoch": 0.8203599916970613, + "grad_norm": 0.081473708152771, + "learning_rate": 7.909364461096707e-05, + "loss": 2.5611, + "step": 27665 + }, + { + "epoch": 0.8203896450493728, + "grad_norm": 0.08587527275085449, + "learning_rate": 7.906824979025174e-05, + "loss": 2.6188, + "step": 27666 + }, + { + "epoch": 0.8204192984016843, + "grad_norm": 0.08869244903326035, + "learning_rate": 7.904285869695926e-05, + "loss": 2.5533, + "step": 27667 + }, + { + "epoch": 0.8204489517539958, + "grad_norm": 0.07926236093044281, + "learning_rate": 7.901747133131453e-05, + "loss": 2.5623, + "step": 27668 + }, + { + "epoch": 0.8204786051063072, + "grad_norm": 0.08636678755283356, + "learning_rate": 7.899208769354237e-05, + "loss": 2.5969, + "step": 27669 + }, + { + "epoch": 0.8205082584586187, + "grad_norm": 0.08627073466777802, + "learning_rate": 7.896670778386756e-05, + "loss": 2.5784, + "step": 27670 + }, + { + "epoch": 0.8205379118109303, + "grad_norm": 0.08827118575572968, + "learning_rate": 7.894133160251476e-05, + "loss": 2.5959, + "step": 27671 + }, + { + "epoch": 0.8205675651632417, + "grad_norm": 0.08205492049455643, + "learning_rate": 7.891595914970878e-05, + "loss": 2.5802, + "step": 27672 + }, + { + "epoch": 0.8205972185155532, + "grad_norm": 0.09276927262544632, + "learning_rate": 7.889059042567425e-05, + "loss": 2.6188, + "step": 27673 + }, + { + "epoch": 0.8206268718678646, + "grad_norm": 0.08433166891336441, + "learning_rate": 7.886522543063584e-05, + "loss": 2.6046, + "step": 27674 + }, + { + "epoch": 0.8206565252201762, + "grad_norm": 0.09340061992406845, + "learning_rate": 7.883986416481814e-05, + "loss": 2.5717, + "step": 27675 + }, + { + "epoch": 0.8206861785724876, + "grad_norm": 0.08173271268606186, + "learning_rate": 7.881450662844575e-05, + "loss": 2.5626, + "step": 27676 + }, + { + "epoch": 0.8207158319247991, + "grad_norm": 0.08763778954744339, + "learning_rate": 7.878915282174314e-05, + "loss": 2.5729, + "step": 27677 + }, + { + "epoch": 0.8207454852771106, + "grad_norm": 0.09495263546705246, + "learning_rate": 7.876380274493494e-05, + "loss": 2.6146, + "step": 27678 + }, + { + "epoch": 0.8207751386294221, + "grad_norm": 0.08614588528871536, + "learning_rate": 7.873845639824556e-05, + "loss": 2.585, + "step": 27679 + }, + { + "epoch": 0.8208047919817335, + "grad_norm": 0.10530164837837219, + "learning_rate": 7.871311378189943e-05, + "loss": 2.5674, + "step": 27680 + }, + { + "epoch": 0.820834445334045, + "grad_norm": 0.095499686896801, + "learning_rate": 7.868777489612105e-05, + "loss": 2.6036, + "step": 27681 + }, + { + "epoch": 0.8208640986863565, + "grad_norm": 0.09307299554347992, + "learning_rate": 7.866243974113469e-05, + "loss": 2.5887, + "step": 27682 + }, + { + "epoch": 0.820893752038668, + "grad_norm": 0.09577058255672455, + "learning_rate": 7.863710831716475e-05, + "loss": 2.6334, + "step": 27683 + }, + { + "epoch": 0.8209234053909794, + "grad_norm": 0.0869787335395813, + "learning_rate": 7.861178062443552e-05, + "loss": 2.596, + "step": 27684 + }, + { + "epoch": 0.8209530587432909, + "grad_norm": 0.08838531374931335, + "learning_rate": 7.858645666317138e-05, + "loss": 2.5953, + "step": 27685 + }, + { + "epoch": 0.8209827120956024, + "grad_norm": 0.09042511880397797, + "learning_rate": 7.856113643359642e-05, + "loss": 2.566, + "step": 27686 + }, + { + "epoch": 0.8210123654479139, + "grad_norm": 0.08837292343378067, + "learning_rate": 7.8535819935935e-05, + "loss": 2.5705, + "step": 27687 + }, + { + "epoch": 0.8210420188002253, + "grad_norm": 0.08532129973173141, + "learning_rate": 7.85105071704112e-05, + "loss": 2.5565, + "step": 27688 + }, + { + "epoch": 0.8210716721525368, + "grad_norm": 0.08617404848337173, + "learning_rate": 7.84851981372492e-05, + "loss": 2.5755, + "step": 27689 + }, + { + "epoch": 0.8211013255048484, + "grad_norm": 0.09123560041189194, + "learning_rate": 7.845989283667326e-05, + "loss": 2.6023, + "step": 27690 + }, + { + "epoch": 0.8211309788571598, + "grad_norm": 0.08459583669900894, + "learning_rate": 7.843459126890722e-05, + "loss": 2.5769, + "step": 27691 + }, + { + "epoch": 0.8211606322094713, + "grad_norm": 0.09350906312465668, + "learning_rate": 7.840929343417519e-05, + "loss": 2.5565, + "step": 27692 + }, + { + "epoch": 0.8211902855617828, + "grad_norm": 0.09239112585783005, + "learning_rate": 7.838399933270118e-05, + "loss": 2.5759, + "step": 27693 + }, + { + "epoch": 0.8212199389140943, + "grad_norm": 0.10113812983036041, + "learning_rate": 7.835870896470926e-05, + "loss": 2.5943, + "step": 27694 + }, + { + "epoch": 0.8212495922664057, + "grad_norm": 0.0981164425611496, + "learning_rate": 7.83334223304234e-05, + "loss": 2.5752, + "step": 27695 + }, + { + "epoch": 0.8212792456187172, + "grad_norm": 0.08898372948169708, + "learning_rate": 7.830813943006748e-05, + "loss": 2.546, + "step": 27696 + }, + { + "epoch": 0.8213088989710287, + "grad_norm": 0.09789536893367767, + "learning_rate": 7.828286026386533e-05, + "loss": 2.5748, + "step": 27697 + }, + { + "epoch": 0.8213385523233402, + "grad_norm": 0.08805904537439346, + "learning_rate": 7.825758483204087e-05, + "loss": 2.5829, + "step": 27698 + }, + { + "epoch": 0.8213682056756516, + "grad_norm": 0.09833620488643646, + "learning_rate": 7.823231313481787e-05, + "loss": 2.5698, + "step": 27699 + }, + { + "epoch": 0.8213978590279631, + "grad_norm": 0.0944654792547226, + "learning_rate": 7.82070451724201e-05, + "loss": 2.6378, + "step": 27700 + }, + { + "epoch": 0.8214275123802746, + "grad_norm": 0.0919705331325531, + "learning_rate": 7.81817809450715e-05, + "loss": 2.597, + "step": 27701 + }, + { + "epoch": 0.8214571657325861, + "grad_norm": 0.09511509537696838, + "learning_rate": 7.815652045299554e-05, + "loss": 2.5813, + "step": 27702 + }, + { + "epoch": 0.8214868190848975, + "grad_norm": 0.08477235585451126, + "learning_rate": 7.813126369641593e-05, + "loss": 2.5888, + "step": 27703 + }, + { + "epoch": 0.821516472437209, + "grad_norm": 0.08966251462697983, + "learning_rate": 7.810601067555645e-05, + "loss": 2.5829, + "step": 27704 + }, + { + "epoch": 0.8215461257895205, + "grad_norm": 0.09297221899032593, + "learning_rate": 7.808076139064064e-05, + "loss": 2.5502, + "step": 27705 + }, + { + "epoch": 0.821575779141832, + "grad_norm": 0.0852721780538559, + "learning_rate": 7.805551584189203e-05, + "loss": 2.579, + "step": 27706 + }, + { + "epoch": 0.8216054324941434, + "grad_norm": 0.0965200737118721, + "learning_rate": 7.803027402953433e-05, + "loss": 2.5958, + "step": 27707 + }, + { + "epoch": 0.821635085846455, + "grad_norm": 0.09324803948402405, + "learning_rate": 7.800503595379099e-05, + "loss": 2.5687, + "step": 27708 + }, + { + "epoch": 0.8216647391987664, + "grad_norm": 0.08561181277036667, + "learning_rate": 7.79798016148855e-05, + "loss": 2.5868, + "step": 27709 + }, + { + "epoch": 0.8216943925510779, + "grad_norm": 0.0921366959810257, + "learning_rate": 7.795457101304126e-05, + "loss": 2.594, + "step": 27710 + }, + { + "epoch": 0.8217240459033894, + "grad_norm": 0.08994363248348236, + "learning_rate": 7.792934414848191e-05, + "loss": 2.5904, + "step": 27711 + }, + { + "epoch": 0.8217536992557009, + "grad_norm": 0.087557353079319, + "learning_rate": 7.790412102143051e-05, + "loss": 2.615, + "step": 27712 + }, + { + "epoch": 0.8217833526080124, + "grad_norm": 0.08754774928092957, + "learning_rate": 7.787890163211058e-05, + "loss": 2.5942, + "step": 27713 + }, + { + "epoch": 0.8218130059603238, + "grad_norm": 0.08887268602848053, + "learning_rate": 7.785368598074549e-05, + "loss": 2.5643, + "step": 27714 + }, + { + "epoch": 0.8218426593126353, + "grad_norm": 0.0907556414604187, + "learning_rate": 7.782847406755839e-05, + "loss": 2.5566, + "step": 27715 + }, + { + "epoch": 0.8218723126649468, + "grad_norm": 0.08451899141073227, + "learning_rate": 7.780326589277264e-05, + "loss": 2.5582, + "step": 27716 + }, + { + "epoch": 0.8219019660172583, + "grad_norm": 0.08748665452003479, + "learning_rate": 7.777806145661149e-05, + "loss": 2.594, + "step": 27717 + }, + { + "epoch": 0.8219316193695697, + "grad_norm": 0.08985932916402817, + "learning_rate": 7.7752860759298e-05, + "loss": 2.5973, + "step": 27718 + }, + { + "epoch": 0.8219612727218812, + "grad_norm": 0.08979736268520355, + "learning_rate": 7.772766380105534e-05, + "loss": 2.6059, + "step": 27719 + }, + { + "epoch": 0.8219909260741927, + "grad_norm": 0.0890655592083931, + "learning_rate": 7.770247058210683e-05, + "loss": 2.546, + "step": 27720 + }, + { + "epoch": 0.8220205794265042, + "grad_norm": 0.09891638904809952, + "learning_rate": 7.767728110267535e-05, + "loss": 2.6253, + "step": 27721 + }, + { + "epoch": 0.8220502327788156, + "grad_norm": 0.08969128131866455, + "learning_rate": 7.765209536298423e-05, + "loss": 2.575, + "step": 27722 + }, + { + "epoch": 0.8220798861311271, + "grad_norm": 0.08955530822277069, + "learning_rate": 7.762691336325617e-05, + "loss": 2.5721, + "step": 27723 + }, + { + "epoch": 0.8221095394834386, + "grad_norm": 0.08439424633979797, + "learning_rate": 7.760173510371426e-05, + "loss": 2.5658, + "step": 27724 + }, + { + "epoch": 0.8221391928357501, + "grad_norm": 0.09374473243951797, + "learning_rate": 7.757656058458151e-05, + "loss": 2.5887, + "step": 27725 + }, + { + "epoch": 0.8221688461880615, + "grad_norm": 0.08129990100860596, + "learning_rate": 7.755138980608084e-05, + "loss": 2.5915, + "step": 27726 + }, + { + "epoch": 0.822198499540373, + "grad_norm": 0.09083783626556396, + "learning_rate": 7.752622276843513e-05, + "loss": 2.5926, + "step": 27727 + }, + { + "epoch": 0.8222281528926845, + "grad_norm": 0.09276028722524643, + "learning_rate": 7.75010594718672e-05, + "loss": 2.5981, + "step": 27728 + }, + { + "epoch": 0.822257806244996, + "grad_norm": 0.08877406269311905, + "learning_rate": 7.747589991659992e-05, + "loss": 2.5359, + "step": 27729 + }, + { + "epoch": 0.8222874595973074, + "grad_norm": 0.09607900679111481, + "learning_rate": 7.745074410285607e-05, + "loss": 2.565, + "step": 27730 + }, + { + "epoch": 0.822317112949619, + "grad_norm": 0.08824006468057632, + "learning_rate": 7.742559203085831e-05, + "loss": 2.5936, + "step": 27731 + }, + { + "epoch": 0.8223467663019305, + "grad_norm": 0.09171877801418304, + "learning_rate": 7.740044370082971e-05, + "loss": 2.5675, + "step": 27732 + }, + { + "epoch": 0.8223764196542419, + "grad_norm": 0.08777711540460587, + "learning_rate": 7.737529911299256e-05, + "loss": 2.572, + "step": 27733 + }, + { + "epoch": 0.8224060730065534, + "grad_norm": 0.09801676124334335, + "learning_rate": 7.735015826756969e-05, + "loss": 2.5707, + "step": 27734 + }, + { + "epoch": 0.8224357263588649, + "grad_norm": 0.08319830149412155, + "learning_rate": 7.732502116478373e-05, + "loss": 2.568, + "step": 27735 + }, + { + "epoch": 0.8224653797111764, + "grad_norm": 0.09444409608840942, + "learning_rate": 7.729988780485725e-05, + "loss": 2.5878, + "step": 27736 + }, + { + "epoch": 0.8224950330634878, + "grad_norm": 0.09772877395153046, + "learning_rate": 7.727475818801283e-05, + "loss": 2.6063, + "step": 27737 + }, + { + "epoch": 0.8225246864157993, + "grad_norm": 0.08934352546930313, + "learning_rate": 7.724963231447302e-05, + "loss": 2.564, + "step": 27738 + }, + { + "epoch": 0.8225543397681108, + "grad_norm": 0.08933927863836288, + "learning_rate": 7.722451018446025e-05, + "loss": 2.5851, + "step": 27739 + }, + { + "epoch": 0.8225839931204223, + "grad_norm": 0.10634283721446991, + "learning_rate": 7.719939179819696e-05, + "loss": 2.5844, + "step": 27740 + }, + { + "epoch": 0.8226136464727337, + "grad_norm": 0.08978917449712753, + "learning_rate": 7.717427715590569e-05, + "loss": 2.5713, + "step": 27741 + }, + { + "epoch": 0.8226432998250452, + "grad_norm": 0.0918588787317276, + "learning_rate": 7.714916625780877e-05, + "loss": 2.5675, + "step": 27742 + }, + { + "epoch": 0.8226729531773567, + "grad_norm": 0.09229974448680878, + "learning_rate": 7.712405910412851e-05, + "loss": 2.5423, + "step": 27743 + }, + { + "epoch": 0.8227026065296682, + "grad_norm": 0.08876130729913712, + "learning_rate": 7.709895569508734e-05, + "loss": 2.604, + "step": 27744 + }, + { + "epoch": 0.8227322598819796, + "grad_norm": 0.09965287148952484, + "learning_rate": 7.70738560309075e-05, + "loss": 2.5685, + "step": 27745 + }, + { + "epoch": 0.8227619132342912, + "grad_norm": 0.08062323927879333, + "learning_rate": 7.704876011181127e-05, + "loss": 2.564, + "step": 27746 + }, + { + "epoch": 0.8227915665866026, + "grad_norm": 0.09840170294046402, + "learning_rate": 7.702366793802085e-05, + "loss": 2.5816, + "step": 27747 + }, + { + "epoch": 0.8228212199389141, + "grad_norm": 0.08539030700922012, + "learning_rate": 7.699857950975847e-05, + "loss": 2.5882, + "step": 27748 + }, + { + "epoch": 0.8228508732912255, + "grad_norm": 0.09320767223834991, + "learning_rate": 7.697349482724625e-05, + "loss": 2.5597, + "step": 27749 + }, + { + "epoch": 0.8228805266435371, + "grad_norm": 0.09161696583032608, + "learning_rate": 7.694841389070633e-05, + "loss": 2.5835, + "step": 27750 + }, + { + "epoch": 0.8229101799958485, + "grad_norm": 0.08940370380878448, + "learning_rate": 7.692333670036089e-05, + "loss": 2.5999, + "step": 27751 + }, + { + "epoch": 0.82293983334816, + "grad_norm": 0.09231634438037872, + "learning_rate": 7.689826325643184e-05, + "loss": 2.5963, + "step": 27752 + }, + { + "epoch": 0.8229694867004715, + "grad_norm": 0.09555160254240036, + "learning_rate": 7.687319355914135e-05, + "loss": 2.554, + "step": 27753 + }, + { + "epoch": 0.822999140052783, + "grad_norm": 0.09652042388916016, + "learning_rate": 7.684812760871135e-05, + "loss": 2.5654, + "step": 27754 + }, + { + "epoch": 0.8230287934050945, + "grad_norm": 0.09017551690340042, + "learning_rate": 7.682306540536383e-05, + "loss": 2.5638, + "step": 27755 + }, + { + "epoch": 0.8230584467574059, + "grad_norm": 0.09043537825345993, + "learning_rate": 7.679800694932076e-05, + "loss": 2.5817, + "step": 27756 + }, + { + "epoch": 0.8230881001097174, + "grad_norm": 0.08826190233230591, + "learning_rate": 7.677295224080383e-05, + "loss": 2.5677, + "step": 27757 + }, + { + "epoch": 0.8231177534620289, + "grad_norm": 0.08850076794624329, + "learning_rate": 7.674790128003512e-05, + "loss": 2.6134, + "step": 27758 + }, + { + "epoch": 0.8231474068143404, + "grad_norm": 0.08518116921186447, + "learning_rate": 7.67228540672364e-05, + "loss": 2.5869, + "step": 27759 + }, + { + "epoch": 0.8231770601666518, + "grad_norm": 0.09471892565488815, + "learning_rate": 7.669781060262943e-05, + "loss": 2.5895, + "step": 27760 + }, + { + "epoch": 0.8232067135189634, + "grad_norm": 0.0892987921833992, + "learning_rate": 7.667277088643604e-05, + "loss": 2.6038, + "step": 27761 + }, + { + "epoch": 0.8232363668712748, + "grad_norm": 0.09225863963365555, + "learning_rate": 7.664773491887794e-05, + "loss": 2.5472, + "step": 27762 + }, + { + "epoch": 0.8232660202235863, + "grad_norm": 0.0865052342414856, + "learning_rate": 7.66227027001768e-05, + "loss": 2.5601, + "step": 27763 + }, + { + "epoch": 0.8232956735758977, + "grad_norm": 0.08630367368459702, + "learning_rate": 7.659767423055431e-05, + "loss": 2.5927, + "step": 27764 + }, + { + "epoch": 0.8233253269282093, + "grad_norm": 0.0926179438829422, + "learning_rate": 7.657264951023207e-05, + "loss": 2.5665, + "step": 27765 + }, + { + "epoch": 0.8233549802805207, + "grad_norm": 0.09492969512939453, + "learning_rate": 7.654762853943182e-05, + "loss": 2.5474, + "step": 27766 + }, + { + "epoch": 0.8233846336328322, + "grad_norm": 0.08625344187021255, + "learning_rate": 7.65226113183749e-05, + "loss": 2.5733, + "step": 27767 + }, + { + "epoch": 0.8234142869851436, + "grad_norm": 0.0932372435927391, + "learning_rate": 7.649759784728295e-05, + "loss": 2.5673, + "step": 27768 + }, + { + "epoch": 0.8234439403374552, + "grad_norm": 0.08875378221273422, + "learning_rate": 7.647258812637741e-05, + "loss": 2.5781, + "step": 27769 + }, + { + "epoch": 0.8234735936897666, + "grad_norm": 0.09238714724779129, + "learning_rate": 7.644758215587977e-05, + "loss": 2.5944, + "step": 27770 + }, + { + "epoch": 0.8235032470420781, + "grad_norm": 0.08230482041835785, + "learning_rate": 7.642257993601153e-05, + "loss": 2.5933, + "step": 27771 + }, + { + "epoch": 0.8235329003943895, + "grad_norm": 0.09409391134977341, + "learning_rate": 7.639758146699411e-05, + "loss": 2.5561, + "step": 27772 + }, + { + "epoch": 0.8235625537467011, + "grad_norm": 0.07994798570871353, + "learning_rate": 7.63725867490488e-05, + "loss": 2.5688, + "step": 27773 + }, + { + "epoch": 0.8235922070990126, + "grad_norm": 0.0813455656170845, + "learning_rate": 7.634759578239692e-05, + "loss": 2.5575, + "step": 27774 + }, + { + "epoch": 0.823621860451324, + "grad_norm": 0.09200479835271835, + "learning_rate": 7.632260856725981e-05, + "loss": 2.5618, + "step": 27775 + }, + { + "epoch": 0.8236515138036355, + "grad_norm": 0.0798337459564209, + "learning_rate": 7.629762510385874e-05, + "loss": 2.61, + "step": 27776 + }, + { + "epoch": 0.823681167155947, + "grad_norm": 0.09725840389728546, + "learning_rate": 7.627264539241508e-05, + "loss": 2.5828, + "step": 27777 + }, + { + "epoch": 0.8237108205082585, + "grad_norm": 0.0812097042798996, + "learning_rate": 7.62476694331497e-05, + "loss": 2.542, + "step": 27778 + }, + { + "epoch": 0.8237404738605699, + "grad_norm": 0.08738040179014206, + "learning_rate": 7.622269722628394e-05, + "loss": 2.5987, + "step": 27779 + }, + { + "epoch": 0.8237701272128815, + "grad_norm": 0.0913391262292862, + "learning_rate": 7.619772877203895e-05, + "loss": 2.5794, + "step": 27780 + }, + { + "epoch": 0.8237997805651929, + "grad_norm": 0.08001551777124405, + "learning_rate": 7.617276407063584e-05, + "loss": 2.5609, + "step": 27781 + }, + { + "epoch": 0.8238294339175044, + "grad_norm": 0.09586957097053528, + "learning_rate": 7.614780312229564e-05, + "loss": 2.5714, + "step": 27782 + }, + { + "epoch": 0.8238590872698158, + "grad_norm": 0.08581123501062393, + "learning_rate": 7.612284592723928e-05, + "loss": 2.5718, + "step": 27783 + }, + { + "epoch": 0.8238887406221274, + "grad_norm": 0.08264929056167603, + "learning_rate": 7.609789248568799e-05, + "loss": 2.5696, + "step": 27784 + }, + { + "epoch": 0.8239183939744388, + "grad_norm": 0.09205970913171768, + "learning_rate": 7.607294279786265e-05, + "loss": 2.5879, + "step": 27785 + }, + { + "epoch": 0.8239480473267503, + "grad_norm": 0.08855778723955154, + "learning_rate": 7.604799686398411e-05, + "loss": 2.5874, + "step": 27786 + }, + { + "epoch": 0.8239777006790617, + "grad_norm": 0.09060832113027573, + "learning_rate": 7.602305468427345e-05, + "loss": 2.5728, + "step": 27787 + }, + { + "epoch": 0.8240073540313733, + "grad_norm": 0.0875447541475296, + "learning_rate": 7.599811625895137e-05, + "loss": 2.5742, + "step": 27788 + }, + { + "epoch": 0.8240370073836847, + "grad_norm": 0.08652599155902863, + "learning_rate": 7.597318158823868e-05, + "loss": 2.5809, + "step": 27789 + }, + { + "epoch": 0.8240666607359962, + "grad_norm": 0.09236085414886475, + "learning_rate": 7.594825067235628e-05, + "loss": 2.5835, + "step": 27790 + }, + { + "epoch": 0.8240963140883076, + "grad_norm": 0.08250968158245087, + "learning_rate": 7.592332351152493e-05, + "loss": 2.5872, + "step": 27791 + }, + { + "epoch": 0.8241259674406192, + "grad_norm": 0.084047332406044, + "learning_rate": 7.589840010596527e-05, + "loss": 2.5318, + "step": 27792 + }, + { + "epoch": 0.8241556207929306, + "grad_norm": 0.08940421789884567, + "learning_rate": 7.587348045589815e-05, + "loss": 2.5626, + "step": 27793 + }, + { + "epoch": 0.8241852741452421, + "grad_norm": 0.08636123687028885, + "learning_rate": 7.58485645615441e-05, + "loss": 2.6075, + "step": 27794 + }, + { + "epoch": 0.8242149274975537, + "grad_norm": 0.08637997508049011, + "learning_rate": 7.582365242312389e-05, + "loss": 2.5632, + "step": 27795 + }, + { + "epoch": 0.8242445808498651, + "grad_norm": 0.0903288945555687, + "learning_rate": 7.579874404085785e-05, + "loss": 2.5807, + "step": 27796 + }, + { + "epoch": 0.8242742342021766, + "grad_norm": 0.08954554051160812, + "learning_rate": 7.57738394149669e-05, + "loss": 2.6019, + "step": 27797 + }, + { + "epoch": 0.824303887554488, + "grad_norm": 0.09005508571863174, + "learning_rate": 7.574893854567155e-05, + "loss": 2.5816, + "step": 27798 + }, + { + "epoch": 0.8243335409067996, + "grad_norm": 0.08952777832746506, + "learning_rate": 7.572404143319201e-05, + "loss": 2.5321, + "step": 27799 + }, + { + "epoch": 0.824363194259111, + "grad_norm": 0.09396445751190186, + "learning_rate": 7.569914807774896e-05, + "loss": 2.5949, + "step": 27800 + }, + { + "epoch": 0.8243928476114225, + "grad_norm": 0.09009548276662827, + "learning_rate": 7.567425847956278e-05, + "loss": 2.5805, + "step": 27801 + }, + { + "epoch": 0.8244225009637339, + "grad_norm": 0.0970730409026146, + "learning_rate": 7.564937263885385e-05, + "loss": 2.5603, + "step": 27802 + }, + { + "epoch": 0.8244521543160455, + "grad_norm": 0.08575944602489471, + "learning_rate": 7.562449055584254e-05, + "loss": 2.6014, + "step": 27803 + }, + { + "epoch": 0.8244818076683569, + "grad_norm": 0.10477051883935928, + "learning_rate": 7.559961223074924e-05, + "loss": 2.598, + "step": 27804 + }, + { + "epoch": 0.8245114610206684, + "grad_norm": 0.08693061023950577, + "learning_rate": 7.557473766379424e-05, + "loss": 2.5776, + "step": 27805 + }, + { + "epoch": 0.8245411143729798, + "grad_norm": 0.09584265947341919, + "learning_rate": 7.554986685519776e-05, + "loss": 2.5438, + "step": 27806 + }, + { + "epoch": 0.8245707677252914, + "grad_norm": 0.09256142377853394, + "learning_rate": 7.552499980518007e-05, + "loss": 2.593, + "step": 27807 + }, + { + "epoch": 0.8246004210776028, + "grad_norm": 0.09110835939645767, + "learning_rate": 7.550013651396137e-05, + "loss": 2.5811, + "step": 27808 + }, + { + "epoch": 0.8246300744299143, + "grad_norm": 0.10548698902130127, + "learning_rate": 7.547527698176182e-05, + "loss": 2.5923, + "step": 27809 + }, + { + "epoch": 0.8246597277822257, + "grad_norm": 0.08801288902759552, + "learning_rate": 7.545042120880158e-05, + "loss": 2.5776, + "step": 27810 + }, + { + "epoch": 0.8246893811345373, + "grad_norm": 0.09509675204753876, + "learning_rate": 7.542556919530075e-05, + "loss": 2.551, + "step": 27811 + }, + { + "epoch": 0.8247190344868487, + "grad_norm": 0.09303697943687439, + "learning_rate": 7.540072094147932e-05, + "loss": 2.57, + "step": 27812 + }, + { + "epoch": 0.8247486878391602, + "grad_norm": 0.08773834258317947, + "learning_rate": 7.537587644755745e-05, + "loss": 2.6059, + "step": 27813 + }, + { + "epoch": 0.8247783411914716, + "grad_norm": 0.08022861182689667, + "learning_rate": 7.535103571375501e-05, + "loss": 2.6021, + "step": 27814 + }, + { + "epoch": 0.8248079945437832, + "grad_norm": 0.09808757901191711, + "learning_rate": 7.532619874029212e-05, + "loss": 2.5818, + "step": 27815 + }, + { + "epoch": 0.8248376478960947, + "grad_norm": 0.08645735681056976, + "learning_rate": 7.530136552738859e-05, + "loss": 2.6013, + "step": 27816 + }, + { + "epoch": 0.8248673012484061, + "grad_norm": 0.09000120311975479, + "learning_rate": 7.527653607526435e-05, + "loss": 2.6033, + "step": 27817 + }, + { + "epoch": 0.8248969546007177, + "grad_norm": 0.08949221670627594, + "learning_rate": 7.52517103841393e-05, + "loss": 2.5508, + "step": 27818 + }, + { + "epoch": 0.8249266079530291, + "grad_norm": 0.08305086195468903, + "learning_rate": 7.522688845423325e-05, + "loss": 2.5802, + "step": 27819 + }, + { + "epoch": 0.8249562613053406, + "grad_norm": 0.08723766356706619, + "learning_rate": 7.520207028576609e-05, + "loss": 2.6009, + "step": 27820 + }, + { + "epoch": 0.824985914657652, + "grad_norm": 0.08728420734405518, + "learning_rate": 7.51772558789574e-05, + "loss": 2.5682, + "step": 27821 + }, + { + "epoch": 0.8250155680099636, + "grad_norm": 0.08991964906454086, + "learning_rate": 7.515244523402708e-05, + "loss": 2.5519, + "step": 27822 + }, + { + "epoch": 0.825045221362275, + "grad_norm": 0.09752710163593292, + "learning_rate": 7.51276383511948e-05, + "loss": 2.5463, + "step": 27823 + }, + { + "epoch": 0.8250748747145865, + "grad_norm": 0.08990436792373657, + "learning_rate": 7.510283523068023e-05, + "loss": 2.6082, + "step": 27824 + }, + { + "epoch": 0.8251045280668979, + "grad_norm": 0.0941985622048378, + "learning_rate": 7.507803587270295e-05, + "loss": 2.5914, + "step": 27825 + }, + { + "epoch": 0.8251341814192095, + "grad_norm": 0.09858953952789307, + "learning_rate": 7.505324027748262e-05, + "loss": 2.5511, + "step": 27826 + }, + { + "epoch": 0.8251638347715209, + "grad_norm": 0.084664486348629, + "learning_rate": 7.502844844523876e-05, + "loss": 2.6102, + "step": 27827 + }, + { + "epoch": 0.8251934881238324, + "grad_norm": 0.09972862899303436, + "learning_rate": 7.500366037619095e-05, + "loss": 2.5739, + "step": 27828 + }, + { + "epoch": 0.8252231414761438, + "grad_norm": 0.10018984973430634, + "learning_rate": 7.497887607055864e-05, + "loss": 2.5584, + "step": 27829 + }, + { + "epoch": 0.8252527948284554, + "grad_norm": 0.09056103974580765, + "learning_rate": 7.495409552856137e-05, + "loss": 2.5599, + "step": 27830 + }, + { + "epoch": 0.8252824481807668, + "grad_norm": 0.09099675714969635, + "learning_rate": 7.492931875041858e-05, + "loss": 2.5781, + "step": 27831 + }, + { + "epoch": 0.8253121015330783, + "grad_norm": 0.08926650136709213, + "learning_rate": 7.490454573634969e-05, + "loss": 2.5576, + "step": 27832 + }, + { + "epoch": 0.8253417548853897, + "grad_norm": 0.0875348299741745, + "learning_rate": 7.48797764865739e-05, + "loss": 2.5243, + "step": 27833 + }, + { + "epoch": 0.8253714082377013, + "grad_norm": 0.09465867280960083, + "learning_rate": 7.48550110013107e-05, + "loss": 2.5774, + "step": 27834 + }, + { + "epoch": 0.8254010615900127, + "grad_norm": 0.0844898447394371, + "learning_rate": 7.483024928077919e-05, + "loss": 2.5812, + "step": 27835 + }, + { + "epoch": 0.8254307149423242, + "grad_norm": 0.09289857745170593, + "learning_rate": 7.480549132519898e-05, + "loss": 2.5916, + "step": 27836 + }, + { + "epoch": 0.8254603682946358, + "grad_norm": 0.09902125597000122, + "learning_rate": 7.47807371347891e-05, + "loss": 2.6011, + "step": 27837 + }, + { + "epoch": 0.8254900216469472, + "grad_norm": 0.09591991454362869, + "learning_rate": 7.475598670976874e-05, + "loss": 2.5831, + "step": 27838 + }, + { + "epoch": 0.8255196749992587, + "grad_norm": 0.0903024673461914, + "learning_rate": 7.47312400503572e-05, + "loss": 2.5647, + "step": 27839 + }, + { + "epoch": 0.8255493283515701, + "grad_norm": 0.09168775379657745, + "learning_rate": 7.470649715677347e-05, + "loss": 2.5302, + "step": 27840 + }, + { + "epoch": 0.8255789817038817, + "grad_norm": 0.09122909605503082, + "learning_rate": 7.468175802923666e-05, + "loss": 2.5789, + "step": 27841 + }, + { + "epoch": 0.8256086350561931, + "grad_norm": 0.08850627392530441, + "learning_rate": 7.465702266796597e-05, + "loss": 2.5725, + "step": 27842 + }, + { + "epoch": 0.8256382884085046, + "grad_norm": 0.0989663153886795, + "learning_rate": 7.463229107318042e-05, + "loss": 2.5995, + "step": 27843 + }, + { + "epoch": 0.825667941760816, + "grad_norm": 0.08629420399665833, + "learning_rate": 7.460756324509888e-05, + "loss": 2.5345, + "step": 27844 + }, + { + "epoch": 0.8256975951131276, + "grad_norm": 0.09231483191251755, + "learning_rate": 7.458283918394033e-05, + "loss": 2.5867, + "step": 27845 + }, + { + "epoch": 0.825727248465439, + "grad_norm": 0.08982782810926437, + "learning_rate": 7.455811888992376e-05, + "loss": 2.5888, + "step": 27846 + }, + { + "epoch": 0.8257569018177505, + "grad_norm": 0.0935768261551857, + "learning_rate": 7.453340236326811e-05, + "loss": 2.5786, + "step": 27847 + }, + { + "epoch": 0.8257865551700619, + "grad_norm": 0.09413795918226242, + "learning_rate": 7.450868960419211e-05, + "loss": 2.5712, + "step": 27848 + }, + { + "epoch": 0.8258162085223735, + "grad_norm": 0.08977826684713364, + "learning_rate": 7.448398061291472e-05, + "loss": 2.6077, + "step": 27849 + }, + { + "epoch": 0.8258458618746849, + "grad_norm": 0.09750638157129288, + "learning_rate": 7.44592753896548e-05, + "loss": 2.5587, + "step": 27850 + }, + { + "epoch": 0.8258755152269964, + "grad_norm": 0.08506420999765396, + "learning_rate": 7.443457393463105e-05, + "loss": 2.5249, + "step": 27851 + }, + { + "epoch": 0.8259051685793078, + "grad_norm": 0.08986671268939972, + "learning_rate": 7.440987624806217e-05, + "loss": 2.6119, + "step": 27852 + }, + { + "epoch": 0.8259348219316194, + "grad_norm": 0.08969933539628983, + "learning_rate": 7.4385182330167e-05, + "loss": 2.5851, + "step": 27853 + }, + { + "epoch": 0.8259644752839308, + "grad_norm": 0.08670960366725922, + "learning_rate": 7.436049218116397e-05, + "loss": 2.5859, + "step": 27854 + }, + { + "epoch": 0.8259941286362423, + "grad_norm": 0.0857974961400032, + "learning_rate": 7.433580580127186e-05, + "loss": 2.5929, + "step": 27855 + }, + { + "epoch": 0.8260237819885538, + "grad_norm": 0.0844411551952362, + "learning_rate": 7.431112319070926e-05, + "loss": 2.5622, + "step": 27856 + }, + { + "epoch": 0.8260534353408653, + "grad_norm": 0.08060658723115921, + "learning_rate": 7.428644434969472e-05, + "loss": 2.5836, + "step": 27857 + }, + { + "epoch": 0.8260830886931768, + "grad_norm": 0.09405059367418289, + "learning_rate": 7.42617692784468e-05, + "loss": 2.585, + "step": 27858 + }, + { + "epoch": 0.8261127420454882, + "grad_norm": 0.08928465843200684, + "learning_rate": 7.423709797718397e-05, + "loss": 2.5915, + "step": 27859 + }, + { + "epoch": 0.8261423953977998, + "grad_norm": 0.08436808735132217, + "learning_rate": 7.421243044612475e-05, + "loss": 2.5526, + "step": 27860 + }, + { + "epoch": 0.8261720487501112, + "grad_norm": 0.09973766654729843, + "learning_rate": 7.418776668548738e-05, + "loss": 2.5612, + "step": 27861 + }, + { + "epoch": 0.8262017021024227, + "grad_norm": 0.09375475347042084, + "learning_rate": 7.416310669549059e-05, + "loss": 2.5395, + "step": 27862 + }, + { + "epoch": 0.8262313554547341, + "grad_norm": 0.09429445117712021, + "learning_rate": 7.41384504763527e-05, + "loss": 2.5665, + "step": 27863 + }, + { + "epoch": 0.8262610088070457, + "grad_norm": 0.0990859866142273, + "learning_rate": 7.411379802829176e-05, + "loss": 2.5927, + "step": 27864 + }, + { + "epoch": 0.8262906621593571, + "grad_norm": 0.0917855054140091, + "learning_rate": 7.408914935152628e-05, + "loss": 2.5714, + "step": 27865 + }, + { + "epoch": 0.8263203155116686, + "grad_norm": 0.09472573548555374, + "learning_rate": 7.40645044462745e-05, + "loss": 2.5414, + "step": 27866 + }, + { + "epoch": 0.82634996886398, + "grad_norm": 0.09086187183856964, + "learning_rate": 7.403986331275459e-05, + "loss": 2.6054, + "step": 27867 + }, + { + "epoch": 0.8263796222162916, + "grad_norm": 0.0911126434803009, + "learning_rate": 7.401522595118487e-05, + "loss": 2.6135, + "step": 27868 + }, + { + "epoch": 0.826409275568603, + "grad_norm": 0.09128598123788834, + "learning_rate": 7.39905923617834e-05, + "loss": 2.5792, + "step": 27869 + }, + { + "epoch": 0.8264389289209145, + "grad_norm": 0.08744101971387863, + "learning_rate": 7.396596254476839e-05, + "loss": 2.5656, + "step": 27870 + }, + { + "epoch": 0.826468582273226, + "grad_norm": 0.10131864994764328, + "learning_rate": 7.39413365003579e-05, + "loss": 2.5517, + "step": 27871 + }, + { + "epoch": 0.8264982356255375, + "grad_norm": 0.08963148295879364, + "learning_rate": 7.391671422877e-05, + "loss": 2.5853, + "step": 27872 + }, + { + "epoch": 0.8265278889778489, + "grad_norm": 0.09063791483640671, + "learning_rate": 7.38920957302227e-05, + "loss": 2.5664, + "step": 27873 + }, + { + "epoch": 0.8265575423301604, + "grad_norm": 0.08619694411754608, + "learning_rate": 7.386748100493407e-05, + "loss": 2.5666, + "step": 27874 + }, + { + "epoch": 0.8265871956824719, + "grad_norm": 0.09618250280618668, + "learning_rate": 7.384287005312207e-05, + "loss": 2.6019, + "step": 27875 + }, + { + "epoch": 0.8266168490347834, + "grad_norm": 0.08862879127264023, + "learning_rate": 7.381826287500454e-05, + "loss": 2.5887, + "step": 27876 + }, + { + "epoch": 0.8266465023870948, + "grad_norm": 0.09015609323978424, + "learning_rate": 7.379365947079946e-05, + "loss": 2.5728, + "step": 27877 + }, + { + "epoch": 0.8266761557394063, + "grad_norm": 0.09046942740678787, + "learning_rate": 7.376905984072473e-05, + "loss": 2.5744, + "step": 27878 + }, + { + "epoch": 0.8267058090917179, + "grad_norm": 0.08259972184896469, + "learning_rate": 7.374446398499812e-05, + "loss": 2.5905, + "step": 27879 + }, + { + "epoch": 0.8267354624440293, + "grad_norm": 0.09108570963144302, + "learning_rate": 7.371987190383745e-05, + "loss": 2.6063, + "step": 27880 + }, + { + "epoch": 0.8267651157963408, + "grad_norm": 0.08907576650381088, + "learning_rate": 7.369528359746042e-05, + "loss": 2.6133, + "step": 27881 + }, + { + "epoch": 0.8267947691486522, + "grad_norm": 0.09423930943012238, + "learning_rate": 7.367069906608486e-05, + "loss": 2.5938, + "step": 27882 + }, + { + "epoch": 0.8268244225009638, + "grad_norm": 0.08104611933231354, + "learning_rate": 7.364611830992846e-05, + "loss": 2.6054, + "step": 27883 + }, + { + "epoch": 0.8268540758532752, + "grad_norm": 0.08933044224977493, + "learning_rate": 7.36215413292089e-05, + "loss": 2.5892, + "step": 27884 + }, + { + "epoch": 0.8268837292055867, + "grad_norm": 0.08928809314966202, + "learning_rate": 7.359696812414374e-05, + "loss": 2.599, + "step": 27885 + }, + { + "epoch": 0.8269133825578981, + "grad_norm": 0.0821332037448883, + "learning_rate": 7.357239869495058e-05, + "loss": 2.5568, + "step": 27886 + }, + { + "epoch": 0.8269430359102097, + "grad_norm": 0.08371829986572266, + "learning_rate": 7.354783304184708e-05, + "loss": 2.5634, + "step": 27887 + }, + { + "epoch": 0.8269726892625211, + "grad_norm": 0.08984173089265823, + "learning_rate": 7.35232711650507e-05, + "loss": 2.5603, + "step": 27888 + }, + { + "epoch": 0.8270023426148326, + "grad_norm": 0.09068148583173752, + "learning_rate": 7.349871306477896e-05, + "loss": 2.5652, + "step": 27889 + }, + { + "epoch": 0.827031995967144, + "grad_norm": 0.08598639070987701, + "learning_rate": 7.34741587412493e-05, + "loss": 2.5932, + "step": 27890 + }, + { + "epoch": 0.8270616493194556, + "grad_norm": 0.08749809116125107, + "learning_rate": 7.344960819467922e-05, + "loss": 2.5858, + "step": 27891 + }, + { + "epoch": 0.827091302671767, + "grad_norm": 0.0932110920548439, + "learning_rate": 7.342506142528605e-05, + "loss": 2.5957, + "step": 27892 + }, + { + "epoch": 0.8271209560240785, + "grad_norm": 0.08936229348182678, + "learning_rate": 7.340051843328715e-05, + "loss": 2.5568, + "step": 27893 + }, + { + "epoch": 0.82715060937639, + "grad_norm": 0.08830969780683517, + "learning_rate": 7.337597921889993e-05, + "loss": 2.5329, + "step": 27894 + }, + { + "epoch": 0.8271802627287015, + "grad_norm": 0.09305877238512039, + "learning_rate": 7.335144378234165e-05, + "loss": 2.5585, + "step": 27895 + }, + { + "epoch": 0.8272099160810129, + "grad_norm": 0.0874098390340805, + "learning_rate": 7.332691212382952e-05, + "loss": 2.5421, + "step": 27896 + }, + { + "epoch": 0.8272395694333244, + "grad_norm": 0.0927022323012352, + "learning_rate": 7.330238424358088e-05, + "loss": 2.5772, + "step": 27897 + }, + { + "epoch": 0.827269222785636, + "grad_norm": 0.07956207543611526, + "learning_rate": 7.327786014181293e-05, + "loss": 2.543, + "step": 27898 + }, + { + "epoch": 0.8272988761379474, + "grad_norm": 0.09220833331346512, + "learning_rate": 7.325333981874271e-05, + "loss": 2.5687, + "step": 27899 + }, + { + "epoch": 0.8273285294902589, + "grad_norm": 0.08916865289211273, + "learning_rate": 7.322882327458724e-05, + "loss": 2.5605, + "step": 27900 + }, + { + "epoch": 0.8273581828425703, + "grad_norm": 0.08340749144554138, + "learning_rate": 7.320431050956394e-05, + "loss": 2.5879, + "step": 27901 + }, + { + "epoch": 0.8273878361948819, + "grad_norm": 0.0989336371421814, + "learning_rate": 7.317980152388975e-05, + "loss": 2.5874, + "step": 27902 + }, + { + "epoch": 0.8274174895471933, + "grad_norm": 0.0848199799656868, + "learning_rate": 7.315529631778167e-05, + "loss": 2.595, + "step": 27903 + }, + { + "epoch": 0.8274471428995048, + "grad_norm": 0.08701726794242859, + "learning_rate": 7.313079489145669e-05, + "loss": 2.5558, + "step": 27904 + }, + { + "epoch": 0.8274767962518162, + "grad_norm": 0.08751647174358368, + "learning_rate": 7.310629724513179e-05, + "loss": 2.5388, + "step": 27905 + }, + { + "epoch": 0.8275064496041278, + "grad_norm": 0.08371048420667648, + "learning_rate": 7.308180337902392e-05, + "loss": 2.5626, + "step": 27906 + }, + { + "epoch": 0.8275361029564392, + "grad_norm": 0.09229561686515808, + "learning_rate": 7.305731329334996e-05, + "loss": 2.5942, + "step": 27907 + }, + { + "epoch": 0.8275657563087507, + "grad_norm": 0.08841115236282349, + "learning_rate": 7.303282698832691e-05, + "loss": 2.6038, + "step": 27908 + }, + { + "epoch": 0.8275954096610622, + "grad_norm": 0.0874754786491394, + "learning_rate": 7.300834446417131e-05, + "loss": 2.6085, + "step": 27909 + }, + { + "epoch": 0.8276250630133737, + "grad_norm": 0.08481518179178238, + "learning_rate": 7.29838657211001e-05, + "loss": 2.5914, + "step": 27910 + }, + { + "epoch": 0.8276547163656851, + "grad_norm": 0.08750846236944199, + "learning_rate": 7.295939075933012e-05, + "loss": 2.5645, + "step": 27911 + }, + { + "epoch": 0.8276843697179966, + "grad_norm": 0.08124428987503052, + "learning_rate": 7.2934919579078e-05, + "loss": 2.5661, + "step": 27912 + }, + { + "epoch": 0.8277140230703081, + "grad_norm": 0.08797477930784225, + "learning_rate": 7.291045218056036e-05, + "loss": 2.5471, + "step": 27913 + }, + { + "epoch": 0.8277436764226196, + "grad_norm": 0.0854877457022667, + "learning_rate": 7.288598856399408e-05, + "loss": 2.5649, + "step": 27914 + }, + { + "epoch": 0.827773329774931, + "grad_norm": 0.08782776445150375, + "learning_rate": 7.286152872959567e-05, + "loss": 2.5993, + "step": 27915 + }, + { + "epoch": 0.8278029831272425, + "grad_norm": 0.08577380329370499, + "learning_rate": 7.283707267758177e-05, + "loss": 2.5547, + "step": 27916 + }, + { + "epoch": 0.827832636479554, + "grad_norm": 0.08272981643676758, + "learning_rate": 7.281262040816894e-05, + "loss": 2.5676, + "step": 27917 + }, + { + "epoch": 0.8278622898318655, + "grad_norm": 0.08515097945928574, + "learning_rate": 7.278817192157361e-05, + "loss": 2.593, + "step": 27918 + }, + { + "epoch": 0.827891943184177, + "grad_norm": 0.08090922981500626, + "learning_rate": 7.27637272180125e-05, + "loss": 2.5694, + "step": 27919 + }, + { + "epoch": 0.8279215965364884, + "grad_norm": 0.08890432119369507, + "learning_rate": 7.273928629770182e-05, + "loss": 2.6071, + "step": 27920 + }, + { + "epoch": 0.8279512498888, + "grad_norm": 0.08537988364696503, + "learning_rate": 7.271484916085808e-05, + "loss": 2.5692, + "step": 27921 + }, + { + "epoch": 0.8279809032411114, + "grad_norm": 0.09872741252183914, + "learning_rate": 7.269041580769769e-05, + "loss": 2.5804, + "step": 27922 + }, + { + "epoch": 0.8280105565934229, + "grad_norm": 0.08432115614414215, + "learning_rate": 7.266598623843701e-05, + "loss": 2.6047, + "step": 27923 + }, + { + "epoch": 0.8280402099457344, + "grad_norm": 0.09265799820423126, + "learning_rate": 7.26415604532924e-05, + "loss": 2.5829, + "step": 27924 + }, + { + "epoch": 0.8280698632980459, + "grad_norm": 0.08246026933193207, + "learning_rate": 7.261713845247998e-05, + "loss": 2.5691, + "step": 27925 + }, + { + "epoch": 0.8280995166503573, + "grad_norm": 0.08314244449138641, + "learning_rate": 7.259272023621627e-05, + "loss": 2.5931, + "step": 27926 + }, + { + "epoch": 0.8281291700026688, + "grad_norm": 0.09218821674585342, + "learning_rate": 7.25683058047174e-05, + "loss": 2.5852, + "step": 27927 + }, + { + "epoch": 0.8281588233549803, + "grad_norm": 0.08677929639816284, + "learning_rate": 7.254389515819959e-05, + "loss": 2.5945, + "step": 27928 + }, + { + "epoch": 0.8281884767072918, + "grad_norm": 0.08868876844644547, + "learning_rate": 7.251948829687905e-05, + "loss": 2.5415, + "step": 27929 + }, + { + "epoch": 0.8282181300596032, + "grad_norm": 0.08863285928964615, + "learning_rate": 7.249508522097164e-05, + "loss": 2.5852, + "step": 27930 + }, + { + "epoch": 0.8282477834119147, + "grad_norm": 0.09018896520137787, + "learning_rate": 7.247068593069373e-05, + "loss": 2.537, + "step": 27931 + }, + { + "epoch": 0.8282774367642262, + "grad_norm": 0.08947250992059708, + "learning_rate": 7.244629042626122e-05, + "loss": 2.5703, + "step": 27932 + }, + { + "epoch": 0.8283070901165377, + "grad_norm": 0.08749354630708694, + "learning_rate": 7.242189870789017e-05, + "loss": 2.5979, + "step": 27933 + }, + { + "epoch": 0.8283367434688491, + "grad_norm": 0.08835156261920929, + "learning_rate": 7.239751077579665e-05, + "loss": 2.5886, + "step": 27934 + }, + { + "epoch": 0.8283663968211606, + "grad_norm": 0.09043731540441513, + "learning_rate": 7.237312663019657e-05, + "loss": 2.553, + "step": 27935 + }, + { + "epoch": 0.8283960501734721, + "grad_norm": 0.09936944395303726, + "learning_rate": 7.234874627130584e-05, + "loss": 2.5735, + "step": 27936 + }, + { + "epoch": 0.8284257035257836, + "grad_norm": 0.09225068986415863, + "learning_rate": 7.232436969934036e-05, + "loss": 2.5755, + "step": 27937 + }, + { + "epoch": 0.828455356878095, + "grad_norm": 0.0926096960902214, + "learning_rate": 7.229999691451594e-05, + "loss": 2.5509, + "step": 27938 + }, + { + "epoch": 0.8284850102304065, + "grad_norm": 0.08492160588502884, + "learning_rate": 7.227562791704862e-05, + "loss": 2.6104, + "step": 27939 + }, + { + "epoch": 0.8285146635827181, + "grad_norm": 0.08570745587348938, + "learning_rate": 7.225126270715393e-05, + "loss": 2.5511, + "step": 27940 + }, + { + "epoch": 0.8285443169350295, + "grad_norm": 0.09509764611721039, + "learning_rate": 7.222690128504777e-05, + "loss": 2.5764, + "step": 27941 + }, + { + "epoch": 0.828573970287341, + "grad_norm": 0.09076446294784546, + "learning_rate": 7.220254365094575e-05, + "loss": 2.565, + "step": 27942 + }, + { + "epoch": 0.8286036236396525, + "grad_norm": 0.10520754754543304, + "learning_rate": 7.217818980506369e-05, + "loss": 2.5995, + "step": 27943 + }, + { + "epoch": 0.828633276991964, + "grad_norm": 0.08964664489030838, + "learning_rate": 7.215383974761719e-05, + "loss": 2.5705, + "step": 27944 + }, + { + "epoch": 0.8286629303442754, + "grad_norm": 0.09487565606832504, + "learning_rate": 7.212949347882187e-05, + "loss": 2.5857, + "step": 27945 + }, + { + "epoch": 0.8286925836965869, + "grad_norm": 0.08924227207899094, + "learning_rate": 7.210515099889336e-05, + "loss": 2.5771, + "step": 27946 + }, + { + "epoch": 0.8287222370488984, + "grad_norm": 0.09429585933685303, + "learning_rate": 7.208081230804714e-05, + "loss": 2.6105, + "step": 27947 + }, + { + "epoch": 0.8287518904012099, + "grad_norm": 0.08931174129247665, + "learning_rate": 7.205647740649879e-05, + "loss": 2.6016, + "step": 27948 + }, + { + "epoch": 0.8287815437535213, + "grad_norm": 0.09497767686843872, + "learning_rate": 7.203214629446381e-05, + "loss": 2.59, + "step": 27949 + }, + { + "epoch": 0.8288111971058328, + "grad_norm": 0.09441443532705307, + "learning_rate": 7.200781897215763e-05, + "loss": 2.5729, + "step": 27950 + }, + { + "epoch": 0.8288408504581443, + "grad_norm": 0.09458823502063751, + "learning_rate": 7.198349543979565e-05, + "loss": 2.5707, + "step": 27951 + }, + { + "epoch": 0.8288705038104558, + "grad_norm": 0.09663009643554688, + "learning_rate": 7.195917569759331e-05, + "loss": 2.6029, + "step": 27952 + }, + { + "epoch": 0.8289001571627672, + "grad_norm": 0.08172016590833664, + "learning_rate": 7.193485974576592e-05, + "loss": 2.5533, + "step": 27953 + }, + { + "epoch": 0.8289298105150787, + "grad_norm": 0.10100246220827103, + "learning_rate": 7.191054758452886e-05, + "loss": 2.5502, + "step": 27954 + }, + { + "epoch": 0.8289594638673902, + "grad_norm": 0.09853576123714447, + "learning_rate": 7.188623921409731e-05, + "loss": 2.5751, + "step": 27955 + }, + { + "epoch": 0.8289891172197017, + "grad_norm": 0.10232042521238327, + "learning_rate": 7.186193463468666e-05, + "loss": 2.5999, + "step": 27956 + }, + { + "epoch": 0.8290187705720131, + "grad_norm": 0.09737179428339005, + "learning_rate": 7.183763384651204e-05, + "loss": 2.6175, + "step": 27957 + }, + { + "epoch": 0.8290484239243247, + "grad_norm": 0.09019333869218826, + "learning_rate": 7.181333684978869e-05, + "loss": 2.5926, + "step": 27958 + }, + { + "epoch": 0.8290780772766361, + "grad_norm": 0.1007961556315422, + "learning_rate": 7.178904364473176e-05, + "loss": 2.5954, + "step": 27959 + }, + { + "epoch": 0.8291077306289476, + "grad_norm": 0.09087394177913666, + "learning_rate": 7.176475423155632e-05, + "loss": 2.5794, + "step": 27960 + }, + { + "epoch": 0.8291373839812591, + "grad_norm": 0.09387961775064468, + "learning_rate": 7.174046861047745e-05, + "loss": 2.5903, + "step": 27961 + }, + { + "epoch": 0.8291670373335706, + "grad_norm": 0.08734934031963348, + "learning_rate": 7.171618678171027e-05, + "loss": 2.5843, + "step": 27962 + }, + { + "epoch": 0.8291966906858821, + "grad_norm": 0.08977484703063965, + "learning_rate": 7.169190874546988e-05, + "loss": 2.5947, + "step": 27963 + }, + { + "epoch": 0.8292263440381935, + "grad_norm": 0.09661085158586502, + "learning_rate": 7.166763450197095e-05, + "loss": 2.5447, + "step": 27964 + }, + { + "epoch": 0.829255997390505, + "grad_norm": 0.09322328120470047, + "learning_rate": 7.164336405142874e-05, + "loss": 2.5695, + "step": 27965 + }, + { + "epoch": 0.8292856507428165, + "grad_norm": 0.09013692289590836, + "learning_rate": 7.161909739405809e-05, + "loss": 2.5266, + "step": 27966 + }, + { + "epoch": 0.829315304095128, + "grad_norm": 0.08911051601171494, + "learning_rate": 7.159483453007382e-05, + "loss": 2.5944, + "step": 27967 + }, + { + "epoch": 0.8293449574474394, + "grad_norm": 0.09604396671056747, + "learning_rate": 7.157057545969087e-05, + "loss": 2.5854, + "step": 27968 + }, + { + "epoch": 0.8293746107997509, + "grad_norm": 0.0855807363986969, + "learning_rate": 7.154632018312396e-05, + "loss": 2.5509, + "step": 27969 + }, + { + "epoch": 0.8294042641520624, + "grad_norm": 0.08744846284389496, + "learning_rate": 7.152206870058797e-05, + "loss": 2.6046, + "step": 27970 + }, + { + "epoch": 0.8294339175043739, + "grad_norm": 0.08882107585668564, + "learning_rate": 7.149782101229757e-05, + "loss": 2.5823, + "step": 27971 + }, + { + "epoch": 0.8294635708566853, + "grad_norm": 0.0864202082157135, + "learning_rate": 7.147357711846758e-05, + "loss": 2.6134, + "step": 27972 + }, + { + "epoch": 0.8294932242089968, + "grad_norm": 0.09717637300491333, + "learning_rate": 7.144933701931255e-05, + "loss": 2.5941, + "step": 27973 + }, + { + "epoch": 0.8295228775613083, + "grad_norm": 0.10120917856693268, + "learning_rate": 7.142510071504737e-05, + "loss": 2.5823, + "step": 27974 + }, + { + "epoch": 0.8295525309136198, + "grad_norm": 0.08696691691875458, + "learning_rate": 7.140086820588632e-05, + "loss": 2.5664, + "step": 27975 + }, + { + "epoch": 0.8295821842659312, + "grad_norm": 0.08895918726921082, + "learning_rate": 7.13766394920442e-05, + "loss": 2.5625, + "step": 27976 + }, + { + "epoch": 0.8296118376182428, + "grad_norm": 0.08721406012773514, + "learning_rate": 7.135241457373537e-05, + "loss": 2.5688, + "step": 27977 + }, + { + "epoch": 0.8296414909705542, + "grad_norm": 0.08609163761138916, + "learning_rate": 7.132819345117459e-05, + "loss": 2.5974, + "step": 27978 + }, + { + "epoch": 0.8296711443228657, + "grad_norm": 0.09071795642375946, + "learning_rate": 7.130397612457629e-05, + "loss": 2.5665, + "step": 27979 + }, + { + "epoch": 0.8297007976751771, + "grad_norm": 0.08468261361122131, + "learning_rate": 7.127976259415481e-05, + "loss": 2.5777, + "step": 27980 + }, + { + "epoch": 0.8297304510274887, + "grad_norm": 0.09150819480419159, + "learning_rate": 7.125555286012465e-05, + "loss": 2.5883, + "step": 27981 + }, + { + "epoch": 0.8297601043798002, + "grad_norm": 0.08707630634307861, + "learning_rate": 7.123134692270012e-05, + "loss": 2.571, + "step": 27982 + }, + { + "epoch": 0.8297897577321116, + "grad_norm": 0.0949854701757431, + "learning_rate": 7.120714478209567e-05, + "loss": 2.5729, + "step": 27983 + }, + { + "epoch": 0.8298194110844231, + "grad_norm": 0.09254458546638489, + "learning_rate": 7.118294643852562e-05, + "loss": 2.5952, + "step": 27984 + }, + { + "epoch": 0.8298490644367346, + "grad_norm": 0.09202355146408081, + "learning_rate": 7.115875189220411e-05, + "loss": 2.5726, + "step": 27985 + }, + { + "epoch": 0.8298787177890461, + "grad_norm": 0.09135603159666061, + "learning_rate": 7.113456114334543e-05, + "loss": 2.5601, + "step": 27986 + }, + { + "epoch": 0.8299083711413575, + "grad_norm": 0.09356856346130371, + "learning_rate": 7.111037419216382e-05, + "loss": 2.5771, + "step": 27987 + }, + { + "epoch": 0.829938024493669, + "grad_norm": 0.07760284841060638, + "learning_rate": 7.108619103887349e-05, + "loss": 2.5663, + "step": 27988 + }, + { + "epoch": 0.8299676778459805, + "grad_norm": 0.09389124065637589, + "learning_rate": 7.106201168368858e-05, + "loss": 2.6006, + "step": 27989 + }, + { + "epoch": 0.829997331198292, + "grad_norm": 0.08737807720899582, + "learning_rate": 7.103783612682302e-05, + "loss": 2.5719, + "step": 27990 + }, + { + "epoch": 0.8300269845506034, + "grad_norm": 0.08574898540973663, + "learning_rate": 7.101366436849122e-05, + "loss": 2.591, + "step": 27991 + }, + { + "epoch": 0.830056637902915, + "grad_norm": 0.08833157271146774, + "learning_rate": 7.0989496408907e-05, + "loss": 2.5403, + "step": 27992 + }, + { + "epoch": 0.8300862912552264, + "grad_norm": 0.0890495628118515, + "learning_rate": 7.096533224828444e-05, + "loss": 2.6236, + "step": 27993 + }, + { + "epoch": 0.8301159446075379, + "grad_norm": 0.08579360693693161, + "learning_rate": 7.094117188683752e-05, + "loss": 2.5876, + "step": 27994 + }, + { + "epoch": 0.8301455979598493, + "grad_norm": 0.08309105783700943, + "learning_rate": 7.091701532478029e-05, + "loss": 2.5913, + "step": 27995 + }, + { + "epoch": 0.8301752513121609, + "grad_norm": 0.08975735306739807, + "learning_rate": 7.089286256232641e-05, + "loss": 2.5343, + "step": 27996 + }, + { + "epoch": 0.8302049046644723, + "grad_norm": 0.08568368852138519, + "learning_rate": 7.086871359968988e-05, + "loss": 2.579, + "step": 27997 + }, + { + "epoch": 0.8302345580167838, + "grad_norm": 0.08197595179080963, + "learning_rate": 7.08445684370846e-05, + "loss": 2.601, + "step": 27998 + }, + { + "epoch": 0.8302642113690952, + "grad_norm": 0.09339230507612228, + "learning_rate": 7.08204270747243e-05, + "loss": 2.5481, + "step": 27999 + }, + { + "epoch": 0.8302938647214068, + "grad_norm": 0.08184781670570374, + "learning_rate": 7.079628951282274e-05, + "loss": 2.5681, + "step": 28000 + }, + { + "epoch": 0.8303235180737182, + "grad_norm": 0.0834927186369896, + "learning_rate": 7.07721557515938e-05, + "loss": 2.5583, + "step": 28001 + }, + { + "epoch": 0.8303531714260297, + "grad_norm": 0.08721167594194412, + "learning_rate": 7.0748025791251e-05, + "loss": 2.5807, + "step": 28002 + }, + { + "epoch": 0.8303828247783412, + "grad_norm": 0.0853479877114296, + "learning_rate": 7.072389963200804e-05, + "loss": 2.5724, + "step": 28003 + }, + { + "epoch": 0.8304124781306527, + "grad_norm": 0.07917723059654236, + "learning_rate": 7.069977727407878e-05, + "loss": 2.5492, + "step": 28004 + }, + { + "epoch": 0.8304421314829642, + "grad_norm": 0.08974266797304153, + "learning_rate": 7.067565871767673e-05, + "loss": 2.5728, + "step": 28005 + }, + { + "epoch": 0.8304717848352756, + "grad_norm": 0.08465812355279922, + "learning_rate": 7.065154396301538e-05, + "loss": 2.5798, + "step": 28006 + }, + { + "epoch": 0.8305014381875871, + "grad_norm": 0.07799849659204483, + "learning_rate": 7.062743301030822e-05, + "loss": 2.5771, + "step": 28007 + }, + { + "epoch": 0.8305310915398986, + "grad_norm": 0.08622289448976517, + "learning_rate": 7.060332585976892e-05, + "loss": 2.5873, + "step": 28008 + }, + { + "epoch": 0.8305607448922101, + "grad_norm": 0.08240983635187149, + "learning_rate": 7.057922251161081e-05, + "loss": 2.5467, + "step": 28009 + }, + { + "epoch": 0.8305903982445215, + "grad_norm": 0.07789810746908188, + "learning_rate": 7.055512296604744e-05, + "loss": 2.5678, + "step": 28010 + }, + { + "epoch": 0.830620051596833, + "grad_norm": 0.08922013640403748, + "learning_rate": 7.053102722329214e-05, + "loss": 2.5891, + "step": 28011 + }, + { + "epoch": 0.8306497049491445, + "grad_norm": 0.0865454152226448, + "learning_rate": 7.050693528355834e-05, + "loss": 2.5971, + "step": 28012 + }, + { + "epoch": 0.830679358301456, + "grad_norm": 0.08660311251878738, + "learning_rate": 7.048284714705932e-05, + "loss": 2.5851, + "step": 28013 + }, + { + "epoch": 0.8307090116537674, + "grad_norm": 0.0831313282251358, + "learning_rate": 7.045876281400842e-05, + "loss": 2.5611, + "step": 28014 + }, + { + "epoch": 0.830738665006079, + "grad_norm": 0.08230365067720413, + "learning_rate": 7.043468228461891e-05, + "loss": 2.5787, + "step": 28015 + }, + { + "epoch": 0.8307683183583904, + "grad_norm": 0.0893663763999939, + "learning_rate": 7.0410605559104e-05, + "loss": 2.5932, + "step": 28016 + }, + { + "epoch": 0.8307979717107019, + "grad_norm": 0.09149724245071411, + "learning_rate": 7.038653263767697e-05, + "loss": 2.6075, + "step": 28017 + }, + { + "epoch": 0.8308276250630133, + "grad_norm": 0.09173345565795898, + "learning_rate": 7.036246352055092e-05, + "loss": 2.5767, + "step": 28018 + }, + { + "epoch": 0.8308572784153249, + "grad_norm": 0.08369916677474976, + "learning_rate": 7.033839820793897e-05, + "loss": 2.6042, + "step": 28019 + }, + { + "epoch": 0.8308869317676363, + "grad_norm": 0.0906091183423996, + "learning_rate": 7.031433670005428e-05, + "loss": 2.5931, + "step": 28020 + }, + { + "epoch": 0.8309165851199478, + "grad_norm": 0.0906621515750885, + "learning_rate": 7.029027899710989e-05, + "loss": 2.5691, + "step": 28021 + }, + { + "epoch": 0.8309462384722592, + "grad_norm": 0.0874674916267395, + "learning_rate": 7.026622509931879e-05, + "loss": 2.5665, + "step": 28022 + }, + { + "epoch": 0.8309758918245708, + "grad_norm": 0.09131859987974167, + "learning_rate": 7.024217500689412e-05, + "loss": 2.5617, + "step": 28023 + }, + { + "epoch": 0.8310055451768823, + "grad_norm": 0.08195292949676514, + "learning_rate": 7.021812872004868e-05, + "loss": 2.6246, + "step": 28024 + }, + { + "epoch": 0.8310351985291937, + "grad_norm": 0.09855173528194427, + "learning_rate": 7.019408623899553e-05, + "loss": 2.6005, + "step": 28025 + }, + { + "epoch": 0.8310648518815053, + "grad_norm": 0.08979954570531845, + "learning_rate": 7.017004756394746e-05, + "loss": 2.5772, + "step": 28026 + }, + { + "epoch": 0.8310945052338167, + "grad_norm": 0.0859374850988388, + "learning_rate": 7.014601269511745e-05, + "loss": 2.5807, + "step": 28027 + }, + { + "epoch": 0.8311241585861282, + "grad_norm": 0.09124571830034256, + "learning_rate": 7.012198163271827e-05, + "loss": 2.5652, + "step": 28028 + }, + { + "epoch": 0.8311538119384396, + "grad_norm": 0.09883435070514679, + "learning_rate": 7.009795437696276e-05, + "loss": 2.5973, + "step": 28029 + }, + { + "epoch": 0.8311834652907512, + "grad_norm": 0.09533828496932983, + "learning_rate": 7.007393092806363e-05, + "loss": 2.5635, + "step": 28030 + }, + { + "epoch": 0.8312131186430626, + "grad_norm": 0.09497058391571045, + "learning_rate": 7.004991128623361e-05, + "loss": 2.5582, + "step": 28031 + }, + { + "epoch": 0.8312427719953741, + "grad_norm": 0.09755514562129974, + "learning_rate": 7.002589545168548e-05, + "loss": 2.5752, + "step": 28032 + }, + { + "epoch": 0.8312724253476855, + "grad_norm": 0.08674904704093933, + "learning_rate": 7.000188342463182e-05, + "loss": 2.5656, + "step": 28033 + }, + { + "epoch": 0.8313020786999971, + "grad_norm": 0.09199602156877518, + "learning_rate": 6.997787520528526e-05, + "loss": 2.5538, + "step": 28034 + }, + { + "epoch": 0.8313317320523085, + "grad_norm": 0.08606790751218796, + "learning_rate": 6.995387079385845e-05, + "loss": 2.5673, + "step": 28035 + }, + { + "epoch": 0.83136138540462, + "grad_norm": 0.08885196596384048, + "learning_rate": 6.992987019056396e-05, + "loss": 2.6184, + "step": 28036 + }, + { + "epoch": 0.8313910387569314, + "grad_norm": 0.08533231914043427, + "learning_rate": 6.990587339561427e-05, + "loss": 2.5784, + "step": 28037 + }, + { + "epoch": 0.831420692109243, + "grad_norm": 0.08603455871343613, + "learning_rate": 6.988188040922189e-05, + "loss": 2.5789, + "step": 28038 + }, + { + "epoch": 0.8314503454615544, + "grad_norm": 0.0850491002202034, + "learning_rate": 6.985789123159942e-05, + "loss": 2.5908, + "step": 28039 + }, + { + "epoch": 0.8314799988138659, + "grad_norm": 0.08116693794727325, + "learning_rate": 6.983390586295902e-05, + "loss": 2.5816, + "step": 28040 + }, + { + "epoch": 0.8315096521661773, + "grad_norm": 0.09118986129760742, + "learning_rate": 6.980992430351324e-05, + "loss": 2.5467, + "step": 28041 + }, + { + "epoch": 0.8315393055184889, + "grad_norm": 0.07918021827936172, + "learning_rate": 6.978594655347426e-05, + "loss": 2.5536, + "step": 28042 + }, + { + "epoch": 0.8315689588708003, + "grad_norm": 0.08864276111125946, + "learning_rate": 6.976197261305472e-05, + "loss": 2.5651, + "step": 28043 + }, + { + "epoch": 0.8315986122231118, + "grad_norm": 0.08354565501213074, + "learning_rate": 6.973800248246676e-05, + "loss": 2.599, + "step": 28044 + }, + { + "epoch": 0.8316282655754234, + "grad_norm": 0.07850521802902222, + "learning_rate": 6.971403616192262e-05, + "loss": 2.5833, + "step": 28045 + }, + { + "epoch": 0.8316579189277348, + "grad_norm": 0.08745311945676804, + "learning_rate": 6.96900736516346e-05, + "loss": 2.5934, + "step": 28046 + }, + { + "epoch": 0.8316875722800463, + "grad_norm": 0.08099254965782166, + "learning_rate": 6.96661149518148e-05, + "loss": 2.5893, + "step": 28047 + }, + { + "epoch": 0.8317172256323577, + "grad_norm": 0.08104753494262695, + "learning_rate": 6.964216006267543e-05, + "loss": 2.5966, + "step": 28048 + }, + { + "epoch": 0.8317468789846693, + "grad_norm": 0.08687502890825272, + "learning_rate": 6.961820898442861e-05, + "loss": 2.5692, + "step": 28049 + }, + { + "epoch": 0.8317765323369807, + "grad_norm": 0.08444710075855255, + "learning_rate": 6.95942617172865e-05, + "loss": 2.5881, + "step": 28050 + }, + { + "epoch": 0.8318061856892922, + "grad_norm": 0.08480257540941238, + "learning_rate": 6.957031826146098e-05, + "loss": 2.5819, + "step": 28051 + }, + { + "epoch": 0.8318358390416036, + "grad_norm": 0.08891037851572037, + "learning_rate": 6.954637861716423e-05, + "loss": 2.5535, + "step": 28052 + }, + { + "epoch": 0.8318654923939152, + "grad_norm": 0.08300633728504181, + "learning_rate": 6.952244278460812e-05, + "loss": 2.5626, + "step": 28053 + }, + { + "epoch": 0.8318951457462266, + "grad_norm": 0.08692789077758789, + "learning_rate": 6.94985107640047e-05, + "loss": 2.5931, + "step": 28054 + }, + { + "epoch": 0.8319247990985381, + "grad_norm": 0.09842634946107864, + "learning_rate": 6.947458255556576e-05, + "loss": 2.5753, + "step": 28055 + }, + { + "epoch": 0.8319544524508495, + "grad_norm": 0.0837695300579071, + "learning_rate": 6.945065815950336e-05, + "loss": 2.5856, + "step": 28056 + }, + { + "epoch": 0.8319841058031611, + "grad_norm": 0.08141642808914185, + "learning_rate": 6.94267375760293e-05, + "loss": 2.5655, + "step": 28057 + }, + { + "epoch": 0.8320137591554725, + "grad_norm": 0.09251933544874191, + "learning_rate": 6.940282080535543e-05, + "loss": 2.5995, + "step": 28058 + }, + { + "epoch": 0.832043412507784, + "grad_norm": 0.08329317718744278, + "learning_rate": 6.937890784769341e-05, + "loss": 2.5841, + "step": 28059 + }, + { + "epoch": 0.8320730658600954, + "grad_norm": 0.08631603419780731, + "learning_rate": 6.935499870325524e-05, + "loss": 2.5758, + "step": 28060 + }, + { + "epoch": 0.832102719212407, + "grad_norm": 0.08364760875701904, + "learning_rate": 6.933109337225236e-05, + "loss": 2.5817, + "step": 28061 + }, + { + "epoch": 0.8321323725647184, + "grad_norm": 0.08260109275579453, + "learning_rate": 6.930719185489659e-05, + "loss": 2.5522, + "step": 28062 + }, + { + "epoch": 0.8321620259170299, + "grad_norm": 0.08618658781051636, + "learning_rate": 6.928329415139956e-05, + "loss": 2.5944, + "step": 28063 + }, + { + "epoch": 0.8321916792693413, + "grad_norm": 0.0876789540052414, + "learning_rate": 6.925940026197287e-05, + "loss": 2.5747, + "step": 28064 + }, + { + "epoch": 0.8322213326216529, + "grad_norm": 0.07921435683965683, + "learning_rate": 6.923551018682811e-05, + "loss": 2.5704, + "step": 28065 + }, + { + "epoch": 0.8322509859739644, + "grad_norm": 0.08825889974832535, + "learning_rate": 6.921162392617686e-05, + "loss": 2.5894, + "step": 28066 + }, + { + "epoch": 0.8322806393262758, + "grad_norm": 0.08665438741445541, + "learning_rate": 6.918774148023066e-05, + "loss": 2.5825, + "step": 28067 + }, + { + "epoch": 0.8323102926785874, + "grad_norm": 0.07762694358825684, + "learning_rate": 6.916386284920078e-05, + "loss": 2.561, + "step": 28068 + }, + { + "epoch": 0.8323399460308988, + "grad_norm": 0.09226153045892715, + "learning_rate": 6.913998803329902e-05, + "loss": 2.5917, + "step": 28069 + }, + { + "epoch": 0.8323695993832103, + "grad_norm": 0.08807780593633652, + "learning_rate": 6.911611703273663e-05, + "loss": 2.5695, + "step": 28070 + }, + { + "epoch": 0.8323992527355217, + "grad_norm": 0.0832793191075325, + "learning_rate": 6.909224984772505e-05, + "loss": 2.569, + "step": 28071 + }, + { + "epoch": 0.8324289060878333, + "grad_norm": 0.09416785091161728, + "learning_rate": 6.906838647847547e-05, + "loss": 2.5805, + "step": 28072 + }, + { + "epoch": 0.8324585594401447, + "grad_norm": 0.08661843836307526, + "learning_rate": 6.904452692519925e-05, + "loss": 2.5921, + "step": 28073 + }, + { + "epoch": 0.8324882127924562, + "grad_norm": 0.0913502424955368, + "learning_rate": 6.902067118810779e-05, + "loss": 2.5653, + "step": 28074 + }, + { + "epoch": 0.8325178661447676, + "grad_norm": 0.09116146713495255, + "learning_rate": 6.899681926741219e-05, + "loss": 2.5515, + "step": 28075 + }, + { + "epoch": 0.8325475194970792, + "grad_norm": 0.09064172208309174, + "learning_rate": 6.89729711633238e-05, + "loss": 2.6064, + "step": 28076 + }, + { + "epoch": 0.8325771728493906, + "grad_norm": 0.08456344902515411, + "learning_rate": 6.89491268760537e-05, + "loss": 2.546, + "step": 28077 + }, + { + "epoch": 0.8326068262017021, + "grad_norm": 0.08633384853601456, + "learning_rate": 6.892528640581308e-05, + "loss": 2.5691, + "step": 28078 + }, + { + "epoch": 0.8326364795540135, + "grad_norm": 0.08890146017074585, + "learning_rate": 6.890144975281305e-05, + "loss": 2.5842, + "step": 28079 + }, + { + "epoch": 0.8326661329063251, + "grad_norm": 0.084981270134449, + "learning_rate": 6.887761691726468e-05, + "loss": 2.5887, + "step": 28080 + }, + { + "epoch": 0.8326957862586365, + "grad_norm": 0.08533397316932678, + "learning_rate": 6.885378789937901e-05, + "loss": 2.5975, + "step": 28081 + }, + { + "epoch": 0.832725439610948, + "grad_norm": 0.09010865539312363, + "learning_rate": 6.882996269936703e-05, + "loss": 2.5864, + "step": 28082 + }, + { + "epoch": 0.8327550929632594, + "grad_norm": 0.09215280413627625, + "learning_rate": 6.880614131743978e-05, + "loss": 2.559, + "step": 28083 + }, + { + "epoch": 0.832784746315571, + "grad_norm": 0.07792747020721436, + "learning_rate": 6.878232375380817e-05, + "loss": 2.5595, + "step": 28084 + }, + { + "epoch": 0.8328143996678824, + "grad_norm": 0.08966191112995148, + "learning_rate": 6.875851000868305e-05, + "loss": 2.5784, + "step": 28085 + }, + { + "epoch": 0.8328440530201939, + "grad_norm": 0.08690575510263443, + "learning_rate": 6.873470008227539e-05, + "loss": 2.5814, + "step": 28086 + }, + { + "epoch": 0.8328737063725055, + "grad_norm": 0.08333567529916763, + "learning_rate": 6.871089397479596e-05, + "loss": 2.594, + "step": 28087 + }, + { + "epoch": 0.8329033597248169, + "grad_norm": 0.08490603417158127, + "learning_rate": 6.868709168645559e-05, + "loss": 2.5986, + "step": 28088 + }, + { + "epoch": 0.8329330130771284, + "grad_norm": 0.08035532385110855, + "learning_rate": 6.866329321746505e-05, + "loss": 2.5638, + "step": 28089 + }, + { + "epoch": 0.8329626664294398, + "grad_norm": 0.08470301330089569, + "learning_rate": 6.863949856803509e-05, + "loss": 2.5435, + "step": 28090 + }, + { + "epoch": 0.8329923197817514, + "grad_norm": 0.08605222404003143, + "learning_rate": 6.861570773837644e-05, + "loss": 2.5499, + "step": 28091 + }, + { + "epoch": 0.8330219731340628, + "grad_norm": 0.08459612727165222, + "learning_rate": 6.859192072869974e-05, + "loss": 2.6023, + "step": 28092 + }, + { + "epoch": 0.8330516264863743, + "grad_norm": 0.08564531803131104, + "learning_rate": 6.85681375392156e-05, + "loss": 2.5691, + "step": 28093 + }, + { + "epoch": 0.8330812798386857, + "grad_norm": 0.08809974044561386, + "learning_rate": 6.854435817013472e-05, + "loss": 2.566, + "step": 28094 + }, + { + "epoch": 0.8331109331909973, + "grad_norm": 0.08675269037485123, + "learning_rate": 6.852058262166755e-05, + "loss": 2.5846, + "step": 28095 + }, + { + "epoch": 0.8331405865433087, + "grad_norm": 0.0786183699965477, + "learning_rate": 6.849681089402471e-05, + "loss": 2.6072, + "step": 28096 + }, + { + "epoch": 0.8331702398956202, + "grad_norm": 0.0862697958946228, + "learning_rate": 6.847304298741664e-05, + "loss": 2.574, + "step": 28097 + }, + { + "epoch": 0.8331998932479316, + "grad_norm": 0.09645392000675201, + "learning_rate": 6.844927890205389e-05, + "loss": 2.5423, + "step": 28098 + }, + { + "epoch": 0.8332295466002432, + "grad_norm": 0.08413038402795792, + "learning_rate": 6.842551863814678e-05, + "loss": 2.5812, + "step": 28099 + }, + { + "epoch": 0.8332591999525546, + "grad_norm": 0.08872661739587784, + "learning_rate": 6.840176219590582e-05, + "loss": 2.5639, + "step": 28100 + }, + { + "epoch": 0.8332888533048661, + "grad_norm": 0.078329898416996, + "learning_rate": 6.837800957554136e-05, + "loss": 2.5798, + "step": 28101 + }, + { + "epoch": 0.8333185066571775, + "grad_norm": 0.08215632289648056, + "learning_rate": 6.83542607772637e-05, + "loss": 2.6001, + "step": 28102 + }, + { + "epoch": 0.8333481600094891, + "grad_norm": 0.08272218704223633, + "learning_rate": 6.833051580128319e-05, + "loss": 2.551, + "step": 28103 + }, + { + "epoch": 0.8333778133618005, + "grad_norm": 0.07972736656665802, + "learning_rate": 6.830677464780999e-05, + "loss": 2.5567, + "step": 28104 + }, + { + "epoch": 0.833407466714112, + "grad_norm": 0.08553164452314377, + "learning_rate": 6.828303731705454e-05, + "loss": 2.5792, + "step": 28105 + }, + { + "epoch": 0.8334371200664236, + "grad_norm": 0.08884884417057037, + "learning_rate": 6.825930380922668e-05, + "loss": 2.5686, + "step": 28106 + }, + { + "epoch": 0.833466773418735, + "grad_norm": 0.0903572142124176, + "learning_rate": 6.82355741245369e-05, + "loss": 2.5487, + "step": 28107 + }, + { + "epoch": 0.8334964267710465, + "grad_norm": 0.09032225608825684, + "learning_rate": 6.821184826319521e-05, + "loss": 2.571, + "step": 28108 + }, + { + "epoch": 0.8335260801233579, + "grad_norm": 0.08578328043222427, + "learning_rate": 6.818812622541176e-05, + "loss": 2.5685, + "step": 28109 + }, + { + "epoch": 0.8335557334756695, + "grad_norm": 0.09386801719665527, + "learning_rate": 6.816440801139657e-05, + "loss": 2.557, + "step": 28110 + }, + { + "epoch": 0.8335853868279809, + "grad_norm": 0.09536506980657578, + "learning_rate": 6.81406936213596e-05, + "loss": 2.5896, + "step": 28111 + }, + { + "epoch": 0.8336150401802924, + "grad_norm": 0.0878828838467598, + "learning_rate": 6.811698305551095e-05, + "loss": 2.5978, + "step": 28112 + }, + { + "epoch": 0.8336446935326038, + "grad_norm": 0.08017248660326004, + "learning_rate": 6.80932763140606e-05, + "loss": 2.601, + "step": 28113 + }, + { + "epoch": 0.8336743468849154, + "grad_norm": 0.08806726336479187, + "learning_rate": 6.806957339721837e-05, + "loss": 2.5673, + "step": 28114 + }, + { + "epoch": 0.8337040002372268, + "grad_norm": 0.08199505507946014, + "learning_rate": 6.804587430519433e-05, + "loss": 2.5941, + "step": 28115 + }, + { + "epoch": 0.8337336535895383, + "grad_norm": 0.0792049914598465, + "learning_rate": 6.802217903819808e-05, + "loss": 2.5726, + "step": 28116 + }, + { + "epoch": 0.8337633069418497, + "grad_norm": 0.0877036303281784, + "learning_rate": 6.799848759643962e-05, + "loss": 2.597, + "step": 28117 + }, + { + "epoch": 0.8337929602941613, + "grad_norm": 0.08746809512376785, + "learning_rate": 6.797479998012867e-05, + "loss": 2.6073, + "step": 28118 + }, + { + "epoch": 0.8338226136464727, + "grad_norm": 0.08005336672067642, + "learning_rate": 6.795111618947497e-05, + "loss": 2.5594, + "step": 28119 + }, + { + "epoch": 0.8338522669987842, + "grad_norm": 0.08739357441663742, + "learning_rate": 6.792743622468833e-05, + "loss": 2.579, + "step": 28120 + }, + { + "epoch": 0.8338819203510957, + "grad_norm": 0.08662452548742294, + "learning_rate": 6.790376008597848e-05, + "loss": 2.5871, + "step": 28121 + }, + { + "epoch": 0.8339115737034072, + "grad_norm": 0.08785951882600784, + "learning_rate": 6.788008777355498e-05, + "loss": 2.5863, + "step": 28122 + }, + { + "epoch": 0.8339412270557186, + "grad_norm": 0.08609837293624878, + "learning_rate": 6.785641928762743e-05, + "loss": 2.5809, + "step": 28123 + }, + { + "epoch": 0.8339708804080301, + "grad_norm": 0.08698506653308868, + "learning_rate": 6.78327546284055e-05, + "loss": 2.5932, + "step": 28124 + }, + { + "epoch": 0.8340005337603416, + "grad_norm": 0.08086960017681122, + "learning_rate": 6.780909379609874e-05, + "loss": 2.5516, + "step": 28125 + }, + { + "epoch": 0.8340301871126531, + "grad_norm": 0.08956049382686615, + "learning_rate": 6.77854367909167e-05, + "loss": 2.5823, + "step": 28126 + }, + { + "epoch": 0.8340598404649646, + "grad_norm": 0.08692233264446259, + "learning_rate": 6.776178361306872e-05, + "loss": 2.5787, + "step": 28127 + }, + { + "epoch": 0.834089493817276, + "grad_norm": 0.09333250671625137, + "learning_rate": 6.773813426276431e-05, + "loss": 2.5843, + "step": 28128 + }, + { + "epoch": 0.8341191471695876, + "grad_norm": 0.09091667830944061, + "learning_rate": 6.771448874021297e-05, + "loss": 2.5373, + "step": 28129 + }, + { + "epoch": 0.834148800521899, + "grad_norm": 0.09401652216911316, + "learning_rate": 6.7690847045624e-05, + "loss": 2.5683, + "step": 28130 + }, + { + "epoch": 0.8341784538742105, + "grad_norm": 0.0916668176651001, + "learning_rate": 6.766720917920677e-05, + "loss": 2.5843, + "step": 28131 + }, + { + "epoch": 0.8342081072265219, + "grad_norm": 0.10036607831716537, + "learning_rate": 6.764357514117053e-05, + "loss": 2.5766, + "step": 28132 + }, + { + "epoch": 0.8342377605788335, + "grad_norm": 0.08970798552036285, + "learning_rate": 6.761994493172474e-05, + "loss": 2.5951, + "step": 28133 + }, + { + "epoch": 0.8342674139311449, + "grad_norm": 0.10919862240552902, + "learning_rate": 6.759631855107856e-05, + "loss": 2.5977, + "step": 28134 + }, + { + "epoch": 0.8342970672834564, + "grad_norm": 0.09000794589519501, + "learning_rate": 6.757269599944115e-05, + "loss": 2.5756, + "step": 28135 + }, + { + "epoch": 0.8343267206357678, + "grad_norm": 0.09243784099817276, + "learning_rate": 6.754907727702193e-05, + "loss": 2.5834, + "step": 28136 + }, + { + "epoch": 0.8343563739880794, + "grad_norm": 0.09912414103746414, + "learning_rate": 6.752546238402973e-05, + "loss": 2.5932, + "step": 28137 + }, + { + "epoch": 0.8343860273403908, + "grad_norm": 0.10279419273138046, + "learning_rate": 6.750185132067376e-05, + "loss": 2.5474, + "step": 28138 + }, + { + "epoch": 0.8344156806927023, + "grad_norm": 0.10008231550455093, + "learning_rate": 6.747824408716318e-05, + "loss": 2.5689, + "step": 28139 + }, + { + "epoch": 0.8344453340450138, + "grad_norm": 0.09841911494731903, + "learning_rate": 6.745464068370694e-05, + "loss": 2.5927, + "step": 28140 + }, + { + "epoch": 0.8344749873973253, + "grad_norm": 0.09337859600782394, + "learning_rate": 6.743104111051412e-05, + "loss": 2.6032, + "step": 28141 + }, + { + "epoch": 0.8345046407496367, + "grad_norm": 0.10128942877054214, + "learning_rate": 6.74074453677937e-05, + "loss": 2.5713, + "step": 28142 + }, + { + "epoch": 0.8345342941019482, + "grad_norm": 0.08364144712686539, + "learning_rate": 6.73838534557546e-05, + "loss": 2.6029, + "step": 28143 + }, + { + "epoch": 0.8345639474542597, + "grad_norm": 0.08896845579147339, + "learning_rate": 6.736026537460577e-05, + "loss": 2.5601, + "step": 28144 + }, + { + "epoch": 0.8345936008065712, + "grad_norm": 0.09371450543403625, + "learning_rate": 6.733668112455588e-05, + "loss": 2.5796, + "step": 28145 + }, + { + "epoch": 0.8346232541588826, + "grad_norm": 0.08740729838609695, + "learning_rate": 6.731310070581409e-05, + "loss": 2.5599, + "step": 28146 + }, + { + "epoch": 0.8346529075111941, + "grad_norm": 0.08552850037813187, + "learning_rate": 6.728952411858913e-05, + "loss": 2.5417, + "step": 28147 + }, + { + "epoch": 0.8346825608635057, + "grad_norm": 0.09152456372976303, + "learning_rate": 6.726595136308967e-05, + "loss": 2.5863, + "step": 28148 + }, + { + "epoch": 0.8347122142158171, + "grad_norm": 0.094442218542099, + "learning_rate": 6.72423824395244e-05, + "loss": 2.587, + "step": 28149 + }, + { + "epoch": 0.8347418675681286, + "grad_norm": 0.08301584422588348, + "learning_rate": 6.72188173481022e-05, + "loss": 2.5697, + "step": 28150 + }, + { + "epoch": 0.83477152092044, + "grad_norm": 0.09021838754415512, + "learning_rate": 6.71952560890316e-05, + "loss": 2.5706, + "step": 28151 + }, + { + "epoch": 0.8348011742727516, + "grad_norm": 0.08892308920621872, + "learning_rate": 6.717169866252132e-05, + "loss": 2.5656, + "step": 28152 + }, + { + "epoch": 0.834830827625063, + "grad_norm": 0.08202328532934189, + "learning_rate": 6.714814506877992e-05, + "loss": 2.5875, + "step": 28153 + }, + { + "epoch": 0.8348604809773745, + "grad_norm": 0.09147094935178757, + "learning_rate": 6.712459530801601e-05, + "loss": 2.5648, + "step": 28154 + }, + { + "epoch": 0.834890134329686, + "grad_norm": 0.08570807427167892, + "learning_rate": 6.710104938043815e-05, + "loss": 2.5574, + "step": 28155 + }, + { + "epoch": 0.8349197876819975, + "grad_norm": 0.0831325501203537, + "learning_rate": 6.707750728625472e-05, + "loss": 2.5832, + "step": 28156 + }, + { + "epoch": 0.8349494410343089, + "grad_norm": 0.08272448182106018, + "learning_rate": 6.705396902567434e-05, + "loss": 2.574, + "step": 28157 + }, + { + "epoch": 0.8349790943866204, + "grad_norm": 0.08568481355905533, + "learning_rate": 6.703043459890534e-05, + "loss": 2.5954, + "step": 28158 + }, + { + "epoch": 0.8350087477389319, + "grad_norm": 0.08986279368400574, + "learning_rate": 6.700690400615622e-05, + "loss": 2.5758, + "step": 28159 + }, + { + "epoch": 0.8350384010912434, + "grad_norm": 0.08826850354671478, + "learning_rate": 6.698337724763521e-05, + "loss": 2.5812, + "step": 28160 + }, + { + "epoch": 0.8350680544435548, + "grad_norm": 0.08165346086025238, + "learning_rate": 6.695985432355078e-05, + "loss": 2.5744, + "step": 28161 + }, + { + "epoch": 0.8350977077958663, + "grad_norm": 0.0857396349310875, + "learning_rate": 6.693633523411114e-05, + "loss": 2.5904, + "step": 28162 + }, + { + "epoch": 0.8351273611481778, + "grad_norm": 0.08607097715139389, + "learning_rate": 6.691281997952459e-05, + "loss": 2.5811, + "step": 28163 + }, + { + "epoch": 0.8351570145004893, + "grad_norm": 0.08171077072620392, + "learning_rate": 6.688930855999936e-05, + "loss": 2.5474, + "step": 28164 + }, + { + "epoch": 0.8351866678528007, + "grad_norm": 0.08103806525468826, + "learning_rate": 6.686580097574363e-05, + "loss": 2.5482, + "step": 28165 + }, + { + "epoch": 0.8352163212051122, + "grad_norm": 0.08742980659008026, + "learning_rate": 6.684229722696561e-05, + "loss": 2.5831, + "step": 28166 + }, + { + "epoch": 0.8352459745574237, + "grad_norm": 0.08524258434772491, + "learning_rate": 6.681879731387341e-05, + "loss": 2.5786, + "step": 28167 + }, + { + "epoch": 0.8352756279097352, + "grad_norm": 0.07710926234722137, + "learning_rate": 6.679530123667505e-05, + "loss": 2.5941, + "step": 28168 + }, + { + "epoch": 0.8353052812620467, + "grad_norm": 0.08982416987419128, + "learning_rate": 6.677180899557873e-05, + "loss": 2.596, + "step": 28169 + }, + { + "epoch": 0.8353349346143581, + "grad_norm": 0.0782233327627182, + "learning_rate": 6.674832059079244e-05, + "loss": 2.5599, + "step": 28170 + }, + { + "epoch": 0.8353645879666697, + "grad_norm": 0.08194559067487717, + "learning_rate": 6.672483602252399e-05, + "loss": 2.5875, + "step": 28171 + }, + { + "epoch": 0.8353942413189811, + "grad_norm": 0.09381401538848877, + "learning_rate": 6.670135529098154e-05, + "loss": 2.5877, + "step": 28172 + }, + { + "epoch": 0.8354238946712926, + "grad_norm": 0.08633232116699219, + "learning_rate": 6.667787839637301e-05, + "loss": 2.5588, + "step": 28173 + }, + { + "epoch": 0.835453548023604, + "grad_norm": 0.08092770725488663, + "learning_rate": 6.665440533890621e-05, + "loss": 2.5836, + "step": 28174 + }, + { + "epoch": 0.8354832013759156, + "grad_norm": 0.08752576261758804, + "learning_rate": 6.663093611878907e-05, + "loss": 2.5605, + "step": 28175 + }, + { + "epoch": 0.835512854728227, + "grad_norm": 0.08791568130254745, + "learning_rate": 6.66074707362293e-05, + "loss": 2.5528, + "step": 28176 + }, + { + "epoch": 0.8355425080805385, + "grad_norm": 0.09054961055517197, + "learning_rate": 6.658400919143487e-05, + "loss": 2.5644, + "step": 28177 + }, + { + "epoch": 0.83557216143285, + "grad_norm": 0.08101942390203476, + "learning_rate": 6.656055148461337e-05, + "loss": 2.5642, + "step": 28178 + }, + { + "epoch": 0.8356018147851615, + "grad_norm": 0.09842705726623535, + "learning_rate": 6.65370976159726e-05, + "loss": 2.5852, + "step": 28179 + }, + { + "epoch": 0.8356314681374729, + "grad_norm": 0.09219670295715332, + "learning_rate": 6.65136475857202e-05, + "loss": 2.6015, + "step": 28180 + }, + { + "epoch": 0.8356611214897844, + "grad_norm": 0.07950355112552643, + "learning_rate": 6.649020139406403e-05, + "loss": 2.5673, + "step": 28181 + }, + { + "epoch": 0.8356907748420959, + "grad_norm": 0.10475268959999084, + "learning_rate": 6.64667590412114e-05, + "loss": 2.5732, + "step": 28182 + }, + { + "epoch": 0.8357204281944074, + "grad_norm": 0.0935843363404274, + "learning_rate": 6.644332052737001e-05, + "loss": 2.5438, + "step": 28183 + }, + { + "epoch": 0.8357500815467188, + "grad_norm": 0.09242619574069977, + "learning_rate": 6.641988585274739e-05, + "loss": 2.5641, + "step": 28184 + }, + { + "epoch": 0.8357797348990303, + "grad_norm": 0.09135347604751587, + "learning_rate": 6.639645501755115e-05, + "loss": 2.5906, + "step": 28185 + }, + { + "epoch": 0.8358093882513418, + "grad_norm": 0.09452522546052933, + "learning_rate": 6.637302802198875e-05, + "loss": 2.5758, + "step": 28186 + }, + { + "epoch": 0.8358390416036533, + "grad_norm": 0.09061684459447861, + "learning_rate": 6.634960486626763e-05, + "loss": 2.5869, + "step": 28187 + }, + { + "epoch": 0.8358686949559647, + "grad_norm": 0.08699920773506165, + "learning_rate": 6.632618555059517e-05, + "loss": 2.5806, + "step": 28188 + }, + { + "epoch": 0.8358983483082763, + "grad_norm": 0.08703655004501343, + "learning_rate": 6.630277007517876e-05, + "loss": 2.5843, + "step": 28189 + }, + { + "epoch": 0.8359280016605878, + "grad_norm": 0.08012212067842484, + "learning_rate": 6.627935844022582e-05, + "loss": 2.5817, + "step": 28190 + }, + { + "epoch": 0.8359576550128992, + "grad_norm": 0.08172620087862015, + "learning_rate": 6.62559506459437e-05, + "loss": 2.5576, + "step": 28191 + }, + { + "epoch": 0.8359873083652107, + "grad_norm": 0.08571205288171768, + "learning_rate": 6.623254669253948e-05, + "loss": 2.5754, + "step": 28192 + }, + { + "epoch": 0.8360169617175222, + "grad_norm": 0.08448788523674011, + "learning_rate": 6.620914658022048e-05, + "loss": 2.5785, + "step": 28193 + }, + { + "epoch": 0.8360466150698337, + "grad_norm": 0.08309104293584824, + "learning_rate": 6.6185750309194e-05, + "loss": 2.6093, + "step": 28194 + }, + { + "epoch": 0.8360762684221451, + "grad_norm": 0.08643652498722076, + "learning_rate": 6.616235787966713e-05, + "loss": 2.5504, + "step": 28195 + }, + { + "epoch": 0.8361059217744566, + "grad_norm": 0.095448337495327, + "learning_rate": 6.613896929184705e-05, + "loss": 2.5827, + "step": 28196 + }, + { + "epoch": 0.8361355751267681, + "grad_norm": 0.08438442647457123, + "learning_rate": 6.611558454594074e-05, + "loss": 2.5447, + "step": 28197 + }, + { + "epoch": 0.8361652284790796, + "grad_norm": 0.08402369916439056, + "learning_rate": 6.609220364215551e-05, + "loss": 2.5729, + "step": 28198 + }, + { + "epoch": 0.836194881831391, + "grad_norm": 0.0909358561038971, + "learning_rate": 6.606882658069834e-05, + "loss": 2.5937, + "step": 28199 + }, + { + "epoch": 0.8362245351837025, + "grad_norm": 0.08096689730882645, + "learning_rate": 6.60454533617762e-05, + "loss": 2.5542, + "step": 28200 + }, + { + "epoch": 0.836254188536014, + "grad_norm": 0.0891014114022255, + "learning_rate": 6.602208398559601e-05, + "loss": 2.5395, + "step": 28201 + }, + { + "epoch": 0.8362838418883255, + "grad_norm": 0.09153693914413452, + "learning_rate": 6.599871845236488e-05, + "loss": 2.5866, + "step": 28202 + }, + { + "epoch": 0.8363134952406369, + "grad_norm": 0.08762066066265106, + "learning_rate": 6.59753567622895e-05, + "loss": 2.6033, + "step": 28203 + }, + { + "epoch": 0.8363431485929484, + "grad_norm": 0.08305475115776062, + "learning_rate": 6.595199891557685e-05, + "loss": 2.599, + "step": 28204 + }, + { + "epoch": 0.8363728019452599, + "grad_norm": 0.09064101427793503, + "learning_rate": 6.592864491243373e-05, + "loss": 2.5798, + "step": 28205 + }, + { + "epoch": 0.8364024552975714, + "grad_norm": 0.0827520415186882, + "learning_rate": 6.590529475306695e-05, + "loss": 2.5808, + "step": 28206 + }, + { + "epoch": 0.8364321086498828, + "grad_norm": 0.08358825743198395, + "learning_rate": 6.58819484376833e-05, + "loss": 2.5549, + "step": 28207 + }, + { + "epoch": 0.8364617620021944, + "grad_norm": 0.0853334590792656, + "learning_rate": 6.585860596648952e-05, + "loss": 2.5493, + "step": 28208 + }, + { + "epoch": 0.8364914153545058, + "grad_norm": 0.07719921320676804, + "learning_rate": 6.58352673396923e-05, + "loss": 2.5472, + "step": 28209 + }, + { + "epoch": 0.8365210687068173, + "grad_norm": 0.08649826049804688, + "learning_rate": 6.581193255749823e-05, + "loss": 2.5709, + "step": 28210 + }, + { + "epoch": 0.8365507220591288, + "grad_norm": 0.0843380019068718, + "learning_rate": 6.578860162011413e-05, + "loss": 2.575, + "step": 28211 + }, + { + "epoch": 0.8365803754114403, + "grad_norm": 0.08573298156261444, + "learning_rate": 6.576527452774656e-05, + "loss": 2.5831, + "step": 28212 + }, + { + "epoch": 0.8366100287637518, + "grad_norm": 0.08355584740638733, + "learning_rate": 6.574195128060196e-05, + "loss": 2.5781, + "step": 28213 + }, + { + "epoch": 0.8366396821160632, + "grad_norm": 0.089451365172863, + "learning_rate": 6.571863187888688e-05, + "loss": 2.5899, + "step": 28214 + }, + { + "epoch": 0.8366693354683747, + "grad_norm": 0.0874982625246048, + "learning_rate": 6.56953163228079e-05, + "loss": 2.6018, + "step": 28215 + }, + { + "epoch": 0.8366989888206862, + "grad_norm": 0.0910634845495224, + "learning_rate": 6.567200461257145e-05, + "loss": 2.605, + "step": 28216 + }, + { + "epoch": 0.8367286421729977, + "grad_norm": 0.08569380640983582, + "learning_rate": 6.564869674838386e-05, + "loss": 2.5643, + "step": 28217 + }, + { + "epoch": 0.8367582955253091, + "grad_norm": 0.08953367918729782, + "learning_rate": 6.562539273045171e-05, + "loss": 2.5383, + "step": 28218 + }, + { + "epoch": 0.8367879488776206, + "grad_norm": 0.08602938055992126, + "learning_rate": 6.560209255898126e-05, + "loss": 2.5621, + "step": 28219 + }, + { + "epoch": 0.8368176022299321, + "grad_norm": 0.08977963030338287, + "learning_rate": 6.557879623417878e-05, + "loss": 2.589, + "step": 28220 + }, + { + "epoch": 0.8368472555822436, + "grad_norm": 0.09259883314371109, + "learning_rate": 6.55555037562507e-05, + "loss": 2.577, + "step": 28221 + }, + { + "epoch": 0.836876908934555, + "grad_norm": 0.08353988826274872, + "learning_rate": 6.553221512540314e-05, + "loss": 2.5362, + "step": 28222 + }, + { + "epoch": 0.8369065622868666, + "grad_norm": 0.08958012610673904, + "learning_rate": 6.550893034184241e-05, + "loss": 2.6031, + "step": 28223 + }, + { + "epoch": 0.836936215639178, + "grad_norm": 0.08943081647157669, + "learning_rate": 6.548564940577467e-05, + "loss": 2.5677, + "step": 28224 + }, + { + "epoch": 0.8369658689914895, + "grad_norm": 0.0921509712934494, + "learning_rate": 6.546237231740614e-05, + "loss": 2.5897, + "step": 28225 + }, + { + "epoch": 0.8369955223438009, + "grad_norm": 0.09057758748531342, + "learning_rate": 6.543909907694284e-05, + "loss": 2.623, + "step": 28226 + }, + { + "epoch": 0.8370251756961125, + "grad_norm": 0.09127192199230194, + "learning_rate": 6.541582968459092e-05, + "loss": 2.5756, + "step": 28227 + }, + { + "epoch": 0.8370548290484239, + "grad_norm": 0.08492661267518997, + "learning_rate": 6.539256414055644e-05, + "loss": 2.5709, + "step": 28228 + }, + { + "epoch": 0.8370844824007354, + "grad_norm": 0.08468812704086304, + "learning_rate": 6.536930244504541e-05, + "loss": 2.5694, + "step": 28229 + }, + { + "epoch": 0.8371141357530468, + "grad_norm": 0.08768823742866516, + "learning_rate": 6.534604459826377e-05, + "loss": 2.5972, + "step": 28230 + }, + { + "epoch": 0.8371437891053584, + "grad_norm": 0.08742944896221161, + "learning_rate": 6.532279060041757e-05, + "loss": 2.5906, + "step": 28231 + }, + { + "epoch": 0.8371734424576699, + "grad_norm": 0.07968270778656006, + "learning_rate": 6.52995404517126e-05, + "loss": 2.555, + "step": 28232 + }, + { + "epoch": 0.8372030958099813, + "grad_norm": 0.08304207026958466, + "learning_rate": 6.527629415235486e-05, + "loss": 2.5588, + "step": 28233 + }, + { + "epoch": 0.8372327491622928, + "grad_norm": 0.09143564105033875, + "learning_rate": 6.52530517025502e-05, + "loss": 2.5746, + "step": 28234 + }, + { + "epoch": 0.8372624025146043, + "grad_norm": 0.08281396329402924, + "learning_rate": 6.522981310250431e-05, + "loss": 2.5996, + "step": 28235 + }, + { + "epoch": 0.8372920558669158, + "grad_norm": 0.08369418978691101, + "learning_rate": 6.520657835242311e-05, + "loss": 2.5703, + "step": 28236 + }, + { + "epoch": 0.8373217092192272, + "grad_norm": 0.100026935338974, + "learning_rate": 6.518334745251225e-05, + "loss": 2.5468, + "step": 28237 + }, + { + "epoch": 0.8373513625715387, + "grad_norm": 0.09128013998270035, + "learning_rate": 6.516012040297747e-05, + "loss": 2.5644, + "step": 28238 + }, + { + "epoch": 0.8373810159238502, + "grad_norm": 0.08674237132072449, + "learning_rate": 6.513689720402449e-05, + "loss": 2.5923, + "step": 28239 + }, + { + "epoch": 0.8374106692761617, + "grad_norm": 0.08672087639570236, + "learning_rate": 6.511367785585893e-05, + "loss": 2.6075, + "step": 28240 + }, + { + "epoch": 0.8374403226284731, + "grad_norm": 0.08750177919864655, + "learning_rate": 6.509046235868637e-05, + "loss": 2.5335, + "step": 28241 + }, + { + "epoch": 0.8374699759807847, + "grad_norm": 0.09034282714128494, + "learning_rate": 6.506725071271246e-05, + "loss": 2.5647, + "step": 28242 + }, + { + "epoch": 0.8374996293330961, + "grad_norm": 0.0839424803853035, + "learning_rate": 6.50440429181427e-05, + "loss": 2.5534, + "step": 28243 + }, + { + "epoch": 0.8375292826854076, + "grad_norm": 0.08655789494514465, + "learning_rate": 6.502083897518258e-05, + "loss": 2.5465, + "step": 28244 + }, + { + "epoch": 0.837558936037719, + "grad_norm": 0.08626234531402588, + "learning_rate": 6.49976388840376e-05, + "loss": 2.5651, + "step": 28245 + }, + { + "epoch": 0.8375885893900306, + "grad_norm": 0.07973749190568924, + "learning_rate": 6.49744426449132e-05, + "loss": 2.5328, + "step": 28246 + }, + { + "epoch": 0.837618242742342, + "grad_norm": 0.08712437748908997, + "learning_rate": 6.495125025801485e-05, + "loss": 2.6092, + "step": 28247 + }, + { + "epoch": 0.8376478960946535, + "grad_norm": 0.08400695025920868, + "learning_rate": 6.492806172354782e-05, + "loss": 2.5375, + "step": 28248 + }, + { + "epoch": 0.8376775494469649, + "grad_norm": 0.0847630724310875, + "learning_rate": 6.490487704171733e-05, + "loss": 2.5532, + "step": 28249 + }, + { + "epoch": 0.8377072027992765, + "grad_norm": 0.08086942881345749, + "learning_rate": 6.488169621272894e-05, + "loss": 2.5516, + "step": 28250 + }, + { + "epoch": 0.8377368561515879, + "grad_norm": 0.08037783950567245, + "learning_rate": 6.485851923678781e-05, + "loss": 2.5886, + "step": 28251 + }, + { + "epoch": 0.8377665095038994, + "grad_norm": 0.0817384123802185, + "learning_rate": 6.483534611409919e-05, + "loss": 2.556, + "step": 28252 + }, + { + "epoch": 0.837796162856211, + "grad_norm": 0.09073010087013245, + "learning_rate": 6.481217684486829e-05, + "loss": 2.5537, + "step": 28253 + }, + { + "epoch": 0.8378258162085224, + "grad_norm": 0.08118504285812378, + "learning_rate": 6.478901142930027e-05, + "loss": 2.5422, + "step": 28254 + }, + { + "epoch": 0.8378554695608339, + "grad_norm": 0.08987068384885788, + "learning_rate": 6.47658498676002e-05, + "loss": 2.5471, + "step": 28255 + }, + { + "epoch": 0.8378851229131453, + "grad_norm": 0.09298075735569, + "learning_rate": 6.474269215997331e-05, + "loss": 2.5543, + "step": 28256 + }, + { + "epoch": 0.8379147762654569, + "grad_norm": 0.08758634328842163, + "learning_rate": 6.471953830662463e-05, + "loss": 2.6034, + "step": 28257 + }, + { + "epoch": 0.8379444296177683, + "grad_norm": 0.09600469470024109, + "learning_rate": 6.46963883077591e-05, + "loss": 2.6015, + "step": 28258 + }, + { + "epoch": 0.8379740829700798, + "grad_norm": 0.09363792091608047, + "learning_rate": 6.467324216358179e-05, + "loss": 2.5714, + "step": 28259 + }, + { + "epoch": 0.8380037363223912, + "grad_norm": 0.08170933276414871, + "learning_rate": 6.465009987429759e-05, + "loss": 2.5677, + "step": 28260 + }, + { + "epoch": 0.8380333896747028, + "grad_norm": 0.09444486349821091, + "learning_rate": 6.462696144011149e-05, + "loss": 2.6046, + "step": 28261 + }, + { + "epoch": 0.8380630430270142, + "grad_norm": 0.09053555130958557, + "learning_rate": 6.460382686122828e-05, + "loss": 2.5522, + "step": 28262 + }, + { + "epoch": 0.8380926963793257, + "grad_norm": 0.08507605642080307, + "learning_rate": 6.4580696137853e-05, + "loss": 2.5695, + "step": 28263 + }, + { + "epoch": 0.8381223497316371, + "grad_norm": 0.09316878765821457, + "learning_rate": 6.455756927019046e-05, + "loss": 2.5854, + "step": 28264 + }, + { + "epoch": 0.8381520030839487, + "grad_norm": 0.09006728231906891, + "learning_rate": 6.453444625844535e-05, + "loss": 2.5567, + "step": 28265 + }, + { + "epoch": 0.8381816564362601, + "grad_norm": 0.08983416110277176, + "learning_rate": 6.451132710282243e-05, + "loss": 2.5465, + "step": 28266 + }, + { + "epoch": 0.8382113097885716, + "grad_norm": 0.09338393807411194, + "learning_rate": 6.448821180352659e-05, + "loss": 2.6122, + "step": 28267 + }, + { + "epoch": 0.838240963140883, + "grad_norm": 0.08233395218849182, + "learning_rate": 6.44651003607623e-05, + "loss": 2.5693, + "step": 28268 + }, + { + "epoch": 0.8382706164931946, + "grad_norm": 0.08724720031023026, + "learning_rate": 6.444199277473428e-05, + "loss": 2.5747, + "step": 28269 + }, + { + "epoch": 0.838300269845506, + "grad_norm": 0.08916129916906357, + "learning_rate": 6.441888904564725e-05, + "loss": 2.5853, + "step": 28270 + }, + { + "epoch": 0.8383299231978175, + "grad_norm": 0.08616311103105545, + "learning_rate": 6.439578917370564e-05, + "loss": 2.5901, + "step": 28271 + }, + { + "epoch": 0.8383595765501289, + "grad_norm": 0.08541858941316605, + "learning_rate": 6.437269315911409e-05, + "loss": 2.6017, + "step": 28272 + }, + { + "epoch": 0.8383892299024405, + "grad_norm": 0.08739384263753891, + "learning_rate": 6.434960100207716e-05, + "loss": 2.5723, + "step": 28273 + }, + { + "epoch": 0.838418883254752, + "grad_norm": 0.08568232506513596, + "learning_rate": 6.432651270279926e-05, + "loss": 2.5706, + "step": 28274 + }, + { + "epoch": 0.8384485366070634, + "grad_norm": 0.09401015937328339, + "learning_rate": 6.430342826148477e-05, + "loss": 2.571, + "step": 28275 + }, + { + "epoch": 0.838478189959375, + "grad_norm": 0.08531001210212708, + "learning_rate": 6.428034767833835e-05, + "loss": 2.5866, + "step": 28276 + }, + { + "epoch": 0.8385078433116864, + "grad_norm": 0.08546020835638046, + "learning_rate": 6.425727095356421e-05, + "loss": 2.6089, + "step": 28277 + }, + { + "epoch": 0.8385374966639979, + "grad_norm": 0.08726714551448822, + "learning_rate": 6.42341980873668e-05, + "loss": 2.5699, + "step": 28278 + }, + { + "epoch": 0.8385671500163093, + "grad_norm": 0.08446525037288666, + "learning_rate": 6.42111290799503e-05, + "loss": 2.6224, + "step": 28279 + }, + { + "epoch": 0.8385968033686209, + "grad_norm": 0.08527717739343643, + "learning_rate": 6.41880639315191e-05, + "loss": 2.6082, + "step": 28280 + }, + { + "epoch": 0.8386264567209323, + "grad_norm": 0.08504809439182281, + "learning_rate": 6.416500264227731e-05, + "loss": 2.6099, + "step": 28281 + }, + { + "epoch": 0.8386561100732438, + "grad_norm": 0.08301106095314026, + "learning_rate": 6.414194521242928e-05, + "loss": 2.5516, + "step": 28282 + }, + { + "epoch": 0.8386857634255552, + "grad_norm": 0.08518339693546295, + "learning_rate": 6.411889164217916e-05, + "loss": 2.57, + "step": 28283 + }, + { + "epoch": 0.8387154167778668, + "grad_norm": 0.08653383702039719, + "learning_rate": 6.409584193173101e-05, + "loss": 2.6237, + "step": 28284 + }, + { + "epoch": 0.8387450701301782, + "grad_norm": 0.08728321641683578, + "learning_rate": 6.407279608128907e-05, + "loss": 2.5851, + "step": 28285 + }, + { + "epoch": 0.8387747234824897, + "grad_norm": 0.09239019453525543, + "learning_rate": 6.40497540910573e-05, + "loss": 2.5745, + "step": 28286 + }, + { + "epoch": 0.8388043768348011, + "grad_norm": 0.08307912200689316, + "learning_rate": 6.402671596123972e-05, + "loss": 2.5502, + "step": 28287 + }, + { + "epoch": 0.8388340301871127, + "grad_norm": 0.08583837747573853, + "learning_rate": 6.400368169204057e-05, + "loss": 2.5636, + "step": 28288 + }, + { + "epoch": 0.8388636835394241, + "grad_norm": 0.09098780155181885, + "learning_rate": 6.398065128366359e-05, + "loss": 2.6057, + "step": 28289 + }, + { + "epoch": 0.8388933368917356, + "grad_norm": 0.09299568831920624, + "learning_rate": 6.395762473631273e-05, + "loss": 2.5949, + "step": 28290 + }, + { + "epoch": 0.838922990244047, + "grad_norm": 0.09243100136518478, + "learning_rate": 6.393460205019202e-05, + "loss": 2.5752, + "step": 28291 + }, + { + "epoch": 0.8389526435963586, + "grad_norm": 0.09678072482347488, + "learning_rate": 6.391158322550522e-05, + "loss": 2.5783, + "step": 28292 + }, + { + "epoch": 0.83898229694867, + "grad_norm": 0.08523111790418625, + "learning_rate": 6.38885682624562e-05, + "loss": 2.5449, + "step": 28293 + }, + { + "epoch": 0.8390119503009815, + "grad_norm": 0.08497105538845062, + "learning_rate": 6.386555716124875e-05, + "loss": 2.5943, + "step": 28294 + }, + { + "epoch": 0.839041603653293, + "grad_norm": 0.10086248815059662, + "learning_rate": 6.384254992208671e-05, + "loss": 2.5534, + "step": 28295 + }, + { + "epoch": 0.8390712570056045, + "grad_norm": 0.08551565557718277, + "learning_rate": 6.38195465451737e-05, + "loss": 2.5811, + "step": 28296 + }, + { + "epoch": 0.839100910357916, + "grad_norm": 0.09697365760803223, + "learning_rate": 6.379654703071353e-05, + "loss": 2.6222, + "step": 28297 + }, + { + "epoch": 0.8391305637102274, + "grad_norm": 0.08277241140604019, + "learning_rate": 6.377355137890972e-05, + "loss": 2.6196, + "step": 28298 + }, + { + "epoch": 0.839160217062539, + "grad_norm": 0.09582895785570145, + "learning_rate": 6.375055958996606e-05, + "loss": 2.5737, + "step": 28299 + }, + { + "epoch": 0.8391898704148504, + "grad_norm": 0.08668742328882217, + "learning_rate": 6.372757166408605e-05, + "loss": 2.5967, + "step": 28300 + }, + { + "epoch": 0.8392195237671619, + "grad_norm": 0.0805424377322197, + "learning_rate": 6.37045876014733e-05, + "loss": 2.5794, + "step": 28301 + }, + { + "epoch": 0.8392491771194733, + "grad_norm": 0.08559300750494003, + "learning_rate": 6.368160740233132e-05, + "loss": 2.5838, + "step": 28302 + }, + { + "epoch": 0.8392788304717849, + "grad_norm": 0.08545216917991638, + "learning_rate": 6.36586310668636e-05, + "loss": 2.5821, + "step": 28303 + }, + { + "epoch": 0.8393084838240963, + "grad_norm": 0.08723627030849457, + "learning_rate": 6.36356585952736e-05, + "loss": 2.6137, + "step": 28304 + }, + { + "epoch": 0.8393381371764078, + "grad_norm": 0.08987759053707123, + "learning_rate": 6.36126899877647e-05, + "loss": 2.5757, + "step": 28305 + }, + { + "epoch": 0.8393677905287192, + "grad_norm": 0.09031027555465698, + "learning_rate": 6.358972524454037e-05, + "loss": 2.6119, + "step": 28306 + }, + { + "epoch": 0.8393974438810308, + "grad_norm": 0.08124465495347977, + "learning_rate": 6.356676436580394e-05, + "loss": 2.572, + "step": 28307 + }, + { + "epoch": 0.8394270972333422, + "grad_norm": 0.1019158810377121, + "learning_rate": 6.354380735175869e-05, + "loss": 2.5926, + "step": 28308 + }, + { + "epoch": 0.8394567505856537, + "grad_norm": 0.08809412270784378, + "learning_rate": 6.352085420260794e-05, + "loss": 2.5939, + "step": 28309 + }, + { + "epoch": 0.8394864039379651, + "grad_norm": 0.09015409648418427, + "learning_rate": 6.349790491855501e-05, + "loss": 2.5699, + "step": 28310 + }, + { + "epoch": 0.8395160572902767, + "grad_norm": 0.09240912646055222, + "learning_rate": 6.347495949980297e-05, + "loss": 2.5682, + "step": 28311 + }, + { + "epoch": 0.8395457106425881, + "grad_norm": 0.08790267258882523, + "learning_rate": 6.345201794655525e-05, + "loss": 2.5646, + "step": 28312 + }, + { + "epoch": 0.8395753639948996, + "grad_norm": 0.09303601086139679, + "learning_rate": 6.342908025901461e-05, + "loss": 2.5598, + "step": 28313 + }, + { + "epoch": 0.839605017347211, + "grad_norm": 0.08630770444869995, + "learning_rate": 6.340614643738457e-05, + "loss": 2.5449, + "step": 28314 + }, + { + "epoch": 0.8396346706995226, + "grad_norm": 0.08585318177938461, + "learning_rate": 6.338321648186795e-05, + "loss": 2.5943, + "step": 28315 + }, + { + "epoch": 0.8396643240518341, + "grad_norm": 0.08018989115953445, + "learning_rate": 6.336029039266794e-05, + "loss": 2.576, + "step": 28316 + }, + { + "epoch": 0.8396939774041455, + "grad_norm": 0.09125994890928268, + "learning_rate": 6.333736816998753e-05, + "loss": 2.5774, + "step": 28317 + }, + { + "epoch": 0.8397236307564571, + "grad_norm": 0.08751948177814484, + "learning_rate": 6.331444981402968e-05, + "loss": 2.5662, + "step": 28318 + }, + { + "epoch": 0.8397532841087685, + "grad_norm": 0.09076345711946487, + "learning_rate": 6.329153532499726e-05, + "loss": 2.5965, + "step": 28319 + }, + { + "epoch": 0.83978293746108, + "grad_norm": 0.08911438286304474, + "learning_rate": 6.32686247030933e-05, + "loss": 2.5773, + "step": 28320 + }, + { + "epoch": 0.8398125908133914, + "grad_norm": 0.09666789323091507, + "learning_rate": 6.324571794852063e-05, + "loss": 2.5842, + "step": 28321 + }, + { + "epoch": 0.839842244165703, + "grad_norm": 0.09067495912313461, + "learning_rate": 6.322281506148215e-05, + "loss": 2.5536, + "step": 28322 + }, + { + "epoch": 0.8398718975180144, + "grad_norm": 0.07981414347887039, + "learning_rate": 6.319991604218062e-05, + "loss": 2.5133, + "step": 28323 + }, + { + "epoch": 0.8399015508703259, + "grad_norm": 0.0983441174030304, + "learning_rate": 6.317702089081879e-05, + "loss": 2.5833, + "step": 28324 + }, + { + "epoch": 0.8399312042226373, + "grad_norm": 0.08169015496969223, + "learning_rate": 6.315412960759936e-05, + "loss": 2.5956, + "step": 28325 + }, + { + "epoch": 0.8399608575749489, + "grad_norm": 0.08846967667341232, + "learning_rate": 6.313124219272498e-05, + "loss": 2.593, + "step": 28326 + }, + { + "epoch": 0.8399905109272603, + "grad_norm": 0.09535044431686401, + "learning_rate": 6.310835864639858e-05, + "loss": 2.5949, + "step": 28327 + }, + { + "epoch": 0.8400201642795718, + "grad_norm": 0.08799813687801361, + "learning_rate": 6.308547896882266e-05, + "loss": 2.5738, + "step": 28328 + }, + { + "epoch": 0.8400498176318832, + "grad_norm": 0.08322979509830475, + "learning_rate": 6.306260316019985e-05, + "loss": 2.5855, + "step": 28329 + }, + { + "epoch": 0.8400794709841948, + "grad_norm": 0.09600181877613068, + "learning_rate": 6.303973122073265e-05, + "loss": 2.5908, + "step": 28330 + }, + { + "epoch": 0.8401091243365062, + "grad_norm": 0.0830712765455246, + "learning_rate": 6.30168631506236e-05, + "loss": 2.5597, + "step": 28331 + }, + { + "epoch": 0.8401387776888177, + "grad_norm": 0.08724064379930496, + "learning_rate": 6.29939989500753e-05, + "loss": 2.5669, + "step": 28332 + }, + { + "epoch": 0.8401684310411291, + "grad_norm": 0.08266877382993698, + "learning_rate": 6.297113861929022e-05, + "loss": 2.5683, + "step": 28333 + }, + { + "epoch": 0.8401980843934407, + "grad_norm": 0.09068723022937775, + "learning_rate": 6.294828215847059e-05, + "loss": 2.6016, + "step": 28334 + }, + { + "epoch": 0.8402277377457522, + "grad_norm": 0.08423862606287003, + "learning_rate": 6.292542956781899e-05, + "loss": 2.5681, + "step": 28335 + }, + { + "epoch": 0.8402573910980636, + "grad_norm": 0.0861339420080185, + "learning_rate": 6.29025808475377e-05, + "loss": 2.5453, + "step": 28336 + }, + { + "epoch": 0.8402870444503752, + "grad_norm": 0.08533576875925064, + "learning_rate": 6.28797359978291e-05, + "loss": 2.5889, + "step": 28337 + }, + { + "epoch": 0.8403166978026866, + "grad_norm": 0.08435025066137314, + "learning_rate": 6.285689501889546e-05, + "loss": 2.538, + "step": 28338 + }, + { + "epoch": 0.8403463511549981, + "grad_norm": 0.08847808092832565, + "learning_rate": 6.283405791093893e-05, + "loss": 2.5947, + "step": 28339 + }, + { + "epoch": 0.8403760045073095, + "grad_norm": 0.08484380692243576, + "learning_rate": 6.2811224674162e-05, + "loss": 2.5813, + "step": 28340 + }, + { + "epoch": 0.8404056578596211, + "grad_norm": 0.08335594832897186, + "learning_rate": 6.278839530876667e-05, + "loss": 2.5854, + "step": 28341 + }, + { + "epoch": 0.8404353112119325, + "grad_norm": 0.09230412542819977, + "learning_rate": 6.276556981495518e-05, + "loss": 2.5691, + "step": 28342 + }, + { + "epoch": 0.840464964564244, + "grad_norm": 0.08507393300533295, + "learning_rate": 6.274274819292975e-05, + "loss": 2.5984, + "step": 28343 + }, + { + "epoch": 0.8404946179165554, + "grad_norm": 0.07826636731624603, + "learning_rate": 6.271993044289215e-05, + "loss": 2.5972, + "step": 28344 + }, + { + "epoch": 0.840524271268867, + "grad_norm": 0.09087472409009933, + "learning_rate": 6.269711656504467e-05, + "loss": 2.5636, + "step": 28345 + }, + { + "epoch": 0.8405539246211784, + "grad_norm": 0.07860135287046432, + "learning_rate": 6.267430655958934e-05, + "loss": 2.5554, + "step": 28346 + }, + { + "epoch": 0.8405835779734899, + "grad_norm": 0.08791343867778778, + "learning_rate": 6.265150042672802e-05, + "loss": 2.5484, + "step": 28347 + }, + { + "epoch": 0.8406132313258013, + "grad_norm": 0.07991867512464523, + "learning_rate": 6.262869816666277e-05, + "loss": 2.5812, + "step": 28348 + }, + { + "epoch": 0.8406428846781129, + "grad_norm": 0.0816713497042656, + "learning_rate": 6.260589977959546e-05, + "loss": 2.6017, + "step": 28349 + }, + { + "epoch": 0.8406725380304243, + "grad_norm": 0.08753858506679535, + "learning_rate": 6.258310526572797e-05, + "loss": 2.552, + "step": 28350 + }, + { + "epoch": 0.8407021913827358, + "grad_norm": 0.07929946482181549, + "learning_rate": 6.256031462526219e-05, + "loss": 2.5968, + "step": 28351 + }, + { + "epoch": 0.8407318447350473, + "grad_norm": 0.08345726132392883, + "learning_rate": 6.253752785839977e-05, + "loss": 2.5778, + "step": 28352 + }, + { + "epoch": 0.8407614980873588, + "grad_norm": 0.09083971381187439, + "learning_rate": 6.251474496534277e-05, + "loss": 2.5739, + "step": 28353 + }, + { + "epoch": 0.8407911514396702, + "grad_norm": 0.0860997661948204, + "learning_rate": 6.249196594629286e-05, + "loss": 2.5648, + "step": 28354 + }, + { + "epoch": 0.8408208047919817, + "grad_norm": 0.09168968349695206, + "learning_rate": 6.246919080145164e-05, + "loss": 2.6109, + "step": 28355 + }, + { + "epoch": 0.8408504581442933, + "grad_norm": 0.08788976073265076, + "learning_rate": 6.244641953102081e-05, + "loss": 2.5798, + "step": 28356 + }, + { + "epoch": 0.8408801114966047, + "grad_norm": 0.09164394438266754, + "learning_rate": 6.242365213520201e-05, + "loss": 2.5834, + "step": 28357 + }, + { + "epoch": 0.8409097648489162, + "grad_norm": 0.08645272254943848, + "learning_rate": 6.24008886141969e-05, + "loss": 2.6026, + "step": 28358 + }, + { + "epoch": 0.8409394182012276, + "grad_norm": 0.08088561147451401, + "learning_rate": 6.237812896820705e-05, + "loss": 2.573, + "step": 28359 + }, + { + "epoch": 0.8409690715535392, + "grad_norm": 0.09615056961774826, + "learning_rate": 6.235537319743401e-05, + "loss": 2.5884, + "step": 28360 + }, + { + "epoch": 0.8409987249058506, + "grad_norm": 0.08563395589590073, + "learning_rate": 6.233262130207923e-05, + "loss": 2.5668, + "step": 28361 + }, + { + "epoch": 0.8410283782581621, + "grad_norm": 0.09204766154289246, + "learning_rate": 6.230987328234423e-05, + "loss": 2.5997, + "step": 28362 + }, + { + "epoch": 0.8410580316104735, + "grad_norm": 0.08970107138156891, + "learning_rate": 6.228712913843037e-05, + "loss": 2.5822, + "step": 28363 + }, + { + "epoch": 0.8410876849627851, + "grad_norm": 0.09024159610271454, + "learning_rate": 6.226438887053915e-05, + "loss": 2.5385, + "step": 28364 + }, + { + "epoch": 0.8411173383150965, + "grad_norm": 0.09083860367536545, + "learning_rate": 6.224165247887192e-05, + "loss": 2.5769, + "step": 28365 + }, + { + "epoch": 0.841146991667408, + "grad_norm": 0.08160445094108582, + "learning_rate": 6.221891996363e-05, + "loss": 2.5948, + "step": 28366 + }, + { + "epoch": 0.8411766450197194, + "grad_norm": 0.08939163386821747, + "learning_rate": 6.21961913250147e-05, + "loss": 2.5892, + "step": 28367 + }, + { + "epoch": 0.841206298372031, + "grad_norm": 0.09170205146074295, + "learning_rate": 6.21734665632272e-05, + "loss": 2.6159, + "step": 28368 + }, + { + "epoch": 0.8412359517243424, + "grad_norm": 0.08741055428981781, + "learning_rate": 6.215074567846885e-05, + "loss": 2.5646, + "step": 28369 + }, + { + "epoch": 0.8412656050766539, + "grad_norm": 0.08610795438289642, + "learning_rate": 6.212802867094081e-05, + "loss": 2.5884, + "step": 28370 + }, + { + "epoch": 0.8412952584289654, + "grad_norm": 0.08571185171604156, + "learning_rate": 6.210531554084426e-05, + "loss": 2.5599, + "step": 28371 + }, + { + "epoch": 0.8413249117812769, + "grad_norm": 0.08503524959087372, + "learning_rate": 6.208260628838025e-05, + "loss": 2.5838, + "step": 28372 + }, + { + "epoch": 0.8413545651335883, + "grad_norm": 0.0811801552772522, + "learning_rate": 6.205990091374997e-05, + "loss": 2.5614, + "step": 28373 + }, + { + "epoch": 0.8413842184858998, + "grad_norm": 0.0843537449836731, + "learning_rate": 6.203719941715441e-05, + "loss": 2.5861, + "step": 28374 + }, + { + "epoch": 0.8414138718382113, + "grad_norm": 0.08457914739847183, + "learning_rate": 6.201450179879465e-05, + "loss": 2.5645, + "step": 28375 + }, + { + "epoch": 0.8414435251905228, + "grad_norm": 0.08564895391464233, + "learning_rate": 6.199180805887167e-05, + "loss": 2.6001, + "step": 28376 + }, + { + "epoch": 0.8414731785428343, + "grad_norm": 0.08189978450536728, + "learning_rate": 6.196911819758638e-05, + "loss": 2.5459, + "step": 28377 + }, + { + "epoch": 0.8415028318951457, + "grad_norm": 0.08342280238866806, + "learning_rate": 6.194643221513974e-05, + "loss": 2.5567, + "step": 28378 + }, + { + "epoch": 0.8415324852474573, + "grad_norm": 0.08377731591463089, + "learning_rate": 6.192375011173263e-05, + "loss": 2.5929, + "step": 28379 + }, + { + "epoch": 0.8415621385997687, + "grad_norm": 0.08470948040485382, + "learning_rate": 6.190107188756594e-05, + "loss": 2.5342, + "step": 28380 + }, + { + "epoch": 0.8415917919520802, + "grad_norm": 0.08708357065916061, + "learning_rate": 6.187839754284041e-05, + "loss": 2.5499, + "step": 28381 + }, + { + "epoch": 0.8416214453043916, + "grad_norm": 0.08263887465000153, + "learning_rate": 6.185572707775688e-05, + "loss": 2.5699, + "step": 28382 + }, + { + "epoch": 0.8416510986567032, + "grad_norm": 0.08689339458942413, + "learning_rate": 6.183306049251614e-05, + "loss": 2.5655, + "step": 28383 + }, + { + "epoch": 0.8416807520090146, + "grad_norm": 0.08962905406951904, + "learning_rate": 6.181039778731878e-05, + "loss": 2.5653, + "step": 28384 + }, + { + "epoch": 0.8417104053613261, + "grad_norm": 0.08820950984954834, + "learning_rate": 6.178773896236562e-05, + "loss": 2.591, + "step": 28385 + }, + { + "epoch": 0.8417400587136376, + "grad_norm": 0.09436163306236267, + "learning_rate": 6.176508401785725e-05, + "loss": 2.5836, + "step": 28386 + }, + { + "epoch": 0.8417697120659491, + "grad_norm": 0.08908364921808243, + "learning_rate": 6.174243295399429e-05, + "loss": 2.561, + "step": 28387 + }, + { + "epoch": 0.8417993654182605, + "grad_norm": 0.09492363035678864, + "learning_rate": 6.171978577097736e-05, + "loss": 2.5833, + "step": 28388 + }, + { + "epoch": 0.841829018770572, + "grad_norm": 0.08841058611869812, + "learning_rate": 6.169714246900693e-05, + "loss": 2.5394, + "step": 28389 + }, + { + "epoch": 0.8418586721228835, + "grad_norm": 0.08743841201066971, + "learning_rate": 6.167450304828348e-05, + "loss": 2.5765, + "step": 28390 + }, + { + "epoch": 0.841888325475195, + "grad_norm": 0.0953316017985344, + "learning_rate": 6.165186750900747e-05, + "loss": 2.6104, + "step": 28391 + }, + { + "epoch": 0.8419179788275064, + "grad_norm": 0.08452026546001434, + "learning_rate": 6.162923585137947e-05, + "loss": 2.5982, + "step": 28392 + }, + { + "epoch": 0.8419476321798179, + "grad_norm": 0.08767765015363693, + "learning_rate": 6.160660807559986e-05, + "loss": 2.5284, + "step": 28393 + }, + { + "epoch": 0.8419772855321294, + "grad_norm": 0.09375840425491333, + "learning_rate": 6.1583984181869e-05, + "loss": 2.5939, + "step": 28394 + }, + { + "epoch": 0.8420069388844409, + "grad_norm": 0.08669745922088623, + "learning_rate": 6.156136417038721e-05, + "loss": 2.5799, + "step": 28395 + }, + { + "epoch": 0.8420365922367523, + "grad_norm": 0.08071808516979218, + "learning_rate": 6.153874804135479e-05, + "loss": 2.5514, + "step": 28396 + }, + { + "epoch": 0.8420662455890638, + "grad_norm": 0.08528328686952591, + "learning_rate": 6.151613579497207e-05, + "loss": 2.561, + "step": 28397 + }, + { + "epoch": 0.8420958989413754, + "grad_norm": 0.08145767450332642, + "learning_rate": 6.149352743143916e-05, + "loss": 2.5867, + "step": 28398 + }, + { + "epoch": 0.8421255522936868, + "grad_norm": 0.08813485503196716, + "learning_rate": 6.147092295095647e-05, + "loss": 2.5391, + "step": 28399 + }, + { + "epoch": 0.8421552056459983, + "grad_norm": 0.09191298484802246, + "learning_rate": 6.144832235372389e-05, + "loss": 2.5629, + "step": 28400 + }, + { + "epoch": 0.8421848589983097, + "grad_norm": 0.08642907440662384, + "learning_rate": 6.14257256399417e-05, + "loss": 2.5368, + "step": 28401 + }, + { + "epoch": 0.8422145123506213, + "grad_norm": 0.08620531111955643, + "learning_rate": 6.140313280981002e-05, + "loss": 2.5977, + "step": 28402 + }, + { + "epoch": 0.8422441657029327, + "grad_norm": 0.08851002156734467, + "learning_rate": 6.138054386352888e-05, + "loss": 2.5422, + "step": 28403 + }, + { + "epoch": 0.8422738190552442, + "grad_norm": 0.08992156386375427, + "learning_rate": 6.135795880129819e-05, + "loss": 2.5405, + "step": 28404 + }, + { + "epoch": 0.8423034724075557, + "grad_norm": 0.08381923288106918, + "learning_rate": 6.13353776233182e-05, + "loss": 2.5493, + "step": 28405 + }, + { + "epoch": 0.8423331257598672, + "grad_norm": 0.0903751403093338, + "learning_rate": 6.13128003297887e-05, + "loss": 2.5622, + "step": 28406 + }, + { + "epoch": 0.8423627791121786, + "grad_norm": 0.08190572261810303, + "learning_rate": 6.129022692090969e-05, + "loss": 2.5852, + "step": 28407 + }, + { + "epoch": 0.8423924324644901, + "grad_norm": 0.08302900940179825, + "learning_rate": 6.126765739688095e-05, + "loss": 2.5488, + "step": 28408 + }, + { + "epoch": 0.8424220858168016, + "grad_norm": 0.08540183305740356, + "learning_rate": 6.12450917579026e-05, + "loss": 2.5751, + "step": 28409 + }, + { + "epoch": 0.8424517391691131, + "grad_norm": 0.09198722243309021, + "learning_rate": 6.122253000417417e-05, + "loss": 2.5898, + "step": 28410 + }, + { + "epoch": 0.8424813925214245, + "grad_norm": 0.08227910846471786, + "learning_rate": 6.119997213589551e-05, + "loss": 2.5975, + "step": 28411 + }, + { + "epoch": 0.842511045873736, + "grad_norm": 0.08629097044467926, + "learning_rate": 6.117741815326638e-05, + "loss": 2.5944, + "step": 28412 + }, + { + "epoch": 0.8425406992260475, + "grad_norm": 0.08982345461845398, + "learning_rate": 6.115486805648663e-05, + "loss": 2.5842, + "step": 28413 + }, + { + "epoch": 0.842570352578359, + "grad_norm": 0.07997975498437881, + "learning_rate": 6.113232184575579e-05, + "loss": 2.5325, + "step": 28414 + }, + { + "epoch": 0.8426000059306704, + "grad_norm": 0.08012409508228302, + "learning_rate": 6.110977952127355e-05, + "loss": 2.5822, + "step": 28415 + }, + { + "epoch": 0.842629659282982, + "grad_norm": 0.09202767163515091, + "learning_rate": 6.10872410832396e-05, + "loss": 2.6047, + "step": 28416 + }, + { + "epoch": 0.8426593126352934, + "grad_norm": 0.08918651938438416, + "learning_rate": 6.106470653185331e-05, + "loss": 2.5277, + "step": 28417 + }, + { + "epoch": 0.8426889659876049, + "grad_norm": 0.08971437066793442, + "learning_rate": 6.104217586731453e-05, + "loss": 2.5772, + "step": 28418 + }, + { + "epoch": 0.8427186193399164, + "grad_norm": 0.08732382208108902, + "learning_rate": 6.101964908982266e-05, + "loss": 2.569, + "step": 28419 + }, + { + "epoch": 0.8427482726922279, + "grad_norm": 0.08373616635799408, + "learning_rate": 6.09971261995772e-05, + "loss": 2.5788, + "step": 28420 + }, + { + "epoch": 0.8427779260445394, + "grad_norm": 0.09135939180850983, + "learning_rate": 6.0974607196777446e-05, + "loss": 2.5847, + "step": 28421 + }, + { + "epoch": 0.8428075793968508, + "grad_norm": 0.08996622264385223, + "learning_rate": 6.095209208162289e-05, + "loss": 2.5742, + "step": 28422 + }, + { + "epoch": 0.8428372327491623, + "grad_norm": 0.08540818095207214, + "learning_rate": 6.092958085431294e-05, + "loss": 2.5594, + "step": 28423 + }, + { + "epoch": 0.8428668861014738, + "grad_norm": 0.09842637926340103, + "learning_rate": 6.0907073515046926e-05, + "loss": 2.5893, + "step": 28424 + }, + { + "epoch": 0.8428965394537853, + "grad_norm": 0.08230391889810562, + "learning_rate": 6.0884570064024145e-05, + "loss": 2.5624, + "step": 28425 + }, + { + "epoch": 0.8429261928060967, + "grad_norm": 0.08949175477027893, + "learning_rate": 6.086207050144382e-05, + "loss": 2.599, + "step": 28426 + }, + { + "epoch": 0.8429558461584082, + "grad_norm": 0.09184329211711884, + "learning_rate": 6.0839574827505295e-05, + "loss": 2.5727, + "step": 28427 + }, + { + "epoch": 0.8429854995107197, + "grad_norm": 0.08264288306236267, + "learning_rate": 6.0817083042407685e-05, + "loss": 2.568, + "step": 28428 + }, + { + "epoch": 0.8430151528630312, + "grad_norm": 0.08556272089481354, + "learning_rate": 6.079459514635022e-05, + "loss": 2.5557, + "step": 28429 + }, + { + "epoch": 0.8430448062153426, + "grad_norm": 0.08759884536266327, + "learning_rate": 6.077211113953196e-05, + "loss": 2.5703, + "step": 28430 + }, + { + "epoch": 0.8430744595676541, + "grad_norm": 0.08912049978971481, + "learning_rate": 6.0749631022152083e-05, + "loss": 2.5649, + "step": 28431 + }, + { + "epoch": 0.8431041129199656, + "grad_norm": 0.09198020398616791, + "learning_rate": 6.07271547944096e-05, + "loss": 2.5687, + "step": 28432 + }, + { + "epoch": 0.8431337662722771, + "grad_norm": 0.08323420584201813, + "learning_rate": 6.070468245650357e-05, + "loss": 2.5586, + "step": 28433 + }, + { + "epoch": 0.8431634196245885, + "grad_norm": 0.08286622911691666, + "learning_rate": 6.0682214008633e-05, + "loss": 2.5856, + "step": 28434 + }, + { + "epoch": 0.8431930729769, + "grad_norm": 0.09875938296318054, + "learning_rate": 6.065974945099684e-05, + "loss": 2.5517, + "step": 28435 + }, + { + "epoch": 0.8432227263292115, + "grad_norm": 0.08396637439727783, + "learning_rate": 6.0637288783793986e-05, + "loss": 2.5779, + "step": 28436 + }, + { + "epoch": 0.843252379681523, + "grad_norm": 0.08985255658626556, + "learning_rate": 6.061483200722334e-05, + "loss": 2.5796, + "step": 28437 + }, + { + "epoch": 0.8432820330338344, + "grad_norm": 0.0843869149684906, + "learning_rate": 6.05923791214838e-05, + "loss": 2.57, + "step": 28438 + }, + { + "epoch": 0.843311686386146, + "grad_norm": 0.07988660037517548, + "learning_rate": 6.056993012677414e-05, + "loss": 2.5916, + "step": 28439 + }, + { + "epoch": 0.8433413397384575, + "grad_norm": 0.07999568432569504, + "learning_rate": 6.054748502329321e-05, + "loss": 2.5556, + "step": 28440 + }, + { + "epoch": 0.8433709930907689, + "grad_norm": 0.08236848562955856, + "learning_rate": 6.052504381123969e-05, + "loss": 2.5929, + "step": 28441 + }, + { + "epoch": 0.8434006464430804, + "grad_norm": 0.07962913811206818, + "learning_rate": 6.050260649081235e-05, + "loss": 2.5988, + "step": 28442 + }, + { + "epoch": 0.8434302997953919, + "grad_norm": 0.08622001856565475, + "learning_rate": 6.048017306220988e-05, + "loss": 2.5738, + "step": 28443 + }, + { + "epoch": 0.8434599531477034, + "grad_norm": 0.08647856116294861, + "learning_rate": 6.045774352563094e-05, + "loss": 2.5682, + "step": 28444 + }, + { + "epoch": 0.8434896065000148, + "grad_norm": 0.08720191568136215, + "learning_rate": 6.04353178812741e-05, + "loss": 2.5632, + "step": 28445 + }, + { + "epoch": 0.8435192598523263, + "grad_norm": 0.0813901275396347, + "learning_rate": 6.041289612933798e-05, + "loss": 2.5372, + "step": 28446 + }, + { + "epoch": 0.8435489132046378, + "grad_norm": 0.09472649544477463, + "learning_rate": 6.0390478270021144e-05, + "loss": 2.521, + "step": 28447 + }, + { + "epoch": 0.8435785665569493, + "grad_norm": 0.07800985872745514, + "learning_rate": 6.036806430352204e-05, + "loss": 2.5591, + "step": 28448 + }, + { + "epoch": 0.8436082199092607, + "grad_norm": 0.09196747839450836, + "learning_rate": 6.0345654230039235e-05, + "loss": 2.5934, + "step": 28449 + }, + { + "epoch": 0.8436378732615722, + "grad_norm": 0.0890875980257988, + "learning_rate": 6.032324804977108e-05, + "loss": 2.5763, + "step": 28450 + }, + { + "epoch": 0.8436675266138837, + "grad_norm": 0.0881306380033493, + "learning_rate": 6.030084576291606e-05, + "loss": 2.5912, + "step": 28451 + }, + { + "epoch": 0.8436971799661952, + "grad_norm": 0.09968145936727524, + "learning_rate": 6.027844736967253e-05, + "loss": 2.5438, + "step": 28452 + }, + { + "epoch": 0.8437268333185066, + "grad_norm": 0.07934992015361786, + "learning_rate": 6.025605287023877e-05, + "loss": 2.5474, + "step": 28453 + }, + { + "epoch": 0.8437564866708182, + "grad_norm": 0.08934298902750015, + "learning_rate": 6.0233662264813306e-05, + "loss": 2.5891, + "step": 28454 + }, + { + "epoch": 0.8437861400231296, + "grad_norm": 0.0928669273853302, + "learning_rate": 6.0211275553594126e-05, + "loss": 2.5663, + "step": 28455 + }, + { + "epoch": 0.8438157933754411, + "grad_norm": 0.08940871059894562, + "learning_rate": 6.018889273677952e-05, + "loss": 2.5922, + "step": 28456 + }, + { + "epoch": 0.8438454467277525, + "grad_norm": 0.090335413813591, + "learning_rate": 6.016651381456778e-05, + "loss": 2.6061, + "step": 28457 + }, + { + "epoch": 0.8438751000800641, + "grad_norm": 0.08886723965406418, + "learning_rate": 6.014413878715713e-05, + "loss": 2.573, + "step": 28458 + }, + { + "epoch": 0.8439047534323755, + "grad_norm": 0.08526226133108139, + "learning_rate": 6.012176765474564e-05, + "loss": 2.5721, + "step": 28459 + }, + { + "epoch": 0.843934406784687, + "grad_norm": 0.08857568353414536, + "learning_rate": 6.009940041753137e-05, + "loss": 2.6002, + "step": 28460 + }, + { + "epoch": 0.8439640601369985, + "grad_norm": 0.08384742587804794, + "learning_rate": 6.007703707571238e-05, + "loss": 2.5416, + "step": 28461 + }, + { + "epoch": 0.84399371348931, + "grad_norm": 0.08554300665855408, + "learning_rate": 6.0054677629486795e-05, + "loss": 2.5829, + "step": 28462 + }, + { + "epoch": 0.8440233668416215, + "grad_norm": 0.08437740057706833, + "learning_rate": 6.003232207905251e-05, + "loss": 2.5379, + "step": 28463 + }, + { + "epoch": 0.8440530201939329, + "grad_norm": 0.08780723065137863, + "learning_rate": 6.0009970424607704e-05, + "loss": 2.5443, + "step": 28464 + }, + { + "epoch": 0.8440826735462444, + "grad_norm": 0.08654149621725082, + "learning_rate": 5.998762266634999e-05, + "loss": 2.5407, + "step": 28465 + }, + { + "epoch": 0.8441123268985559, + "grad_norm": 0.07970365136861801, + "learning_rate": 5.996527880447739e-05, + "loss": 2.5675, + "step": 28466 + }, + { + "epoch": 0.8441419802508674, + "grad_norm": 0.08403380215167999, + "learning_rate": 5.994293883918778e-05, + "loss": 2.5488, + "step": 28467 + }, + { + "epoch": 0.8441716336031788, + "grad_norm": 0.08669089525938034, + "learning_rate": 5.99206027706789e-05, + "loss": 2.5626, + "step": 28468 + }, + { + "epoch": 0.8442012869554903, + "grad_norm": 0.08430890738964081, + "learning_rate": 5.989827059914871e-05, + "loss": 2.583, + "step": 28469 + }, + { + "epoch": 0.8442309403078018, + "grad_norm": 0.08100850880146027, + "learning_rate": 5.9875942324794874e-05, + "loss": 2.5815, + "step": 28470 + }, + { + "epoch": 0.8442605936601133, + "grad_norm": 0.08628950268030167, + "learning_rate": 5.985361794781513e-05, + "loss": 2.5792, + "step": 28471 + }, + { + "epoch": 0.8442902470124247, + "grad_norm": 0.08507627993822098, + "learning_rate": 5.9831297468407156e-05, + "loss": 2.5382, + "step": 28472 + }, + { + "epoch": 0.8443199003647363, + "grad_norm": 0.07636664807796478, + "learning_rate": 5.980898088676856e-05, + "loss": 2.5926, + "step": 28473 + }, + { + "epoch": 0.8443495537170477, + "grad_norm": 0.0867486372590065, + "learning_rate": 5.978666820309703e-05, + "loss": 2.5538, + "step": 28474 + }, + { + "epoch": 0.8443792070693592, + "grad_norm": 0.08211211115121841, + "learning_rate": 5.976435941759018e-05, + "loss": 2.5878, + "step": 28475 + }, + { + "epoch": 0.8444088604216706, + "grad_norm": 0.0832270160317421, + "learning_rate": 5.97420545304454e-05, + "loss": 2.6104, + "step": 28476 + }, + { + "epoch": 0.8444385137739822, + "grad_norm": 0.08528469502925873, + "learning_rate": 5.971975354186032e-05, + "loss": 2.5968, + "step": 28477 + }, + { + "epoch": 0.8444681671262936, + "grad_norm": 0.0819694772362709, + "learning_rate": 5.9697456452032395e-05, + "loss": 2.5832, + "step": 28478 + }, + { + "epoch": 0.8444978204786051, + "grad_norm": 0.08039873838424683, + "learning_rate": 5.967516326115907e-05, + "loss": 2.5851, + "step": 28479 + }, + { + "epoch": 0.8445274738309165, + "grad_norm": 0.08671274036169052, + "learning_rate": 5.965287396943775e-05, + "loss": 2.5918, + "step": 28480 + }, + { + "epoch": 0.8445571271832281, + "grad_norm": 0.08095487207174301, + "learning_rate": 5.963058857706572e-05, + "loss": 2.5639, + "step": 28481 + }, + { + "epoch": 0.8445867805355396, + "grad_norm": 0.08060931414365768, + "learning_rate": 5.960830708424048e-05, + "loss": 2.5718, + "step": 28482 + }, + { + "epoch": 0.844616433887851, + "grad_norm": 0.09357386827468872, + "learning_rate": 5.958602949115932e-05, + "loss": 2.5755, + "step": 28483 + }, + { + "epoch": 0.8446460872401625, + "grad_norm": 0.08669079840183258, + "learning_rate": 5.9563755798019424e-05, + "loss": 2.5582, + "step": 28484 + }, + { + "epoch": 0.844675740592474, + "grad_norm": 0.09969959408044815, + "learning_rate": 5.954148600501818e-05, + "loss": 2.6005, + "step": 28485 + }, + { + "epoch": 0.8447053939447855, + "grad_norm": 0.0903918519616127, + "learning_rate": 5.951922011235261e-05, + "loss": 2.581, + "step": 28486 + }, + { + "epoch": 0.8447350472970969, + "grad_norm": 0.08585448563098907, + "learning_rate": 5.949695812021994e-05, + "loss": 2.5947, + "step": 28487 + }, + { + "epoch": 0.8447647006494085, + "grad_norm": 0.09938450902700424, + "learning_rate": 5.947470002881733e-05, + "loss": 2.5929, + "step": 28488 + }, + { + "epoch": 0.8447943540017199, + "grad_norm": 0.0896826684474945, + "learning_rate": 5.9452445838341864e-05, + "loss": 2.5533, + "step": 28489 + }, + { + "epoch": 0.8448240073540314, + "grad_norm": 0.09688819944858551, + "learning_rate": 5.943019554899059e-05, + "loss": 2.5648, + "step": 28490 + }, + { + "epoch": 0.8448536607063428, + "grad_norm": 0.09989897161722183, + "learning_rate": 5.940794916096054e-05, + "loss": 2.5607, + "step": 28491 + }, + { + "epoch": 0.8448833140586544, + "grad_norm": 0.09086041897535324, + "learning_rate": 5.938570667444876e-05, + "loss": 2.5924, + "step": 28492 + }, + { + "epoch": 0.8449129674109658, + "grad_norm": 0.0947229191660881, + "learning_rate": 5.936346808965215e-05, + "loss": 2.571, + "step": 28493 + }, + { + "epoch": 0.8449426207632773, + "grad_norm": 0.08555957674980164, + "learning_rate": 5.934123340676756e-05, + "loss": 2.513, + "step": 28494 + }, + { + "epoch": 0.8449722741155887, + "grad_norm": 0.1009979173541069, + "learning_rate": 5.9319002625992156e-05, + "loss": 2.5622, + "step": 28495 + }, + { + "epoch": 0.8450019274679003, + "grad_norm": 0.09166105091571808, + "learning_rate": 5.929677574752268e-05, + "loss": 2.5717, + "step": 28496 + }, + { + "epoch": 0.8450315808202117, + "grad_norm": 0.09065484255552292, + "learning_rate": 5.92745527715558e-05, + "loss": 2.5555, + "step": 28497 + }, + { + "epoch": 0.8450612341725232, + "grad_norm": 0.09181904792785645, + "learning_rate": 5.925233369828836e-05, + "loss": 2.571, + "step": 28498 + }, + { + "epoch": 0.8450908875248346, + "grad_norm": 0.0882207527756691, + "learning_rate": 5.923011852791721e-05, + "loss": 2.6053, + "step": 28499 + }, + { + "epoch": 0.8451205408771462, + "grad_norm": 0.09548437595367432, + "learning_rate": 5.9207907260639016e-05, + "loss": 2.5924, + "step": 28500 + }, + { + "epoch": 0.8451501942294576, + "grad_norm": 0.08251377195119858, + "learning_rate": 5.918569989665046e-05, + "loss": 2.6156, + "step": 28501 + }, + { + "epoch": 0.8451798475817691, + "grad_norm": 0.09267506003379822, + "learning_rate": 5.9163496436148214e-05, + "loss": 2.5564, + "step": 28502 + }, + { + "epoch": 0.8452095009340806, + "grad_norm": 0.086520716547966, + "learning_rate": 5.914129687932884e-05, + "loss": 2.5635, + "step": 28503 + }, + { + "epoch": 0.8452391542863921, + "grad_norm": 0.086945079267025, + "learning_rate": 5.911910122638897e-05, + "loss": 2.5589, + "step": 28504 + }, + { + "epoch": 0.8452688076387036, + "grad_norm": 0.08608704060316086, + "learning_rate": 5.909690947752511e-05, + "loss": 2.5779, + "step": 28505 + }, + { + "epoch": 0.845298460991015, + "grad_norm": 0.09309584647417068, + "learning_rate": 5.907472163293387e-05, + "loss": 2.5918, + "step": 28506 + }, + { + "epoch": 0.8453281143433266, + "grad_norm": 0.08457835763692856, + "learning_rate": 5.9052537692811604e-05, + "loss": 2.5925, + "step": 28507 + }, + { + "epoch": 0.845357767695638, + "grad_norm": 0.08372551947832108, + "learning_rate": 5.903035765735476e-05, + "loss": 2.5689, + "step": 28508 + }, + { + "epoch": 0.8453874210479495, + "grad_norm": 0.09464330226182938, + "learning_rate": 5.900818152675985e-05, + "loss": 2.5609, + "step": 28509 + }, + { + "epoch": 0.8454170744002609, + "grad_norm": 0.08363894373178482, + "learning_rate": 5.898600930122316e-05, + "loss": 2.5429, + "step": 28510 + }, + { + "epoch": 0.8454467277525725, + "grad_norm": 0.08518586307764053, + "learning_rate": 5.89638409809411e-05, + "loss": 2.5453, + "step": 28511 + }, + { + "epoch": 0.8454763811048839, + "grad_norm": 0.0844687819480896, + "learning_rate": 5.894167656610988e-05, + "loss": 2.5562, + "step": 28512 + }, + { + "epoch": 0.8455060344571954, + "grad_norm": 0.08661838620901108, + "learning_rate": 5.891951605692586e-05, + "loss": 2.579, + "step": 28513 + }, + { + "epoch": 0.8455356878095068, + "grad_norm": 0.08175094425678253, + "learning_rate": 5.8897359453585206e-05, + "loss": 2.569, + "step": 28514 + }, + { + "epoch": 0.8455653411618184, + "grad_norm": 0.09179772436618805, + "learning_rate": 5.887520675628416e-05, + "loss": 2.5699, + "step": 28515 + }, + { + "epoch": 0.8455949945141298, + "grad_norm": 0.0863359346985817, + "learning_rate": 5.8853057965218895e-05, + "loss": 2.5551, + "step": 28516 + }, + { + "epoch": 0.8456246478664413, + "grad_norm": 0.08844855427742004, + "learning_rate": 5.883091308058547e-05, + "loss": 2.5771, + "step": 28517 + }, + { + "epoch": 0.8456543012187527, + "grad_norm": 0.08731867372989655, + "learning_rate": 5.880877210258007e-05, + "loss": 2.5762, + "step": 28518 + }, + { + "epoch": 0.8456839545710643, + "grad_norm": 0.08256082236766815, + "learning_rate": 5.878663503139886e-05, + "loss": 2.5881, + "step": 28519 + }, + { + "epoch": 0.8457136079233757, + "grad_norm": 0.08452268689870834, + "learning_rate": 5.8764501867237474e-05, + "loss": 2.5768, + "step": 28520 + }, + { + "epoch": 0.8457432612756872, + "grad_norm": 0.08985734730958939, + "learning_rate": 5.8742372610292306e-05, + "loss": 2.5487, + "step": 28521 + }, + { + "epoch": 0.8457729146279986, + "grad_norm": 0.08394617587327957, + "learning_rate": 5.872024726075914e-05, + "loss": 2.5518, + "step": 28522 + }, + { + "epoch": 0.8458025679803102, + "grad_norm": 0.0860678181052208, + "learning_rate": 5.8698125818833934e-05, + "loss": 2.5943, + "step": 28523 + }, + { + "epoch": 0.8458322213326217, + "grad_norm": 0.08990056067705154, + "learning_rate": 5.8676008284712535e-05, + "loss": 2.5701, + "step": 28524 + }, + { + "epoch": 0.8458618746849331, + "grad_norm": 0.08371763676404953, + "learning_rate": 5.865389465859089e-05, + "loss": 2.5653, + "step": 28525 + }, + { + "epoch": 0.8458915280372447, + "grad_norm": 0.0837610736489296, + "learning_rate": 5.8631784940664734e-05, + "loss": 2.5551, + "step": 28526 + }, + { + "epoch": 0.8459211813895561, + "grad_norm": 0.09250947833061218, + "learning_rate": 5.8609679131129914e-05, + "loss": 2.5742, + "step": 28527 + }, + { + "epoch": 0.8459508347418676, + "grad_norm": 0.08673828095197678, + "learning_rate": 5.8587577230182096e-05, + "loss": 2.596, + "step": 28528 + }, + { + "epoch": 0.845980488094179, + "grad_norm": 0.08580461889505386, + "learning_rate": 5.856547923801708e-05, + "loss": 2.5194, + "step": 28529 + }, + { + "epoch": 0.8460101414464906, + "grad_norm": 0.09336115419864655, + "learning_rate": 5.8543385154830655e-05, + "loss": 2.5732, + "step": 28530 + }, + { + "epoch": 0.846039794798802, + "grad_norm": 0.08250177651643753, + "learning_rate": 5.852129498081815e-05, + "loss": 2.5741, + "step": 28531 + }, + { + "epoch": 0.8460694481511135, + "grad_norm": 0.095921590924263, + "learning_rate": 5.849920871617542e-05, + "loss": 2.5663, + "step": 28532 + }, + { + "epoch": 0.8460991015034249, + "grad_norm": 0.09062903374433517, + "learning_rate": 5.8477126361097864e-05, + "loss": 2.605, + "step": 28533 + }, + { + "epoch": 0.8461287548557365, + "grad_norm": 0.08659899234771729, + "learning_rate": 5.8455047915781215e-05, + "loss": 2.5701, + "step": 28534 + }, + { + "epoch": 0.8461584082080479, + "grad_norm": 0.08340851217508316, + "learning_rate": 5.8432973380420915e-05, + "loss": 2.5674, + "step": 28535 + }, + { + "epoch": 0.8461880615603594, + "grad_norm": 0.08537738025188446, + "learning_rate": 5.8410902755212494e-05, + "loss": 2.5596, + "step": 28536 + }, + { + "epoch": 0.8462177149126708, + "grad_norm": 0.09324797987937927, + "learning_rate": 5.8388836040351224e-05, + "loss": 2.5939, + "step": 28537 + }, + { + "epoch": 0.8462473682649824, + "grad_norm": 0.08647575229406357, + "learning_rate": 5.8366773236032674e-05, + "loss": 2.57, + "step": 28538 + }, + { + "epoch": 0.8462770216172938, + "grad_norm": 0.07959329336881638, + "learning_rate": 5.834471434245214e-05, + "loss": 2.529, + "step": 28539 + }, + { + "epoch": 0.8463066749696053, + "grad_norm": 0.09042634069919586, + "learning_rate": 5.832265935980507e-05, + "loss": 2.5703, + "step": 28540 + }, + { + "epoch": 0.8463363283219167, + "grad_norm": 0.09047730267047882, + "learning_rate": 5.83006082882866e-05, + "loss": 2.5895, + "step": 28541 + }, + { + "epoch": 0.8463659816742283, + "grad_norm": 0.09201731532812119, + "learning_rate": 5.827856112809199e-05, + "loss": 2.5463, + "step": 28542 + }, + { + "epoch": 0.8463956350265398, + "grad_norm": 0.08980204910039902, + "learning_rate": 5.82565178794166e-05, + "loss": 2.5679, + "step": 28543 + }, + { + "epoch": 0.8464252883788512, + "grad_norm": 0.08019141852855682, + "learning_rate": 5.823447854245556e-05, + "loss": 2.5007, + "step": 28544 + }, + { + "epoch": 0.8464549417311628, + "grad_norm": 0.08733509480953217, + "learning_rate": 5.8212443117404035e-05, + "loss": 2.5446, + "step": 28545 + }, + { + "epoch": 0.8464845950834742, + "grad_norm": 0.08987584710121155, + "learning_rate": 5.819041160445704e-05, + "loss": 2.5707, + "step": 28546 + }, + { + "epoch": 0.8465142484357857, + "grad_norm": 0.08094162493944168, + "learning_rate": 5.816838400380986e-05, + "loss": 2.5939, + "step": 28547 + }, + { + "epoch": 0.8465439017880971, + "grad_norm": 0.08015012741088867, + "learning_rate": 5.814636031565751e-05, + "loss": 2.5938, + "step": 28548 + }, + { + "epoch": 0.8465735551404087, + "grad_norm": 0.09073397517204285, + "learning_rate": 5.8124340540194996e-05, + "loss": 2.5769, + "step": 28549 + }, + { + "epoch": 0.8466032084927201, + "grad_norm": 0.08460011333227158, + "learning_rate": 5.810232467761728e-05, + "loss": 2.6005, + "step": 28550 + }, + { + "epoch": 0.8466328618450316, + "grad_norm": 0.08416403830051422, + "learning_rate": 5.8080312728119476e-05, + "loss": 2.5738, + "step": 28551 + }, + { + "epoch": 0.846662515197343, + "grad_norm": 0.08485399931669235, + "learning_rate": 5.805830469189621e-05, + "loss": 2.5958, + "step": 28552 + }, + { + "epoch": 0.8466921685496546, + "grad_norm": 0.08936822414398193, + "learning_rate": 5.8036300569142497e-05, + "loss": 2.5712, + "step": 28553 + }, + { + "epoch": 0.846721821901966, + "grad_norm": 0.08326807618141174, + "learning_rate": 5.801430036005323e-05, + "loss": 2.577, + "step": 28554 + }, + { + "epoch": 0.8467514752542775, + "grad_norm": 0.07931332290172577, + "learning_rate": 5.7992304064823196e-05, + "loss": 2.5699, + "step": 28555 + }, + { + "epoch": 0.8467811286065889, + "grad_norm": 0.0818033367395401, + "learning_rate": 5.797031168364719e-05, + "loss": 2.5542, + "step": 28556 + }, + { + "epoch": 0.8468107819589005, + "grad_norm": 0.08224605023860931, + "learning_rate": 5.7948323216719944e-05, + "loss": 2.5755, + "step": 28557 + }, + { + "epoch": 0.8468404353112119, + "grad_norm": 0.08865280449390411, + "learning_rate": 5.7926338664236134e-05, + "loss": 2.5716, + "step": 28558 + }, + { + "epoch": 0.8468700886635234, + "grad_norm": 0.08534408360719681, + "learning_rate": 5.7904358026390436e-05, + "loss": 2.5799, + "step": 28559 + }, + { + "epoch": 0.8468997420158348, + "grad_norm": 0.07953791320323944, + "learning_rate": 5.7882381303377584e-05, + "loss": 2.5728, + "step": 28560 + }, + { + "epoch": 0.8469293953681464, + "grad_norm": 0.08171103149652481, + "learning_rate": 5.7860408495392255e-05, + "loss": 2.5847, + "step": 28561 + }, + { + "epoch": 0.8469590487204578, + "grad_norm": 0.0771806538105011, + "learning_rate": 5.78384396026288e-05, + "loss": 2.5975, + "step": 28562 + }, + { + "epoch": 0.8469887020727693, + "grad_norm": 0.08602043241262436, + "learning_rate": 5.781647462528189e-05, + "loss": 2.5616, + "step": 28563 + }, + { + "epoch": 0.8470183554250809, + "grad_norm": 0.09142359346151352, + "learning_rate": 5.779451356354593e-05, + "loss": 2.5654, + "step": 28564 + }, + { + "epoch": 0.8470480087773923, + "grad_norm": 0.07764368504285812, + "learning_rate": 5.777255641761553e-05, + "loss": 2.5778, + "step": 28565 + }, + { + "epoch": 0.8470776621297038, + "grad_norm": 0.0913516953587532, + "learning_rate": 5.775060318768499e-05, + "loss": 2.5299, + "step": 28566 + }, + { + "epoch": 0.8471073154820152, + "grad_norm": 0.08511337637901306, + "learning_rate": 5.772865387394877e-05, + "loss": 2.5958, + "step": 28567 + }, + { + "epoch": 0.8471369688343268, + "grad_norm": 0.08358725905418396, + "learning_rate": 5.770670847660126e-05, + "loss": 2.5533, + "step": 28568 + }, + { + "epoch": 0.8471666221866382, + "grad_norm": 0.09038087725639343, + "learning_rate": 5.76847669958368e-05, + "loss": 2.5905, + "step": 28569 + }, + { + "epoch": 0.8471962755389497, + "grad_norm": 0.08278699219226837, + "learning_rate": 5.766282943184958e-05, + "loss": 2.5799, + "step": 28570 + }, + { + "epoch": 0.8472259288912611, + "grad_norm": 0.07795208692550659, + "learning_rate": 5.764089578483395e-05, + "loss": 2.5945, + "step": 28571 + }, + { + "epoch": 0.8472555822435727, + "grad_norm": 0.08869767189025879, + "learning_rate": 5.761896605498418e-05, + "loss": 2.578, + "step": 28572 + }, + { + "epoch": 0.8472852355958841, + "grad_norm": 0.08091417700052261, + "learning_rate": 5.7597040242494346e-05, + "loss": 2.6039, + "step": 28573 + }, + { + "epoch": 0.8473148889481956, + "grad_norm": 0.08367183059453964, + "learning_rate": 5.757511834755863e-05, + "loss": 2.5616, + "step": 28574 + }, + { + "epoch": 0.847344542300507, + "grad_norm": 0.08943145722150803, + "learning_rate": 5.7553200370371204e-05, + "loss": 2.5497, + "step": 28575 + }, + { + "epoch": 0.8473741956528186, + "grad_norm": 0.08336184173822403, + "learning_rate": 5.75312863111262e-05, + "loss": 2.5793, + "step": 28576 + }, + { + "epoch": 0.84740384900513, + "grad_norm": 0.0871628001332283, + "learning_rate": 5.75093761700175e-05, + "loss": 2.5857, + "step": 28577 + }, + { + "epoch": 0.8474335023574415, + "grad_norm": 0.08485718071460724, + "learning_rate": 5.74874699472393e-05, + "loss": 2.5632, + "step": 28578 + }, + { + "epoch": 0.847463155709753, + "grad_norm": 0.08165629208087921, + "learning_rate": 5.746556764298549e-05, + "loss": 2.5645, + "step": 28579 + }, + { + "epoch": 0.8474928090620645, + "grad_norm": 0.08909645676612854, + "learning_rate": 5.7443669257450035e-05, + "loss": 2.5792, + "step": 28580 + }, + { + "epoch": 0.8475224624143759, + "grad_norm": 0.08244526386260986, + "learning_rate": 5.742177479082683e-05, + "loss": 2.5561, + "step": 28581 + }, + { + "epoch": 0.8475521157666874, + "grad_norm": 0.08755399286746979, + "learning_rate": 5.739988424330983e-05, + "loss": 2.5636, + "step": 28582 + }, + { + "epoch": 0.8475817691189989, + "grad_norm": 0.08390677720308304, + "learning_rate": 5.737799761509277e-05, + "loss": 2.585, + "step": 28583 + }, + { + "epoch": 0.8476114224713104, + "grad_norm": 0.08777975291013718, + "learning_rate": 5.735611490636955e-05, + "loss": 2.5573, + "step": 28584 + }, + { + "epoch": 0.8476410758236219, + "grad_norm": 0.08393029123544693, + "learning_rate": 5.733423611733391e-05, + "loss": 2.5962, + "step": 28585 + }, + { + "epoch": 0.8476707291759333, + "grad_norm": 0.08647823333740234, + "learning_rate": 5.7312361248179566e-05, + "loss": 2.5803, + "step": 28586 + }, + { + "epoch": 0.8477003825282449, + "grad_norm": 0.0841364786028862, + "learning_rate": 5.729049029910027e-05, + "loss": 2.5724, + "step": 28587 + }, + { + "epoch": 0.8477300358805563, + "grad_norm": 0.08483317494392395, + "learning_rate": 5.7268623270289696e-05, + "loss": 2.6042, + "step": 28588 + }, + { + "epoch": 0.8477596892328678, + "grad_norm": 0.08969615399837494, + "learning_rate": 5.7246760161941416e-05, + "loss": 2.5729, + "step": 28589 + }, + { + "epoch": 0.8477893425851792, + "grad_norm": 0.08131548762321472, + "learning_rate": 5.722490097424909e-05, + "loss": 2.5515, + "step": 28590 + }, + { + "epoch": 0.8478189959374908, + "grad_norm": 0.09097962826490402, + "learning_rate": 5.720304570740625e-05, + "loss": 2.5826, + "step": 28591 + }, + { + "epoch": 0.8478486492898022, + "grad_norm": 0.08185159415006638, + "learning_rate": 5.718119436160646e-05, + "loss": 2.5954, + "step": 28592 + }, + { + "epoch": 0.8478783026421137, + "grad_norm": 0.08758051693439484, + "learning_rate": 5.715934693704322e-05, + "loss": 2.6104, + "step": 28593 + }, + { + "epoch": 0.8479079559944251, + "grad_norm": 0.07984442263841629, + "learning_rate": 5.713750343390994e-05, + "loss": 2.5898, + "step": 28594 + }, + { + "epoch": 0.8479376093467367, + "grad_norm": 0.09436236321926117, + "learning_rate": 5.711566385240025e-05, + "loss": 2.5684, + "step": 28595 + }, + { + "epoch": 0.8479672626990481, + "grad_norm": 0.08819282799959183, + "learning_rate": 5.70938281927072e-05, + "loss": 2.5586, + "step": 28596 + }, + { + "epoch": 0.8479969160513596, + "grad_norm": 0.07769759744405746, + "learning_rate": 5.7071996455024365e-05, + "loss": 2.5355, + "step": 28597 + }, + { + "epoch": 0.848026569403671, + "grad_norm": 0.088811956346035, + "learning_rate": 5.705016863954493e-05, + "loss": 2.6282, + "step": 28598 + }, + { + "epoch": 0.8480562227559826, + "grad_norm": 0.09108881652355194, + "learning_rate": 5.7028344746462345e-05, + "loss": 2.6152, + "step": 28599 + }, + { + "epoch": 0.848085876108294, + "grad_norm": 0.07937649637460709, + "learning_rate": 5.700652477596985e-05, + "loss": 2.6225, + "step": 28600 + }, + { + "epoch": 0.8481155294606055, + "grad_norm": 0.08684045821428299, + "learning_rate": 5.6984708728260556e-05, + "loss": 2.6081, + "step": 28601 + }, + { + "epoch": 0.848145182812917, + "grad_norm": 0.08948289602994919, + "learning_rate": 5.696289660352777e-05, + "loss": 2.5819, + "step": 28602 + }, + { + "epoch": 0.8481748361652285, + "grad_norm": 0.08250735700130463, + "learning_rate": 5.694108840196455e-05, + "loss": 2.551, + "step": 28603 + }, + { + "epoch": 0.8482044895175399, + "grad_norm": 0.08173010498285294, + "learning_rate": 5.691928412376407e-05, + "loss": 2.607, + "step": 28604 + }, + { + "epoch": 0.8482341428698514, + "grad_norm": 0.08678373694419861, + "learning_rate": 5.689748376911935e-05, + "loss": 2.5945, + "step": 28605 + }, + { + "epoch": 0.848263796222163, + "grad_norm": 0.09045004844665527, + "learning_rate": 5.687568733822357e-05, + "loss": 2.6006, + "step": 28606 + }, + { + "epoch": 0.8482934495744744, + "grad_norm": 0.08614517748355865, + "learning_rate": 5.685389483126957e-05, + "loss": 2.5471, + "step": 28607 + }, + { + "epoch": 0.8483231029267859, + "grad_norm": 0.0797966942191124, + "learning_rate": 5.6832106248450366e-05, + "loss": 2.5836, + "step": 28608 + }, + { + "epoch": 0.8483527562790973, + "grad_norm": 0.08196824789047241, + "learning_rate": 5.681032158995897e-05, + "loss": 2.6208, + "step": 28609 + }, + { + "epoch": 0.8483824096314089, + "grad_norm": 0.09114367514848709, + "learning_rate": 5.678854085598822e-05, + "loss": 2.5895, + "step": 28610 + }, + { + "epoch": 0.8484120629837203, + "grad_norm": 0.08977951109409332, + "learning_rate": 5.6766764046730924e-05, + "loss": 2.5859, + "step": 28611 + }, + { + "epoch": 0.8484417163360318, + "grad_norm": 0.08120542019605637, + "learning_rate": 5.674499116238008e-05, + "loss": 2.5532, + "step": 28612 + }, + { + "epoch": 0.8484713696883432, + "grad_norm": 0.08291415125131607, + "learning_rate": 5.6723222203128475e-05, + "loss": 2.5498, + "step": 28613 + }, + { + "epoch": 0.8485010230406548, + "grad_norm": 0.09343033283948898, + "learning_rate": 5.6701457169168805e-05, + "loss": 2.6004, + "step": 28614 + }, + { + "epoch": 0.8485306763929662, + "grad_norm": 0.08425283432006836, + "learning_rate": 5.667969606069379e-05, + "loss": 2.5588, + "step": 28615 + }, + { + "epoch": 0.8485603297452777, + "grad_norm": 0.08509937673807144, + "learning_rate": 5.665793887789633e-05, + "loss": 2.5754, + "step": 28616 + }, + { + "epoch": 0.8485899830975892, + "grad_norm": 0.08446499705314636, + "learning_rate": 5.663618562096878e-05, + "loss": 2.5709, + "step": 28617 + }, + { + "epoch": 0.8486196364499007, + "grad_norm": 0.08199997991323471, + "learning_rate": 5.6614436290103875e-05, + "loss": 2.569, + "step": 28618 + }, + { + "epoch": 0.8486492898022121, + "grad_norm": 0.08379849046468735, + "learning_rate": 5.659269088549429e-05, + "loss": 2.5704, + "step": 28619 + }, + { + "epoch": 0.8486789431545236, + "grad_norm": 0.08269793540239334, + "learning_rate": 5.657094940733254e-05, + "loss": 2.5862, + "step": 28620 + }, + { + "epoch": 0.8487085965068351, + "grad_norm": 0.08048971742391586, + "learning_rate": 5.654921185581114e-05, + "loss": 2.5575, + "step": 28621 + }, + { + "epoch": 0.8487382498591466, + "grad_norm": 0.0875362679362297, + "learning_rate": 5.652747823112253e-05, + "loss": 2.5908, + "step": 28622 + }, + { + "epoch": 0.848767903211458, + "grad_norm": 0.08454333990812302, + "learning_rate": 5.65057485334593e-05, + "loss": 2.5686, + "step": 28623 + }, + { + "epoch": 0.8487975565637695, + "grad_norm": 0.08723333477973938, + "learning_rate": 5.648402276301362e-05, + "loss": 2.582, + "step": 28624 + }, + { + "epoch": 0.848827209916081, + "grad_norm": 0.08605814725160599, + "learning_rate": 5.646230091997823e-05, + "loss": 2.5532, + "step": 28625 + }, + { + "epoch": 0.8488568632683925, + "grad_norm": 0.08241980522871017, + "learning_rate": 5.644058300454524e-05, + "loss": 2.5864, + "step": 28626 + }, + { + "epoch": 0.848886516620704, + "grad_norm": 0.07958600670099258, + "learning_rate": 5.641886901690713e-05, + "loss": 2.5732, + "step": 28627 + }, + { + "epoch": 0.8489161699730154, + "grad_norm": 0.07877609878778458, + "learning_rate": 5.6397158957256e-05, + "loss": 2.5842, + "step": 28628 + }, + { + "epoch": 0.848945823325327, + "grad_norm": 0.08392981439828873, + "learning_rate": 5.6375452825784155e-05, + "loss": 2.562, + "step": 28629 + }, + { + "epoch": 0.8489754766776384, + "grad_norm": 0.09407202899456024, + "learning_rate": 5.6353750622683775e-05, + "loss": 2.5434, + "step": 28630 + }, + { + "epoch": 0.8490051300299499, + "grad_norm": 0.08101886510848999, + "learning_rate": 5.633205234814715e-05, + "loss": 2.5768, + "step": 28631 + }, + { + "epoch": 0.8490347833822613, + "grad_norm": 0.08850019425153732, + "learning_rate": 5.631035800236633e-05, + "loss": 2.631, + "step": 28632 + }, + { + "epoch": 0.8490644367345729, + "grad_norm": 0.08396551012992859, + "learning_rate": 5.628866758553347e-05, + "loss": 2.598, + "step": 28633 + }, + { + "epoch": 0.8490940900868843, + "grad_norm": 0.08436333388090134, + "learning_rate": 5.6266981097840616e-05, + "loss": 2.5631, + "step": 28634 + }, + { + "epoch": 0.8491237434391958, + "grad_norm": 0.08685274422168732, + "learning_rate": 5.624529853947979e-05, + "loss": 2.587, + "step": 28635 + }, + { + "epoch": 0.8491533967915073, + "grad_norm": 0.08253628760576248, + "learning_rate": 5.622361991064301e-05, + "loss": 2.5976, + "step": 28636 + }, + { + "epoch": 0.8491830501438188, + "grad_norm": 0.09085998684167862, + "learning_rate": 5.620194521152228e-05, + "loss": 2.5744, + "step": 28637 + }, + { + "epoch": 0.8492127034961302, + "grad_norm": 0.08696481585502625, + "learning_rate": 5.618027444230944e-05, + "loss": 2.5492, + "step": 28638 + }, + { + "epoch": 0.8492423568484417, + "grad_norm": 0.08642018586397171, + "learning_rate": 5.615860760319652e-05, + "loss": 2.591, + "step": 28639 + }, + { + "epoch": 0.8492720102007532, + "grad_norm": 0.08884737640619278, + "learning_rate": 5.6136944694375304e-05, + "loss": 2.5389, + "step": 28640 + }, + { + "epoch": 0.8493016635530647, + "grad_norm": 0.08901459723711014, + "learning_rate": 5.611528571603758e-05, + "loss": 2.5985, + "step": 28641 + }, + { + "epoch": 0.8493313169053761, + "grad_norm": 0.08804517239332199, + "learning_rate": 5.609363066837525e-05, + "loss": 2.5642, + "step": 28642 + }, + { + "epoch": 0.8493609702576876, + "grad_norm": 0.09312733262777328, + "learning_rate": 5.607197955158e-05, + "loss": 2.5616, + "step": 28643 + }, + { + "epoch": 0.8493906236099991, + "grad_norm": 0.09063936769962311, + "learning_rate": 5.605033236584356e-05, + "loss": 2.6037, + "step": 28644 + }, + { + "epoch": 0.8494202769623106, + "grad_norm": 0.08339286595582962, + "learning_rate": 5.602868911135761e-05, + "loss": 2.5896, + "step": 28645 + }, + { + "epoch": 0.849449930314622, + "grad_norm": 0.08360303938388824, + "learning_rate": 5.600704978831389e-05, + "loss": 2.5404, + "step": 28646 + }, + { + "epoch": 0.8494795836669335, + "grad_norm": 0.08487710356712341, + "learning_rate": 5.598541439690391e-05, + "loss": 2.5739, + "step": 28647 + }, + { + "epoch": 0.8495092370192451, + "grad_norm": 0.07855986803770065, + "learning_rate": 5.59637829373193e-05, + "loss": 2.5639, + "step": 28648 + }, + { + "epoch": 0.8495388903715565, + "grad_norm": 0.08795814961194992, + "learning_rate": 5.594215540975162e-05, + "loss": 2.5799, + "step": 28649 + }, + { + "epoch": 0.849568543723868, + "grad_norm": 0.08223319798707962, + "learning_rate": 5.592053181439233e-05, + "loss": 2.5642, + "step": 28650 + }, + { + "epoch": 0.8495981970761795, + "grad_norm": 0.08520124107599258, + "learning_rate": 5.5898912151433e-05, + "loss": 2.5945, + "step": 28651 + }, + { + "epoch": 0.849627850428491, + "grad_norm": 0.08468514680862427, + "learning_rate": 5.5877296421065035e-05, + "loss": 2.5938, + "step": 28652 + }, + { + "epoch": 0.8496575037808024, + "grad_norm": 0.08385688811540604, + "learning_rate": 5.585568462347984e-05, + "loss": 2.5656, + "step": 28653 + }, + { + "epoch": 0.8496871571331139, + "grad_norm": 0.08882343024015427, + "learning_rate": 5.5834076758868814e-05, + "loss": 2.5485, + "step": 28654 + }, + { + "epoch": 0.8497168104854254, + "grad_norm": 0.07954943180084229, + "learning_rate": 5.5812472827423245e-05, + "loss": 2.5702, + "step": 28655 + }, + { + "epoch": 0.8497464638377369, + "grad_norm": 0.08754828572273254, + "learning_rate": 5.579087282933448e-05, + "loss": 2.5625, + "step": 28656 + }, + { + "epoch": 0.8497761171900483, + "grad_norm": 0.08409146964550018, + "learning_rate": 5.576927676479376e-05, + "loss": 2.591, + "step": 28657 + }, + { + "epoch": 0.8498057705423598, + "grad_norm": 0.08833378553390503, + "learning_rate": 5.574768463399238e-05, + "loss": 2.5779, + "step": 28658 + }, + { + "epoch": 0.8498354238946713, + "grad_norm": 0.09200365841388702, + "learning_rate": 5.572609643712151e-05, + "loss": 2.546, + "step": 28659 + }, + { + "epoch": 0.8498650772469828, + "grad_norm": 0.08649001270532608, + "learning_rate": 5.570451217437228e-05, + "loss": 2.5431, + "step": 28660 + }, + { + "epoch": 0.8498947305992942, + "grad_norm": 0.08902905881404877, + "learning_rate": 5.568293184593598e-05, + "loss": 2.5771, + "step": 28661 + }, + { + "epoch": 0.8499243839516057, + "grad_norm": 0.09388787299394608, + "learning_rate": 5.5661355452003404e-05, + "loss": 2.541, + "step": 28662 + }, + { + "epoch": 0.8499540373039172, + "grad_norm": 0.08752796798944473, + "learning_rate": 5.563978299276584e-05, + "loss": 2.5965, + "step": 28663 + }, + { + "epoch": 0.8499836906562287, + "grad_norm": 0.08725787699222565, + "learning_rate": 5.561821446841431e-05, + "loss": 2.5588, + "step": 28664 + }, + { + "epoch": 0.8500133440085401, + "grad_norm": 0.08945973217487335, + "learning_rate": 5.559664987913976e-05, + "loss": 2.5773, + "step": 28665 + }, + { + "epoch": 0.8500429973608516, + "grad_norm": 0.08231855928897858, + "learning_rate": 5.557508922513316e-05, + "loss": 2.5313, + "step": 28666 + }, + { + "epoch": 0.8500726507131631, + "grad_norm": 0.08286318182945251, + "learning_rate": 5.555353250658546e-05, + "loss": 2.5467, + "step": 28667 + }, + { + "epoch": 0.8501023040654746, + "grad_norm": 0.0893978700041771, + "learning_rate": 5.553197972368745e-05, + "loss": 2.5699, + "step": 28668 + }, + { + "epoch": 0.8501319574177861, + "grad_norm": 0.08433156460523605, + "learning_rate": 5.55104308766301e-05, + "loss": 2.5731, + "step": 28669 + }, + { + "epoch": 0.8501616107700976, + "grad_norm": 0.08254086226224899, + "learning_rate": 5.54888859656042e-05, + "loss": 2.5774, + "step": 28670 + }, + { + "epoch": 0.8501912641224091, + "grad_norm": 0.08803033083677292, + "learning_rate": 5.5467344990800585e-05, + "loss": 2.5795, + "step": 28671 + }, + { + "epoch": 0.8502209174747205, + "grad_norm": 0.08998379856348038, + "learning_rate": 5.544580795240983e-05, + "loss": 2.5888, + "step": 28672 + }, + { + "epoch": 0.850250570827032, + "grad_norm": 0.07783462107181549, + "learning_rate": 5.542427485062273e-05, + "loss": 2.5542, + "step": 28673 + }, + { + "epoch": 0.8502802241793435, + "grad_norm": 0.09233902394771576, + "learning_rate": 5.540274568563003e-05, + "loss": 2.596, + "step": 28674 + }, + { + "epoch": 0.850309877531655, + "grad_norm": 0.08917342126369476, + "learning_rate": 5.538122045762217e-05, + "loss": 2.5777, + "step": 28675 + }, + { + "epoch": 0.8503395308839664, + "grad_norm": 0.08663655072450638, + "learning_rate": 5.5359699166790066e-05, + "loss": 2.5825, + "step": 28676 + }, + { + "epoch": 0.8503691842362779, + "grad_norm": 0.08552408218383789, + "learning_rate": 5.533818181332417e-05, + "loss": 2.6075, + "step": 28677 + }, + { + "epoch": 0.8503988375885894, + "grad_norm": 0.08511192351579666, + "learning_rate": 5.531666839741495e-05, + "loss": 2.5304, + "step": 28678 + }, + { + "epoch": 0.8504284909409009, + "grad_norm": 0.07926983386278152, + "learning_rate": 5.529515891925302e-05, + "loss": 2.5483, + "step": 28679 + }, + { + "epoch": 0.8504581442932123, + "grad_norm": 0.07911672443151474, + "learning_rate": 5.5273653379028735e-05, + "loss": 2.5442, + "step": 28680 + }, + { + "epoch": 0.8504877976455238, + "grad_norm": 0.0818641409277916, + "learning_rate": 5.52521517769326e-05, + "loss": 2.5524, + "step": 28681 + }, + { + "epoch": 0.8505174509978353, + "grad_norm": 0.08878818154335022, + "learning_rate": 5.5230654113155084e-05, + "loss": 2.5566, + "step": 28682 + }, + { + "epoch": 0.8505471043501468, + "grad_norm": 0.08582493662834167, + "learning_rate": 5.520916038788642e-05, + "loss": 2.5604, + "step": 28683 + }, + { + "epoch": 0.8505767577024582, + "grad_norm": 0.08312027901411057, + "learning_rate": 5.518767060131696e-05, + "loss": 2.5604, + "step": 28684 + }, + { + "epoch": 0.8506064110547698, + "grad_norm": 0.08032018691301346, + "learning_rate": 5.5166184753637e-05, + "loss": 2.5541, + "step": 28685 + }, + { + "epoch": 0.8506360644070812, + "grad_norm": 0.08489412814378738, + "learning_rate": 5.514470284503686e-05, + "loss": 2.5579, + "step": 28686 + }, + { + "epoch": 0.8506657177593927, + "grad_norm": 0.07927501946687698, + "learning_rate": 5.5123224875706754e-05, + "loss": 2.5716, + "step": 28687 + }, + { + "epoch": 0.8506953711117041, + "grad_norm": 0.07907027006149292, + "learning_rate": 5.510175084583674e-05, + "loss": 2.6027, + "step": 28688 + }, + { + "epoch": 0.8507250244640157, + "grad_norm": 0.08083520084619522, + "learning_rate": 5.508028075561716e-05, + "loss": 2.5492, + "step": 28689 + }, + { + "epoch": 0.8507546778163272, + "grad_norm": 0.07638025283813477, + "learning_rate": 5.5058814605238096e-05, + "loss": 2.5739, + "step": 28690 + }, + { + "epoch": 0.8507843311686386, + "grad_norm": 0.08342073112726212, + "learning_rate": 5.503735239488955e-05, + "loss": 2.5796, + "step": 28691 + }, + { + "epoch": 0.8508139845209501, + "grad_norm": 0.08691523224115372, + "learning_rate": 5.5015894124761766e-05, + "loss": 2.5435, + "step": 28692 + }, + { + "epoch": 0.8508436378732616, + "grad_norm": 0.07901419699192047, + "learning_rate": 5.4994439795044535e-05, + "loss": 2.5549, + "step": 28693 + }, + { + "epoch": 0.8508732912255731, + "grad_norm": 0.08829993009567261, + "learning_rate": 5.4972989405927875e-05, + "loss": 2.5646, + "step": 28694 + }, + { + "epoch": 0.8509029445778845, + "grad_norm": 0.08345896750688553, + "learning_rate": 5.4951542957601856e-05, + "loss": 2.5836, + "step": 28695 + }, + { + "epoch": 0.850932597930196, + "grad_norm": 0.07730066776275635, + "learning_rate": 5.493010045025626e-05, + "loss": 2.5709, + "step": 28696 + }, + { + "epoch": 0.8509622512825075, + "grad_norm": 0.07821304351091385, + "learning_rate": 5.4908661884081e-05, + "loss": 2.5536, + "step": 28697 + }, + { + "epoch": 0.850991904634819, + "grad_norm": 0.08106374740600586, + "learning_rate": 5.488722725926598e-05, + "loss": 2.5865, + "step": 28698 + }, + { + "epoch": 0.8510215579871304, + "grad_norm": 0.08115468919277191, + "learning_rate": 5.486579657600099e-05, + "loss": 2.5267, + "step": 28699 + }, + { + "epoch": 0.851051211339442, + "grad_norm": 0.08289378881454468, + "learning_rate": 5.484436983447572e-05, + "loss": 2.5465, + "step": 28700 + }, + { + "epoch": 0.8510808646917534, + "grad_norm": 0.08752638101577759, + "learning_rate": 5.482294703487989e-05, + "loss": 2.5791, + "step": 28701 + }, + { + "epoch": 0.8511105180440649, + "grad_norm": 0.08203780651092529, + "learning_rate": 5.480152817740336e-05, + "loss": 2.5627, + "step": 28702 + }, + { + "epoch": 0.8511401713963763, + "grad_norm": 0.0824495330452919, + "learning_rate": 5.478011326223587e-05, + "loss": 2.5659, + "step": 28703 + }, + { + "epoch": 0.8511698247486879, + "grad_norm": 0.08587431162595749, + "learning_rate": 5.475870228956675e-05, + "loss": 2.5463, + "step": 28704 + }, + { + "epoch": 0.8511994781009993, + "grad_norm": 0.08865515142679214, + "learning_rate": 5.473729525958571e-05, + "loss": 2.6033, + "step": 28705 + }, + { + "epoch": 0.8512291314533108, + "grad_norm": 0.0855066105723381, + "learning_rate": 5.4715892172482404e-05, + "loss": 2.6119, + "step": 28706 + }, + { + "epoch": 0.8512587848056222, + "grad_norm": 0.0801984965801239, + "learning_rate": 5.469449302844631e-05, + "loss": 2.573, + "step": 28707 + }, + { + "epoch": 0.8512884381579338, + "grad_norm": 0.08968233317136765, + "learning_rate": 5.467309782766688e-05, + "loss": 2.5617, + "step": 28708 + }, + { + "epoch": 0.8513180915102452, + "grad_norm": 0.08083920180797577, + "learning_rate": 5.4651706570333637e-05, + "loss": 2.5909, + "step": 28709 + }, + { + "epoch": 0.8513477448625567, + "grad_norm": 0.09312362223863602, + "learning_rate": 5.463031925663598e-05, + "loss": 2.5891, + "step": 28710 + }, + { + "epoch": 0.8513773982148682, + "grad_norm": 0.08433835953474045, + "learning_rate": 5.4608935886763245e-05, + "loss": 2.5785, + "step": 28711 + }, + { + "epoch": 0.8514070515671797, + "grad_norm": 0.08868836611509323, + "learning_rate": 5.4587556460904906e-05, + "loss": 2.5753, + "step": 28712 + }, + { + "epoch": 0.8514367049194912, + "grad_norm": 0.08804507553577423, + "learning_rate": 5.456618097925015e-05, + "loss": 2.6022, + "step": 28713 + }, + { + "epoch": 0.8514663582718026, + "grad_norm": 0.08324727416038513, + "learning_rate": 5.454480944198836e-05, + "loss": 2.5555, + "step": 28714 + }, + { + "epoch": 0.8514960116241141, + "grad_norm": 0.08710671216249466, + "learning_rate": 5.452344184930869e-05, + "loss": 2.5905, + "step": 28715 + }, + { + "epoch": 0.8515256649764256, + "grad_norm": 0.09240375459194183, + "learning_rate": 5.450207820140046e-05, + "loss": 2.5844, + "step": 28716 + }, + { + "epoch": 0.8515553183287371, + "grad_norm": 0.08386840671300888, + "learning_rate": 5.4480718498452764e-05, + "loss": 2.5536, + "step": 28717 + }, + { + "epoch": 0.8515849716810485, + "grad_norm": 0.08304595202207565, + "learning_rate": 5.445936274065477e-05, + "loss": 2.5991, + "step": 28718 + }, + { + "epoch": 0.85161462503336, + "grad_norm": 0.0821646898984909, + "learning_rate": 5.443801092819567e-05, + "loss": 2.5608, + "step": 28719 + }, + { + "epoch": 0.8516442783856715, + "grad_norm": 0.08096078783273697, + "learning_rate": 5.441666306126436e-05, + "loss": 2.5784, + "step": 28720 + }, + { + "epoch": 0.851673931737983, + "grad_norm": 0.08687271177768707, + "learning_rate": 5.439531914005002e-05, + "loss": 2.5661, + "step": 28721 + }, + { + "epoch": 0.8517035850902944, + "grad_norm": 0.08027022331953049, + "learning_rate": 5.437397916474168e-05, + "loss": 2.5991, + "step": 28722 + }, + { + "epoch": 0.851733238442606, + "grad_norm": 0.08182201534509659, + "learning_rate": 5.435264313552818e-05, + "loss": 2.5693, + "step": 28723 + }, + { + "epoch": 0.8517628917949174, + "grad_norm": 0.0826457217335701, + "learning_rate": 5.433131105259853e-05, + "loss": 2.5756, + "step": 28724 + }, + { + "epoch": 0.8517925451472289, + "grad_norm": 0.08400387316942215, + "learning_rate": 5.430998291614159e-05, + "loss": 2.5813, + "step": 28725 + }, + { + "epoch": 0.8518221984995403, + "grad_norm": 0.076298289000988, + "learning_rate": 5.428865872634631e-05, + "loss": 2.5975, + "step": 28726 + }, + { + "epoch": 0.8518518518518519, + "grad_norm": 0.0831654891371727, + "learning_rate": 5.426733848340143e-05, + "loss": 2.5874, + "step": 28727 + }, + { + "epoch": 0.8518815052041633, + "grad_norm": 0.08312840759754181, + "learning_rate": 5.42460221874958e-05, + "loss": 2.5537, + "step": 28728 + }, + { + "epoch": 0.8519111585564748, + "grad_norm": 0.0903448760509491, + "learning_rate": 5.422470983881811e-05, + "loss": 2.611, + "step": 28729 + }, + { + "epoch": 0.8519408119087862, + "grad_norm": 0.086589515209198, + "learning_rate": 5.420340143755714e-05, + "loss": 2.6161, + "step": 28730 + }, + { + "epoch": 0.8519704652610978, + "grad_norm": 0.08970481902360916, + "learning_rate": 5.418209698390164e-05, + "loss": 2.5597, + "step": 28731 + }, + { + "epoch": 0.8520001186134093, + "grad_norm": 0.08002675324678421, + "learning_rate": 5.416079647804012e-05, + "loss": 2.593, + "step": 28732 + }, + { + "epoch": 0.8520297719657207, + "grad_norm": 0.09635275602340698, + "learning_rate": 5.4139499920161316e-05, + "loss": 2.5864, + "step": 28733 + }, + { + "epoch": 0.8520594253180322, + "grad_norm": 0.08613971620798111, + "learning_rate": 5.411820731045375e-05, + "loss": 2.5808, + "step": 28734 + }, + { + "epoch": 0.8520890786703437, + "grad_norm": 0.09797075390815735, + "learning_rate": 5.409691864910599e-05, + "loss": 2.5945, + "step": 28735 + }, + { + "epoch": 0.8521187320226552, + "grad_norm": 0.09607785940170288, + "learning_rate": 5.4075633936306545e-05, + "loss": 2.5745, + "step": 28736 + }, + { + "epoch": 0.8521483853749666, + "grad_norm": 0.08785609155893326, + "learning_rate": 5.4054353172243994e-05, + "loss": 2.5482, + "step": 28737 + }, + { + "epoch": 0.8521780387272782, + "grad_norm": 0.08512742817401886, + "learning_rate": 5.4033076357106635e-05, + "loss": 2.5722, + "step": 28738 + }, + { + "epoch": 0.8522076920795896, + "grad_norm": 0.09445376694202423, + "learning_rate": 5.401180349108292e-05, + "loss": 2.5675, + "step": 28739 + }, + { + "epoch": 0.8522373454319011, + "grad_norm": 0.0883214920759201, + "learning_rate": 5.399053457436115e-05, + "loss": 2.5567, + "step": 28740 + }, + { + "epoch": 0.8522669987842125, + "grad_norm": 0.09611449390649796, + "learning_rate": 5.396926960712983e-05, + "loss": 2.5635, + "step": 28741 + }, + { + "epoch": 0.8522966521365241, + "grad_norm": 0.089555524289608, + "learning_rate": 5.3948008589577155e-05, + "loss": 2.5742, + "step": 28742 + }, + { + "epoch": 0.8523263054888355, + "grad_norm": 0.09261420369148254, + "learning_rate": 5.392675152189147e-05, + "loss": 2.5734, + "step": 28743 + }, + { + "epoch": 0.852355958841147, + "grad_norm": 0.08556360006332397, + "learning_rate": 5.390549840426101e-05, + "loss": 2.6004, + "step": 28744 + }, + { + "epoch": 0.8523856121934584, + "grad_norm": 0.09579216688871384, + "learning_rate": 5.388424923687385e-05, + "loss": 2.583, + "step": 28745 + }, + { + "epoch": 0.85241526554577, + "grad_norm": 0.08305388689041138, + "learning_rate": 5.3863004019918285e-05, + "loss": 2.5444, + "step": 28746 + }, + { + "epoch": 0.8524449188980814, + "grad_norm": 0.0856754332780838, + "learning_rate": 5.384176275358249e-05, + "loss": 2.5705, + "step": 28747 + }, + { + "epoch": 0.8524745722503929, + "grad_norm": 0.08268759399652481, + "learning_rate": 5.382052543805438e-05, + "loss": 2.5639, + "step": 28748 + }, + { + "epoch": 0.8525042256027043, + "grad_norm": 0.08075380325317383, + "learning_rate": 5.379929207352208e-05, + "loss": 2.5801, + "step": 28749 + }, + { + "epoch": 0.8525338789550159, + "grad_norm": 0.08293585479259491, + "learning_rate": 5.377806266017365e-05, + "loss": 2.5462, + "step": 28750 + }, + { + "epoch": 0.8525635323073274, + "grad_norm": 0.08174509555101395, + "learning_rate": 5.375683719819707e-05, + "loss": 2.5877, + "step": 28751 + }, + { + "epoch": 0.8525931856596388, + "grad_norm": 0.07978708297014236, + "learning_rate": 5.373561568778029e-05, + "loss": 2.5759, + "step": 28752 + }, + { + "epoch": 0.8526228390119504, + "grad_norm": 0.08067198097705841, + "learning_rate": 5.371439812911111e-05, + "loss": 2.5711, + "step": 28753 + }, + { + "epoch": 0.8526524923642618, + "grad_norm": 0.08044836670160294, + "learning_rate": 5.3693184522377645e-05, + "loss": 2.572, + "step": 28754 + }, + { + "epoch": 0.8526821457165733, + "grad_norm": 0.08408015221357346, + "learning_rate": 5.367197486776765e-05, + "loss": 2.5774, + "step": 28755 + }, + { + "epoch": 0.8527117990688847, + "grad_norm": 0.07976371049880981, + "learning_rate": 5.365076916546896e-05, + "loss": 2.5851, + "step": 28756 + }, + { + "epoch": 0.8527414524211963, + "grad_norm": 0.08795630931854248, + "learning_rate": 5.3629567415669264e-05, + "loss": 2.6259, + "step": 28757 + }, + { + "epoch": 0.8527711057735077, + "grad_norm": 0.08630665391683578, + "learning_rate": 5.360836961855653e-05, + "loss": 2.5923, + "step": 28758 + }, + { + "epoch": 0.8528007591258192, + "grad_norm": 0.08770827949047089, + "learning_rate": 5.3587175774318156e-05, + "loss": 2.5457, + "step": 28759 + }, + { + "epoch": 0.8528304124781306, + "grad_norm": 0.07607435435056686, + "learning_rate": 5.356598588314199e-05, + "loss": 2.5831, + "step": 28760 + }, + { + "epoch": 0.8528600658304422, + "grad_norm": 0.08413107693195343, + "learning_rate": 5.354479994521566e-05, + "loss": 2.6029, + "step": 28761 + }, + { + "epoch": 0.8528897191827536, + "grad_norm": 0.08626420050859451, + "learning_rate": 5.352361796072675e-05, + "loss": 2.5781, + "step": 28762 + }, + { + "epoch": 0.8529193725350651, + "grad_norm": 0.07863906770944595, + "learning_rate": 5.350243992986281e-05, + "loss": 2.5843, + "step": 28763 + }, + { + "epoch": 0.8529490258873765, + "grad_norm": 0.08048971742391586, + "learning_rate": 5.3481265852811436e-05, + "loss": 2.5556, + "step": 28764 + }, + { + "epoch": 0.8529786792396881, + "grad_norm": 0.08619604259729385, + "learning_rate": 5.346009572976007e-05, + "loss": 2.5391, + "step": 28765 + }, + { + "epoch": 0.8530083325919995, + "grad_norm": 0.08105091005563736, + "learning_rate": 5.343892956089613e-05, + "loss": 2.5688, + "step": 28766 + }, + { + "epoch": 0.853037985944311, + "grad_norm": 0.0822424665093422, + "learning_rate": 5.341776734640719e-05, + "loss": 2.5751, + "step": 28767 + }, + { + "epoch": 0.8530676392966224, + "grad_norm": 0.08141619712114334, + "learning_rate": 5.3396609086480696e-05, + "loss": 2.576, + "step": 28768 + }, + { + "epoch": 0.853097292648934, + "grad_norm": 0.08580856025218964, + "learning_rate": 5.337545478130379e-05, + "loss": 2.5653, + "step": 28769 + }, + { + "epoch": 0.8531269460012454, + "grad_norm": 0.07913876324892044, + "learning_rate": 5.3354304431063926e-05, + "loss": 2.551, + "step": 28770 + }, + { + "epoch": 0.8531565993535569, + "grad_norm": 0.08047662675380707, + "learning_rate": 5.333315803594829e-05, + "loss": 2.5395, + "step": 28771 + }, + { + "epoch": 0.8531862527058685, + "grad_norm": 0.09538577497005463, + "learning_rate": 5.3312015596144236e-05, + "loss": 2.5724, + "step": 28772 + }, + { + "epoch": 0.8532159060581799, + "grad_norm": 0.08293653279542923, + "learning_rate": 5.3290877111839e-05, + "loss": 2.5909, + "step": 28773 + }, + { + "epoch": 0.8532455594104914, + "grad_norm": 0.08517884463071823, + "learning_rate": 5.32697425832197e-05, + "loss": 2.5839, + "step": 28774 + }, + { + "epoch": 0.8532752127628028, + "grad_norm": 0.0839477926492691, + "learning_rate": 5.3248612010473484e-05, + "loss": 2.5455, + "step": 28775 + }, + { + "epoch": 0.8533048661151144, + "grad_norm": 0.07993888109922409, + "learning_rate": 5.3227485393787525e-05, + "loss": 2.549, + "step": 28776 + }, + { + "epoch": 0.8533345194674258, + "grad_norm": 0.0827312022447586, + "learning_rate": 5.3206362733348836e-05, + "loss": 2.5647, + "step": 28777 + }, + { + "epoch": 0.8533641728197373, + "grad_norm": 0.07995428144931793, + "learning_rate": 5.3185244029344495e-05, + "loss": 2.5789, + "step": 28778 + }, + { + "epoch": 0.8533938261720487, + "grad_norm": 0.07838278263807297, + "learning_rate": 5.316412928196157e-05, + "loss": 2.5392, + "step": 28779 + }, + { + "epoch": 0.8534234795243603, + "grad_norm": 0.08726043999195099, + "learning_rate": 5.314301849138692e-05, + "loss": 2.5774, + "step": 28780 + }, + { + "epoch": 0.8534531328766717, + "grad_norm": 0.08340147137641907, + "learning_rate": 5.3121911657807556e-05, + "loss": 2.5929, + "step": 28781 + }, + { + "epoch": 0.8534827862289832, + "grad_norm": 0.08224878460168839, + "learning_rate": 5.3100808781410384e-05, + "loss": 2.6268, + "step": 28782 + }, + { + "epoch": 0.8535124395812946, + "grad_norm": 0.08202952891588211, + "learning_rate": 5.307970986238225e-05, + "loss": 2.5468, + "step": 28783 + }, + { + "epoch": 0.8535420929336062, + "grad_norm": 0.09054483473300934, + "learning_rate": 5.305861490091002e-05, + "loss": 2.5952, + "step": 28784 + }, + { + "epoch": 0.8535717462859176, + "grad_norm": 0.08298997581005096, + "learning_rate": 5.303752389718042e-05, + "loss": 2.542, + "step": 28785 + }, + { + "epoch": 0.8536013996382291, + "grad_norm": 0.09055998176336288, + "learning_rate": 5.30164368513803e-05, + "loss": 2.579, + "step": 28786 + }, + { + "epoch": 0.8536310529905405, + "grad_norm": 0.08344754576683044, + "learning_rate": 5.299535376369635e-05, + "loss": 2.6239, + "step": 28787 + }, + { + "epoch": 0.8536607063428521, + "grad_norm": 0.082776740193367, + "learning_rate": 5.2974274634315314e-05, + "loss": 2.5487, + "step": 28788 + }, + { + "epoch": 0.8536903596951635, + "grad_norm": 0.08178598433732986, + "learning_rate": 5.295319946342375e-05, + "loss": 2.5874, + "step": 28789 + }, + { + "epoch": 0.853720013047475, + "grad_norm": 0.0831090584397316, + "learning_rate": 5.293212825120835e-05, + "loss": 2.5722, + "step": 28790 + }, + { + "epoch": 0.8537496663997864, + "grad_norm": 0.08434179425239563, + "learning_rate": 5.29110609978557e-05, + "loss": 2.5468, + "step": 28791 + }, + { + "epoch": 0.853779319752098, + "grad_norm": 0.0816257894039154, + "learning_rate": 5.288999770355235e-05, + "loss": 2.5501, + "step": 28792 + }, + { + "epoch": 0.8538089731044095, + "grad_norm": 0.07926537841558456, + "learning_rate": 5.286893836848483e-05, + "loss": 2.584, + "step": 28793 + }, + { + "epoch": 0.8538386264567209, + "grad_norm": 0.08278656750917435, + "learning_rate": 5.284788299283955e-05, + "loss": 2.5508, + "step": 28794 + }, + { + "epoch": 0.8538682798090325, + "grad_norm": 0.08661405742168427, + "learning_rate": 5.282683157680307e-05, + "loss": 2.5526, + "step": 28795 + }, + { + "epoch": 0.8538979331613439, + "grad_norm": 0.08113158494234085, + "learning_rate": 5.280578412056175e-05, + "loss": 2.5622, + "step": 28796 + }, + { + "epoch": 0.8539275865136554, + "grad_norm": 0.08183121681213379, + "learning_rate": 5.2784740624301994e-05, + "loss": 2.5683, + "step": 28797 + }, + { + "epoch": 0.8539572398659668, + "grad_norm": 0.08893956989049911, + "learning_rate": 5.2763701088210045e-05, + "loss": 2.5363, + "step": 28798 + }, + { + "epoch": 0.8539868932182784, + "grad_norm": 0.08109374344348907, + "learning_rate": 5.2742665512472365e-05, + "loss": 2.5948, + "step": 28799 + }, + { + "epoch": 0.8540165465705898, + "grad_norm": 0.08260896801948547, + "learning_rate": 5.272163389727514e-05, + "loss": 2.5608, + "step": 28800 + }, + { + "epoch": 0.8540461999229013, + "grad_norm": 0.09363576024770737, + "learning_rate": 5.2700606242804594e-05, + "loss": 2.5966, + "step": 28801 + }, + { + "epoch": 0.8540758532752127, + "grad_norm": 0.08770844340324402, + "learning_rate": 5.267958254924698e-05, + "loss": 2.5821, + "step": 28802 + }, + { + "epoch": 0.8541055066275243, + "grad_norm": 0.07594583183526993, + "learning_rate": 5.2658562816788545e-05, + "loss": 2.5797, + "step": 28803 + }, + { + "epoch": 0.8541351599798357, + "grad_norm": 0.07806374877691269, + "learning_rate": 5.2637547045615185e-05, + "loss": 2.5217, + "step": 28804 + }, + { + "epoch": 0.8541648133321472, + "grad_norm": 0.08418826013803482, + "learning_rate": 5.2616535235913085e-05, + "loss": 2.5309, + "step": 28805 + }, + { + "epoch": 0.8541944666844586, + "grad_norm": 0.08477627485990524, + "learning_rate": 5.259552738786844e-05, + "loss": 2.5687, + "step": 28806 + }, + { + "epoch": 0.8542241200367702, + "grad_norm": 0.08245600014925003, + "learning_rate": 5.2574523501667134e-05, + "loss": 2.5916, + "step": 28807 + }, + { + "epoch": 0.8542537733890816, + "grad_norm": 0.08432476222515106, + "learning_rate": 5.255352357749532e-05, + "loss": 2.5734, + "step": 28808 + }, + { + "epoch": 0.8542834267413931, + "grad_norm": 0.09445507824420929, + "learning_rate": 5.253252761553878e-05, + "loss": 2.577, + "step": 28809 + }, + { + "epoch": 0.8543130800937045, + "grad_norm": 0.0877159908413887, + "learning_rate": 5.25115356159836e-05, + "loss": 2.5781, + "step": 28810 + }, + { + "epoch": 0.8543427334460161, + "grad_norm": 0.08169525116682053, + "learning_rate": 5.2490547579015504e-05, + "loss": 2.5812, + "step": 28811 + }, + { + "epoch": 0.8543723867983275, + "grad_norm": 0.08217592537403107, + "learning_rate": 5.246956350482046e-05, + "loss": 2.5738, + "step": 28812 + }, + { + "epoch": 0.854402040150639, + "grad_norm": 0.07904531806707382, + "learning_rate": 5.2448583393584326e-05, + "loss": 2.5527, + "step": 28813 + }, + { + "epoch": 0.8544316935029506, + "grad_norm": 0.08122509717941284, + "learning_rate": 5.2427607245492725e-05, + "loss": 2.562, + "step": 28814 + }, + { + "epoch": 0.854461346855262, + "grad_norm": 0.08169200271368027, + "learning_rate": 5.240663506073151e-05, + "loss": 2.5957, + "step": 28815 + }, + { + "epoch": 0.8544910002075735, + "grad_norm": 0.09083998948335648, + "learning_rate": 5.238566683948631e-05, + "loss": 2.5897, + "step": 28816 + }, + { + "epoch": 0.8545206535598849, + "grad_norm": 0.08664865046739578, + "learning_rate": 5.236470258194292e-05, + "loss": 2.5465, + "step": 28817 + }, + { + "epoch": 0.8545503069121965, + "grad_norm": 0.09292091429233551, + "learning_rate": 5.234374228828681e-05, + "loss": 2.6024, + "step": 28818 + }, + { + "epoch": 0.8545799602645079, + "grad_norm": 0.08536392450332642, + "learning_rate": 5.2322785958703764e-05, + "loss": 2.564, + "step": 28819 + }, + { + "epoch": 0.8546096136168194, + "grad_norm": 0.08518902212381363, + "learning_rate": 5.2301833593379314e-05, + "loss": 2.5761, + "step": 28820 + }, + { + "epoch": 0.8546392669691308, + "grad_norm": 0.0971694141626358, + "learning_rate": 5.228088519249902e-05, + "loss": 2.5879, + "step": 28821 + }, + { + "epoch": 0.8546689203214424, + "grad_norm": 0.08917824923992157, + "learning_rate": 5.2259940756248295e-05, + "loss": 2.5655, + "step": 28822 + }, + { + "epoch": 0.8546985736737538, + "grad_norm": 0.08334779739379883, + "learning_rate": 5.223900028481271e-05, + "loss": 2.575, + "step": 28823 + }, + { + "epoch": 0.8547282270260653, + "grad_norm": 0.09399448335170746, + "learning_rate": 5.2218063778377565e-05, + "loss": 2.5883, + "step": 28824 + }, + { + "epoch": 0.8547578803783767, + "grad_norm": 0.08706840127706528, + "learning_rate": 5.219713123712838e-05, + "loss": 2.5492, + "step": 28825 + }, + { + "epoch": 0.8547875337306883, + "grad_norm": 0.08045288175344467, + "learning_rate": 5.2176202661250394e-05, + "loss": 2.5524, + "step": 28826 + }, + { + "epoch": 0.8548171870829997, + "grad_norm": 0.08203177154064178, + "learning_rate": 5.215527805092901e-05, + "loss": 2.5581, + "step": 28827 + }, + { + "epoch": 0.8548468404353112, + "grad_norm": 0.07692652195692062, + "learning_rate": 5.213435740634953e-05, + "loss": 2.5484, + "step": 28828 + }, + { + "epoch": 0.8548764937876226, + "grad_norm": 0.08356036990880966, + "learning_rate": 5.211344072769719e-05, + "loss": 2.575, + "step": 28829 + }, + { + "epoch": 0.8549061471399342, + "grad_norm": 0.09027525782585144, + "learning_rate": 5.209252801515718e-05, + "loss": 2.5713, + "step": 28830 + }, + { + "epoch": 0.8549358004922456, + "grad_norm": 0.0807683914899826, + "learning_rate": 5.207161926891468e-05, + "loss": 2.6053, + "step": 28831 + }, + { + "epoch": 0.8549654538445571, + "grad_norm": 0.08427499234676361, + "learning_rate": 5.205071448915494e-05, + "loss": 2.5689, + "step": 28832 + }, + { + "epoch": 0.8549951071968686, + "grad_norm": 0.07915767282247543, + "learning_rate": 5.202981367606302e-05, + "loss": 2.5472, + "step": 28833 + }, + { + "epoch": 0.8550247605491801, + "grad_norm": 0.08250701427459717, + "learning_rate": 5.200891682982406e-05, + "loss": 2.6213, + "step": 28834 + }, + { + "epoch": 0.8550544139014916, + "grad_norm": 0.08317200839519501, + "learning_rate": 5.198802395062296e-05, + "loss": 2.5694, + "step": 28835 + }, + { + "epoch": 0.855084067253803, + "grad_norm": 0.07905328273773193, + "learning_rate": 5.196713503864481e-05, + "loss": 2.5927, + "step": 28836 + }, + { + "epoch": 0.8551137206061146, + "grad_norm": 0.08008110523223877, + "learning_rate": 5.194625009407461e-05, + "loss": 2.5585, + "step": 28837 + }, + { + "epoch": 0.855143373958426, + "grad_norm": 0.079912930727005, + "learning_rate": 5.192536911709722e-05, + "loss": 2.5981, + "step": 28838 + }, + { + "epoch": 0.8551730273107375, + "grad_norm": 0.0823625922203064, + "learning_rate": 5.190449210789766e-05, + "loss": 2.5564, + "step": 28839 + }, + { + "epoch": 0.8552026806630489, + "grad_norm": 0.09055017679929733, + "learning_rate": 5.188361906666067e-05, + "loss": 2.5426, + "step": 28840 + }, + { + "epoch": 0.8552323340153605, + "grad_norm": 0.0878581777215004, + "learning_rate": 5.1862749993571214e-05, + "loss": 2.5829, + "step": 28841 + }, + { + "epoch": 0.8552619873676719, + "grad_norm": 0.07634378969669342, + "learning_rate": 5.184188488881397e-05, + "loss": 2.5392, + "step": 28842 + }, + { + "epoch": 0.8552916407199834, + "grad_norm": 0.08246046304702759, + "learning_rate": 5.182102375257369e-05, + "loss": 2.6039, + "step": 28843 + }, + { + "epoch": 0.8553212940722948, + "grad_norm": 0.08619806170463562, + "learning_rate": 5.180016658503539e-05, + "loss": 2.5672, + "step": 28844 + }, + { + "epoch": 0.8553509474246064, + "grad_norm": 0.08436309546232224, + "learning_rate": 5.177931338638342e-05, + "loss": 2.5639, + "step": 28845 + }, + { + "epoch": 0.8553806007769178, + "grad_norm": 0.08303435891866684, + "learning_rate": 5.1758464156802574e-05, + "loss": 2.5727, + "step": 28846 + }, + { + "epoch": 0.8554102541292293, + "grad_norm": 0.0907733216881752, + "learning_rate": 5.173761889647749e-05, + "loss": 2.6036, + "step": 28847 + }, + { + "epoch": 0.8554399074815408, + "grad_norm": 0.08206388354301453, + "learning_rate": 5.171677760559268e-05, + "loss": 2.5672, + "step": 28848 + }, + { + "epoch": 0.8554695608338523, + "grad_norm": 0.08045859634876251, + "learning_rate": 5.169594028433283e-05, + "loss": 2.5622, + "step": 28849 + }, + { + "epoch": 0.8554992141861637, + "grad_norm": 0.08101530373096466, + "learning_rate": 5.16751069328823e-05, + "loss": 2.5926, + "step": 28850 + }, + { + "epoch": 0.8555288675384752, + "grad_norm": 0.08368232846260071, + "learning_rate": 5.165427755142571e-05, + "loss": 2.5667, + "step": 28851 + }, + { + "epoch": 0.8555585208907867, + "grad_norm": 0.08038479834794998, + "learning_rate": 5.163345214014742e-05, + "loss": 2.5281, + "step": 28852 + }, + { + "epoch": 0.8555881742430982, + "grad_norm": 0.07990943640470505, + "learning_rate": 5.161263069923189e-05, + "loss": 2.579, + "step": 28853 + }, + { + "epoch": 0.8556178275954096, + "grad_norm": 0.08206896483898163, + "learning_rate": 5.159181322886347e-05, + "loss": 2.5372, + "step": 28854 + }, + { + "epoch": 0.8556474809477211, + "grad_norm": 0.0787087008357048, + "learning_rate": 5.157099972922652e-05, + "loss": 2.5507, + "step": 28855 + }, + { + "epoch": 0.8556771343000327, + "grad_norm": 0.07971933484077454, + "learning_rate": 5.155019020050533e-05, + "loss": 2.5658, + "step": 28856 + }, + { + "epoch": 0.8557067876523441, + "grad_norm": 0.08283846080303192, + "learning_rate": 5.152938464288415e-05, + "loss": 2.5652, + "step": 28857 + }, + { + "epoch": 0.8557364410046556, + "grad_norm": 0.07991745322942734, + "learning_rate": 5.1508583056547266e-05, + "loss": 2.5369, + "step": 28858 + }, + { + "epoch": 0.855766094356967, + "grad_norm": 0.08416714519262314, + "learning_rate": 5.148778544167887e-05, + "loss": 2.5996, + "step": 28859 + }, + { + "epoch": 0.8557957477092786, + "grad_norm": 0.08663016557693481, + "learning_rate": 5.1466991798463145e-05, + "loss": 2.6016, + "step": 28860 + }, + { + "epoch": 0.85582540106159, + "grad_norm": 0.08543943613767624, + "learning_rate": 5.1446202127084176e-05, + "loss": 2.5753, + "step": 28861 + }, + { + "epoch": 0.8558550544139015, + "grad_norm": 0.08224932849407196, + "learning_rate": 5.142541642772608e-05, + "loss": 2.5514, + "step": 28862 + }, + { + "epoch": 0.855884707766213, + "grad_norm": 0.08131356537342072, + "learning_rate": 5.140463470057294e-05, + "loss": 2.5993, + "step": 28863 + }, + { + "epoch": 0.8559143611185245, + "grad_norm": 0.0867394208908081, + "learning_rate": 5.138385694580871e-05, + "loss": 2.5264, + "step": 28864 + }, + { + "epoch": 0.8559440144708359, + "grad_norm": 0.08113743364810944, + "learning_rate": 5.136308316361749e-05, + "loss": 2.5784, + "step": 28865 + }, + { + "epoch": 0.8559736678231474, + "grad_norm": 0.08595634251832962, + "learning_rate": 5.1342313354183154e-05, + "loss": 2.569, + "step": 28866 + }, + { + "epoch": 0.8560033211754589, + "grad_norm": 0.08837660402059555, + "learning_rate": 5.132154751768964e-05, + "loss": 2.5406, + "step": 28867 + }, + { + "epoch": 0.8560329745277704, + "grad_norm": 0.08080416172742844, + "learning_rate": 5.130078565432089e-05, + "loss": 2.5466, + "step": 28868 + }, + { + "epoch": 0.8560626278800818, + "grad_norm": 0.08424830436706543, + "learning_rate": 5.128002776426055e-05, + "loss": 2.5666, + "step": 28869 + }, + { + "epoch": 0.8560922812323933, + "grad_norm": 0.09275657683610916, + "learning_rate": 5.125927384769269e-05, + "loss": 2.5575, + "step": 28870 + }, + { + "epoch": 0.8561219345847048, + "grad_norm": 0.08665022253990173, + "learning_rate": 5.123852390480099e-05, + "loss": 2.5712, + "step": 28871 + }, + { + "epoch": 0.8561515879370163, + "grad_norm": 0.08958519995212555, + "learning_rate": 5.121777793576915e-05, + "loss": 2.5732, + "step": 28872 + }, + { + "epoch": 0.8561812412893277, + "grad_norm": 0.08045249432325363, + "learning_rate": 5.1197035940780955e-05, + "loss": 2.5659, + "step": 28873 + }, + { + "epoch": 0.8562108946416392, + "grad_norm": 0.09042397141456604, + "learning_rate": 5.117629792002004e-05, + "loss": 2.5818, + "step": 28874 + }, + { + "epoch": 0.8562405479939507, + "grad_norm": 0.08530434966087341, + "learning_rate": 5.115556387367004e-05, + "loss": 2.5278, + "step": 28875 + }, + { + "epoch": 0.8562702013462622, + "grad_norm": 0.09066241979598999, + "learning_rate": 5.113483380191458e-05, + "loss": 2.5534, + "step": 28876 + }, + { + "epoch": 0.8562998546985737, + "grad_norm": 0.0827043205499649, + "learning_rate": 5.1114107704937185e-05, + "loss": 2.5688, + "step": 28877 + }, + { + "epoch": 0.8563295080508851, + "grad_norm": 0.08113712072372437, + "learning_rate": 5.109338558292143e-05, + "loss": 2.5857, + "step": 28878 + }, + { + "epoch": 0.8563591614031967, + "grad_norm": 0.09223775565624237, + "learning_rate": 5.107266743605088e-05, + "loss": 2.5946, + "step": 28879 + }, + { + "epoch": 0.8563888147555081, + "grad_norm": 0.08183744549751282, + "learning_rate": 5.105195326450884e-05, + "loss": 2.5799, + "step": 28880 + }, + { + "epoch": 0.8564184681078196, + "grad_norm": 0.08938553929328918, + "learning_rate": 5.103124306847884e-05, + "loss": 2.5857, + "step": 28881 + }, + { + "epoch": 0.856448121460131, + "grad_norm": 0.08341721445322037, + "learning_rate": 5.1010536848144106e-05, + "loss": 2.5681, + "step": 28882 + }, + { + "epoch": 0.8564777748124426, + "grad_norm": 0.08220826089382172, + "learning_rate": 5.0989834603688226e-05, + "loss": 2.5341, + "step": 28883 + }, + { + "epoch": 0.856507428164754, + "grad_norm": 0.08207782357931137, + "learning_rate": 5.096913633529449e-05, + "loss": 2.5566, + "step": 28884 + }, + { + "epoch": 0.8565370815170655, + "grad_norm": 0.09176749736070633, + "learning_rate": 5.094844204314608e-05, + "loss": 2.5672, + "step": 28885 + }, + { + "epoch": 0.856566734869377, + "grad_norm": 0.07959579676389694, + "learning_rate": 5.092775172742631e-05, + "loss": 2.6294, + "step": 28886 + }, + { + "epoch": 0.8565963882216885, + "grad_norm": 0.08850044012069702, + "learning_rate": 5.0907065388318355e-05, + "loss": 2.5233, + "step": 28887 + }, + { + "epoch": 0.8566260415739999, + "grad_norm": 0.08447535336017609, + "learning_rate": 5.088638302600546e-05, + "loss": 2.5518, + "step": 28888 + }, + { + "epoch": 0.8566556949263114, + "grad_norm": 0.0809544175863266, + "learning_rate": 5.0865704640670806e-05, + "loss": 2.5713, + "step": 28889 + }, + { + "epoch": 0.8566853482786229, + "grad_norm": 0.09019840508699417, + "learning_rate": 5.084503023249737e-05, + "loss": 2.5975, + "step": 28890 + }, + { + "epoch": 0.8567150016309344, + "grad_norm": 0.07751113176345825, + "learning_rate": 5.0824359801668216e-05, + "loss": 2.5662, + "step": 28891 + }, + { + "epoch": 0.8567446549832458, + "grad_norm": 0.08726257085800171, + "learning_rate": 5.080369334836649e-05, + "loss": 2.6026, + "step": 28892 + }, + { + "epoch": 0.8567743083355573, + "grad_norm": 0.08595280349254608, + "learning_rate": 5.0783030872775194e-05, + "loss": 2.5898, + "step": 28893 + }, + { + "epoch": 0.8568039616878688, + "grad_norm": 0.0789426863193512, + "learning_rate": 5.0762372375077245e-05, + "loss": 2.6097, + "step": 28894 + }, + { + "epoch": 0.8568336150401803, + "grad_norm": 0.08282757550477982, + "learning_rate": 5.0741717855455506e-05, + "loss": 2.5782, + "step": 28895 + }, + { + "epoch": 0.8568632683924917, + "grad_norm": 0.08987351506948471, + "learning_rate": 5.072106731409304e-05, + "loss": 2.5739, + "step": 28896 + }, + { + "epoch": 0.8568929217448032, + "grad_norm": 0.08247888833284378, + "learning_rate": 5.0700420751172705e-05, + "loss": 2.5499, + "step": 28897 + }, + { + "epoch": 0.8569225750971148, + "grad_norm": 0.08542229235172272, + "learning_rate": 5.067977816687719e-05, + "loss": 2.5838, + "step": 28898 + }, + { + "epoch": 0.8569522284494262, + "grad_norm": 0.09224853664636612, + "learning_rate": 5.06591395613894e-05, + "loss": 2.559, + "step": 28899 + }, + { + "epoch": 0.8569818818017377, + "grad_norm": 0.07911040633916855, + "learning_rate": 5.0638504934892135e-05, + "loss": 2.5936, + "step": 28900 + }, + { + "epoch": 0.8570115351540492, + "grad_norm": 0.09261665493249893, + "learning_rate": 5.0617874287567974e-05, + "loss": 2.5936, + "step": 28901 + }, + { + "epoch": 0.8570411885063607, + "grad_norm": 0.08734818547964096, + "learning_rate": 5.059724761959966e-05, + "loss": 2.5591, + "step": 28902 + }, + { + "epoch": 0.8570708418586721, + "grad_norm": 0.08173923939466476, + "learning_rate": 5.057662493116988e-05, + "loss": 2.6233, + "step": 28903 + }, + { + "epoch": 0.8571004952109836, + "grad_norm": 0.09281028062105179, + "learning_rate": 5.05560062224612e-05, + "loss": 2.5619, + "step": 28904 + }, + { + "epoch": 0.8571301485632951, + "grad_norm": 0.09062286466360092, + "learning_rate": 5.0535391493656215e-05, + "loss": 2.5819, + "step": 28905 + }, + { + "epoch": 0.8571598019156066, + "grad_norm": 0.08588605374097824, + "learning_rate": 5.051478074493748e-05, + "loss": 2.5852, + "step": 28906 + }, + { + "epoch": 0.857189455267918, + "grad_norm": 0.08704478293657303, + "learning_rate": 5.049417397648759e-05, + "loss": 2.5796, + "step": 28907 + }, + { + "epoch": 0.8572191086202295, + "grad_norm": 0.08236578106880188, + "learning_rate": 5.0473571188488776e-05, + "loss": 2.5711, + "step": 28908 + }, + { + "epoch": 0.857248761972541, + "grad_norm": 0.08183625340461731, + "learning_rate": 5.0452972381123785e-05, + "loss": 2.5859, + "step": 28909 + }, + { + "epoch": 0.8572784153248525, + "grad_norm": 0.08144095540046692, + "learning_rate": 5.0432377554574973e-05, + "loss": 2.5895, + "step": 28910 + }, + { + "epoch": 0.8573080686771639, + "grad_norm": 0.08185598254203796, + "learning_rate": 5.041178670902452e-05, + "loss": 2.5649, + "step": 28911 + }, + { + "epoch": 0.8573377220294754, + "grad_norm": 0.08636448532342911, + "learning_rate": 5.039119984465484e-05, + "loss": 2.5739, + "step": 28912 + }, + { + "epoch": 0.8573673753817869, + "grad_norm": 0.07684765756130219, + "learning_rate": 5.0370616961648295e-05, + "loss": 2.5657, + "step": 28913 + }, + { + "epoch": 0.8573970287340984, + "grad_norm": 0.07739946991205215, + "learning_rate": 5.035003806018712e-05, + "loss": 2.5547, + "step": 28914 + }, + { + "epoch": 0.8574266820864098, + "grad_norm": 0.0900852233171463, + "learning_rate": 5.032946314045356e-05, + "loss": 2.5734, + "step": 28915 + }, + { + "epoch": 0.8574563354387214, + "grad_norm": 0.08433423191308975, + "learning_rate": 5.030889220262974e-05, + "loss": 2.5586, + "step": 28916 + }, + { + "epoch": 0.8574859887910328, + "grad_norm": 0.0804707258939743, + "learning_rate": 5.028832524689791e-05, + "loss": 2.5584, + "step": 28917 + }, + { + "epoch": 0.8575156421433443, + "grad_norm": 0.08782177418470383, + "learning_rate": 5.02677622734401e-05, + "loss": 2.5698, + "step": 28918 + }, + { + "epoch": 0.8575452954956558, + "grad_norm": 0.08171539008617401, + "learning_rate": 5.024720328243848e-05, + "loss": 2.5483, + "step": 28919 + }, + { + "epoch": 0.8575749488479673, + "grad_norm": 0.08926662057638168, + "learning_rate": 5.0226648274075083e-05, + "loss": 2.5821, + "step": 28920 + }, + { + "epoch": 0.8576046022002788, + "grad_norm": 0.08295576274394989, + "learning_rate": 5.020609724853192e-05, + "loss": 2.5667, + "step": 28921 + }, + { + "epoch": 0.8576342555525902, + "grad_norm": 0.08105743676424026, + "learning_rate": 5.018555020599097e-05, + "loss": 2.5935, + "step": 28922 + }, + { + "epoch": 0.8576639089049017, + "grad_norm": 0.08888091892004013, + "learning_rate": 5.016500714663419e-05, + "loss": 2.5924, + "step": 28923 + }, + { + "epoch": 0.8576935622572132, + "grad_norm": 0.0911167562007904, + "learning_rate": 5.0144468070643435e-05, + "loss": 2.5724, + "step": 28924 + }, + { + "epoch": 0.8577232156095247, + "grad_norm": 0.08391545712947845, + "learning_rate": 5.012393297820067e-05, + "loss": 2.6012, + "step": 28925 + }, + { + "epoch": 0.8577528689618361, + "grad_norm": 0.07770878076553345, + "learning_rate": 5.01034018694877e-05, + "loss": 2.5763, + "step": 28926 + }, + { + "epoch": 0.8577825223141476, + "grad_norm": 0.08499325066804886, + "learning_rate": 5.008287474468631e-05, + "loss": 2.5695, + "step": 28927 + }, + { + "epoch": 0.8578121756664591, + "grad_norm": 0.08282564580440521, + "learning_rate": 5.0062351603978316e-05, + "loss": 2.5626, + "step": 28928 + }, + { + "epoch": 0.8578418290187706, + "grad_norm": 0.0780133306980133, + "learning_rate": 5.004183244754546e-05, + "loss": 2.5925, + "step": 28929 + }, + { + "epoch": 0.857871482371082, + "grad_norm": 0.08498314768075943, + "learning_rate": 5.002131727556936e-05, + "loss": 2.5817, + "step": 28930 + }, + { + "epoch": 0.8579011357233935, + "grad_norm": 0.0750904232263565, + "learning_rate": 5.000080608823171e-05, + "loss": 2.5814, + "step": 28931 + }, + { + "epoch": 0.857930789075705, + "grad_norm": 0.08093619346618652, + "learning_rate": 4.9980298885714205e-05, + "loss": 2.591, + "step": 28932 + }, + { + "epoch": 0.8579604424280165, + "grad_norm": 0.0768054723739624, + "learning_rate": 4.9959795668198416e-05, + "loss": 2.5922, + "step": 28933 + }, + { + "epoch": 0.8579900957803279, + "grad_norm": 0.08042407035827637, + "learning_rate": 4.993929643586587e-05, + "loss": 2.6058, + "step": 28934 + }, + { + "epoch": 0.8580197491326395, + "grad_norm": 0.08050593733787537, + "learning_rate": 4.9918801188898074e-05, + "loss": 2.5876, + "step": 28935 + }, + { + "epoch": 0.8580494024849509, + "grad_norm": 0.08432991802692413, + "learning_rate": 4.989830992747657e-05, + "loss": 2.6029, + "step": 28936 + }, + { + "epoch": 0.8580790558372624, + "grad_norm": 0.08002903312444687, + "learning_rate": 4.987782265178281e-05, + "loss": 2.5446, + "step": 28937 + }, + { + "epoch": 0.8581087091895738, + "grad_norm": 0.08811257779598236, + "learning_rate": 4.985733936199815e-05, + "loss": 2.5792, + "step": 28938 + }, + { + "epoch": 0.8581383625418854, + "grad_norm": 0.07670831680297852, + "learning_rate": 4.983686005830407e-05, + "loss": 2.5486, + "step": 28939 + }, + { + "epoch": 0.8581680158941969, + "grad_norm": 0.08353134244680405, + "learning_rate": 4.981638474088179e-05, + "loss": 2.5946, + "step": 28940 + }, + { + "epoch": 0.8581976692465083, + "grad_norm": 0.08219150453805923, + "learning_rate": 4.979591340991274e-05, + "loss": 2.5696, + "step": 28941 + }, + { + "epoch": 0.8582273225988198, + "grad_norm": 0.08254606276750565, + "learning_rate": 4.9775446065578155e-05, + "loss": 2.5917, + "step": 28942 + }, + { + "epoch": 0.8582569759511313, + "grad_norm": 0.08399096131324768, + "learning_rate": 4.975498270805928e-05, + "loss": 2.5578, + "step": 28943 + }, + { + "epoch": 0.8582866293034428, + "grad_norm": 0.08306187391281128, + "learning_rate": 4.973452333753742e-05, + "loss": 2.5772, + "step": 28944 + }, + { + "epoch": 0.8583162826557542, + "grad_norm": 0.08203814923763275, + "learning_rate": 4.9714067954193534e-05, + "loss": 2.5858, + "step": 28945 + }, + { + "epoch": 0.8583459360080657, + "grad_norm": 0.07945047318935394, + "learning_rate": 4.9693616558208866e-05, + "loss": 2.554, + "step": 28946 + }, + { + "epoch": 0.8583755893603772, + "grad_norm": 0.07797617465257645, + "learning_rate": 4.9673169149764444e-05, + "loss": 2.5546, + "step": 28947 + }, + { + "epoch": 0.8584052427126887, + "grad_norm": 0.07444824278354645, + "learning_rate": 4.965272572904145e-05, + "loss": 2.5556, + "step": 28948 + }, + { + "epoch": 0.8584348960650001, + "grad_norm": 0.0756768137216568, + "learning_rate": 4.963228629622091e-05, + "loss": 2.5745, + "step": 28949 + }, + { + "epoch": 0.8584645494173117, + "grad_norm": 0.08124158531427383, + "learning_rate": 4.961185085148379e-05, + "loss": 2.564, + "step": 28950 + }, + { + "epoch": 0.8584942027696231, + "grad_norm": 0.07789522409439087, + "learning_rate": 4.9591419395010996e-05, + "loss": 2.5974, + "step": 28951 + }, + { + "epoch": 0.8585238561219346, + "grad_norm": 0.08018935471773148, + "learning_rate": 4.957099192698355e-05, + "loss": 2.5728, + "step": 28952 + }, + { + "epoch": 0.858553509474246, + "grad_norm": 0.08613473176956177, + "learning_rate": 4.955056844758221e-05, + "loss": 2.6071, + "step": 28953 + }, + { + "epoch": 0.8585831628265576, + "grad_norm": 0.07997073978185654, + "learning_rate": 4.953014895698799e-05, + "loss": 2.6128, + "step": 28954 + }, + { + "epoch": 0.858612816178869, + "grad_norm": 0.0838325172662735, + "learning_rate": 4.950973345538168e-05, + "loss": 2.5722, + "step": 28955 + }, + { + "epoch": 0.8586424695311805, + "grad_norm": 0.08291631937026978, + "learning_rate": 4.948932194294387e-05, + "loss": 2.5744, + "step": 28956 + }, + { + "epoch": 0.8586721228834919, + "grad_norm": 0.07998030632734299, + "learning_rate": 4.946891441985552e-05, + "loss": 2.5255, + "step": 28957 + }, + { + "epoch": 0.8587017762358035, + "grad_norm": 0.07915838062763214, + "learning_rate": 4.944851088629721e-05, + "loss": 2.5203, + "step": 28958 + }, + { + "epoch": 0.858731429588115, + "grad_norm": 0.08018098026514053, + "learning_rate": 4.942811134244968e-05, + "loss": 2.5775, + "step": 28959 + }, + { + "epoch": 0.8587610829404264, + "grad_norm": 0.0802600085735321, + "learning_rate": 4.940771578849351e-05, + "loss": 2.5736, + "step": 28960 + }, + { + "epoch": 0.8587907362927379, + "grad_norm": 0.08215975761413574, + "learning_rate": 4.93873242246094e-05, + "loss": 2.5872, + "step": 28961 + }, + { + "epoch": 0.8588203896450494, + "grad_norm": 0.07912000268697739, + "learning_rate": 4.936693665097791e-05, + "loss": 2.5935, + "step": 28962 + }, + { + "epoch": 0.8588500429973609, + "grad_norm": 0.08043748140335083, + "learning_rate": 4.934655306777952e-05, + "loss": 2.5359, + "step": 28963 + }, + { + "epoch": 0.8588796963496723, + "grad_norm": 0.08220674842596054, + "learning_rate": 4.93261734751948e-05, + "loss": 2.562, + "step": 28964 + }, + { + "epoch": 0.8589093497019838, + "grad_norm": 0.07627661526203156, + "learning_rate": 4.9305797873404224e-05, + "loss": 2.5529, + "step": 28965 + }, + { + "epoch": 0.8589390030542953, + "grad_norm": 0.07836532592773438, + "learning_rate": 4.9285426262588086e-05, + "loss": 2.5672, + "step": 28966 + }, + { + "epoch": 0.8589686564066068, + "grad_norm": 0.08347875624895096, + "learning_rate": 4.926505864292691e-05, + "loss": 2.585, + "step": 28967 + }, + { + "epoch": 0.8589983097589182, + "grad_norm": 0.0811644196510315, + "learning_rate": 4.9244695014600936e-05, + "loss": 2.5741, + "step": 28968 + }, + { + "epoch": 0.8590279631112298, + "grad_norm": 0.08186504244804382, + "learning_rate": 4.9224335377790584e-05, + "loss": 2.5682, + "step": 28969 + }, + { + "epoch": 0.8590576164635412, + "grad_norm": 0.08072458207607269, + "learning_rate": 4.9203979732676155e-05, + "loss": 2.5834, + "step": 28970 + }, + { + "epoch": 0.8590872698158527, + "grad_norm": 0.07748706638813019, + "learning_rate": 4.9183628079437824e-05, + "loss": 2.5986, + "step": 28971 + }, + { + "epoch": 0.8591169231681641, + "grad_norm": 0.0828145295381546, + "learning_rate": 4.916328041825585e-05, + "loss": 2.5385, + "step": 28972 + }, + { + "epoch": 0.8591465765204757, + "grad_norm": 0.07850270718336105, + "learning_rate": 4.914293674931031e-05, + "loss": 2.5679, + "step": 28973 + }, + { + "epoch": 0.8591762298727871, + "grad_norm": 0.08124133199453354, + "learning_rate": 4.912259707278155e-05, + "loss": 2.5888, + "step": 28974 + }, + { + "epoch": 0.8592058832250986, + "grad_norm": 0.08394786715507507, + "learning_rate": 4.91022613888496e-05, + "loss": 2.5711, + "step": 28975 + }, + { + "epoch": 0.85923553657741, + "grad_norm": 0.0819961205124855, + "learning_rate": 4.9081929697694596e-05, + "loss": 2.535, + "step": 28976 + }, + { + "epoch": 0.8592651899297216, + "grad_norm": 0.07876081764698029, + "learning_rate": 4.906160199949644e-05, + "loss": 2.5346, + "step": 28977 + }, + { + "epoch": 0.859294843282033, + "grad_norm": 0.08544866740703583, + "learning_rate": 4.9041278294435165e-05, + "loss": 2.5827, + "step": 28978 + }, + { + "epoch": 0.8593244966343445, + "grad_norm": 0.08030937612056732, + "learning_rate": 4.902095858269079e-05, + "loss": 2.5869, + "step": 28979 + }, + { + "epoch": 0.859354149986656, + "grad_norm": 0.08926573395729065, + "learning_rate": 4.900064286444328e-05, + "loss": 2.6153, + "step": 28980 + }, + { + "epoch": 0.8593838033389675, + "grad_norm": 0.08235623687505722, + "learning_rate": 4.8980331139872435e-05, + "loss": 2.521, + "step": 28981 + }, + { + "epoch": 0.859413456691279, + "grad_norm": 0.08380662649869919, + "learning_rate": 4.896002340915823e-05, + "loss": 2.5557, + "step": 28982 + }, + { + "epoch": 0.8594431100435904, + "grad_norm": 0.08831397444009781, + "learning_rate": 4.8939719672480396e-05, + "loss": 2.5749, + "step": 28983 + }, + { + "epoch": 0.859472763395902, + "grad_norm": 0.0747513622045517, + "learning_rate": 4.891941993001875e-05, + "loss": 2.5452, + "step": 28984 + }, + { + "epoch": 0.8595024167482134, + "grad_norm": 0.08381661027669907, + "learning_rate": 4.889912418195308e-05, + "loss": 2.567, + "step": 28985 + }, + { + "epoch": 0.8595320701005249, + "grad_norm": 0.08044426143169403, + "learning_rate": 4.887883242846314e-05, + "loss": 2.5641, + "step": 28986 + }, + { + "epoch": 0.8595617234528363, + "grad_norm": 0.08744383603334427, + "learning_rate": 4.88585446697285e-05, + "loss": 2.5819, + "step": 28987 + }, + { + "epoch": 0.8595913768051479, + "grad_norm": 0.08235769718885422, + "learning_rate": 4.883826090592891e-05, + "loss": 2.5907, + "step": 28988 + }, + { + "epoch": 0.8596210301574593, + "grad_norm": 0.08251804113388062, + "learning_rate": 4.8817981137244004e-05, + "loss": 2.5842, + "step": 28989 + }, + { + "epoch": 0.8596506835097708, + "grad_norm": 0.08188911527395248, + "learning_rate": 4.8797705363853305e-05, + "loss": 2.5993, + "step": 28990 + }, + { + "epoch": 0.8596803368620822, + "grad_norm": 0.08771413564682007, + "learning_rate": 4.877743358593634e-05, + "loss": 2.5953, + "step": 28991 + }, + { + "epoch": 0.8597099902143938, + "grad_norm": 0.08344881236553192, + "learning_rate": 4.875716580367268e-05, + "loss": 2.5705, + "step": 28992 + }, + { + "epoch": 0.8597396435667052, + "grad_norm": 0.07895892858505249, + "learning_rate": 4.873690201724174e-05, + "loss": 2.5756, + "step": 28993 + }, + { + "epoch": 0.8597692969190167, + "grad_norm": 0.08362432569265366, + "learning_rate": 4.871664222682304e-05, + "loss": 2.5641, + "step": 28994 + }, + { + "epoch": 0.8597989502713281, + "grad_norm": 0.0821123942732811, + "learning_rate": 4.8696386432595886e-05, + "loss": 2.5795, + "step": 28995 + }, + { + "epoch": 0.8598286036236397, + "grad_norm": 0.08185974508523941, + "learning_rate": 4.867613463473969e-05, + "loss": 2.562, + "step": 28996 + }, + { + "epoch": 0.8598582569759511, + "grad_norm": 0.08415074646472931, + "learning_rate": 4.865588683343386e-05, + "loss": 2.5895, + "step": 28997 + }, + { + "epoch": 0.8598879103282626, + "grad_norm": 0.08055278658866882, + "learning_rate": 4.8635643028857535e-05, + "loss": 2.5934, + "step": 28998 + }, + { + "epoch": 0.859917563680574, + "grad_norm": 0.08690174669027328, + "learning_rate": 4.861540322119012e-05, + "loss": 2.5591, + "step": 28999 + }, + { + "epoch": 0.8599472170328856, + "grad_norm": 0.08481891453266144, + "learning_rate": 4.859516741061082e-05, + "loss": 2.6152, + "step": 29000 + }, + { + "epoch": 0.8599768703851971, + "grad_norm": 0.08796324580907822, + "learning_rate": 4.857493559729875e-05, + "loss": 2.5621, + "step": 29001 + }, + { + "epoch": 0.8600065237375085, + "grad_norm": 0.08070909976959229, + "learning_rate": 4.8554707781433116e-05, + "loss": 2.5386, + "step": 29002 + }, + { + "epoch": 0.86003617708982, + "grad_norm": 0.07451259344816208, + "learning_rate": 4.853448396319304e-05, + "loss": 2.5812, + "step": 29003 + }, + { + "epoch": 0.8600658304421315, + "grad_norm": 0.0865035280585289, + "learning_rate": 4.8514264142757556e-05, + "loss": 2.5985, + "step": 29004 + }, + { + "epoch": 0.860095483794443, + "grad_norm": 0.08285293728113174, + "learning_rate": 4.849404832030579e-05, + "loss": 2.5599, + "step": 29005 + }, + { + "epoch": 0.8601251371467544, + "grad_norm": 0.07610747963190079, + "learning_rate": 4.8473836496016777e-05, + "loss": 2.5407, + "step": 29006 + }, + { + "epoch": 0.860154790499066, + "grad_norm": 0.08561104536056519, + "learning_rate": 4.845362867006942e-05, + "loss": 2.5493, + "step": 29007 + }, + { + "epoch": 0.8601844438513774, + "grad_norm": 0.08682302385568619, + "learning_rate": 4.843342484264263e-05, + "loss": 2.5365, + "step": 29008 + }, + { + "epoch": 0.8602140972036889, + "grad_norm": 0.07814747095108032, + "learning_rate": 4.841322501391543e-05, + "loss": 2.5587, + "step": 29009 + }, + { + "epoch": 0.8602437505560003, + "grad_norm": 0.08585968613624573, + "learning_rate": 4.839302918406674e-05, + "loss": 2.5666, + "step": 29010 + }, + { + "epoch": 0.8602734039083119, + "grad_norm": 0.0777200311422348, + "learning_rate": 4.8372837353275135e-05, + "loss": 2.5579, + "step": 29011 + }, + { + "epoch": 0.8603030572606233, + "grad_norm": 0.08192289620637894, + "learning_rate": 4.8352649521719525e-05, + "loss": 2.5854, + "step": 29012 + }, + { + "epoch": 0.8603327106129348, + "grad_norm": 0.07733938843011856, + "learning_rate": 4.833246568957883e-05, + "loss": 2.5531, + "step": 29013 + }, + { + "epoch": 0.8603623639652462, + "grad_norm": 0.07946910709142685, + "learning_rate": 4.831228585703168e-05, + "loss": 2.5695, + "step": 29014 + }, + { + "epoch": 0.8603920173175578, + "grad_norm": 0.07858388870954514, + "learning_rate": 4.8292110024256766e-05, + "loss": 2.5763, + "step": 29015 + }, + { + "epoch": 0.8604216706698692, + "grad_norm": 0.0785641297698021, + "learning_rate": 4.827193819143272e-05, + "loss": 2.587, + "step": 29016 + }, + { + "epoch": 0.8604513240221807, + "grad_norm": 0.08045245707035065, + "learning_rate": 4.8251770358738235e-05, + "loss": 2.5849, + "step": 29017 + }, + { + "epoch": 0.8604809773744921, + "grad_norm": 0.07567856460809708, + "learning_rate": 4.823160652635189e-05, + "loss": 2.6074, + "step": 29018 + }, + { + "epoch": 0.8605106307268037, + "grad_norm": 0.0800631120800972, + "learning_rate": 4.8211446694452155e-05, + "loss": 2.5833, + "step": 29019 + }, + { + "epoch": 0.8605402840791151, + "grad_norm": 0.07888687402009964, + "learning_rate": 4.819129086321772e-05, + "loss": 2.5583, + "step": 29020 + }, + { + "epoch": 0.8605699374314266, + "grad_norm": 0.08212020248174667, + "learning_rate": 4.817113903282688e-05, + "loss": 2.5865, + "step": 29021 + }, + { + "epoch": 0.8605995907837382, + "grad_norm": 0.0793217346072197, + "learning_rate": 4.8150991203458116e-05, + "loss": 2.5992, + "step": 29022 + }, + { + "epoch": 0.8606292441360496, + "grad_norm": 0.08331454545259476, + "learning_rate": 4.813084737528995e-05, + "loss": 2.6076, + "step": 29023 + }, + { + "epoch": 0.8606588974883611, + "grad_norm": 0.07536676526069641, + "learning_rate": 4.8110707548500555e-05, + "loss": 2.5526, + "step": 29024 + }, + { + "epoch": 0.8606885508406725, + "grad_norm": 0.08960257470607758, + "learning_rate": 4.809057172326853e-05, + "loss": 2.5671, + "step": 29025 + }, + { + "epoch": 0.8607182041929841, + "grad_norm": 0.07571527361869812, + "learning_rate": 4.807043989977206e-05, + "loss": 2.5823, + "step": 29026 + }, + { + "epoch": 0.8607478575452955, + "grad_norm": 0.08365733176469803, + "learning_rate": 4.8050312078189384e-05, + "loss": 2.5733, + "step": 29027 + }, + { + "epoch": 0.860777510897607, + "grad_norm": 0.08433276414871216, + "learning_rate": 4.8030188258698757e-05, + "loss": 2.5552, + "step": 29028 + }, + { + "epoch": 0.8608071642499184, + "grad_norm": 0.08343234658241272, + "learning_rate": 4.8010068441478426e-05, + "loss": 2.5794, + "step": 29029 + }, + { + "epoch": 0.86083681760223, + "grad_norm": 0.08223651349544525, + "learning_rate": 4.798995262670653e-05, + "loss": 2.5909, + "step": 29030 + }, + { + "epoch": 0.8608664709545414, + "grad_norm": 0.08044037967920303, + "learning_rate": 4.7969840814561294e-05, + "loss": 2.5796, + "step": 29031 + }, + { + "epoch": 0.8608961243068529, + "grad_norm": 0.08851556479930878, + "learning_rate": 4.79497330052206e-05, + "loss": 2.587, + "step": 29032 + }, + { + "epoch": 0.8609257776591643, + "grad_norm": 0.07850641757249832, + "learning_rate": 4.792962919886262e-05, + "loss": 2.6073, + "step": 29033 + }, + { + "epoch": 0.8609554310114759, + "grad_norm": 0.07936695963144302, + "learning_rate": 4.790952939566534e-05, + "loss": 2.5731, + "step": 29034 + }, + { + "epoch": 0.8609850843637873, + "grad_norm": 0.08089340478181839, + "learning_rate": 4.7889433595806776e-05, + "loss": 2.5843, + "step": 29035 + }, + { + "epoch": 0.8610147377160988, + "grad_norm": 0.07934699207544327, + "learning_rate": 4.7869341799464896e-05, + "loss": 2.5965, + "step": 29036 + }, + { + "epoch": 0.8610443910684102, + "grad_norm": 0.08282626420259476, + "learning_rate": 4.7849254006817555e-05, + "loss": 2.5851, + "step": 29037 + }, + { + "epoch": 0.8610740444207218, + "grad_norm": 0.08072559535503387, + "learning_rate": 4.782917021804273e-05, + "loss": 2.5531, + "step": 29038 + }, + { + "epoch": 0.8611036977730332, + "grad_norm": 0.08103042095899582, + "learning_rate": 4.780909043331821e-05, + "loss": 2.5525, + "step": 29039 + }, + { + "epoch": 0.8611333511253447, + "grad_norm": 0.08331888169050217, + "learning_rate": 4.7789014652821815e-05, + "loss": 2.5718, + "step": 29040 + }, + { + "epoch": 0.8611630044776561, + "grad_norm": 0.08401288837194443, + "learning_rate": 4.776894287673139e-05, + "loss": 2.5735, + "step": 29041 + }, + { + "epoch": 0.8611926578299677, + "grad_norm": 0.07964842766523361, + "learning_rate": 4.7748875105224577e-05, + "loss": 2.5825, + "step": 29042 + }, + { + "epoch": 0.8612223111822792, + "grad_norm": 0.08114509284496307, + "learning_rate": 4.772881133847906e-05, + "loss": 2.5952, + "step": 29043 + }, + { + "epoch": 0.8612519645345906, + "grad_norm": 0.0784527137875557, + "learning_rate": 4.770875157667254e-05, + "loss": 2.5854, + "step": 29044 + }, + { + "epoch": 0.8612816178869022, + "grad_norm": 0.0781681016087532, + "learning_rate": 4.768869581998264e-05, + "loss": 2.5544, + "step": 29045 + }, + { + "epoch": 0.8613112712392136, + "grad_norm": 0.08526221662759781, + "learning_rate": 4.766864406858701e-05, + "loss": 2.5946, + "step": 29046 + }, + { + "epoch": 0.8613409245915251, + "grad_norm": 0.08070144057273865, + "learning_rate": 4.764859632266316e-05, + "loss": 2.5775, + "step": 29047 + }, + { + "epoch": 0.8613705779438365, + "grad_norm": 0.07893633097410202, + "learning_rate": 4.7628552582388686e-05, + "loss": 2.5612, + "step": 29048 + }, + { + "epoch": 0.8614002312961481, + "grad_norm": 0.07903892546892166, + "learning_rate": 4.760851284794099e-05, + "loss": 2.5587, + "step": 29049 + }, + { + "epoch": 0.8614298846484595, + "grad_norm": 0.08218492567539215, + "learning_rate": 4.758847711949749e-05, + "loss": 2.5631, + "step": 29050 + }, + { + "epoch": 0.861459538000771, + "grad_norm": 0.08340118080377579, + "learning_rate": 4.7568445397235704e-05, + "loss": 2.5525, + "step": 29051 + }, + { + "epoch": 0.8614891913530824, + "grad_norm": 0.08895713835954666, + "learning_rate": 4.7548417681333164e-05, + "loss": 2.556, + "step": 29052 + }, + { + "epoch": 0.861518844705394, + "grad_norm": 0.0791519284248352, + "learning_rate": 4.7528393971966897e-05, + "loss": 2.5723, + "step": 29053 + }, + { + "epoch": 0.8615484980577054, + "grad_norm": 0.07575735449790955, + "learning_rate": 4.750837426931443e-05, + "loss": 2.5436, + "step": 29054 + }, + { + "epoch": 0.8615781514100169, + "grad_norm": 0.0989571288228035, + "learning_rate": 4.748835857355293e-05, + "loss": 2.576, + "step": 29055 + }, + { + "epoch": 0.8616078047623283, + "grad_norm": 0.08810658007860184, + "learning_rate": 4.746834688485974e-05, + "loss": 2.5551, + "step": 29056 + }, + { + "epoch": 0.8616374581146399, + "grad_norm": 0.07807102054357529, + "learning_rate": 4.744833920341196e-05, + "loss": 2.5481, + "step": 29057 + }, + { + "epoch": 0.8616671114669513, + "grad_norm": 0.0882272943854332, + "learning_rate": 4.74283355293868e-05, + "loss": 2.5789, + "step": 29058 + }, + { + "epoch": 0.8616967648192628, + "grad_norm": 0.08226394653320312, + "learning_rate": 4.740833586296145e-05, + "loss": 2.5762, + "step": 29059 + }, + { + "epoch": 0.8617264181715742, + "grad_norm": 0.09266074746847153, + "learning_rate": 4.7388340204312984e-05, + "loss": 2.5747, + "step": 29060 + }, + { + "epoch": 0.8617560715238858, + "grad_norm": 0.08245109021663666, + "learning_rate": 4.7368348553618434e-05, + "loss": 2.5716, + "step": 29061 + }, + { + "epoch": 0.8617857248761972, + "grad_norm": 0.08354369550943375, + "learning_rate": 4.7348360911054875e-05, + "loss": 2.5646, + "step": 29062 + }, + { + "epoch": 0.8618153782285087, + "grad_norm": 0.08799770474433899, + "learning_rate": 4.732837727679923e-05, + "loss": 2.5837, + "step": 29063 + }, + { + "epoch": 0.8618450315808203, + "grad_norm": 0.08507271856069565, + "learning_rate": 4.730839765102857e-05, + "loss": 2.5573, + "step": 29064 + }, + { + "epoch": 0.8618746849331317, + "grad_norm": 0.07778023928403854, + "learning_rate": 4.72884220339197e-05, + "loss": 2.5802, + "step": 29065 + }, + { + "epoch": 0.8619043382854432, + "grad_norm": 0.08346232026815414, + "learning_rate": 4.726845042564959e-05, + "loss": 2.5531, + "step": 29066 + }, + { + "epoch": 0.8619339916377546, + "grad_norm": 0.0829804390668869, + "learning_rate": 4.724848282639505e-05, + "loss": 2.6096, + "step": 29067 + }, + { + "epoch": 0.8619636449900662, + "grad_norm": 0.08176010102033615, + "learning_rate": 4.722851923633287e-05, + "loss": 2.6032, + "step": 29068 + }, + { + "epoch": 0.8619932983423776, + "grad_norm": 0.07997166365385056, + "learning_rate": 4.720855965563992e-05, + "loss": 2.5886, + "step": 29069 + }, + { + "epoch": 0.8620229516946891, + "grad_norm": 0.08194811642169952, + "learning_rate": 4.718860408449288e-05, + "loss": 2.566, + "step": 29070 + }, + { + "epoch": 0.8620526050470005, + "grad_norm": 0.07601100206375122, + "learning_rate": 4.7168652523068456e-05, + "loss": 2.5453, + "step": 29071 + }, + { + "epoch": 0.8620822583993121, + "grad_norm": 0.0752992033958435, + "learning_rate": 4.714870497154333e-05, + "loss": 2.5907, + "step": 29072 + }, + { + "epoch": 0.8621119117516235, + "grad_norm": 0.07606375217437744, + "learning_rate": 4.71287614300942e-05, + "loss": 2.588, + "step": 29073 + }, + { + "epoch": 0.862141565103935, + "grad_norm": 0.07601282745599747, + "learning_rate": 4.710882189889759e-05, + "loss": 2.5568, + "step": 29074 + }, + { + "epoch": 0.8621712184562464, + "grad_norm": 0.07763935625553131, + "learning_rate": 4.7088886378130243e-05, + "loss": 2.5564, + "step": 29075 + }, + { + "epoch": 0.862200871808558, + "grad_norm": 0.07857023924589157, + "learning_rate": 4.706895486796831e-05, + "loss": 2.5752, + "step": 29076 + }, + { + "epoch": 0.8622305251608694, + "grad_norm": 0.07618094235658646, + "learning_rate": 4.704902736858863e-05, + "loss": 2.5824, + "step": 29077 + }, + { + "epoch": 0.8622601785131809, + "grad_norm": 0.07966624945402145, + "learning_rate": 4.702910388016757e-05, + "loss": 2.587, + "step": 29078 + }, + { + "epoch": 0.8622898318654924, + "grad_norm": 0.07901279628276825, + "learning_rate": 4.7009184402881545e-05, + "loss": 2.5699, + "step": 29079 + }, + { + "epoch": 0.8623194852178039, + "grad_norm": 0.07781385630369186, + "learning_rate": 4.698926893690692e-05, + "loss": 2.5345, + "step": 29080 + }, + { + "epoch": 0.8623491385701153, + "grad_norm": 0.08043313771486282, + "learning_rate": 4.6969357482420094e-05, + "loss": 2.5685, + "step": 29081 + }, + { + "epoch": 0.8623787919224268, + "grad_norm": 0.0794568806886673, + "learning_rate": 4.6949450039597386e-05, + "loss": 2.554, + "step": 29082 + }, + { + "epoch": 0.8624084452747383, + "grad_norm": 0.08605196326971054, + "learning_rate": 4.692954660861509e-05, + "loss": 2.544, + "step": 29083 + }, + { + "epoch": 0.8624380986270498, + "grad_norm": 0.08573602139949799, + "learning_rate": 4.690964718964935e-05, + "loss": 2.5695, + "step": 29084 + }, + { + "epoch": 0.8624677519793613, + "grad_norm": 0.08237456530332565, + "learning_rate": 4.688975178287652e-05, + "loss": 2.5492, + "step": 29085 + }, + { + "epoch": 0.8624974053316727, + "grad_norm": 0.08450471609830856, + "learning_rate": 4.68698603884728e-05, + "loss": 2.5697, + "step": 29086 + }, + { + "epoch": 0.8625270586839843, + "grad_norm": 0.08247647434473038, + "learning_rate": 4.684997300661409e-05, + "loss": 2.5924, + "step": 29087 + }, + { + "epoch": 0.8625567120362957, + "grad_norm": 0.07716374844312668, + "learning_rate": 4.6830089637476705e-05, + "loss": 2.5698, + "step": 29088 + }, + { + "epoch": 0.8625863653886072, + "grad_norm": 0.08561260253190994, + "learning_rate": 4.681021028123656e-05, + "loss": 2.5523, + "step": 29089 + }, + { + "epoch": 0.8626160187409186, + "grad_norm": 0.0754595696926117, + "learning_rate": 4.679033493806989e-05, + "loss": 2.5723, + "step": 29090 + }, + { + "epoch": 0.8626456720932302, + "grad_norm": 0.07728234678506851, + "learning_rate": 4.677046360815257e-05, + "loss": 2.5946, + "step": 29091 + }, + { + "epoch": 0.8626753254455416, + "grad_norm": 0.08895350992679596, + "learning_rate": 4.675059629166062e-05, + "loss": 2.5938, + "step": 29092 + }, + { + "epoch": 0.8627049787978531, + "grad_norm": 0.0770772397518158, + "learning_rate": 4.6730732988769956e-05, + "loss": 2.5424, + "step": 29093 + }, + { + "epoch": 0.8627346321501645, + "grad_norm": 0.07823997735977173, + "learning_rate": 4.671087369965643e-05, + "loss": 2.5746, + "step": 29094 + }, + { + "epoch": 0.8627642855024761, + "grad_norm": 0.09247035533189774, + "learning_rate": 4.6691018424495915e-05, + "loss": 2.5668, + "step": 29095 + }, + { + "epoch": 0.8627939388547875, + "grad_norm": 0.08098876476287842, + "learning_rate": 4.667116716346431e-05, + "loss": 2.57, + "step": 29096 + }, + { + "epoch": 0.862823592207099, + "grad_norm": 0.07994499057531357, + "learning_rate": 4.6651319916737264e-05, + "loss": 2.566, + "step": 29097 + }, + { + "epoch": 0.8628532455594105, + "grad_norm": 0.09335090965032578, + "learning_rate": 4.6631476684490635e-05, + "loss": 2.5761, + "step": 29098 + }, + { + "epoch": 0.862882898911722, + "grad_norm": 0.08072738349437714, + "learning_rate": 4.661163746690006e-05, + "loss": 2.5697, + "step": 29099 + }, + { + "epoch": 0.8629125522640334, + "grad_norm": 0.08259227126836777, + "learning_rate": 4.6591802264141284e-05, + "loss": 2.5813, + "step": 29100 + }, + { + "epoch": 0.8629422056163449, + "grad_norm": 0.08635993301868439, + "learning_rate": 4.6571971076389885e-05, + "loss": 2.5679, + "step": 29101 + }, + { + "epoch": 0.8629718589686564, + "grad_norm": 0.08491925895214081, + "learning_rate": 4.655214390382145e-05, + "loss": 2.5846, + "step": 29102 + }, + { + "epoch": 0.8630015123209679, + "grad_norm": 0.08277596533298492, + "learning_rate": 4.653232074661168e-05, + "loss": 2.5985, + "step": 29103 + }, + { + "epoch": 0.8630311656732793, + "grad_norm": 0.08340337872505188, + "learning_rate": 4.6512501604936086e-05, + "loss": 2.5575, + "step": 29104 + }, + { + "epoch": 0.8630608190255908, + "grad_norm": 0.08320994675159454, + "learning_rate": 4.649268647897009e-05, + "loss": 2.5862, + "step": 29105 + }, + { + "epoch": 0.8630904723779024, + "grad_norm": 0.08039098232984543, + "learning_rate": 4.6472875368889223e-05, + "loss": 2.5594, + "step": 29106 + }, + { + "epoch": 0.8631201257302138, + "grad_norm": 0.08094698190689087, + "learning_rate": 4.6453068274869005e-05, + "loss": 2.5544, + "step": 29107 + }, + { + "epoch": 0.8631497790825253, + "grad_norm": 0.08018787950277328, + "learning_rate": 4.6433265197084576e-05, + "loss": 2.5508, + "step": 29108 + }, + { + "epoch": 0.8631794324348367, + "grad_norm": 0.08192487806081772, + "learning_rate": 4.6413466135711455e-05, + "loss": 2.5767, + "step": 29109 + }, + { + "epoch": 0.8632090857871483, + "grad_norm": 0.08199996501207352, + "learning_rate": 4.639367109092496e-05, + "loss": 2.5821, + "step": 29110 + }, + { + "epoch": 0.8632387391394597, + "grad_norm": 0.08314444869756699, + "learning_rate": 4.637388006290039e-05, + "loss": 2.5744, + "step": 29111 + }, + { + "epoch": 0.8632683924917712, + "grad_norm": 0.0846615806221962, + "learning_rate": 4.635409305181293e-05, + "loss": 2.5858, + "step": 29112 + }, + { + "epoch": 0.8632980458440827, + "grad_norm": 0.08217529207468033, + "learning_rate": 4.633431005783789e-05, + "loss": 2.5862, + "step": 29113 + }, + { + "epoch": 0.8633276991963942, + "grad_norm": 0.08582413196563721, + "learning_rate": 4.6314531081150354e-05, + "loss": 2.5623, + "step": 29114 + }, + { + "epoch": 0.8633573525487056, + "grad_norm": 0.07935241609811783, + "learning_rate": 4.629475612192546e-05, + "loss": 2.5842, + "step": 29115 + }, + { + "epoch": 0.8633870059010171, + "grad_norm": 0.08259500563144684, + "learning_rate": 4.6274985180338457e-05, + "loss": 2.6013, + "step": 29116 + }, + { + "epoch": 0.8634166592533286, + "grad_norm": 0.07908336073160172, + "learning_rate": 4.625521825656442e-05, + "loss": 2.5408, + "step": 29117 + }, + { + "epoch": 0.8634463126056401, + "grad_norm": 0.08685939013957977, + "learning_rate": 4.623545535077822e-05, + "loss": 2.5702, + "step": 29118 + }, + { + "epoch": 0.8634759659579515, + "grad_norm": 0.08048923313617706, + "learning_rate": 4.621569646315499e-05, + "loss": 2.5833, + "step": 29119 + }, + { + "epoch": 0.863505619310263, + "grad_norm": 0.07903793454170227, + "learning_rate": 4.619594159386964e-05, + "loss": 2.5649, + "step": 29120 + }, + { + "epoch": 0.8635352726625745, + "grad_norm": 0.08690446615219116, + "learning_rate": 4.617619074309709e-05, + "loss": 2.5839, + "step": 29121 + }, + { + "epoch": 0.863564926014886, + "grad_norm": 0.08350130170583725, + "learning_rate": 4.6156443911012316e-05, + "loss": 2.5795, + "step": 29122 + }, + { + "epoch": 0.8635945793671974, + "grad_norm": 0.08121985197067261, + "learning_rate": 4.6136701097790114e-05, + "loss": 2.5615, + "step": 29123 + }, + { + "epoch": 0.8636242327195089, + "grad_norm": 0.08111095428466797, + "learning_rate": 4.6116962303605294e-05, + "loss": 2.5952, + "step": 29124 + }, + { + "epoch": 0.8636538860718204, + "grad_norm": 0.08042476326227188, + "learning_rate": 4.609722752863271e-05, + "loss": 2.5822, + "step": 29125 + }, + { + "epoch": 0.8636835394241319, + "grad_norm": 0.08257212489843369, + "learning_rate": 4.607749677304712e-05, + "loss": 2.5566, + "step": 29126 + }, + { + "epoch": 0.8637131927764434, + "grad_norm": 0.07776456326246262, + "learning_rate": 4.605777003702316e-05, + "loss": 2.6015, + "step": 29127 + }, + { + "epoch": 0.8637428461287548, + "grad_norm": 0.07648659497499466, + "learning_rate": 4.6038047320735575e-05, + "loss": 2.5711, + "step": 29128 + }, + { + "epoch": 0.8637724994810664, + "grad_norm": 0.08619105070829391, + "learning_rate": 4.601832862435906e-05, + "loss": 2.5701, + "step": 29129 + }, + { + "epoch": 0.8638021528333778, + "grad_norm": 0.0858340784907341, + "learning_rate": 4.5998613948068094e-05, + "loss": 2.5739, + "step": 29130 + }, + { + "epoch": 0.8638318061856893, + "grad_norm": 0.07761736214160919, + "learning_rate": 4.597890329203735e-05, + "loss": 2.5584, + "step": 29131 + }, + { + "epoch": 0.8638614595380008, + "grad_norm": 0.08739040046930313, + "learning_rate": 4.595919665644138e-05, + "loss": 2.5669, + "step": 29132 + }, + { + "epoch": 0.8638911128903123, + "grad_norm": 0.07628719508647919, + "learning_rate": 4.5939494041454646e-05, + "loss": 2.5609, + "step": 29133 + }, + { + "epoch": 0.8639207662426237, + "grad_norm": 0.08202866464853287, + "learning_rate": 4.591979544725161e-05, + "loss": 2.5614, + "step": 29134 + }, + { + "epoch": 0.8639504195949352, + "grad_norm": 0.08569606393575668, + "learning_rate": 4.590010087400676e-05, + "loss": 2.5979, + "step": 29135 + }, + { + "epoch": 0.8639800729472467, + "grad_norm": 0.07561013847589493, + "learning_rate": 4.588041032189438e-05, + "loss": 2.5687, + "step": 29136 + }, + { + "epoch": 0.8640097262995582, + "grad_norm": 0.08374819159507751, + "learning_rate": 4.586072379108902e-05, + "loss": 2.5463, + "step": 29137 + }, + { + "epoch": 0.8640393796518696, + "grad_norm": 0.07787162810564041, + "learning_rate": 4.584104128176486e-05, + "loss": 2.5513, + "step": 29138 + }, + { + "epoch": 0.8640690330041811, + "grad_norm": 0.08255306631326675, + "learning_rate": 4.582136279409621e-05, + "loss": 2.5733, + "step": 29139 + }, + { + "epoch": 0.8640986863564926, + "grad_norm": 0.08669807016849518, + "learning_rate": 4.580168832825737e-05, + "loss": 2.5744, + "step": 29140 + }, + { + "epoch": 0.8641283397088041, + "grad_norm": 0.08591420948505402, + "learning_rate": 4.578201788442254e-05, + "loss": 2.5851, + "step": 29141 + }, + { + "epoch": 0.8641579930611155, + "grad_norm": 0.08824557811021805, + "learning_rate": 4.5762351462765914e-05, + "loss": 2.5563, + "step": 29142 + }, + { + "epoch": 0.864187646413427, + "grad_norm": 0.07973838597536087, + "learning_rate": 4.574268906346163e-05, + "loss": 2.5634, + "step": 29143 + }, + { + "epoch": 0.8642172997657385, + "grad_norm": 0.08557789772748947, + "learning_rate": 4.572303068668382e-05, + "loss": 2.5651, + "step": 29144 + }, + { + "epoch": 0.86424695311805, + "grad_norm": 0.08160823583602905, + "learning_rate": 4.5703376332606516e-05, + "loss": 2.5733, + "step": 29145 + }, + { + "epoch": 0.8642766064703614, + "grad_norm": 0.08486801385879517, + "learning_rate": 4.568372600140386e-05, + "loss": 2.5529, + "step": 29146 + }, + { + "epoch": 0.864306259822673, + "grad_norm": 0.08047018945217133, + "learning_rate": 4.566407969324976e-05, + "loss": 2.559, + "step": 29147 + }, + { + "epoch": 0.8643359131749845, + "grad_norm": 0.08179458975791931, + "learning_rate": 4.5644437408318196e-05, + "loss": 2.5805, + "step": 29148 + }, + { + "epoch": 0.8643655665272959, + "grad_norm": 0.08162505179643631, + "learning_rate": 4.5624799146783134e-05, + "loss": 2.5583, + "step": 29149 + }, + { + "epoch": 0.8643952198796074, + "grad_norm": 0.08619918674230576, + "learning_rate": 4.560516490881844e-05, + "loss": 2.5851, + "step": 29150 + }, + { + "epoch": 0.8644248732319189, + "grad_norm": 0.08609815686941147, + "learning_rate": 4.558553469459814e-05, + "loss": 2.5608, + "step": 29151 + }, + { + "epoch": 0.8644545265842304, + "grad_norm": 0.07753197848796844, + "learning_rate": 4.556590850429587e-05, + "loss": 2.5887, + "step": 29152 + }, + { + "epoch": 0.8644841799365418, + "grad_norm": 0.0814833715558052, + "learning_rate": 4.554628633808544e-05, + "loss": 2.5607, + "step": 29153 + }, + { + "epoch": 0.8645138332888533, + "grad_norm": 0.08564863353967667, + "learning_rate": 4.552666819614054e-05, + "loss": 2.5342, + "step": 29154 + }, + { + "epoch": 0.8645434866411648, + "grad_norm": 0.08278417587280273, + "learning_rate": 4.550705407863515e-05, + "loss": 2.5608, + "step": 29155 + }, + { + "epoch": 0.8645731399934763, + "grad_norm": 0.08389145880937576, + "learning_rate": 4.548744398574273e-05, + "loss": 2.5832, + "step": 29156 + }, + { + "epoch": 0.8646027933457877, + "grad_norm": 0.08268485963344574, + "learning_rate": 4.546783791763709e-05, + "loss": 2.5796, + "step": 29157 + }, + { + "epoch": 0.8646324466980992, + "grad_norm": 0.08124791830778122, + "learning_rate": 4.544823587449171e-05, + "loss": 2.5894, + "step": 29158 + }, + { + "epoch": 0.8646621000504107, + "grad_norm": 0.08906687051057816, + "learning_rate": 4.5428637856480274e-05, + "loss": 2.5488, + "step": 29159 + }, + { + "epoch": 0.8646917534027222, + "grad_norm": 0.08297446370124817, + "learning_rate": 4.54090438637762e-05, + "loss": 2.5369, + "step": 29160 + }, + { + "epoch": 0.8647214067550336, + "grad_norm": 0.08241970837116241, + "learning_rate": 4.538945389655314e-05, + "loss": 2.5716, + "step": 29161 + }, + { + "epoch": 0.8647510601073451, + "grad_norm": 0.09036251902580261, + "learning_rate": 4.536986795498454e-05, + "loss": 2.5941, + "step": 29162 + }, + { + "epoch": 0.8647807134596566, + "grad_norm": 0.07855460792779922, + "learning_rate": 4.5350286039243735e-05, + "loss": 2.5808, + "step": 29163 + }, + { + "epoch": 0.8648103668119681, + "grad_norm": 0.09001178294420242, + "learning_rate": 4.533070814950424e-05, + "loss": 2.6024, + "step": 29164 + }, + { + "epoch": 0.8648400201642795, + "grad_norm": 0.08536845445632935, + "learning_rate": 4.53111342859393e-05, + "loss": 2.5328, + "step": 29165 + }, + { + "epoch": 0.864869673516591, + "grad_norm": 0.08053059875965118, + "learning_rate": 4.529156444872229e-05, + "loss": 2.5771, + "step": 29166 + }, + { + "epoch": 0.8648993268689026, + "grad_norm": 0.08922586590051651, + "learning_rate": 4.52719986380265e-05, + "loss": 2.5777, + "step": 29167 + }, + { + "epoch": 0.864928980221214, + "grad_norm": 0.08575163036584854, + "learning_rate": 4.5252436854025244e-05, + "loss": 2.5803, + "step": 29168 + }, + { + "epoch": 0.8649586335735255, + "grad_norm": 0.089402936398983, + "learning_rate": 4.5232879096891775e-05, + "loss": 2.5463, + "step": 29169 + }, + { + "epoch": 0.864988286925837, + "grad_norm": 0.08803696930408478, + "learning_rate": 4.521332536679917e-05, + "loss": 2.5931, + "step": 29170 + }, + { + "epoch": 0.8650179402781485, + "grad_norm": 0.0810880959033966, + "learning_rate": 4.5193775663920686e-05, + "loss": 2.5441, + "step": 29171 + }, + { + "epoch": 0.8650475936304599, + "grad_norm": 0.08233632892370224, + "learning_rate": 4.51742299884294e-05, + "loss": 2.5608, + "step": 29172 + }, + { + "epoch": 0.8650772469827714, + "grad_norm": 0.08187469840049744, + "learning_rate": 4.515468834049835e-05, + "loss": 2.5586, + "step": 29173 + }, + { + "epoch": 0.8651069003350829, + "grad_norm": 0.08448104560375214, + "learning_rate": 4.5135150720300614e-05, + "loss": 2.5775, + "step": 29174 + }, + { + "epoch": 0.8651365536873944, + "grad_norm": 0.08109711855649948, + "learning_rate": 4.5115617128009154e-05, + "loss": 2.5432, + "step": 29175 + }, + { + "epoch": 0.8651662070397058, + "grad_norm": 0.08889543265104294, + "learning_rate": 4.5096087563797016e-05, + "loss": 2.5899, + "step": 29176 + }, + { + "epoch": 0.8651958603920173, + "grad_norm": 0.07796261459589005, + "learning_rate": 4.507656202783705e-05, + "loss": 2.6096, + "step": 29177 + }, + { + "epoch": 0.8652255137443288, + "grad_norm": 0.08216845244169235, + "learning_rate": 4.5057040520302295e-05, + "loss": 2.5681, + "step": 29178 + }, + { + "epoch": 0.8652551670966403, + "grad_norm": 0.09078340232372284, + "learning_rate": 4.503752304136549e-05, + "loss": 2.5678, + "step": 29179 + }, + { + "epoch": 0.8652848204489517, + "grad_norm": 0.08610480278730392, + "learning_rate": 4.501800959119939e-05, + "loss": 2.5704, + "step": 29180 + }, + { + "epoch": 0.8653144738012633, + "grad_norm": 0.08680468797683716, + "learning_rate": 4.499850016997708e-05, + "loss": 2.5967, + "step": 29181 + }, + { + "epoch": 0.8653441271535747, + "grad_norm": 0.08497294038534164, + "learning_rate": 4.4978994777871095e-05, + "loss": 2.6076, + "step": 29182 + }, + { + "epoch": 0.8653737805058862, + "grad_norm": 0.08045018464326859, + "learning_rate": 4.495949341505434e-05, + "loss": 2.5604, + "step": 29183 + }, + { + "epoch": 0.8654034338581976, + "grad_norm": 0.08154288679361343, + "learning_rate": 4.4939996081699244e-05, + "loss": 2.5613, + "step": 29184 + }, + { + "epoch": 0.8654330872105092, + "grad_norm": 0.08026330173015594, + "learning_rate": 4.49205027779786e-05, + "loss": 2.5882, + "step": 29185 + }, + { + "epoch": 0.8654627405628206, + "grad_norm": 0.08013119548559189, + "learning_rate": 4.4901013504065056e-05, + "loss": 2.544, + "step": 29186 + }, + { + "epoch": 0.8654923939151321, + "grad_norm": 0.08420159667730331, + "learning_rate": 4.488152826013114e-05, + "loss": 2.5619, + "step": 29187 + }, + { + "epoch": 0.8655220472674436, + "grad_norm": 0.08288294821977615, + "learning_rate": 4.4862047046349386e-05, + "loss": 2.6235, + "step": 29188 + }, + { + "epoch": 0.8655517006197551, + "grad_norm": 0.08507627993822098, + "learning_rate": 4.484256986289237e-05, + "loss": 2.5313, + "step": 29189 + }, + { + "epoch": 0.8655813539720666, + "grad_norm": 0.08862976729869843, + "learning_rate": 4.482309670993251e-05, + "loss": 2.6048, + "step": 29190 + }, + { + "epoch": 0.865611007324378, + "grad_norm": 0.09131849557161331, + "learning_rate": 4.4803627587642226e-05, + "loss": 2.5393, + "step": 29191 + }, + { + "epoch": 0.8656406606766895, + "grad_norm": 0.0836310014128685, + "learning_rate": 4.4784162496193994e-05, + "loss": 2.5549, + "step": 29192 + }, + { + "epoch": 0.865670314029001, + "grad_norm": 0.08944232761859894, + "learning_rate": 4.4764701435760116e-05, + "loss": 2.5886, + "step": 29193 + }, + { + "epoch": 0.8656999673813125, + "grad_norm": 0.08344848453998566, + "learning_rate": 4.474524440651295e-05, + "loss": 2.5379, + "step": 29194 + }, + { + "epoch": 0.8657296207336239, + "grad_norm": 0.08676628768444061, + "learning_rate": 4.472579140862476e-05, + "loss": 2.5861, + "step": 29195 + }, + { + "epoch": 0.8657592740859354, + "grad_norm": 0.08373981714248657, + "learning_rate": 4.47063424422679e-05, + "loss": 2.5922, + "step": 29196 + }, + { + "epoch": 0.8657889274382469, + "grad_norm": 0.07916504144668579, + "learning_rate": 4.468689750761446e-05, + "loss": 2.5821, + "step": 29197 + }, + { + "epoch": 0.8658185807905584, + "grad_norm": 0.08265268057584763, + "learning_rate": 4.466745660483673e-05, + "loss": 2.5724, + "step": 29198 + }, + { + "epoch": 0.8658482341428698, + "grad_norm": 0.08045139908790588, + "learning_rate": 4.464801973410687e-05, + "loss": 2.5788, + "step": 29199 + }, + { + "epoch": 0.8658778874951814, + "grad_norm": 0.08574535697698593, + "learning_rate": 4.46285868955969e-05, + "loss": 2.5729, + "step": 29200 + }, + { + "epoch": 0.8659075408474928, + "grad_norm": 0.08936942368745804, + "learning_rate": 4.460915808947896e-05, + "loss": 2.5895, + "step": 29201 + }, + { + "epoch": 0.8659371941998043, + "grad_norm": 0.07899875193834305, + "learning_rate": 4.458973331592508e-05, + "loss": 2.548, + "step": 29202 + }, + { + "epoch": 0.8659668475521157, + "grad_norm": 0.0921861082315445, + "learning_rate": 4.45703125751073e-05, + "loss": 2.5835, + "step": 29203 + }, + { + "epoch": 0.8659965009044273, + "grad_norm": 0.08371338248252869, + "learning_rate": 4.4550895867197624e-05, + "loss": 2.586, + "step": 29204 + }, + { + "epoch": 0.8660261542567387, + "grad_norm": 0.08583396673202515, + "learning_rate": 4.453148319236788e-05, + "loss": 2.5615, + "step": 29205 + }, + { + "epoch": 0.8660558076090502, + "grad_norm": 0.08062274008989334, + "learning_rate": 4.451207455079004e-05, + "loss": 2.5783, + "step": 29206 + }, + { + "epoch": 0.8660854609613616, + "grad_norm": 0.07961801439523697, + "learning_rate": 4.4492669942635954e-05, + "loss": 2.5658, + "step": 29207 + }, + { + "epoch": 0.8661151143136732, + "grad_norm": 0.08191179484128952, + "learning_rate": 4.44732693680775e-05, + "loss": 2.5384, + "step": 29208 + }, + { + "epoch": 0.8661447676659847, + "grad_norm": 0.08348670601844788, + "learning_rate": 4.4453872827286415e-05, + "loss": 2.5531, + "step": 29209 + }, + { + "epoch": 0.8661744210182961, + "grad_norm": 0.08633461594581604, + "learning_rate": 4.443448032043451e-05, + "loss": 2.5619, + "step": 29210 + }, + { + "epoch": 0.8662040743706076, + "grad_norm": 0.0852990448474884, + "learning_rate": 4.441509184769343e-05, + "loss": 2.5859, + "step": 29211 + }, + { + "epoch": 0.8662337277229191, + "grad_norm": 0.0836934968829155, + "learning_rate": 4.4395707409234934e-05, + "loss": 2.5748, + "step": 29212 + }, + { + "epoch": 0.8662633810752306, + "grad_norm": 0.08503439277410507, + "learning_rate": 4.437632700523064e-05, + "loss": 2.5512, + "step": 29213 + }, + { + "epoch": 0.866293034427542, + "grad_norm": 0.0876627117395401, + "learning_rate": 4.435695063585221e-05, + "loss": 2.5741, + "step": 29214 + }, + { + "epoch": 0.8663226877798536, + "grad_norm": 0.08848255127668381, + "learning_rate": 4.433757830127116e-05, + "loss": 2.5668, + "step": 29215 + }, + { + "epoch": 0.866352341132165, + "grad_norm": 0.09392598271369934, + "learning_rate": 4.4318210001659086e-05, + "loss": 2.5949, + "step": 29216 + }, + { + "epoch": 0.8663819944844765, + "grad_norm": 0.08487781137228012, + "learning_rate": 4.4298845737187566e-05, + "loss": 2.5432, + "step": 29217 + }, + { + "epoch": 0.8664116478367879, + "grad_norm": 0.09381657093763351, + "learning_rate": 4.42794855080278e-05, + "loss": 2.5998, + "step": 29218 + }, + { + "epoch": 0.8664413011890995, + "grad_norm": 0.08629768341779709, + "learning_rate": 4.426012931435147e-05, + "loss": 2.5715, + "step": 29219 + }, + { + "epoch": 0.8664709545414109, + "grad_norm": 0.0768107995390892, + "learning_rate": 4.424077715632996e-05, + "loss": 2.5818, + "step": 29220 + }, + { + "epoch": 0.8665006078937224, + "grad_norm": 0.07942322641611099, + "learning_rate": 4.422142903413462e-05, + "loss": 2.5971, + "step": 29221 + }, + { + "epoch": 0.8665302612460338, + "grad_norm": 0.08560100197792053, + "learning_rate": 4.420208494793671e-05, + "loss": 2.5819, + "step": 29222 + }, + { + "epoch": 0.8665599145983454, + "grad_norm": 0.080547995865345, + "learning_rate": 4.4182744897907576e-05, + "loss": 2.5556, + "step": 29223 + }, + { + "epoch": 0.8665895679506568, + "grad_norm": 0.07592038065195084, + "learning_rate": 4.416340888421855e-05, + "loss": 2.5242, + "step": 29224 + }, + { + "epoch": 0.8666192213029683, + "grad_norm": 0.07954810559749603, + "learning_rate": 4.4144076907040696e-05, + "loss": 2.5735, + "step": 29225 + }, + { + "epoch": 0.8666488746552797, + "grad_norm": 0.076986163854599, + "learning_rate": 4.412474896654534e-05, + "loss": 2.5753, + "step": 29226 + }, + { + "epoch": 0.8666785280075913, + "grad_norm": 0.07973077148199081, + "learning_rate": 4.4105425062903656e-05, + "loss": 2.5714, + "step": 29227 + }, + { + "epoch": 0.8667081813599027, + "grad_norm": 0.0889231413602829, + "learning_rate": 4.408610519628659e-05, + "loss": 2.5644, + "step": 29228 + }, + { + "epoch": 0.8667378347122142, + "grad_norm": 0.08400700986385345, + "learning_rate": 4.406678936686531e-05, + "loss": 2.6092, + "step": 29229 + }, + { + "epoch": 0.8667674880645257, + "grad_norm": 0.08409903198480606, + "learning_rate": 4.404747757481087e-05, + "loss": 2.5545, + "step": 29230 + }, + { + "epoch": 0.8667971414168372, + "grad_norm": 0.08670958131551743, + "learning_rate": 4.4028169820294226e-05, + "loss": 2.54, + "step": 29231 + }, + { + "epoch": 0.8668267947691487, + "grad_norm": 0.08624041825532913, + "learning_rate": 4.400886610348648e-05, + "loss": 2.5923, + "step": 29232 + }, + { + "epoch": 0.8668564481214601, + "grad_norm": 0.08446114510297775, + "learning_rate": 4.3989566424558483e-05, + "loss": 2.5819, + "step": 29233 + }, + { + "epoch": 0.8668861014737717, + "grad_norm": 0.08750683069229126, + "learning_rate": 4.397027078368116e-05, + "loss": 2.5882, + "step": 29234 + }, + { + "epoch": 0.8669157548260831, + "grad_norm": 0.08309858292341232, + "learning_rate": 4.395097918102531e-05, + "loss": 2.5773, + "step": 29235 + }, + { + "epoch": 0.8669454081783946, + "grad_norm": 0.08241362869739532, + "learning_rate": 4.393169161676192e-05, + "loss": 2.5629, + "step": 29236 + }, + { + "epoch": 0.866975061530706, + "grad_norm": 0.08090842515230179, + "learning_rate": 4.391240809106162e-05, + "loss": 2.5462, + "step": 29237 + }, + { + "epoch": 0.8670047148830176, + "grad_norm": 0.08030252903699875, + "learning_rate": 4.389312860409533e-05, + "loss": 2.5714, + "step": 29238 + }, + { + "epoch": 0.867034368235329, + "grad_norm": 0.08004054427146912, + "learning_rate": 4.387385315603359e-05, + "loss": 2.5601, + "step": 29239 + }, + { + "epoch": 0.8670640215876405, + "grad_norm": 0.08183828741312027, + "learning_rate": 4.385458174704715e-05, + "loss": 2.5874, + "step": 29240 + }, + { + "epoch": 0.8670936749399519, + "grad_norm": 0.08723224699497223, + "learning_rate": 4.38353143773067e-05, + "loss": 2.5681, + "step": 29241 + }, + { + "epoch": 0.8671233282922635, + "grad_norm": 0.0787743553519249, + "learning_rate": 4.3816051046982886e-05, + "loss": 2.6093, + "step": 29242 + }, + { + "epoch": 0.8671529816445749, + "grad_norm": 0.07919490337371826, + "learning_rate": 4.3796791756246176e-05, + "loss": 2.5565, + "step": 29243 + }, + { + "epoch": 0.8671826349968864, + "grad_norm": 0.07614358514547348, + "learning_rate": 4.377753650526711e-05, + "loss": 2.5809, + "step": 29244 + }, + { + "epoch": 0.8672122883491978, + "grad_norm": 0.082331083714962, + "learning_rate": 4.375828529421638e-05, + "loss": 2.6135, + "step": 29245 + }, + { + "epoch": 0.8672419417015094, + "grad_norm": 0.08311200141906738, + "learning_rate": 4.3739038123264354e-05, + "loss": 2.5733, + "step": 29246 + }, + { + "epoch": 0.8672715950538208, + "grad_norm": 0.08195894956588745, + "learning_rate": 4.371979499258144e-05, + "loss": 2.5834, + "step": 29247 + }, + { + "epoch": 0.8673012484061323, + "grad_norm": 0.08150096237659454, + "learning_rate": 4.3700555902338124e-05, + "loss": 2.5338, + "step": 29248 + }, + { + "epoch": 0.8673309017584437, + "grad_norm": 0.08173047006130219, + "learning_rate": 4.368132085270465e-05, + "loss": 2.552, + "step": 29249 + }, + { + "epoch": 0.8673605551107553, + "grad_norm": 0.08166525512933731, + "learning_rate": 4.3662089843851396e-05, + "loss": 2.5669, + "step": 29250 + }, + { + "epoch": 0.8673902084630668, + "grad_norm": 0.08374826610088348, + "learning_rate": 4.3642862875948705e-05, + "loss": 2.5695, + "step": 29251 + }, + { + "epoch": 0.8674198618153782, + "grad_norm": 0.08230678737163544, + "learning_rate": 4.362363994916674e-05, + "loss": 2.5731, + "step": 29252 + }, + { + "epoch": 0.8674495151676898, + "grad_norm": 0.0850917249917984, + "learning_rate": 4.360442106367579e-05, + "loss": 2.5802, + "step": 29253 + }, + { + "epoch": 0.8674791685200012, + "grad_norm": 0.07773721218109131, + "learning_rate": 4.3585206219646066e-05, + "loss": 2.5512, + "step": 29254 + }, + { + "epoch": 0.8675088218723127, + "grad_norm": 0.08096115291118622, + "learning_rate": 4.356599541724765e-05, + "loss": 2.5662, + "step": 29255 + }, + { + "epoch": 0.8675384752246241, + "grad_norm": 0.08671721071004868, + "learning_rate": 4.354678865665068e-05, + "loss": 2.6106, + "step": 29256 + }, + { + "epoch": 0.8675681285769357, + "grad_norm": 0.08175545930862427, + "learning_rate": 4.3527585938025196e-05, + "loss": 2.5656, + "step": 29257 + }, + { + "epoch": 0.8675977819292471, + "grad_norm": 0.07859157770872116, + "learning_rate": 4.350838726154138e-05, + "loss": 2.5766, + "step": 29258 + }, + { + "epoch": 0.8676274352815586, + "grad_norm": 0.08317835628986359, + "learning_rate": 4.3489192627369225e-05, + "loss": 2.5671, + "step": 29259 + }, + { + "epoch": 0.86765708863387, + "grad_norm": 0.08291176706552505, + "learning_rate": 4.3470002035678526e-05, + "loss": 2.5638, + "step": 29260 + }, + { + "epoch": 0.8676867419861816, + "grad_norm": 0.08419136703014374, + "learning_rate": 4.3450815486639315e-05, + "loss": 2.5871, + "step": 29261 + }, + { + "epoch": 0.867716395338493, + "grad_norm": 0.07777728885412216, + "learning_rate": 4.343163298042152e-05, + "loss": 2.5706, + "step": 29262 + }, + { + "epoch": 0.8677460486908045, + "grad_norm": 0.07860182970762253, + "learning_rate": 4.3412454517194996e-05, + "loss": 2.5751, + "step": 29263 + }, + { + "epoch": 0.8677757020431159, + "grad_norm": 0.08778180927038193, + "learning_rate": 4.339328009712951e-05, + "loss": 2.5682, + "step": 29264 + }, + { + "epoch": 0.8678053553954275, + "grad_norm": 0.08807574212551117, + "learning_rate": 4.337410972039496e-05, + "loss": 2.5734, + "step": 29265 + }, + { + "epoch": 0.8678350087477389, + "grad_norm": 0.08433448523283005, + "learning_rate": 4.3354943387161005e-05, + "loss": 2.5548, + "step": 29266 + }, + { + "epoch": 0.8678646621000504, + "grad_norm": 0.08786345273256302, + "learning_rate": 4.333578109759745e-05, + "loss": 2.5574, + "step": 29267 + }, + { + "epoch": 0.8678943154523618, + "grad_norm": 0.08288411051034927, + "learning_rate": 4.3316622851873935e-05, + "loss": 2.5714, + "step": 29268 + }, + { + "epoch": 0.8679239688046734, + "grad_norm": 0.08693672716617584, + "learning_rate": 4.329746865016004e-05, + "loss": 2.561, + "step": 29269 + }, + { + "epoch": 0.8679536221569848, + "grad_norm": 0.08630628138780594, + "learning_rate": 4.327831849262554e-05, + "loss": 2.5538, + "step": 29270 + }, + { + "epoch": 0.8679832755092963, + "grad_norm": 0.08353809267282486, + "learning_rate": 4.325917237943988e-05, + "loss": 2.5611, + "step": 29271 + }, + { + "epoch": 0.8680129288616079, + "grad_norm": 0.08668716251850128, + "learning_rate": 4.324003031077267e-05, + "loss": 2.5741, + "step": 29272 + }, + { + "epoch": 0.8680425822139193, + "grad_norm": 0.07974997162818909, + "learning_rate": 4.322089228679338e-05, + "loss": 2.5686, + "step": 29273 + }, + { + "epoch": 0.8680722355662308, + "grad_norm": 0.09130609035491943, + "learning_rate": 4.320175830767148e-05, + "loss": 2.5769, + "step": 29274 + }, + { + "epoch": 0.8681018889185422, + "grad_norm": 0.09308277815580368, + "learning_rate": 4.3182628373576447e-05, + "loss": 2.6013, + "step": 29275 + }, + { + "epoch": 0.8681315422708538, + "grad_norm": 0.0832349881529808, + "learning_rate": 4.316350248467765e-05, + "loss": 2.5836, + "step": 29276 + }, + { + "epoch": 0.8681611956231652, + "grad_norm": 0.09223858267068863, + "learning_rate": 4.314438064114445e-05, + "loss": 2.5521, + "step": 29277 + }, + { + "epoch": 0.8681908489754767, + "grad_norm": 0.08466033637523651, + "learning_rate": 4.3125262843146165e-05, + "loss": 2.57, + "step": 29278 + }, + { + "epoch": 0.8682205023277881, + "grad_norm": 0.08796771615743637, + "learning_rate": 4.310614909085209e-05, + "loss": 2.5599, + "step": 29279 + }, + { + "epoch": 0.8682501556800997, + "grad_norm": 0.0836043655872345, + "learning_rate": 4.308703938443148e-05, + "loss": 2.5496, + "step": 29280 + }, + { + "epoch": 0.8682798090324111, + "grad_norm": 0.08359664678573608, + "learning_rate": 4.30679337240536e-05, + "loss": 2.5618, + "step": 29281 + }, + { + "epoch": 0.8683094623847226, + "grad_norm": 0.08287537842988968, + "learning_rate": 4.3048832109887584e-05, + "loss": 2.5677, + "step": 29282 + }, + { + "epoch": 0.868339115737034, + "grad_norm": 0.07905042171478271, + "learning_rate": 4.302973454210263e-05, + "loss": 2.5936, + "step": 29283 + }, + { + "epoch": 0.8683687690893456, + "grad_norm": 0.0849449411034584, + "learning_rate": 4.301064102086777e-05, + "loss": 2.5751, + "step": 29284 + }, + { + "epoch": 0.868398422441657, + "grad_norm": 0.07823657244443893, + "learning_rate": 4.299155154635215e-05, + "loss": 2.5499, + "step": 29285 + }, + { + "epoch": 0.8684280757939685, + "grad_norm": 0.07959774136543274, + "learning_rate": 4.297246611872474e-05, + "loss": 2.5944, + "step": 29286 + }, + { + "epoch": 0.8684577291462799, + "grad_norm": 0.08159727603197098, + "learning_rate": 4.295338473815463e-05, + "loss": 2.575, + "step": 29287 + }, + { + "epoch": 0.8684873824985915, + "grad_norm": 0.08507060259580612, + "learning_rate": 4.2934307404810746e-05, + "loss": 2.6104, + "step": 29288 + }, + { + "epoch": 0.8685170358509029, + "grad_norm": 0.08150451630353928, + "learning_rate": 4.2915234118862004e-05, + "loss": 2.5629, + "step": 29289 + }, + { + "epoch": 0.8685466892032144, + "grad_norm": 0.07705709338188171, + "learning_rate": 4.289616488047737e-05, + "loss": 2.5743, + "step": 29290 + }, + { + "epoch": 0.8685763425555258, + "grad_norm": 0.0836503878235817, + "learning_rate": 4.287709968982562e-05, + "loss": 2.5591, + "step": 29291 + }, + { + "epoch": 0.8686059959078374, + "grad_norm": 0.0852164477109909, + "learning_rate": 4.2858038547075595e-05, + "loss": 2.6016, + "step": 29292 + }, + { + "epoch": 0.8686356492601489, + "grad_norm": 0.07963922619819641, + "learning_rate": 4.2838981452396234e-05, + "loss": 2.5691, + "step": 29293 + }, + { + "epoch": 0.8686653026124603, + "grad_norm": 0.08440124988555908, + "learning_rate": 4.2819928405956054e-05, + "loss": 2.5594, + "step": 29294 + }, + { + "epoch": 0.8686949559647719, + "grad_norm": 0.08662157505750656, + "learning_rate": 4.280087940792388e-05, + "loss": 2.5959, + "step": 29295 + }, + { + "epoch": 0.8687246093170833, + "grad_norm": 0.07570160925388336, + "learning_rate": 4.278183445846834e-05, + "loss": 2.5469, + "step": 29296 + }, + { + "epoch": 0.8687542626693948, + "grad_norm": 0.08529766649007797, + "learning_rate": 4.2762793557758196e-05, + "loss": 2.5877, + "step": 29297 + }, + { + "epoch": 0.8687839160217062, + "grad_norm": 0.0787290707230568, + "learning_rate": 4.274375670596198e-05, + "loss": 2.5638, + "step": 29298 + }, + { + "epoch": 0.8688135693740178, + "grad_norm": 0.08846615254878998, + "learning_rate": 4.272472390324833e-05, + "loss": 2.602, + "step": 29299 + }, + { + "epoch": 0.8688432227263292, + "grad_norm": 0.08895433694124222, + "learning_rate": 4.270569514978573e-05, + "loss": 2.5576, + "step": 29300 + }, + { + "epoch": 0.8688728760786407, + "grad_norm": 0.08891677856445312, + "learning_rate": 4.268667044574265e-05, + "loss": 2.5619, + "step": 29301 + }, + { + "epoch": 0.8689025294309521, + "grad_norm": 0.08086952567100525, + "learning_rate": 4.266764979128762e-05, + "loss": 2.5916, + "step": 29302 + }, + { + "epoch": 0.8689321827832637, + "grad_norm": 0.09411272406578064, + "learning_rate": 4.264863318658907e-05, + "loss": 2.5767, + "step": 29303 + }, + { + "epoch": 0.8689618361355751, + "grad_norm": 0.07693277299404144, + "learning_rate": 4.2629620631815466e-05, + "loss": 2.5408, + "step": 29304 + }, + { + "epoch": 0.8689914894878866, + "grad_norm": 0.07714156061410904, + "learning_rate": 4.2610612127134964e-05, + "loss": 2.5588, + "step": 29305 + }, + { + "epoch": 0.869021142840198, + "grad_norm": 0.07782896608114243, + "learning_rate": 4.2591607672716026e-05, + "loss": 2.5773, + "step": 29306 + }, + { + "epoch": 0.8690507961925096, + "grad_norm": 0.08017498254776001, + "learning_rate": 4.2572607268726914e-05, + "loss": 2.5755, + "step": 29307 + }, + { + "epoch": 0.869080449544821, + "grad_norm": 0.07901471853256226, + "learning_rate": 4.255361091533588e-05, + "loss": 2.5964, + "step": 29308 + }, + { + "epoch": 0.8691101028971325, + "grad_norm": 0.08072496950626373, + "learning_rate": 4.2534618612711074e-05, + "loss": 2.5991, + "step": 29309 + }, + { + "epoch": 0.869139756249444, + "grad_norm": 0.07565592229366302, + "learning_rate": 4.2515630361020796e-05, + "loss": 2.5563, + "step": 29310 + }, + { + "epoch": 0.8691694096017555, + "grad_norm": 0.08584984391927719, + "learning_rate": 4.249664616043314e-05, + "loss": 2.5597, + "step": 29311 + }, + { + "epoch": 0.8691990629540669, + "grad_norm": 0.07989461719989777, + "learning_rate": 4.247766601111619e-05, + "loss": 2.5753, + "step": 29312 + }, + { + "epoch": 0.8692287163063784, + "grad_norm": 0.07851724326610565, + "learning_rate": 4.245868991323809e-05, + "loss": 2.5496, + "step": 29313 + }, + { + "epoch": 0.86925836965869, + "grad_norm": 0.090727798640728, + "learning_rate": 4.243971786696688e-05, + "loss": 2.5915, + "step": 29314 + }, + { + "epoch": 0.8692880230110014, + "grad_norm": 0.08051453530788422, + "learning_rate": 4.2420749872470464e-05, + "loss": 2.5894, + "step": 29315 + }, + { + "epoch": 0.8693176763633129, + "grad_norm": 0.07684326171875, + "learning_rate": 4.240178592991678e-05, + "loss": 2.5813, + "step": 29316 + }, + { + "epoch": 0.8693473297156243, + "grad_norm": 0.08404310047626495, + "learning_rate": 4.238282603947385e-05, + "loss": 2.5623, + "step": 29317 + }, + { + "epoch": 0.8693769830679359, + "grad_norm": 0.08359583467245102, + "learning_rate": 4.2363870201309605e-05, + "loss": 2.5944, + "step": 29318 + }, + { + "epoch": 0.8694066364202473, + "grad_norm": 0.08296803385019302, + "learning_rate": 4.2344918415591794e-05, + "loss": 2.5485, + "step": 29319 + }, + { + "epoch": 0.8694362897725588, + "grad_norm": 0.07879733294248581, + "learning_rate": 4.232597068248828e-05, + "loss": 2.5551, + "step": 29320 + }, + { + "epoch": 0.8694659431248702, + "grad_norm": 0.0864049643278122, + "learning_rate": 4.2307027002166873e-05, + "loss": 2.58, + "step": 29321 + }, + { + "epoch": 0.8694955964771818, + "grad_norm": 0.07455264031887054, + "learning_rate": 4.228808737479517e-05, + "loss": 2.5599, + "step": 29322 + }, + { + "epoch": 0.8695252498294932, + "grad_norm": 0.07457245141267776, + "learning_rate": 4.2269151800541136e-05, + "loss": 2.5558, + "step": 29323 + }, + { + "epoch": 0.8695549031818047, + "grad_norm": 0.07833973318338394, + "learning_rate": 4.225022027957248e-05, + "loss": 2.5865, + "step": 29324 + }, + { + "epoch": 0.8695845565341161, + "grad_norm": 0.07855874300003052, + "learning_rate": 4.223129281205651e-05, + "loss": 2.5665, + "step": 29325 + }, + { + "epoch": 0.8696142098864277, + "grad_norm": 0.08072582632303238, + "learning_rate": 4.221236939816109e-05, + "loss": 2.5877, + "step": 29326 + }, + { + "epoch": 0.8696438632387391, + "grad_norm": 0.0800234004855156, + "learning_rate": 4.219345003805369e-05, + "loss": 2.5523, + "step": 29327 + }, + { + "epoch": 0.8696735165910506, + "grad_norm": 0.07909145951271057, + "learning_rate": 4.217453473190186e-05, + "loss": 2.5739, + "step": 29328 + }, + { + "epoch": 0.869703169943362, + "grad_norm": 0.0780433863401413, + "learning_rate": 4.2155623479873116e-05, + "loss": 2.5289, + "step": 29329 + }, + { + "epoch": 0.8697328232956736, + "grad_norm": 0.07508132606744766, + "learning_rate": 4.213671628213489e-05, + "loss": 2.5802, + "step": 29330 + }, + { + "epoch": 0.869762476647985, + "grad_norm": 0.07601757347583771, + "learning_rate": 4.2117813138854654e-05, + "loss": 2.5953, + "step": 29331 + }, + { + "epoch": 0.8697921300002965, + "grad_norm": 0.07664512097835541, + "learning_rate": 4.209891405019983e-05, + "loss": 2.5577, + "step": 29332 + }, + { + "epoch": 0.869821783352608, + "grad_norm": 0.07912422716617584, + "learning_rate": 4.2080019016337675e-05, + "loss": 2.5846, + "step": 29333 + }, + { + "epoch": 0.8698514367049195, + "grad_norm": 0.08024299889802933, + "learning_rate": 4.20611280374355e-05, + "loss": 2.5799, + "step": 29334 + }, + { + "epoch": 0.869881090057231, + "grad_norm": 0.0758143961429596, + "learning_rate": 4.204224111366073e-05, + "loss": 2.5467, + "step": 29335 + }, + { + "epoch": 0.8699107434095424, + "grad_norm": 0.08103533834218979, + "learning_rate": 4.202335824518044e-05, + "loss": 2.5454, + "step": 29336 + }, + { + "epoch": 0.869940396761854, + "grad_norm": 0.07900989055633545, + "learning_rate": 4.2004479432162014e-05, + "loss": 2.6128, + "step": 29337 + }, + { + "epoch": 0.8699700501141654, + "grad_norm": 0.08378010988235474, + "learning_rate": 4.198560467477247e-05, + "loss": 2.5902, + "step": 29338 + }, + { + "epoch": 0.8699997034664769, + "grad_norm": 0.08113382756710052, + "learning_rate": 4.196673397317902e-05, + "loss": 2.5991, + "step": 29339 + }, + { + "epoch": 0.8700293568187883, + "grad_norm": 0.08050980418920517, + "learning_rate": 4.1947867327548794e-05, + "loss": 2.5775, + "step": 29340 + }, + { + "epoch": 0.8700590101710999, + "grad_norm": 0.07691359519958496, + "learning_rate": 4.1929004738048784e-05, + "loss": 2.5805, + "step": 29341 + }, + { + "epoch": 0.8700886635234113, + "grad_norm": 0.08453614264726639, + "learning_rate": 4.1910146204846126e-05, + "loss": 2.5568, + "step": 29342 + }, + { + "epoch": 0.8701183168757228, + "grad_norm": 0.0803346186876297, + "learning_rate": 4.1891291728107684e-05, + "loss": 2.5692, + "step": 29343 + }, + { + "epoch": 0.8701479702280343, + "grad_norm": 0.07771068066358566, + "learning_rate": 4.18724413080005e-05, + "loss": 2.5583, + "step": 29344 + }, + { + "epoch": 0.8701776235803458, + "grad_norm": 0.07944564521312714, + "learning_rate": 4.185359494469149e-05, + "loss": 2.5922, + "step": 29345 + }, + { + "epoch": 0.8702072769326572, + "grad_norm": 0.08345197141170502, + "learning_rate": 4.183475263834757e-05, + "loss": 2.552, + "step": 29346 + }, + { + "epoch": 0.8702369302849687, + "grad_norm": 0.08144093304872513, + "learning_rate": 4.1815914389135514e-05, + "loss": 2.5525, + "step": 29347 + }, + { + "epoch": 0.8702665836372802, + "grad_norm": 0.08075488358736038, + "learning_rate": 4.179708019722223e-05, + "loss": 2.5593, + "step": 29348 + }, + { + "epoch": 0.8702962369895917, + "grad_norm": 0.07975845038890839, + "learning_rate": 4.1778250062774414e-05, + "loss": 2.5557, + "step": 29349 + }, + { + "epoch": 0.8703258903419031, + "grad_norm": 0.08132990449666977, + "learning_rate": 4.175942398595889e-05, + "loss": 2.5221, + "step": 29350 + }, + { + "epoch": 0.8703555436942146, + "grad_norm": 0.0812583640217781, + "learning_rate": 4.174060196694224e-05, + "loss": 2.5473, + "step": 29351 + }, + { + "epoch": 0.8703851970465261, + "grad_norm": 0.07409750670194626, + "learning_rate": 4.172178400589127e-05, + "loss": 2.5757, + "step": 29352 + }, + { + "epoch": 0.8704148503988376, + "grad_norm": 0.0843663141131401, + "learning_rate": 4.1702970102972584e-05, + "loss": 2.6221, + "step": 29353 + }, + { + "epoch": 0.870444503751149, + "grad_norm": 0.0836699828505516, + "learning_rate": 4.1684160258352754e-05, + "loss": 2.6017, + "step": 29354 + }, + { + "epoch": 0.8704741571034605, + "grad_norm": 0.08162804692983627, + "learning_rate": 4.166535447219833e-05, + "loss": 2.5794, + "step": 29355 + }, + { + "epoch": 0.8705038104557721, + "grad_norm": 0.07707098871469498, + "learning_rate": 4.1646552744675884e-05, + "loss": 2.5646, + "step": 29356 + }, + { + "epoch": 0.8705334638080835, + "grad_norm": 0.08317290246486664, + "learning_rate": 4.16277550759519e-05, + "loss": 2.5693, + "step": 29357 + }, + { + "epoch": 0.870563117160395, + "grad_norm": 0.08128045499324799, + "learning_rate": 4.1608961466192807e-05, + "loss": 2.5885, + "step": 29358 + }, + { + "epoch": 0.8705927705127064, + "grad_norm": 0.0744086503982544, + "learning_rate": 4.1590171915565076e-05, + "loss": 2.5881, + "step": 29359 + }, + { + "epoch": 0.870622423865018, + "grad_norm": 0.07465723156929016, + "learning_rate": 4.157138642423502e-05, + "loss": 2.5734, + "step": 29360 + }, + { + "epoch": 0.8706520772173294, + "grad_norm": 0.08329985290765762, + "learning_rate": 4.155260499236896e-05, + "loss": 2.579, + "step": 29361 + }, + { + "epoch": 0.8706817305696409, + "grad_norm": 0.07743048667907715, + "learning_rate": 4.153382762013336e-05, + "loss": 2.52, + "step": 29362 + }, + { + "epoch": 0.8707113839219524, + "grad_norm": 0.07851914316415787, + "learning_rate": 4.151505430769437e-05, + "loss": 2.6011, + "step": 29363 + }, + { + "epoch": 0.8707410372742639, + "grad_norm": 0.07651517540216446, + "learning_rate": 4.14962850552183e-05, + "loss": 2.5687, + "step": 29364 + }, + { + "epoch": 0.8707706906265753, + "grad_norm": 0.0909510999917984, + "learning_rate": 4.147751986287135e-05, + "loss": 2.5464, + "step": 29365 + }, + { + "epoch": 0.8708003439788868, + "grad_norm": 0.0772520899772644, + "learning_rate": 4.1458758730819616e-05, + "loss": 2.6027, + "step": 29366 + }, + { + "epoch": 0.8708299973311983, + "grad_norm": 0.08248371630907059, + "learning_rate": 4.144000165922929e-05, + "loss": 2.5564, + "step": 29367 + }, + { + "epoch": 0.8708596506835098, + "grad_norm": 0.08278291672468185, + "learning_rate": 4.142124864826646e-05, + "loss": 2.5843, + "step": 29368 + }, + { + "epoch": 0.8708893040358212, + "grad_norm": 0.07819882035255432, + "learning_rate": 4.140249969809729e-05, + "loss": 2.5587, + "step": 29369 + }, + { + "epoch": 0.8709189573881327, + "grad_norm": 0.07859045267105103, + "learning_rate": 4.138375480888762e-05, + "loss": 2.5721, + "step": 29370 + }, + { + "epoch": 0.8709486107404442, + "grad_norm": 0.08091282099485397, + "learning_rate": 4.13650139808035e-05, + "loss": 2.5655, + "step": 29371 + }, + { + "epoch": 0.8709782640927557, + "grad_norm": 0.08120404928922653, + "learning_rate": 4.134627721401096e-05, + "loss": 2.5708, + "step": 29372 + }, + { + "epoch": 0.8710079174450671, + "grad_norm": 0.07957463711500168, + "learning_rate": 4.132754450867582e-05, + "loss": 2.5348, + "step": 29373 + }, + { + "epoch": 0.8710375707973786, + "grad_norm": 0.08372563868761063, + "learning_rate": 4.130881586496388e-05, + "loss": 2.5506, + "step": 29374 + }, + { + "epoch": 0.8710672241496901, + "grad_norm": 0.09125541895627975, + "learning_rate": 4.129009128304123e-05, + "loss": 2.5896, + "step": 29375 + }, + { + "epoch": 0.8710968775020016, + "grad_norm": 0.08134488016366959, + "learning_rate": 4.127137076307358e-05, + "loss": 2.6011, + "step": 29376 + }, + { + "epoch": 0.8711265308543131, + "grad_norm": 0.07794231921434402, + "learning_rate": 4.1252654305226676e-05, + "loss": 2.599, + "step": 29377 + }, + { + "epoch": 0.8711561842066246, + "grad_norm": 0.0894184336066246, + "learning_rate": 4.1233941909666226e-05, + "loss": 2.5607, + "step": 29378 + }, + { + "epoch": 0.8711858375589361, + "grad_norm": 0.08596192300319672, + "learning_rate": 4.121523357655799e-05, + "loss": 2.5794, + "step": 29379 + }, + { + "epoch": 0.8712154909112475, + "grad_norm": 0.07967184484004974, + "learning_rate": 4.119652930606771e-05, + "loss": 2.5969, + "step": 29380 + }, + { + "epoch": 0.871245144263559, + "grad_norm": 0.07994580268859863, + "learning_rate": 4.117782909836082e-05, + "loss": 2.5622, + "step": 29381 + }, + { + "epoch": 0.8712747976158705, + "grad_norm": 0.08415132761001587, + "learning_rate": 4.115913295360302e-05, + "loss": 2.5288, + "step": 29382 + }, + { + "epoch": 0.871304450968182, + "grad_norm": 0.07920393347740173, + "learning_rate": 4.114044087195984e-05, + "loss": 2.6033, + "step": 29383 + }, + { + "epoch": 0.8713341043204934, + "grad_norm": 0.08051787316799164, + "learning_rate": 4.1121752853596814e-05, + "loss": 2.5989, + "step": 29384 + }, + { + "epoch": 0.8713637576728049, + "grad_norm": 0.07727142423391342, + "learning_rate": 4.110306889867943e-05, + "loss": 2.5772, + "step": 29385 + }, + { + "epoch": 0.8713934110251164, + "grad_norm": 0.07832235842943192, + "learning_rate": 4.108438900737316e-05, + "loss": 2.5712, + "step": 29386 + }, + { + "epoch": 0.8714230643774279, + "grad_norm": 0.08408264070749283, + "learning_rate": 4.106571317984325e-05, + "loss": 2.5479, + "step": 29387 + }, + { + "epoch": 0.8714527177297393, + "grad_norm": 0.08234681934118271, + "learning_rate": 4.104704141625537e-05, + "loss": 2.5873, + "step": 29388 + }, + { + "epoch": 0.8714823710820508, + "grad_norm": 0.07482070475816727, + "learning_rate": 4.1028373716774704e-05, + "loss": 2.548, + "step": 29389 + }, + { + "epoch": 0.8715120244343623, + "grad_norm": 0.08597382158041, + "learning_rate": 4.100971008156667e-05, + "loss": 2.5655, + "step": 29390 + }, + { + "epoch": 0.8715416777866738, + "grad_norm": 0.08611099421977997, + "learning_rate": 4.0991050510796325e-05, + "loss": 2.591, + "step": 29391 + }, + { + "epoch": 0.8715713311389852, + "grad_norm": 0.0809415802359581, + "learning_rate": 4.0972395004629015e-05, + "loss": 2.5738, + "step": 29392 + }, + { + "epoch": 0.8716009844912967, + "grad_norm": 0.08339385688304901, + "learning_rate": 4.095374356322995e-05, + "loss": 2.5731, + "step": 29393 + }, + { + "epoch": 0.8716306378436082, + "grad_norm": 0.07646913826465607, + "learning_rate": 4.093509618676433e-05, + "loss": 2.5682, + "step": 29394 + }, + { + "epoch": 0.8716602911959197, + "grad_norm": 0.08800478279590607, + "learning_rate": 4.0916452875397135e-05, + "loss": 2.5947, + "step": 29395 + }, + { + "epoch": 0.8716899445482312, + "grad_norm": 0.08234412223100662, + "learning_rate": 4.089781362929362e-05, + "loss": 2.567, + "step": 29396 + }, + { + "epoch": 0.8717195979005427, + "grad_norm": 0.0777609720826149, + "learning_rate": 4.087917844861877e-05, + "loss": 2.5487, + "step": 29397 + }, + { + "epoch": 0.8717492512528542, + "grad_norm": 0.07833196222782135, + "learning_rate": 4.0860547333537554e-05, + "loss": 2.6204, + "step": 29398 + }, + { + "epoch": 0.8717789046051656, + "grad_norm": 0.08240728080272675, + "learning_rate": 4.084192028421496e-05, + "loss": 2.5976, + "step": 29399 + }, + { + "epoch": 0.8718085579574771, + "grad_norm": 0.08205673843622208, + "learning_rate": 4.082329730081613e-05, + "loss": 2.5434, + "step": 29400 + }, + { + "epoch": 0.8718382113097886, + "grad_norm": 0.07892535626888275, + "learning_rate": 4.080467838350571e-05, + "loss": 2.5475, + "step": 29401 + }, + { + "epoch": 0.8718678646621001, + "grad_norm": 0.08136657625436783, + "learning_rate": 4.078606353244874e-05, + "loss": 2.5971, + "step": 29402 + }, + { + "epoch": 0.8718975180144115, + "grad_norm": 0.07511559128761292, + "learning_rate": 4.0767452747809964e-05, + "loss": 2.546, + "step": 29403 + }, + { + "epoch": 0.871927171366723, + "grad_norm": 0.08278602361679077, + "learning_rate": 4.0748846029754205e-05, + "loss": 2.5694, + "step": 29404 + }, + { + "epoch": 0.8719568247190345, + "grad_norm": 0.07688599079847336, + "learning_rate": 4.0730243378446273e-05, + "loss": 2.5791, + "step": 29405 + }, + { + "epoch": 0.871986478071346, + "grad_norm": 0.07846172153949738, + "learning_rate": 4.071164479405082e-05, + "loss": 2.5485, + "step": 29406 + }, + { + "epoch": 0.8720161314236574, + "grad_norm": 0.08167227357625961, + "learning_rate": 4.0693050276732645e-05, + "loss": 2.5729, + "step": 29407 + }, + { + "epoch": 0.872045784775969, + "grad_norm": 0.08306622505187988, + "learning_rate": 4.0674459826656295e-05, + "loss": 2.5579, + "step": 29408 + }, + { + "epoch": 0.8720754381282804, + "grad_norm": 0.07908008247613907, + "learning_rate": 4.065587344398653e-05, + "loss": 2.5851, + "step": 29409 + }, + { + "epoch": 0.8721050914805919, + "grad_norm": 0.07837066799402237, + "learning_rate": 4.063729112888775e-05, + "loss": 2.558, + "step": 29410 + }, + { + "epoch": 0.8721347448329033, + "grad_norm": 0.07945314794778824, + "learning_rate": 4.061871288152469e-05, + "loss": 2.5983, + "step": 29411 + }, + { + "epoch": 0.8721643981852149, + "grad_norm": 0.0860256478190422, + "learning_rate": 4.060013870206175e-05, + "loss": 2.5818, + "step": 29412 + }, + { + "epoch": 0.8721940515375263, + "grad_norm": 0.08345936983823776, + "learning_rate": 4.058156859066342e-05, + "loss": 2.5838, + "step": 29413 + }, + { + "epoch": 0.8722237048898378, + "grad_norm": 0.08186917752027512, + "learning_rate": 4.0563002547494175e-05, + "loss": 2.583, + "step": 29414 + }, + { + "epoch": 0.8722533582421492, + "grad_norm": 0.08149018883705139, + "learning_rate": 4.054444057271839e-05, + "loss": 2.5965, + "step": 29415 + }, + { + "epoch": 0.8722830115944608, + "grad_norm": 0.07764416188001633, + "learning_rate": 4.0525882666500426e-05, + "loss": 2.5777, + "step": 29416 + }, + { + "epoch": 0.8723126649467723, + "grad_norm": 0.07896671444177628, + "learning_rate": 4.0507328829004706e-05, + "loss": 2.5644, + "step": 29417 + }, + { + "epoch": 0.8723423182990837, + "grad_norm": 0.08372984081506729, + "learning_rate": 4.048877906039539e-05, + "loss": 2.5685, + "step": 29418 + }, + { + "epoch": 0.8723719716513952, + "grad_norm": 0.08345927298069, + "learning_rate": 4.047023336083683e-05, + "loss": 2.5968, + "step": 29419 + }, + { + "epoch": 0.8724016250037067, + "grad_norm": 0.0792466551065445, + "learning_rate": 4.045169173049329e-05, + "loss": 2.5499, + "step": 29420 + }, + { + "epoch": 0.8724312783560182, + "grad_norm": 0.08152985572814941, + "learning_rate": 4.0433154169528805e-05, + "loss": 2.5917, + "step": 29421 + }, + { + "epoch": 0.8724609317083296, + "grad_norm": 0.08260113000869751, + "learning_rate": 4.0414620678107695e-05, + "loss": 2.5361, + "step": 29422 + }, + { + "epoch": 0.8724905850606411, + "grad_norm": 0.08063182979822159, + "learning_rate": 4.0396091256393986e-05, + "loss": 2.5839, + "step": 29423 + }, + { + "epoch": 0.8725202384129526, + "grad_norm": 0.08309075236320496, + "learning_rate": 4.037756590455188e-05, + "loss": 2.5812, + "step": 29424 + }, + { + "epoch": 0.8725498917652641, + "grad_norm": 0.08246025443077087, + "learning_rate": 4.035904462274509e-05, + "loss": 2.6066, + "step": 29425 + }, + { + "epoch": 0.8725795451175755, + "grad_norm": 0.08370174467563629, + "learning_rate": 4.034052741113803e-05, + "loss": 2.5826, + "step": 29426 + }, + { + "epoch": 0.872609198469887, + "grad_norm": 0.0850130170583725, + "learning_rate": 4.03220142698944e-05, + "loss": 2.5676, + "step": 29427 + }, + { + "epoch": 0.8726388518221985, + "grad_norm": 0.08232108503580093, + "learning_rate": 4.0303505199178305e-05, + "loss": 2.5522, + "step": 29428 + }, + { + "epoch": 0.87266850517451, + "grad_norm": 0.08507618308067322, + "learning_rate": 4.0285000199153544e-05, + "loss": 2.6209, + "step": 29429 + }, + { + "epoch": 0.8726981585268214, + "grad_norm": 0.08253535628318787, + "learning_rate": 4.026649926998405e-05, + "loss": 2.6057, + "step": 29430 + }, + { + "epoch": 0.872727811879133, + "grad_norm": 0.08167658746242523, + "learning_rate": 4.024800241183363e-05, + "loss": 2.5383, + "step": 29431 + }, + { + "epoch": 0.8727574652314444, + "grad_norm": 0.08883627504110336, + "learning_rate": 4.022950962486599e-05, + "loss": 2.5724, + "step": 29432 + }, + { + "epoch": 0.8727871185837559, + "grad_norm": 0.08478628844022751, + "learning_rate": 4.0211020909245e-05, + "loss": 2.5766, + "step": 29433 + }, + { + "epoch": 0.8728167719360673, + "grad_norm": 0.08386062830686569, + "learning_rate": 4.019253626513436e-05, + "loss": 2.5604, + "step": 29434 + }, + { + "epoch": 0.8728464252883789, + "grad_norm": 0.10110557824373245, + "learning_rate": 4.017405569269783e-05, + "loss": 2.5564, + "step": 29435 + }, + { + "epoch": 0.8728760786406903, + "grad_norm": 0.07784773409366608, + "learning_rate": 4.0155579192098834e-05, + "loss": 2.5682, + "step": 29436 + }, + { + "epoch": 0.8729057319930018, + "grad_norm": 0.08265480399131775, + "learning_rate": 4.013710676350113e-05, + "loss": 2.588, + "step": 29437 + }, + { + "epoch": 0.8729353853453133, + "grad_norm": 0.09074196964502335, + "learning_rate": 4.0118638407068195e-05, + "loss": 2.5794, + "step": 29438 + }, + { + "epoch": 0.8729650386976248, + "grad_norm": 0.0857352539896965, + "learning_rate": 4.0100174122963795e-05, + "loss": 2.5414, + "step": 29439 + }, + { + "epoch": 0.8729946920499363, + "grad_norm": 0.0843232199549675, + "learning_rate": 4.008171391135124e-05, + "loss": 2.5958, + "step": 29440 + }, + { + "epoch": 0.8730243454022477, + "grad_norm": 0.09097126871347427, + "learning_rate": 4.006325777239406e-05, + "loss": 2.5841, + "step": 29441 + }, + { + "epoch": 0.8730539987545592, + "grad_norm": 0.07965458184480667, + "learning_rate": 4.0044805706255685e-05, + "loss": 2.5345, + "step": 29442 + }, + { + "epoch": 0.8730836521068707, + "grad_norm": 0.07863109558820724, + "learning_rate": 4.0026357713099484e-05, + "loss": 2.5544, + "step": 29443 + }, + { + "epoch": 0.8731133054591822, + "grad_norm": 0.08547933399677277, + "learning_rate": 4.000791379308882e-05, + "loss": 2.5305, + "step": 29444 + }, + { + "epoch": 0.8731429588114936, + "grad_norm": 0.07550948858261108, + "learning_rate": 3.998947394638719e-05, + "loss": 2.5508, + "step": 29445 + }, + { + "epoch": 0.8731726121638052, + "grad_norm": 0.07573853433132172, + "learning_rate": 3.9971038173157615e-05, + "loss": 2.5581, + "step": 29446 + }, + { + "epoch": 0.8732022655161166, + "grad_norm": 0.08984193205833435, + "learning_rate": 3.995260647356341e-05, + "loss": 2.5688, + "step": 29447 + }, + { + "epoch": 0.8732319188684281, + "grad_norm": 0.07334094494581223, + "learning_rate": 3.99341788477679e-05, + "loss": 2.5725, + "step": 29448 + }, + { + "epoch": 0.8732615722207395, + "grad_norm": 0.07858843356370926, + "learning_rate": 3.991575529593422e-05, + "loss": 2.5543, + "step": 29449 + }, + { + "epoch": 0.873291225573051, + "grad_norm": 0.08096504956483841, + "learning_rate": 3.989733581822547e-05, + "loss": 2.5638, + "step": 29450 + }, + { + "epoch": 0.8733208789253625, + "grad_norm": 0.07992719113826752, + "learning_rate": 3.9878920414804675e-05, + "loss": 2.5556, + "step": 29451 + }, + { + "epoch": 0.873350532277674, + "grad_norm": 0.0784640908241272, + "learning_rate": 3.986050908583511e-05, + "loss": 2.5808, + "step": 29452 + }, + { + "epoch": 0.8733801856299854, + "grad_norm": 0.08101300895214081, + "learning_rate": 3.984210183147974e-05, + "loss": 2.5701, + "step": 29453 + }, + { + "epoch": 0.873409838982297, + "grad_norm": 0.07835006713867188, + "learning_rate": 3.982369865190155e-05, + "loss": 2.5806, + "step": 29454 + }, + { + "epoch": 0.8734394923346084, + "grad_norm": 0.08336719870567322, + "learning_rate": 3.9805299547263475e-05, + "loss": 2.5984, + "step": 29455 + }, + { + "epoch": 0.8734691456869199, + "grad_norm": 0.0864809900522232, + "learning_rate": 3.9786904517728595e-05, + "loss": 2.5683, + "step": 29456 + }, + { + "epoch": 0.8734987990392313, + "grad_norm": 0.08113645017147064, + "learning_rate": 3.9768513563459563e-05, + "loss": 2.5647, + "step": 29457 + }, + { + "epoch": 0.8735284523915429, + "grad_norm": 0.07853315025568008, + "learning_rate": 3.97501266846193e-05, + "loss": 2.6029, + "step": 29458 + }, + { + "epoch": 0.8735581057438544, + "grad_norm": 0.07883051782846451, + "learning_rate": 3.973174388137068e-05, + "loss": 2.5535, + "step": 29459 + }, + { + "epoch": 0.8735877590961658, + "grad_norm": 0.08087429404258728, + "learning_rate": 3.971336515387652e-05, + "loss": 2.5967, + "step": 29460 + }, + { + "epoch": 0.8736174124484773, + "grad_norm": 0.07591798156499863, + "learning_rate": 3.969499050229947e-05, + "loss": 2.5721, + "step": 29461 + }, + { + "epoch": 0.8736470658007888, + "grad_norm": 0.08112318813800812, + "learning_rate": 3.967661992680233e-05, + "loss": 2.5695, + "step": 29462 + }, + { + "epoch": 0.8736767191531003, + "grad_norm": 0.07854209840297699, + "learning_rate": 3.96582534275477e-05, + "loss": 2.5746, + "step": 29463 + }, + { + "epoch": 0.8737063725054117, + "grad_norm": 0.0775836706161499, + "learning_rate": 3.963989100469817e-05, + "loss": 2.5705, + "step": 29464 + }, + { + "epoch": 0.8737360258577233, + "grad_norm": 0.0806153416633606, + "learning_rate": 3.9621532658416505e-05, + "loss": 2.5877, + "step": 29465 + }, + { + "epoch": 0.8737656792100347, + "grad_norm": 0.08269979804754257, + "learning_rate": 3.960317838886529e-05, + "loss": 2.5555, + "step": 29466 + }, + { + "epoch": 0.8737953325623462, + "grad_norm": 0.08029064536094666, + "learning_rate": 3.95848281962069e-05, + "loss": 2.5859, + "step": 29467 + }, + { + "epoch": 0.8738249859146576, + "grad_norm": 0.0837855264544487, + "learning_rate": 3.956648208060382e-05, + "loss": 2.5647, + "step": 29468 + }, + { + "epoch": 0.8738546392669692, + "grad_norm": 0.08262243866920471, + "learning_rate": 3.954814004221863e-05, + "loss": 2.563, + "step": 29469 + }, + { + "epoch": 0.8738842926192806, + "grad_norm": 0.0783780887722969, + "learning_rate": 3.952980208121365e-05, + "loss": 2.5569, + "step": 29470 + }, + { + "epoch": 0.8739139459715921, + "grad_norm": 0.07973989844322205, + "learning_rate": 3.951146819775136e-05, + "loss": 2.5899, + "step": 29471 + }, + { + "epoch": 0.8739435993239035, + "grad_norm": 0.08726031333208084, + "learning_rate": 3.9493138391994034e-05, + "loss": 2.5653, + "step": 29472 + }, + { + "epoch": 0.8739732526762151, + "grad_norm": 0.08240877091884613, + "learning_rate": 3.947481266410402e-05, + "loss": 2.5754, + "step": 29473 + }, + { + "epoch": 0.8740029060285265, + "grad_norm": 0.0788586214184761, + "learning_rate": 3.945649101424359e-05, + "loss": 2.5801, + "step": 29474 + }, + { + "epoch": 0.874032559380838, + "grad_norm": 0.0827777311205864, + "learning_rate": 3.9438173442575e-05, + "loss": 2.5483, + "step": 29475 + }, + { + "epoch": 0.8740622127331494, + "grad_norm": 0.08382564783096313, + "learning_rate": 3.94198599492604e-05, + "loss": 2.5396, + "step": 29476 + }, + { + "epoch": 0.874091866085461, + "grad_norm": 0.08595933765172958, + "learning_rate": 3.940155053446198e-05, + "loss": 2.5753, + "step": 29477 + }, + { + "epoch": 0.8741215194377724, + "grad_norm": 0.08280524611473083, + "learning_rate": 3.938324519834197e-05, + "loss": 2.5709, + "step": 29478 + }, + { + "epoch": 0.8741511727900839, + "grad_norm": 0.07640329003334045, + "learning_rate": 3.936494394106232e-05, + "loss": 2.5487, + "step": 29479 + }, + { + "epoch": 0.8741808261423955, + "grad_norm": 0.08297266811132431, + "learning_rate": 3.934664676278515e-05, + "loss": 2.555, + "step": 29480 + }, + { + "epoch": 0.8742104794947069, + "grad_norm": 0.07964449375867844, + "learning_rate": 3.9328353663672546e-05, + "loss": 2.5511, + "step": 29481 + }, + { + "epoch": 0.8742401328470184, + "grad_norm": 0.08033331483602524, + "learning_rate": 3.931006464388642e-05, + "loss": 2.5876, + "step": 29482 + }, + { + "epoch": 0.8742697861993298, + "grad_norm": 0.07713285088539124, + "learning_rate": 3.929177970358877e-05, + "loss": 2.5544, + "step": 29483 + }, + { + "epoch": 0.8742994395516414, + "grad_norm": 0.0841417983174324, + "learning_rate": 3.927349884294146e-05, + "loss": 2.5609, + "step": 29484 + }, + { + "epoch": 0.8743290929039528, + "grad_norm": 0.07965591549873352, + "learning_rate": 3.925522206210641e-05, + "loss": 2.5405, + "step": 29485 + }, + { + "epoch": 0.8743587462562643, + "grad_norm": 0.09170281887054443, + "learning_rate": 3.923694936124544e-05, + "loss": 2.5567, + "step": 29486 + }, + { + "epoch": 0.8743883996085757, + "grad_norm": 0.08098511397838593, + "learning_rate": 3.921868074052037e-05, + "loss": 2.5705, + "step": 29487 + }, + { + "epoch": 0.8744180529608873, + "grad_norm": 0.0794607624411583, + "learning_rate": 3.9200416200093016e-05, + "loss": 2.5586, + "step": 29488 + }, + { + "epoch": 0.8744477063131987, + "grad_norm": 0.08923453837633133, + "learning_rate": 3.918215574012501e-05, + "loss": 2.5591, + "step": 29489 + }, + { + "epoch": 0.8744773596655102, + "grad_norm": 0.08279384672641754, + "learning_rate": 3.916389936077819e-05, + "loss": 2.5877, + "step": 29490 + }, + { + "epoch": 0.8745070130178216, + "grad_norm": 0.086859330534935, + "learning_rate": 3.914564706221407e-05, + "loss": 2.5578, + "step": 29491 + }, + { + "epoch": 0.8745366663701332, + "grad_norm": 0.0883328765630722, + "learning_rate": 3.912739884459443e-05, + "loss": 2.5843, + "step": 29492 + }, + { + "epoch": 0.8745663197224446, + "grad_norm": 0.07641506940126419, + "learning_rate": 3.9109154708080684e-05, + "loss": 2.6036, + "step": 29493 + }, + { + "epoch": 0.8745959730747561, + "grad_norm": 0.08636175841093063, + "learning_rate": 3.9090914652834544e-05, + "loss": 2.6001, + "step": 29494 + }, + { + "epoch": 0.8746256264270675, + "grad_norm": 0.08226755261421204, + "learning_rate": 3.907267867901748e-05, + "loss": 2.5179, + "step": 29495 + }, + { + "epoch": 0.8746552797793791, + "grad_norm": 0.07953605055809021, + "learning_rate": 3.905444678679093e-05, + "loss": 2.5803, + "step": 29496 + }, + { + "epoch": 0.8746849331316905, + "grad_norm": 0.0784781277179718, + "learning_rate": 3.903621897631637e-05, + "loss": 2.5875, + "step": 29497 + }, + { + "epoch": 0.874714586484002, + "grad_norm": 0.08080276846885681, + "learning_rate": 3.901799524775523e-05, + "loss": 2.5587, + "step": 29498 + }, + { + "epoch": 0.8747442398363134, + "grad_norm": 0.07409953325986862, + "learning_rate": 3.899977560126888e-05, + "loss": 2.5878, + "step": 29499 + }, + { + "epoch": 0.874773893188625, + "grad_norm": 0.07626157253980637, + "learning_rate": 3.8981560037018745e-05, + "loss": 2.5615, + "step": 29500 + }, + { + "epoch": 0.8748035465409365, + "grad_norm": 0.08143588155508041, + "learning_rate": 3.896334855516587e-05, + "loss": 2.5947, + "step": 29501 + }, + { + "epoch": 0.8748331998932479, + "grad_norm": 0.0799066573381424, + "learning_rate": 3.8945141155871775e-05, + "loss": 2.5518, + "step": 29502 + }, + { + "epoch": 0.8748628532455595, + "grad_norm": 0.07877671718597412, + "learning_rate": 3.892693783929746e-05, + "loss": 2.578, + "step": 29503 + }, + { + "epoch": 0.8748925065978709, + "grad_norm": 0.07865479588508606, + "learning_rate": 3.8908738605604345e-05, + "loss": 2.5808, + "step": 29504 + }, + { + "epoch": 0.8749221599501824, + "grad_norm": 0.07655937224626541, + "learning_rate": 3.8890543454953465e-05, + "loss": 2.5655, + "step": 29505 + }, + { + "epoch": 0.8749518133024938, + "grad_norm": 0.07578036934137344, + "learning_rate": 3.887235238750597e-05, + "loss": 2.5168, + "step": 29506 + }, + { + "epoch": 0.8749814666548054, + "grad_norm": 0.07852508872747421, + "learning_rate": 3.885416540342301e-05, + "loss": 2.5906, + "step": 29507 + }, + { + "epoch": 0.8750111200071168, + "grad_norm": 0.07846028357744217, + "learning_rate": 3.883598250286552e-05, + "loss": 2.5624, + "step": 29508 + }, + { + "epoch": 0.8750407733594283, + "grad_norm": 0.0734008178114891, + "learning_rate": 3.881780368599458e-05, + "loss": 2.5655, + "step": 29509 + }, + { + "epoch": 0.8750704267117397, + "grad_norm": 0.0843057930469513, + "learning_rate": 3.8799628952971124e-05, + "loss": 2.5758, + "step": 29510 + }, + { + "epoch": 0.8751000800640513, + "grad_norm": 0.0773833692073822, + "learning_rate": 3.878145830395624e-05, + "loss": 2.5524, + "step": 29511 + }, + { + "epoch": 0.8751297334163627, + "grad_norm": 0.08218546211719513, + "learning_rate": 3.8763291739110586e-05, + "loss": 2.5566, + "step": 29512 + }, + { + "epoch": 0.8751593867686742, + "grad_norm": 0.08412870764732361, + "learning_rate": 3.874512925859519e-05, + "loss": 2.5833, + "step": 29513 + }, + { + "epoch": 0.8751890401209856, + "grad_norm": 0.08146274089813232, + "learning_rate": 3.872697086257082e-05, + "loss": 2.5678, + "step": 29514 + }, + { + "epoch": 0.8752186934732972, + "grad_norm": 0.08094072341918945, + "learning_rate": 3.8708816551198344e-05, + "loss": 2.5498, + "step": 29515 + }, + { + "epoch": 0.8752483468256086, + "grad_norm": 0.07677364349365234, + "learning_rate": 3.869066632463836e-05, + "loss": 2.5425, + "step": 29516 + }, + { + "epoch": 0.8752780001779201, + "grad_norm": 0.08998946100473404, + "learning_rate": 3.867252018305178e-05, + "loss": 2.614, + "step": 29517 + }, + { + "epoch": 0.8753076535302315, + "grad_norm": 0.0843786895275116, + "learning_rate": 3.865437812659922e-05, + "loss": 2.5699, + "step": 29518 + }, + { + "epoch": 0.8753373068825431, + "grad_norm": 0.07765360176563263, + "learning_rate": 3.863624015544137e-05, + "loss": 2.5549, + "step": 29519 + }, + { + "epoch": 0.8753669602348545, + "grad_norm": 0.08038423955440521, + "learning_rate": 3.861810626973877e-05, + "loss": 2.5926, + "step": 29520 + }, + { + "epoch": 0.875396613587166, + "grad_norm": 0.07844556868076324, + "learning_rate": 3.859997646965213e-05, + "loss": 2.5638, + "step": 29521 + }, + { + "epoch": 0.8754262669394776, + "grad_norm": 0.07993072271347046, + "learning_rate": 3.858185075534182e-05, + "loss": 2.5811, + "step": 29522 + }, + { + "epoch": 0.875455920291789, + "grad_norm": 0.0795438140630722, + "learning_rate": 3.856372912696837e-05, + "loss": 2.5691, + "step": 29523 + }, + { + "epoch": 0.8754855736441005, + "grad_norm": 0.07919266819953918, + "learning_rate": 3.8545611584692385e-05, + "loss": 2.5714, + "step": 29524 + }, + { + "epoch": 0.8755152269964119, + "grad_norm": 0.07558475434780121, + "learning_rate": 3.852749812867418e-05, + "loss": 2.5481, + "step": 29525 + }, + { + "epoch": 0.8755448803487235, + "grad_norm": 0.07845304161310196, + "learning_rate": 3.850938875907417e-05, + "loss": 2.5454, + "step": 29526 + }, + { + "epoch": 0.8755745337010349, + "grad_norm": 0.08255975693464279, + "learning_rate": 3.849128347605274e-05, + "loss": 2.5986, + "step": 29527 + }, + { + "epoch": 0.8756041870533464, + "grad_norm": 0.08227849751710892, + "learning_rate": 3.847318227977026e-05, + "loss": 2.5633, + "step": 29528 + }, + { + "epoch": 0.8756338404056578, + "grad_norm": 0.0796273723244667, + "learning_rate": 3.845508517038682e-05, + "loss": 2.5987, + "step": 29529 + }, + { + "epoch": 0.8756634937579694, + "grad_norm": 0.08398343622684479, + "learning_rate": 3.8436992148062955e-05, + "loss": 2.6061, + "step": 29530 + }, + { + "epoch": 0.8756931471102808, + "grad_norm": 0.07772161066532135, + "learning_rate": 3.8418903212958714e-05, + "loss": 2.5817, + "step": 29531 + }, + { + "epoch": 0.8757228004625923, + "grad_norm": 0.0740651786327362, + "learning_rate": 3.840081836523446e-05, + "loss": 2.571, + "step": 29532 + }, + { + "epoch": 0.8757524538149037, + "grad_norm": 0.08667036145925522, + "learning_rate": 3.838273760505007e-05, + "loss": 2.5694, + "step": 29533 + }, + { + "epoch": 0.8757821071672153, + "grad_norm": 0.07589522004127502, + "learning_rate": 3.8364660932565744e-05, + "loss": 2.528, + "step": 29534 + }, + { + "epoch": 0.8758117605195267, + "grad_norm": 0.07977736741304398, + "learning_rate": 3.8346588347941636e-05, + "loss": 2.5566, + "step": 29535 + }, + { + "epoch": 0.8758414138718382, + "grad_norm": 0.08295801281929016, + "learning_rate": 3.832851985133767e-05, + "loss": 2.5324, + "step": 29536 + }, + { + "epoch": 0.8758710672241496, + "grad_norm": 0.08517195284366608, + "learning_rate": 3.831045544291395e-05, + "loss": 2.5506, + "step": 29537 + }, + { + "epoch": 0.8759007205764612, + "grad_norm": 0.07843641936779022, + "learning_rate": 3.8292395122830395e-05, + "loss": 2.5623, + "step": 29538 + }, + { + "epoch": 0.8759303739287726, + "grad_norm": 0.08108248561620712, + "learning_rate": 3.8274338891246926e-05, + "loss": 2.5722, + "step": 29539 + }, + { + "epoch": 0.8759600272810841, + "grad_norm": 0.08322776108980179, + "learning_rate": 3.825628674832343e-05, + "loss": 2.528, + "step": 29540 + }, + { + "epoch": 0.8759896806333956, + "grad_norm": 0.0750085785984993, + "learning_rate": 3.823823869421983e-05, + "loss": 2.5519, + "step": 29541 + }, + { + "epoch": 0.8760193339857071, + "grad_norm": 0.08044591546058655, + "learning_rate": 3.822019472909583e-05, + "loss": 2.5699, + "step": 29542 + }, + { + "epoch": 0.8760489873380186, + "grad_norm": 0.0806213766336441, + "learning_rate": 3.8202154853111234e-05, + "loss": 2.5616, + "step": 29543 + }, + { + "epoch": 0.87607864069033, + "grad_norm": 0.08076401054859161, + "learning_rate": 3.818411906642588e-05, + "loss": 2.5657, + "step": 29544 + }, + { + "epoch": 0.8761082940426416, + "grad_norm": 0.07642021775245667, + "learning_rate": 3.8166087369199407e-05, + "loss": 2.5738, + "step": 29545 + }, + { + "epoch": 0.876137947394953, + "grad_norm": 0.08053021132946014, + "learning_rate": 3.8148059761591524e-05, + "loss": 2.5708, + "step": 29546 + }, + { + "epoch": 0.8761676007472645, + "grad_norm": 0.08337254822254181, + "learning_rate": 3.8130036243761826e-05, + "loss": 2.5786, + "step": 29547 + }, + { + "epoch": 0.8761972540995759, + "grad_norm": 0.08384981751441956, + "learning_rate": 3.8112016815869955e-05, + "loss": 2.5868, + "step": 29548 + }, + { + "epoch": 0.8762269074518875, + "grad_norm": 0.08221954107284546, + "learning_rate": 3.8094001478075405e-05, + "loss": 2.5893, + "step": 29549 + }, + { + "epoch": 0.8762565608041989, + "grad_norm": 0.08110090345144272, + "learning_rate": 3.8075990230537816e-05, + "loss": 2.5799, + "step": 29550 + }, + { + "epoch": 0.8762862141565104, + "grad_norm": 0.08876857161521912, + "learning_rate": 3.805798307341657e-05, + "loss": 2.5847, + "step": 29551 + }, + { + "epoch": 0.8763158675088218, + "grad_norm": 0.08036946505308151, + "learning_rate": 3.80399800068712e-05, + "loss": 2.5647, + "step": 29552 + }, + { + "epoch": 0.8763455208611334, + "grad_norm": 0.07953214645385742, + "learning_rate": 3.8021981031061136e-05, + "loss": 2.5633, + "step": 29553 + }, + { + "epoch": 0.8763751742134448, + "grad_norm": 0.0820005014538765, + "learning_rate": 3.8003986146145695e-05, + "loss": 2.5601, + "step": 29554 + }, + { + "epoch": 0.8764048275657563, + "grad_norm": 0.07736220955848694, + "learning_rate": 3.7985995352284254e-05, + "loss": 2.5423, + "step": 29555 + }, + { + "epoch": 0.8764344809180677, + "grad_norm": 0.08694154769182205, + "learning_rate": 3.796800864963612e-05, + "loss": 2.5741, + "step": 29556 + }, + { + "epoch": 0.8764641342703793, + "grad_norm": 0.0824923887848854, + "learning_rate": 3.7950026038360616e-05, + "loss": 2.5691, + "step": 29557 + }, + { + "epoch": 0.8764937876226907, + "grad_norm": 0.07742772251367569, + "learning_rate": 3.79320475186169e-05, + "loss": 2.588, + "step": 29558 + }, + { + "epoch": 0.8765234409750022, + "grad_norm": 0.08122646808624268, + "learning_rate": 3.791407309056422e-05, + "loss": 2.5728, + "step": 29559 + }, + { + "epoch": 0.8765530943273137, + "grad_norm": 0.08987174183130264, + "learning_rate": 3.7896102754361725e-05, + "loss": 2.616, + "step": 29560 + }, + { + "epoch": 0.8765827476796252, + "grad_norm": 0.08360885828733444, + "learning_rate": 3.787813651016858e-05, + "loss": 2.5789, + "step": 29561 + }, + { + "epoch": 0.8766124010319366, + "grad_norm": 0.07938025891780853, + "learning_rate": 3.7860174358143876e-05, + "loss": 2.6253, + "step": 29562 + }, + { + "epoch": 0.8766420543842481, + "grad_norm": 0.07948341220617294, + "learning_rate": 3.7842216298446643e-05, + "loss": 2.5561, + "step": 29563 + }, + { + "epoch": 0.8766717077365597, + "grad_norm": 0.08342637866735458, + "learning_rate": 3.7824262331235926e-05, + "loss": 2.5916, + "step": 29564 + }, + { + "epoch": 0.8767013610888711, + "grad_norm": 0.08406786620616913, + "learning_rate": 3.7806312456670655e-05, + "loss": 2.5756, + "step": 29565 + }, + { + "epoch": 0.8767310144411826, + "grad_norm": 0.09150604903697968, + "learning_rate": 3.778836667490992e-05, + "loss": 2.5602, + "step": 29566 + }, + { + "epoch": 0.876760667793494, + "grad_norm": 0.08174962550401688, + "learning_rate": 3.77704249861125e-05, + "loss": 2.5838, + "step": 29567 + }, + { + "epoch": 0.8767903211458056, + "grad_norm": 0.09270472079515457, + "learning_rate": 3.775248739043719e-05, + "loss": 2.5751, + "step": 29568 + }, + { + "epoch": 0.876819974498117, + "grad_norm": 0.09573248773813248, + "learning_rate": 3.773455388804303e-05, + "loss": 2.5719, + "step": 29569 + }, + { + "epoch": 0.8768496278504285, + "grad_norm": 0.08301510661840439, + "learning_rate": 3.771662447908874e-05, + "loss": 2.5589, + "step": 29570 + }, + { + "epoch": 0.87687928120274, + "grad_norm": 0.08740530163049698, + "learning_rate": 3.7698699163733084e-05, + "loss": 2.5578, + "step": 29571 + }, + { + "epoch": 0.8769089345550515, + "grad_norm": 0.0884070098400116, + "learning_rate": 3.7680777942134815e-05, + "loss": 2.5818, + "step": 29572 + }, + { + "epoch": 0.8769385879073629, + "grad_norm": 0.08509867638349533, + "learning_rate": 3.766286081445258e-05, + "loss": 2.5646, + "step": 29573 + }, + { + "epoch": 0.8769682412596744, + "grad_norm": 0.08064932376146317, + "learning_rate": 3.764494778084509e-05, + "loss": 2.5529, + "step": 29574 + }, + { + "epoch": 0.8769978946119859, + "grad_norm": 0.08042910695075989, + "learning_rate": 3.762703884147095e-05, + "loss": 2.5521, + "step": 29575 + }, + { + "epoch": 0.8770275479642974, + "grad_norm": 0.0811176672577858, + "learning_rate": 3.760913399648885e-05, + "loss": 2.5661, + "step": 29576 + }, + { + "epoch": 0.8770572013166088, + "grad_norm": 0.08383055031299591, + "learning_rate": 3.759123324605712e-05, + "loss": 2.5864, + "step": 29577 + }, + { + "epoch": 0.8770868546689203, + "grad_norm": 0.08047888427972794, + "learning_rate": 3.757333659033441e-05, + "loss": 2.5601, + "step": 29578 + }, + { + "epoch": 0.8771165080212318, + "grad_norm": 0.07712908089160919, + "learning_rate": 3.755544402947919e-05, + "loss": 2.5508, + "step": 29579 + }, + { + "epoch": 0.8771461613735433, + "grad_norm": 0.07914792746305466, + "learning_rate": 3.7537555563649796e-05, + "loss": 2.5408, + "step": 29580 + }, + { + "epoch": 0.8771758147258547, + "grad_norm": 0.08174659311771393, + "learning_rate": 3.751967119300481e-05, + "loss": 2.5349, + "step": 29581 + }, + { + "epoch": 0.8772054680781662, + "grad_norm": 0.08226492255926132, + "learning_rate": 3.7501790917702505e-05, + "loss": 2.5423, + "step": 29582 + }, + { + "epoch": 0.8772351214304777, + "grad_norm": 0.07621779292821884, + "learning_rate": 3.748391473790125e-05, + "loss": 2.573, + "step": 29583 + }, + { + "epoch": 0.8772647747827892, + "grad_norm": 0.07779750227928162, + "learning_rate": 3.746604265375936e-05, + "loss": 2.609, + "step": 29584 + }, + { + "epoch": 0.8772944281351007, + "grad_norm": 0.08670701831579208, + "learning_rate": 3.7448174665434986e-05, + "loss": 2.5543, + "step": 29585 + }, + { + "epoch": 0.8773240814874121, + "grad_norm": 0.07930289953947067, + "learning_rate": 3.743031077308645e-05, + "loss": 2.5344, + "step": 29586 + }, + { + "epoch": 0.8773537348397237, + "grad_norm": 0.07764945179224014, + "learning_rate": 3.7412450976871956e-05, + "loss": 2.5321, + "step": 29587 + }, + { + "epoch": 0.8773833881920351, + "grad_norm": 0.08707456290721893, + "learning_rate": 3.739459527694961e-05, + "loss": 2.5884, + "step": 29588 + }, + { + "epoch": 0.8774130415443466, + "grad_norm": 0.0826447382569313, + "learning_rate": 3.737674367347743e-05, + "loss": 2.5592, + "step": 29589 + }, + { + "epoch": 0.877442694896658, + "grad_norm": 0.07779194414615631, + "learning_rate": 3.7358896166613645e-05, + "loss": 2.5778, + "step": 29590 + }, + { + "epoch": 0.8774723482489696, + "grad_norm": 0.08295117318630219, + "learning_rate": 3.734105275651628e-05, + "loss": 2.5637, + "step": 29591 + }, + { + "epoch": 0.877502001601281, + "grad_norm": 0.0879640206694603, + "learning_rate": 3.7323213443343276e-05, + "loss": 2.6012, + "step": 29592 + }, + { + "epoch": 0.8775316549535925, + "grad_norm": 0.07730193436145782, + "learning_rate": 3.730537822725249e-05, + "loss": 2.5709, + "step": 29593 + }, + { + "epoch": 0.877561308305904, + "grad_norm": 0.07965973019599915, + "learning_rate": 3.728754710840215e-05, + "loss": 2.5623, + "step": 29594 + }, + { + "epoch": 0.8775909616582155, + "grad_norm": 0.08007129281759262, + "learning_rate": 3.726972008695001e-05, + "loss": 2.5622, + "step": 29595 + }, + { + "epoch": 0.8776206150105269, + "grad_norm": 0.08096804469823837, + "learning_rate": 3.725189716305388e-05, + "loss": 2.5919, + "step": 29596 + }, + { + "epoch": 0.8776502683628384, + "grad_norm": 0.08097571134567261, + "learning_rate": 3.7234078336871755e-05, + "loss": 2.5237, + "step": 29597 + }, + { + "epoch": 0.8776799217151499, + "grad_norm": 0.09268245100975037, + "learning_rate": 3.7216263608561226e-05, + "loss": 2.548, + "step": 29598 + }, + { + "epoch": 0.8777095750674614, + "grad_norm": 0.07577645033597946, + "learning_rate": 3.719845297828006e-05, + "loss": 2.5342, + "step": 29599 + }, + { + "epoch": 0.8777392284197728, + "grad_norm": 0.07738074660301208, + "learning_rate": 3.718064644618607e-05, + "loss": 2.5772, + "step": 29600 + }, + { + "epoch": 0.8777688817720843, + "grad_norm": 0.09042931348085403, + "learning_rate": 3.716284401243691e-05, + "loss": 2.5291, + "step": 29601 + }, + { + "epoch": 0.8777985351243958, + "grad_norm": 0.08093300461769104, + "learning_rate": 3.7145045677190184e-05, + "loss": 2.5697, + "step": 29602 + }, + { + "epoch": 0.8778281884767073, + "grad_norm": 0.07656164467334747, + "learning_rate": 3.7127251440603526e-05, + "loss": 2.6036, + "step": 29603 + }, + { + "epoch": 0.8778578418290188, + "grad_norm": 0.08246973901987076, + "learning_rate": 3.710946130283455e-05, + "loss": 2.5347, + "step": 29604 + }, + { + "epoch": 0.8778874951813302, + "grad_norm": 0.08216174691915512, + "learning_rate": 3.709167526404072e-05, + "loss": 2.5626, + "step": 29605 + }, + { + "epoch": 0.8779171485336418, + "grad_norm": 0.08193901926279068, + "learning_rate": 3.707389332437949e-05, + "loss": 2.5572, + "step": 29606 + }, + { + "epoch": 0.8779468018859532, + "grad_norm": 0.08102954924106598, + "learning_rate": 3.705611548400844e-05, + "loss": 2.6018, + "step": 29607 + }, + { + "epoch": 0.8779764552382647, + "grad_norm": 0.08063849806785583, + "learning_rate": 3.703834174308507e-05, + "loss": 2.6005, + "step": 29608 + }, + { + "epoch": 0.8780061085905762, + "grad_norm": 0.08550971746444702, + "learning_rate": 3.702057210176657e-05, + "loss": 2.5819, + "step": 29609 + }, + { + "epoch": 0.8780357619428877, + "grad_norm": 0.07726788520812988, + "learning_rate": 3.700280656021032e-05, + "loss": 2.5553, + "step": 29610 + }, + { + "epoch": 0.8780654152951991, + "grad_norm": 0.0870903804898262, + "learning_rate": 3.698504511857376e-05, + "loss": 2.5739, + "step": 29611 + }, + { + "epoch": 0.8780950686475106, + "grad_norm": 0.08184301108121872, + "learning_rate": 3.696728777701408e-05, + "loss": 2.5889, + "step": 29612 + }, + { + "epoch": 0.878124721999822, + "grad_norm": 0.07507095485925674, + "learning_rate": 3.69495345356885e-05, + "loss": 2.5412, + "step": 29613 + }, + { + "epoch": 0.8781543753521336, + "grad_norm": 0.07742886990308762, + "learning_rate": 3.6931785394754335e-05, + "loss": 2.5653, + "step": 29614 + }, + { + "epoch": 0.878184028704445, + "grad_norm": 0.07556544989347458, + "learning_rate": 3.691404035436863e-05, + "loss": 2.5614, + "step": 29615 + }, + { + "epoch": 0.8782136820567565, + "grad_norm": 0.07922924309968948, + "learning_rate": 3.689629941468864e-05, + "loss": 2.5863, + "step": 29616 + }, + { + "epoch": 0.878243335409068, + "grad_norm": 0.07450995594263077, + "learning_rate": 3.687856257587141e-05, + "loss": 2.5716, + "step": 29617 + }, + { + "epoch": 0.8782729887613795, + "grad_norm": 0.0797533318400383, + "learning_rate": 3.6860829838073985e-05, + "loss": 2.5717, + "step": 29618 + }, + { + "epoch": 0.8783026421136909, + "grad_norm": 0.07962127029895782, + "learning_rate": 3.68431012014534e-05, + "loss": 2.5802, + "step": 29619 + }, + { + "epoch": 0.8783322954660024, + "grad_norm": 0.08103621006011963, + "learning_rate": 3.682537666616664e-05, + "loss": 2.6137, + "step": 29620 + }, + { + "epoch": 0.8783619488183139, + "grad_norm": 0.07464452087879181, + "learning_rate": 3.68076562323707e-05, + "loss": 2.5471, + "step": 29621 + }, + { + "epoch": 0.8783916021706254, + "grad_norm": 0.0811830461025238, + "learning_rate": 3.67899399002225e-05, + "loss": 2.5798, + "step": 29622 + }, + { + "epoch": 0.8784212555229368, + "grad_norm": 0.07619937509298325, + "learning_rate": 3.6772227669878867e-05, + "loss": 2.5693, + "step": 29623 + }, + { + "epoch": 0.8784509088752483, + "grad_norm": 0.07769323140382767, + "learning_rate": 3.675451954149661e-05, + "loss": 2.5886, + "step": 29624 + }, + { + "epoch": 0.8784805622275599, + "grad_norm": 0.07616204768419266, + "learning_rate": 3.673681551523267e-05, + "loss": 2.5369, + "step": 29625 + }, + { + "epoch": 0.8785102155798713, + "grad_norm": 0.08161573857069016, + "learning_rate": 3.671911559124375e-05, + "loss": 2.5809, + "step": 29626 + }, + { + "epoch": 0.8785398689321828, + "grad_norm": 0.07918516546487808, + "learning_rate": 3.670141976968655e-05, + "loss": 2.5771, + "step": 29627 + }, + { + "epoch": 0.8785695222844943, + "grad_norm": 0.07578453421592712, + "learning_rate": 3.668372805071779e-05, + "loss": 2.5517, + "step": 29628 + }, + { + "epoch": 0.8785991756368058, + "grad_norm": 0.07749764621257782, + "learning_rate": 3.666604043449418e-05, + "loss": 2.5512, + "step": 29629 + }, + { + "epoch": 0.8786288289891172, + "grad_norm": 0.07956306636333466, + "learning_rate": 3.66483569211723e-05, + "loss": 2.556, + "step": 29630 + }, + { + "epoch": 0.8786584823414287, + "grad_norm": 0.07487700134515762, + "learning_rate": 3.663067751090882e-05, + "loss": 2.5905, + "step": 29631 + }, + { + "epoch": 0.8786881356937402, + "grad_norm": 0.07304894179105759, + "learning_rate": 3.661300220386004e-05, + "loss": 2.568, + "step": 29632 + }, + { + "epoch": 0.8787177890460517, + "grad_norm": 0.07628285139799118, + "learning_rate": 3.6595331000182807e-05, + "loss": 2.5429, + "step": 29633 + }, + { + "epoch": 0.8787474423983631, + "grad_norm": 0.07928763329982758, + "learning_rate": 3.657766390003342e-05, + "loss": 2.5684, + "step": 29634 + }, + { + "epoch": 0.8787770957506746, + "grad_norm": 0.07387015968561172, + "learning_rate": 3.656000090356837e-05, + "loss": 2.5298, + "step": 29635 + }, + { + "epoch": 0.8788067491029861, + "grad_norm": 0.07784366607666016, + "learning_rate": 3.6542342010944085e-05, + "loss": 2.5527, + "step": 29636 + }, + { + "epoch": 0.8788364024552976, + "grad_norm": 0.08187751471996307, + "learning_rate": 3.6524687222316886e-05, + "loss": 2.6266, + "step": 29637 + }, + { + "epoch": 0.878866055807609, + "grad_norm": 0.07895029336214066, + "learning_rate": 3.650703653784315e-05, + "loss": 2.5413, + "step": 29638 + }, + { + "epoch": 0.8788957091599205, + "grad_norm": 0.07888707518577576, + "learning_rate": 3.64893899576792e-05, + "loss": 2.5704, + "step": 29639 + }, + { + "epoch": 0.878925362512232, + "grad_norm": 0.0778719112277031, + "learning_rate": 3.6471747481981175e-05, + "loss": 2.574, + "step": 29640 + }, + { + "epoch": 0.8789550158645435, + "grad_norm": 0.07775581628084183, + "learning_rate": 3.6454109110905465e-05, + "loss": 2.5547, + "step": 29641 + }, + { + "epoch": 0.8789846692168549, + "grad_norm": 0.0797712504863739, + "learning_rate": 3.6436474844608215e-05, + "loss": 2.5693, + "step": 29642 + }, + { + "epoch": 0.8790143225691665, + "grad_norm": 0.08372920751571655, + "learning_rate": 3.6418844683245525e-05, + "loss": 2.5697, + "step": 29643 + }, + { + "epoch": 0.8790439759214779, + "grad_norm": 0.07882726937532425, + "learning_rate": 3.6401218626973485e-05, + "loss": 2.56, + "step": 29644 + }, + { + "epoch": 0.8790736292737894, + "grad_norm": 0.09099198132753372, + "learning_rate": 3.638359667594815e-05, + "loss": 2.5323, + "step": 29645 + }, + { + "epoch": 0.8791032826261009, + "grad_norm": 0.07800710946321487, + "learning_rate": 3.6365978830325775e-05, + "loss": 2.5808, + "step": 29646 + }, + { + "epoch": 0.8791329359784124, + "grad_norm": 0.08215833455324173, + "learning_rate": 3.634836509026218e-05, + "loss": 2.5703, + "step": 29647 + }, + { + "epoch": 0.8791625893307239, + "grad_norm": 0.09060525894165039, + "learning_rate": 3.633075545591347e-05, + "loss": 2.6043, + "step": 29648 + }, + { + "epoch": 0.8791922426830353, + "grad_norm": 0.08204971253871918, + "learning_rate": 3.631314992743545e-05, + "loss": 2.5816, + "step": 29649 + }, + { + "epoch": 0.8792218960353468, + "grad_norm": 0.08111616969108582, + "learning_rate": 3.6295548504984065e-05, + "loss": 2.6028, + "step": 29650 + }, + { + "epoch": 0.8792515493876583, + "grad_norm": 0.091075100004673, + "learning_rate": 3.627795118871524e-05, + "loss": 2.5925, + "step": 29651 + }, + { + "epoch": 0.8792812027399698, + "grad_norm": 0.08475785702466965, + "learning_rate": 3.6260357978784794e-05, + "loss": 2.5531, + "step": 29652 + }, + { + "epoch": 0.8793108560922812, + "grad_norm": 0.08018151670694351, + "learning_rate": 3.624276887534844e-05, + "loss": 2.601, + "step": 29653 + }, + { + "epoch": 0.8793405094445927, + "grad_norm": 0.08863486349582672, + "learning_rate": 3.622518387856194e-05, + "loss": 2.5735, + "step": 29654 + }, + { + "epoch": 0.8793701627969042, + "grad_norm": 0.08637935668230057, + "learning_rate": 3.620760298858106e-05, + "loss": 2.5913, + "step": 29655 + }, + { + "epoch": 0.8793998161492157, + "grad_norm": 0.08198041468858719, + "learning_rate": 3.619002620556144e-05, + "loss": 2.5661, + "step": 29656 + }, + { + "epoch": 0.8794294695015271, + "grad_norm": 0.07945974171161652, + "learning_rate": 3.617245352965875e-05, + "loss": 2.5743, + "step": 29657 + }, + { + "epoch": 0.8794591228538386, + "grad_norm": 0.08336766064167023, + "learning_rate": 3.615488496102859e-05, + "loss": 2.545, + "step": 29658 + }, + { + "epoch": 0.8794887762061501, + "grad_norm": 0.0778244212269783, + "learning_rate": 3.6137320499826544e-05, + "loss": 2.5936, + "step": 29659 + }, + { + "epoch": 0.8795184295584616, + "grad_norm": 0.08461033552885056, + "learning_rate": 3.6119760146208156e-05, + "loss": 2.5935, + "step": 29660 + }, + { + "epoch": 0.879548082910773, + "grad_norm": 0.08362039178609848, + "learning_rate": 3.610220390032892e-05, + "loss": 2.5453, + "step": 29661 + }, + { + "epoch": 0.8795777362630846, + "grad_norm": 0.08144375681877136, + "learning_rate": 3.608465176234432e-05, + "loss": 2.5536, + "step": 29662 + }, + { + "epoch": 0.879607389615396, + "grad_norm": 0.0800250843167305, + "learning_rate": 3.606710373240985e-05, + "loss": 2.5822, + "step": 29663 + }, + { + "epoch": 0.8796370429677075, + "grad_norm": 0.08490481227636337, + "learning_rate": 3.60495598106807e-05, + "loss": 2.5819, + "step": 29664 + }, + { + "epoch": 0.8796666963200189, + "grad_norm": 0.0848560482263565, + "learning_rate": 3.6032019997312315e-05, + "loss": 2.5879, + "step": 29665 + }, + { + "epoch": 0.8796963496723305, + "grad_norm": 0.08654285222291946, + "learning_rate": 3.601448429246007e-05, + "loss": 2.55, + "step": 29666 + }, + { + "epoch": 0.879726003024642, + "grad_norm": 0.07985174655914307, + "learning_rate": 3.599695269627917e-05, + "loss": 2.5613, + "step": 29667 + }, + { + "epoch": 0.8797556563769534, + "grad_norm": 0.08018283545970917, + "learning_rate": 3.5979425208924944e-05, + "loss": 2.5871, + "step": 29668 + }, + { + "epoch": 0.8797853097292649, + "grad_norm": 0.09226848185062408, + "learning_rate": 3.596190183055248e-05, + "loss": 2.5973, + "step": 29669 + }, + { + "epoch": 0.8798149630815764, + "grad_norm": 0.08369152992963791, + "learning_rate": 3.5944382561317104e-05, + "loss": 2.5683, + "step": 29670 + }, + { + "epoch": 0.8798446164338879, + "grad_norm": 0.08200166374444962, + "learning_rate": 3.5926867401373744e-05, + "loss": 2.5906, + "step": 29671 + }, + { + "epoch": 0.8798742697861993, + "grad_norm": 0.08271133899688721, + "learning_rate": 3.590935635087777e-05, + "loss": 2.5914, + "step": 29672 + }, + { + "epoch": 0.8799039231385108, + "grad_norm": 0.08237670361995697, + "learning_rate": 3.5891849409984135e-05, + "loss": 2.5481, + "step": 29673 + }, + { + "epoch": 0.8799335764908223, + "grad_norm": 0.08589410781860352, + "learning_rate": 3.5874346578847804e-05, + "loss": 2.5899, + "step": 29674 + }, + { + "epoch": 0.8799632298431338, + "grad_norm": 0.07700175046920776, + "learning_rate": 3.585684785762372e-05, + "loss": 2.5814, + "step": 29675 + }, + { + "epoch": 0.8799928831954452, + "grad_norm": 0.08382688462734222, + "learning_rate": 3.5839353246466976e-05, + "loss": 2.5769, + "step": 29676 + }, + { + "epoch": 0.8800225365477568, + "grad_norm": 0.0810326412320137, + "learning_rate": 3.582186274553245e-05, + "loss": 2.5974, + "step": 29677 + }, + { + "epoch": 0.8800521899000682, + "grad_norm": 0.09321948140859604, + "learning_rate": 3.580437635497497e-05, + "loss": 2.5712, + "step": 29678 + }, + { + "epoch": 0.8800818432523797, + "grad_norm": 0.07891480624675751, + "learning_rate": 3.57868940749494e-05, + "loss": 2.5768, + "step": 29679 + }, + { + "epoch": 0.8801114966046911, + "grad_norm": 0.08163585513830185, + "learning_rate": 3.576941590561061e-05, + "loss": 2.5519, + "step": 29680 + }, + { + "epoch": 0.8801411499570027, + "grad_norm": 0.09175144881010056, + "learning_rate": 3.575194184711328e-05, + "loss": 2.5352, + "step": 29681 + }, + { + "epoch": 0.8801708033093141, + "grad_norm": 0.08492793887853622, + "learning_rate": 3.573447189961221e-05, + "loss": 2.5926, + "step": 29682 + }, + { + "epoch": 0.8802004566616256, + "grad_norm": 0.08283984661102295, + "learning_rate": 3.571700606326211e-05, + "loss": 2.5718, + "step": 29683 + }, + { + "epoch": 0.880230110013937, + "grad_norm": 0.08420205116271973, + "learning_rate": 3.569954433821759e-05, + "loss": 2.5642, + "step": 29684 + }, + { + "epoch": 0.8802597633662486, + "grad_norm": 0.07699668407440186, + "learning_rate": 3.56820867246333e-05, + "loss": 2.5466, + "step": 29685 + }, + { + "epoch": 0.88028941671856, + "grad_norm": 0.08650543540716171, + "learning_rate": 3.5664633222663834e-05, + "loss": 2.6042, + "step": 29686 + }, + { + "epoch": 0.8803190700708715, + "grad_norm": 0.08165848255157471, + "learning_rate": 3.5647183832463737e-05, + "loss": 2.5619, + "step": 29687 + }, + { + "epoch": 0.880348723423183, + "grad_norm": 0.08244331181049347, + "learning_rate": 3.5629738554187494e-05, + "loss": 2.5758, + "step": 29688 + }, + { + "epoch": 0.8803783767754945, + "grad_norm": 0.08515925705432892, + "learning_rate": 3.561229738798971e-05, + "loss": 2.5675, + "step": 29689 + }, + { + "epoch": 0.880408030127806, + "grad_norm": 0.08565118908882141, + "learning_rate": 3.55948603340247e-05, + "loss": 2.5594, + "step": 29690 + }, + { + "epoch": 0.8804376834801174, + "grad_norm": 0.0750875324010849, + "learning_rate": 3.55774273924469e-05, + "loss": 2.5383, + "step": 29691 + }, + { + "epoch": 0.880467336832429, + "grad_norm": 0.08114387094974518, + "learning_rate": 3.5559998563410686e-05, + "loss": 2.5712, + "step": 29692 + }, + { + "epoch": 0.8804969901847404, + "grad_norm": 0.0807531476020813, + "learning_rate": 3.5542573847070434e-05, + "loss": 2.5694, + "step": 29693 + }, + { + "epoch": 0.8805266435370519, + "grad_norm": 0.08172193914651871, + "learning_rate": 3.5525153243580466e-05, + "loss": 2.5654, + "step": 29694 + }, + { + "epoch": 0.8805562968893633, + "grad_norm": 0.07755084335803986, + "learning_rate": 3.550773675309493e-05, + "loss": 2.5746, + "step": 29695 + }, + { + "epoch": 0.8805859502416749, + "grad_norm": 0.08507160097360611, + "learning_rate": 3.54903243757681e-05, + "loss": 2.6127, + "step": 29696 + }, + { + "epoch": 0.8806156035939863, + "grad_norm": 0.07929946482181549, + "learning_rate": 3.547291611175418e-05, + "loss": 2.5699, + "step": 29697 + }, + { + "epoch": 0.8806452569462978, + "grad_norm": 0.08573903888463974, + "learning_rate": 3.545551196120739e-05, + "loss": 2.5858, + "step": 29698 + }, + { + "epoch": 0.8806749102986092, + "grad_norm": 0.08388320356607437, + "learning_rate": 3.54381119242817e-05, + "loss": 2.5902, + "step": 29699 + }, + { + "epoch": 0.8807045636509208, + "grad_norm": 0.08532597869634628, + "learning_rate": 3.542071600113134e-05, + "loss": 2.524, + "step": 29700 + }, + { + "epoch": 0.8807342170032322, + "grad_norm": 0.08886322379112244, + "learning_rate": 3.540332419191022e-05, + "loss": 2.5545, + "step": 29701 + }, + { + "epoch": 0.8807638703555437, + "grad_norm": 0.08655089884996414, + "learning_rate": 3.5385936496772465e-05, + "loss": 2.555, + "step": 29702 + }, + { + "epoch": 0.8807935237078551, + "grad_norm": 0.0833781510591507, + "learning_rate": 3.536855291587193e-05, + "loss": 2.5642, + "step": 29703 + }, + { + "epoch": 0.8808231770601667, + "grad_norm": 0.077440544962883, + "learning_rate": 3.5351173449362675e-05, + "loss": 2.5739, + "step": 29704 + }, + { + "epoch": 0.8808528304124781, + "grad_norm": 0.07675280421972275, + "learning_rate": 3.533379809739851e-05, + "loss": 2.5407, + "step": 29705 + }, + { + "epoch": 0.8808824837647896, + "grad_norm": 0.08214408159255981, + "learning_rate": 3.531642686013331e-05, + "loss": 2.6137, + "step": 29706 + }, + { + "epoch": 0.880912137117101, + "grad_norm": 0.08064094185829163, + "learning_rate": 3.529905973772091e-05, + "loss": 2.6069, + "step": 29707 + }, + { + "epoch": 0.8809417904694126, + "grad_norm": 0.07852574437856674, + "learning_rate": 3.528169673031523e-05, + "loss": 2.5658, + "step": 29708 + }, + { + "epoch": 0.8809714438217241, + "grad_norm": 0.07600022852420807, + "learning_rate": 3.526433783806976e-05, + "loss": 2.5487, + "step": 29709 + }, + { + "epoch": 0.8810010971740355, + "grad_norm": 0.0800580307841301, + "learning_rate": 3.524698306113827e-05, + "loss": 2.5965, + "step": 29710 + }, + { + "epoch": 0.881030750526347, + "grad_norm": 0.0792623907327652, + "learning_rate": 3.522963239967464e-05, + "loss": 2.6043, + "step": 29711 + }, + { + "epoch": 0.8810604038786585, + "grad_norm": 0.0754546970129013, + "learning_rate": 3.5212285853832346e-05, + "loss": 2.6023, + "step": 29712 + }, + { + "epoch": 0.88109005723097, + "grad_norm": 0.08108897507190704, + "learning_rate": 3.5194943423765056e-05, + "loss": 2.5705, + "step": 29713 + }, + { + "epoch": 0.8811197105832814, + "grad_norm": 0.0789218619465828, + "learning_rate": 3.517760510962631e-05, + "loss": 2.5749, + "step": 29714 + }, + { + "epoch": 0.881149363935593, + "grad_norm": 0.07830219715833664, + "learning_rate": 3.51602709115697e-05, + "loss": 2.5621, + "step": 29715 + }, + { + "epoch": 0.8811790172879044, + "grad_norm": 0.07253644615411758, + "learning_rate": 3.514294082974867e-05, + "loss": 2.5856, + "step": 29716 + }, + { + "epoch": 0.8812086706402159, + "grad_norm": 0.07493532449007034, + "learning_rate": 3.512561486431665e-05, + "loss": 2.5613, + "step": 29717 + }, + { + "epoch": 0.8812383239925273, + "grad_norm": 0.08019285649061203, + "learning_rate": 3.5108293015427226e-05, + "loss": 2.5698, + "step": 29718 + }, + { + "epoch": 0.8812679773448389, + "grad_norm": 0.0777859315276146, + "learning_rate": 3.509097528323357e-05, + "loss": 2.578, + "step": 29719 + }, + { + "epoch": 0.8812976306971503, + "grad_norm": 0.07390002906322479, + "learning_rate": 3.5073661667889114e-05, + "loss": 2.5642, + "step": 29720 + }, + { + "epoch": 0.8813272840494618, + "grad_norm": 0.08076594769954681, + "learning_rate": 3.5056352169547225e-05, + "loss": 2.5492, + "step": 29721 + }, + { + "epoch": 0.8813569374017732, + "grad_norm": 0.07869990170001984, + "learning_rate": 3.5039046788361117e-05, + "loss": 2.6063, + "step": 29722 + }, + { + "epoch": 0.8813865907540848, + "grad_norm": 0.08066052198410034, + "learning_rate": 3.502174552448401e-05, + "loss": 2.584, + "step": 29723 + }, + { + "epoch": 0.8814162441063962, + "grad_norm": 0.07642874866724014, + "learning_rate": 3.500444837806921e-05, + "loss": 2.5807, + "step": 29724 + }, + { + "epoch": 0.8814458974587077, + "grad_norm": 0.0784929171204567, + "learning_rate": 3.498715534926983e-05, + "loss": 2.608, + "step": 29725 + }, + { + "epoch": 0.8814755508110191, + "grad_norm": 0.0797901377081871, + "learning_rate": 3.4969866438239017e-05, + "loss": 2.6206, + "step": 29726 + }, + { + "epoch": 0.8815052041633307, + "grad_norm": 0.08251137286424637, + "learning_rate": 3.495258164512982e-05, + "loss": 2.5801, + "step": 29727 + }, + { + "epoch": 0.8815348575156421, + "grad_norm": 0.07314827293157578, + "learning_rate": 3.4935300970095455e-05, + "loss": 2.5471, + "step": 29728 + }, + { + "epoch": 0.8815645108679536, + "grad_norm": 0.07996945083141327, + "learning_rate": 3.491802441328879e-05, + "loss": 2.5784, + "step": 29729 + }, + { + "epoch": 0.8815941642202652, + "grad_norm": 0.07548754662275314, + "learning_rate": 3.490075197486276e-05, + "loss": 2.5763, + "step": 29730 + }, + { + "epoch": 0.8816238175725766, + "grad_norm": 0.08091078698635101, + "learning_rate": 3.488348365497046e-05, + "loss": 2.5458, + "step": 29731 + }, + { + "epoch": 0.8816534709248881, + "grad_norm": 0.08374226093292236, + "learning_rate": 3.4866219453764725e-05, + "loss": 2.5575, + "step": 29732 + }, + { + "epoch": 0.8816831242771995, + "grad_norm": 0.07804802805185318, + "learning_rate": 3.484895937139848e-05, + "loss": 2.5607, + "step": 29733 + }, + { + "epoch": 0.8817127776295111, + "grad_norm": 0.08313092589378357, + "learning_rate": 3.483170340802455e-05, + "loss": 2.5912, + "step": 29734 + }, + { + "epoch": 0.8817424309818225, + "grad_norm": 0.07779596000909805, + "learning_rate": 3.4814451563795703e-05, + "loss": 2.5552, + "step": 29735 + }, + { + "epoch": 0.881772084334134, + "grad_norm": 0.07441669702529907, + "learning_rate": 3.4797203838864644e-05, + "loss": 2.5533, + "step": 29736 + }, + { + "epoch": 0.8818017376864454, + "grad_norm": 0.07506747543811798, + "learning_rate": 3.477996023338431e-05, + "loss": 2.5219, + "step": 29737 + }, + { + "epoch": 0.881831391038757, + "grad_norm": 0.08073066920042038, + "learning_rate": 3.4762720747507247e-05, + "loss": 2.578, + "step": 29738 + }, + { + "epoch": 0.8818610443910684, + "grad_norm": 0.07774733752012253, + "learning_rate": 3.4745485381386275e-05, + "loss": 2.5632, + "step": 29739 + }, + { + "epoch": 0.8818906977433799, + "grad_norm": 0.07884693145751953, + "learning_rate": 3.472825413517378e-05, + "loss": 2.5503, + "step": 29740 + }, + { + "epoch": 0.8819203510956913, + "grad_norm": 0.0846155509352684, + "learning_rate": 3.4711027009022453e-05, + "loss": 2.5237, + "step": 29741 + }, + { + "epoch": 0.8819500044480029, + "grad_norm": 0.07458838820457458, + "learning_rate": 3.469380400308486e-05, + "loss": 2.5738, + "step": 29742 + }, + { + "epoch": 0.8819796578003143, + "grad_norm": 0.08170212805271149, + "learning_rate": 3.467658511751348e-05, + "loss": 2.5539, + "step": 29743 + }, + { + "epoch": 0.8820093111526258, + "grad_norm": 0.08591634035110474, + "learning_rate": 3.465937035246086e-05, + "loss": 2.5975, + "step": 29744 + }, + { + "epoch": 0.8820389645049372, + "grad_norm": 0.08075155317783356, + "learning_rate": 3.464215970807938e-05, + "loss": 2.5535, + "step": 29745 + }, + { + "epoch": 0.8820686178572488, + "grad_norm": 0.08651361614465714, + "learning_rate": 3.462495318452141e-05, + "loss": 2.5726, + "step": 29746 + }, + { + "epoch": 0.8820982712095602, + "grad_norm": 0.08672605454921722, + "learning_rate": 3.4607750781939394e-05, + "loss": 2.5605, + "step": 29747 + }, + { + "epoch": 0.8821279245618717, + "grad_norm": 0.08557042479515076, + "learning_rate": 3.4590552500485594e-05, + "loss": 2.5828, + "step": 29748 + }, + { + "epoch": 0.8821575779141831, + "grad_norm": 0.08201506733894348, + "learning_rate": 3.457335834031239e-05, + "loss": 2.5699, + "step": 29749 + }, + { + "epoch": 0.8821872312664947, + "grad_norm": 0.08210072666406631, + "learning_rate": 3.455616830157193e-05, + "loss": 2.5627, + "step": 29750 + }, + { + "epoch": 0.8822168846188062, + "grad_norm": 0.07915377616882324, + "learning_rate": 3.453898238441655e-05, + "loss": 2.596, + "step": 29751 + }, + { + "epoch": 0.8822465379711176, + "grad_norm": 0.08348901569843292, + "learning_rate": 3.4521800588998345e-05, + "loss": 2.5686, + "step": 29752 + }, + { + "epoch": 0.8822761913234292, + "grad_norm": 0.07838141918182373, + "learning_rate": 3.4504622915469464e-05, + "loss": 2.5638, + "step": 29753 + }, + { + "epoch": 0.8823058446757406, + "grad_norm": 0.08575697988271713, + "learning_rate": 3.448744936398207e-05, + "loss": 2.6032, + "step": 29754 + }, + { + "epoch": 0.8823354980280521, + "grad_norm": 0.08158395439386368, + "learning_rate": 3.4470279934688264e-05, + "loss": 2.585, + "step": 29755 + }, + { + "epoch": 0.8823651513803635, + "grad_norm": 0.07861949503421783, + "learning_rate": 3.445311462773998e-05, + "loss": 2.5702, + "step": 29756 + }, + { + "epoch": 0.8823948047326751, + "grad_norm": 0.07512760907411575, + "learning_rate": 3.443595344328931e-05, + "loss": 2.5544, + "step": 29757 + }, + { + "epoch": 0.8824244580849865, + "grad_norm": 0.08089425414800644, + "learning_rate": 3.441879638148815e-05, + "loss": 2.567, + "step": 29758 + }, + { + "epoch": 0.882454111437298, + "grad_norm": 0.08035394549369812, + "learning_rate": 3.440164344248847e-05, + "loss": 2.592, + "step": 29759 + }, + { + "epoch": 0.8824837647896094, + "grad_norm": 0.07482586055994034, + "learning_rate": 3.438449462644222e-05, + "loss": 2.5929, + "step": 29760 + }, + { + "epoch": 0.882513418141921, + "grad_norm": 0.0771978572010994, + "learning_rate": 3.43673499335011e-05, + "loss": 2.5859, + "step": 29761 + }, + { + "epoch": 0.8825430714942324, + "grad_norm": 0.07783140242099762, + "learning_rate": 3.435020936381711e-05, + "loss": 2.5457, + "step": 29762 + }, + { + "epoch": 0.8825727248465439, + "grad_norm": 0.07564213126897812, + "learning_rate": 3.4333072917541896e-05, + "loss": 2.5951, + "step": 29763 + }, + { + "epoch": 0.8826023781988553, + "grad_norm": 0.07285595685243607, + "learning_rate": 3.431594059482723e-05, + "loss": 2.561, + "step": 29764 + }, + { + "epoch": 0.8826320315511669, + "grad_norm": 0.07998741418123245, + "learning_rate": 3.429881239582488e-05, + "loss": 2.5617, + "step": 29765 + }, + { + "epoch": 0.8826616849034783, + "grad_norm": 0.07637627422809601, + "learning_rate": 3.428168832068646e-05, + "loss": 2.5459, + "step": 29766 + }, + { + "epoch": 0.8826913382557898, + "grad_norm": 0.07776489108800888, + "learning_rate": 3.4264568369563656e-05, + "loss": 2.5868, + "step": 29767 + }, + { + "epoch": 0.8827209916081012, + "grad_norm": 0.08120035380125046, + "learning_rate": 3.424745254260803e-05, + "loss": 2.5706, + "step": 29768 + }, + { + "epoch": 0.8827506449604128, + "grad_norm": 0.0773918479681015, + "learning_rate": 3.423034083997112e-05, + "loss": 2.5596, + "step": 29769 + }, + { + "epoch": 0.8827802983127242, + "grad_norm": 0.07825983315706253, + "learning_rate": 3.421323326180453e-05, + "loss": 2.5753, + "step": 29770 + }, + { + "epoch": 0.8828099516650357, + "grad_norm": 0.080112025141716, + "learning_rate": 3.4196129808259705e-05, + "loss": 2.5785, + "step": 29771 + }, + { + "epoch": 0.8828396050173473, + "grad_norm": 0.08390658348798752, + "learning_rate": 3.4179030479488114e-05, + "loss": 2.5685, + "step": 29772 + }, + { + "epoch": 0.8828692583696587, + "grad_norm": 0.08159539103507996, + "learning_rate": 3.416193527564121e-05, + "loss": 2.5804, + "step": 29773 + }, + { + "epoch": 0.8828989117219702, + "grad_norm": 0.07645585387945175, + "learning_rate": 3.4144844196870195e-05, + "loss": 2.5732, + "step": 29774 + }, + { + "epoch": 0.8829285650742816, + "grad_norm": 0.07459893822669983, + "learning_rate": 3.412775724332662e-05, + "loss": 2.6158, + "step": 29775 + }, + { + "epoch": 0.8829582184265932, + "grad_norm": 0.07761374115943909, + "learning_rate": 3.411067441516175e-05, + "loss": 2.5833, + "step": 29776 + }, + { + "epoch": 0.8829878717789046, + "grad_norm": 0.08547224849462509, + "learning_rate": 3.409359571252679e-05, + "loss": 2.5875, + "step": 29777 + }, + { + "epoch": 0.8830175251312161, + "grad_norm": 0.07900479435920715, + "learning_rate": 3.4076521135573026e-05, + "loss": 2.5725, + "step": 29778 + }, + { + "epoch": 0.8830471784835275, + "grad_norm": 0.08569008111953735, + "learning_rate": 3.405945068445165e-05, + "loss": 2.5969, + "step": 29779 + }, + { + "epoch": 0.8830768318358391, + "grad_norm": 0.0813426673412323, + "learning_rate": 3.404238435931378e-05, + "loss": 2.5788, + "step": 29780 + }, + { + "epoch": 0.8831064851881505, + "grad_norm": 0.08125723898410797, + "learning_rate": 3.402532216031062e-05, + "loss": 2.5278, + "step": 29781 + }, + { + "epoch": 0.883136138540462, + "grad_norm": 0.08216480910778046, + "learning_rate": 3.400826408759322e-05, + "loss": 2.5768, + "step": 29782 + }, + { + "epoch": 0.8831657918927734, + "grad_norm": 0.08109119534492493, + "learning_rate": 3.399121014131257e-05, + "loss": 2.5538, + "step": 29783 + }, + { + "epoch": 0.883195445245085, + "grad_norm": 0.08443150669336319, + "learning_rate": 3.3974160321619875e-05, + "loss": 2.5936, + "step": 29784 + }, + { + "epoch": 0.8832250985973964, + "grad_norm": 0.08188676834106445, + "learning_rate": 3.395711462866591e-05, + "loss": 2.5695, + "step": 29785 + }, + { + "epoch": 0.8832547519497079, + "grad_norm": 0.08992531150579453, + "learning_rate": 3.394007306260166e-05, + "loss": 2.5556, + "step": 29786 + }, + { + "epoch": 0.8832844053020193, + "grad_norm": 0.0866900235414505, + "learning_rate": 3.3923035623578015e-05, + "loss": 2.6056, + "step": 29787 + }, + { + "epoch": 0.8833140586543309, + "grad_norm": 0.0825582817196846, + "learning_rate": 3.39060023117459e-05, + "loss": 2.5622, + "step": 29788 + }, + { + "epoch": 0.8833437120066423, + "grad_norm": 0.08236119151115417, + "learning_rate": 3.38889731272562e-05, + "loss": 2.5543, + "step": 29789 + }, + { + "epoch": 0.8833733653589538, + "grad_norm": 0.08438065648078918, + "learning_rate": 3.3871948070259616e-05, + "loss": 2.5791, + "step": 29790 + }, + { + "epoch": 0.8834030187112653, + "grad_norm": 0.07853052020072937, + "learning_rate": 3.385492714090699e-05, + "loss": 2.5419, + "step": 29791 + }, + { + "epoch": 0.8834326720635768, + "grad_norm": 0.08417605608701706, + "learning_rate": 3.383791033934896e-05, + "loss": 2.5535, + "step": 29792 + }, + { + "epoch": 0.8834623254158883, + "grad_norm": 0.08172192424535751, + "learning_rate": 3.3820897665736263e-05, + "loss": 2.5602, + "step": 29793 + }, + { + "epoch": 0.8834919787681997, + "grad_norm": 0.08351288735866547, + "learning_rate": 3.380388912021959e-05, + "loss": 2.5492, + "step": 29794 + }, + { + "epoch": 0.8835216321205113, + "grad_norm": 0.08731073886156082, + "learning_rate": 3.378688470294944e-05, + "loss": 2.5625, + "step": 29795 + }, + { + "epoch": 0.8835512854728227, + "grad_norm": 0.0827484056353569, + "learning_rate": 3.376988441407647e-05, + "loss": 2.5411, + "step": 29796 + }, + { + "epoch": 0.8835809388251342, + "grad_norm": 0.0843304991722107, + "learning_rate": 3.375288825375117e-05, + "loss": 2.5612, + "step": 29797 + }, + { + "epoch": 0.8836105921774456, + "grad_norm": 0.0862342119216919, + "learning_rate": 3.373589622212408e-05, + "loss": 2.5538, + "step": 29798 + }, + { + "epoch": 0.8836402455297572, + "grad_norm": 0.08580055087804794, + "learning_rate": 3.371890831934565e-05, + "loss": 2.572, + "step": 29799 + }, + { + "epoch": 0.8836698988820686, + "grad_norm": 0.09114275127649307, + "learning_rate": 3.370192454556631e-05, + "loss": 2.575, + "step": 29800 + }, + { + "epoch": 0.8836995522343801, + "grad_norm": 0.08049749583005905, + "learning_rate": 3.3684944900936485e-05, + "loss": 2.5518, + "step": 29801 + }, + { + "epoch": 0.8837292055866915, + "grad_norm": 0.08458653837442398, + "learning_rate": 3.366796938560651e-05, + "loss": 2.5775, + "step": 29802 + }, + { + "epoch": 0.8837588589390031, + "grad_norm": 0.0859130322933197, + "learning_rate": 3.365099799972671e-05, + "loss": 2.6002, + "step": 29803 + }, + { + "epoch": 0.8837885122913145, + "grad_norm": 0.0823369100689888, + "learning_rate": 3.3634030743447505e-05, + "loss": 2.5683, + "step": 29804 + }, + { + "epoch": 0.883818165643626, + "grad_norm": 0.08323470503091812, + "learning_rate": 3.361706761691891e-05, + "loss": 2.5853, + "step": 29805 + }, + { + "epoch": 0.8838478189959375, + "grad_norm": 0.08353034406900406, + "learning_rate": 3.360010862029117e-05, + "loss": 2.5923, + "step": 29806 + }, + { + "epoch": 0.883877472348249, + "grad_norm": 0.08053708076477051, + "learning_rate": 3.358315375371457e-05, + "loss": 2.5592, + "step": 29807 + }, + { + "epoch": 0.8839071257005604, + "grad_norm": 0.08443379402160645, + "learning_rate": 3.3566203017339204e-05, + "loss": 2.5819, + "step": 29808 + }, + { + "epoch": 0.8839367790528719, + "grad_norm": 0.08103667944669724, + "learning_rate": 3.3549256411315175e-05, + "loss": 2.5759, + "step": 29809 + }, + { + "epoch": 0.8839664324051834, + "grad_norm": 0.0819956436753273, + "learning_rate": 3.3532313935792594e-05, + "loss": 2.5685, + "step": 29810 + }, + { + "epoch": 0.8839960857574949, + "grad_norm": 0.08130038529634476, + "learning_rate": 3.351537559092138e-05, + "loss": 2.5639, + "step": 29811 + }, + { + "epoch": 0.8840257391098064, + "grad_norm": 0.08175403624773026, + "learning_rate": 3.3498441376851595e-05, + "loss": 2.56, + "step": 29812 + }, + { + "epoch": 0.8840553924621178, + "grad_norm": 0.08129052817821503, + "learning_rate": 3.3481511293733114e-05, + "loss": 2.5845, + "step": 29813 + }, + { + "epoch": 0.8840850458144294, + "grad_norm": 0.08138313889503479, + "learning_rate": 3.346458534171598e-05, + "loss": 2.5777, + "step": 29814 + }, + { + "epoch": 0.8841146991667408, + "grad_norm": 0.08143854886293411, + "learning_rate": 3.344766352095013e-05, + "loss": 2.5553, + "step": 29815 + }, + { + "epoch": 0.8841443525190523, + "grad_norm": 0.07560229301452637, + "learning_rate": 3.343074583158523e-05, + "loss": 2.5706, + "step": 29816 + }, + { + "epoch": 0.8841740058713637, + "grad_norm": 0.07573439180850983, + "learning_rate": 3.341383227377115e-05, + "loss": 2.5483, + "step": 29817 + }, + { + "epoch": 0.8842036592236753, + "grad_norm": 0.08313498646020889, + "learning_rate": 3.3396922847657663e-05, + "loss": 2.5572, + "step": 29818 + }, + { + "epoch": 0.8842333125759867, + "grad_norm": 0.08100885152816772, + "learning_rate": 3.338001755339454e-05, + "loss": 2.5812, + "step": 29819 + }, + { + "epoch": 0.8842629659282982, + "grad_norm": 0.08052262663841248, + "learning_rate": 3.336311639113143e-05, + "loss": 2.5922, + "step": 29820 + }, + { + "epoch": 0.8842926192806096, + "grad_norm": 0.07765066623687744, + "learning_rate": 3.334621936101801e-05, + "loss": 2.5307, + "step": 29821 + }, + { + "epoch": 0.8843222726329212, + "grad_norm": 0.08395862579345703, + "learning_rate": 3.332932646320397e-05, + "loss": 2.5896, + "step": 29822 + }, + { + "epoch": 0.8843519259852326, + "grad_norm": 0.07853948324918747, + "learning_rate": 3.331243769783876e-05, + "loss": 2.5702, + "step": 29823 + }, + { + "epoch": 0.8843815793375441, + "grad_norm": 0.07835908979177475, + "learning_rate": 3.329555306507209e-05, + "loss": 2.5718, + "step": 29824 + }, + { + "epoch": 0.8844112326898556, + "grad_norm": 0.08494903892278671, + "learning_rate": 3.32786725650534e-05, + "loss": 2.564, + "step": 29825 + }, + { + "epoch": 0.8844408860421671, + "grad_norm": 0.08220010250806808, + "learning_rate": 3.326179619793218e-05, + "loss": 2.5799, + "step": 29826 + }, + { + "epoch": 0.8844705393944785, + "grad_norm": 0.07686702907085419, + "learning_rate": 3.3244923963857866e-05, + "loss": 2.5703, + "step": 29827 + }, + { + "epoch": 0.88450019274679, + "grad_norm": 0.07460179179906845, + "learning_rate": 3.322805586297983e-05, + "loss": 2.5552, + "step": 29828 + }, + { + "epoch": 0.8845298460991015, + "grad_norm": 0.07804961502552032, + "learning_rate": 3.321119189544752e-05, + "loss": 2.547, + "step": 29829 + }, + { + "epoch": 0.884559499451413, + "grad_norm": 0.08423896133899689, + "learning_rate": 3.31943320614102e-05, + "loss": 2.6001, + "step": 29830 + }, + { + "epoch": 0.8845891528037244, + "grad_norm": 0.07951369136571884, + "learning_rate": 3.317747636101725e-05, + "loss": 2.5658, + "step": 29831 + }, + { + "epoch": 0.8846188061560359, + "grad_norm": 0.07859440892934799, + "learning_rate": 3.316062479441784e-05, + "loss": 2.5656, + "step": 29832 + }, + { + "epoch": 0.8846484595083475, + "grad_norm": 0.07620479166507721, + "learning_rate": 3.3143777361761216e-05, + "loss": 2.5881, + "step": 29833 + }, + { + "epoch": 0.8846781128606589, + "grad_norm": 0.07683829963207245, + "learning_rate": 3.312693406319661e-05, + "loss": 2.5646, + "step": 29834 + }, + { + "epoch": 0.8847077662129704, + "grad_norm": 0.075346939265728, + "learning_rate": 3.311009489887312e-05, + "loss": 2.5824, + "step": 29835 + }, + { + "epoch": 0.8847374195652818, + "grad_norm": 0.07481562346220016, + "learning_rate": 3.3093259868939853e-05, + "loss": 2.5632, + "step": 29836 + }, + { + "epoch": 0.8847670729175934, + "grad_norm": 0.07572975754737854, + "learning_rate": 3.3076428973545955e-05, + "loss": 2.5547, + "step": 29837 + }, + { + "epoch": 0.8847967262699048, + "grad_norm": 0.07966921478509903, + "learning_rate": 3.3059602212840436e-05, + "loss": 2.5267, + "step": 29838 + }, + { + "epoch": 0.8848263796222163, + "grad_norm": 0.07136774063110352, + "learning_rate": 3.3042779586972274e-05, + "loss": 2.5842, + "step": 29839 + }, + { + "epoch": 0.8848560329745278, + "grad_norm": 0.07905011624097824, + "learning_rate": 3.3025961096090404e-05, + "loss": 2.5588, + "step": 29840 + }, + { + "epoch": 0.8848856863268393, + "grad_norm": 0.0779813900589943, + "learning_rate": 3.300914674034383e-05, + "loss": 2.5455, + "step": 29841 + }, + { + "epoch": 0.8849153396791507, + "grad_norm": 0.08499523997306824, + "learning_rate": 3.2992336519881424e-05, + "loss": 2.5827, + "step": 29842 + }, + { + "epoch": 0.8849449930314622, + "grad_norm": 0.07663703709840775, + "learning_rate": 3.297553043485208e-05, + "loss": 2.5922, + "step": 29843 + }, + { + "epoch": 0.8849746463837737, + "grad_norm": 0.07706379890441895, + "learning_rate": 3.2958728485404546e-05, + "loss": 2.5489, + "step": 29844 + }, + { + "epoch": 0.8850042997360852, + "grad_norm": 0.08048789203166962, + "learning_rate": 3.2941930671687606e-05, + "loss": 2.5804, + "step": 29845 + }, + { + "epoch": 0.8850339530883966, + "grad_norm": 0.07753986865282059, + "learning_rate": 3.292513699385008e-05, + "loss": 2.5711, + "step": 29846 + }, + { + "epoch": 0.8850636064407081, + "grad_norm": 0.07555574923753738, + "learning_rate": 3.290834745204063e-05, + "loss": 2.5646, + "step": 29847 + }, + { + "epoch": 0.8850932597930196, + "grad_norm": 0.08139339834451675, + "learning_rate": 3.289156204640798e-05, + "loss": 2.5677, + "step": 29848 + }, + { + "epoch": 0.8851229131453311, + "grad_norm": 0.08429069817066193, + "learning_rate": 3.287478077710071e-05, + "loss": 2.5422, + "step": 29849 + }, + { + "epoch": 0.8851525664976425, + "grad_norm": 0.07846295088529587, + "learning_rate": 3.285800364426744e-05, + "loss": 2.538, + "step": 29850 + }, + { + "epoch": 0.885182219849954, + "grad_norm": 0.0850575864315033, + "learning_rate": 3.284123064805666e-05, + "loss": 2.5661, + "step": 29851 + }, + { + "epoch": 0.8852118732022655, + "grad_norm": 0.08373227715492249, + "learning_rate": 3.282446178861698e-05, + "loss": 2.5546, + "step": 29852 + }, + { + "epoch": 0.885241526554577, + "grad_norm": 0.08262225240468979, + "learning_rate": 3.2807697066096874e-05, + "loss": 2.5792, + "step": 29853 + }, + { + "epoch": 0.8852711799068885, + "grad_norm": 0.07614535093307495, + "learning_rate": 3.279093648064485e-05, + "loss": 2.5824, + "step": 29854 + }, + { + "epoch": 0.8853008332592, + "grad_norm": 0.08263183385133743, + "learning_rate": 3.2774180032409284e-05, + "loss": 2.5584, + "step": 29855 + }, + { + "epoch": 0.8853304866115115, + "grad_norm": 0.08542276918888092, + "learning_rate": 3.2757427721538504e-05, + "loss": 2.5811, + "step": 29856 + }, + { + "epoch": 0.8853601399638229, + "grad_norm": 0.07889977842569351, + "learning_rate": 3.274067954818094e-05, + "loss": 2.5562, + "step": 29857 + }, + { + "epoch": 0.8853897933161344, + "grad_norm": 0.07774557173252106, + "learning_rate": 3.272393551248487e-05, + "loss": 2.5459, + "step": 29858 + }, + { + "epoch": 0.8854194466684459, + "grad_norm": 0.07899695634841919, + "learning_rate": 3.270719561459856e-05, + "loss": 2.6155, + "step": 29859 + }, + { + "epoch": 0.8854491000207574, + "grad_norm": 0.07361490279436111, + "learning_rate": 3.269045985467029e-05, + "loss": 2.5349, + "step": 29860 + }, + { + "epoch": 0.8854787533730688, + "grad_norm": 0.07844394445419312, + "learning_rate": 3.2673728232848146e-05, + "loss": 2.5729, + "step": 29861 + }, + { + "epoch": 0.8855084067253803, + "grad_norm": 0.07602745294570923, + "learning_rate": 3.2657000749280354e-05, + "loss": 2.5488, + "step": 29862 + }, + { + "epoch": 0.8855380600776918, + "grad_norm": 0.08062828332185745, + "learning_rate": 3.264027740411507e-05, + "loss": 2.5635, + "step": 29863 + }, + { + "epoch": 0.8855677134300033, + "grad_norm": 0.07397951930761337, + "learning_rate": 3.262355819750029e-05, + "loss": 2.5607, + "step": 29864 + }, + { + "epoch": 0.8855973667823147, + "grad_norm": 0.07467442750930786, + "learning_rate": 3.260684312958412e-05, + "loss": 2.5506, + "step": 29865 + }, + { + "epoch": 0.8856270201346262, + "grad_norm": 0.07278218865394592, + "learning_rate": 3.25901322005146e-05, + "loss": 2.5652, + "step": 29866 + }, + { + "epoch": 0.8856566734869377, + "grad_norm": 0.07364589720964432, + "learning_rate": 3.2573425410439725e-05, + "loss": 2.5581, + "step": 29867 + }, + { + "epoch": 0.8856863268392492, + "grad_norm": 0.07098346948623657, + "learning_rate": 3.2556722759507386e-05, + "loss": 2.5485, + "step": 29868 + }, + { + "epoch": 0.8857159801915606, + "grad_norm": 0.07511646300554276, + "learning_rate": 3.2540024247865506e-05, + "loss": 2.5485, + "step": 29869 + }, + { + "epoch": 0.8857456335438721, + "grad_norm": 0.07767892628908157, + "learning_rate": 3.252332987566203e-05, + "loss": 2.5517, + "step": 29870 + }, + { + "epoch": 0.8857752868961836, + "grad_norm": 0.07635532319545746, + "learning_rate": 3.250663964304462e-05, + "loss": 2.5957, + "step": 29871 + }, + { + "epoch": 0.8858049402484951, + "grad_norm": 0.07572384178638458, + "learning_rate": 3.2489953550161154e-05, + "loss": 2.6135, + "step": 29872 + }, + { + "epoch": 0.8858345936008065, + "grad_norm": 0.07315772771835327, + "learning_rate": 3.247327159715941e-05, + "loss": 2.5563, + "step": 29873 + }, + { + "epoch": 0.885864246953118, + "grad_norm": 0.07890970259904861, + "learning_rate": 3.2456593784187085e-05, + "loss": 2.573, + "step": 29874 + }, + { + "epoch": 0.8858939003054296, + "grad_norm": 0.0829431489109993, + "learning_rate": 3.243992011139191e-05, + "loss": 2.5816, + "step": 29875 + }, + { + "epoch": 0.885923553657741, + "grad_norm": 0.0726037323474884, + "learning_rate": 3.242325057892143e-05, + "loss": 2.5918, + "step": 29876 + }, + { + "epoch": 0.8859532070100525, + "grad_norm": 0.07554920017719269, + "learning_rate": 3.240658518692341e-05, + "loss": 2.537, + "step": 29877 + }, + { + "epoch": 0.885982860362364, + "grad_norm": 0.07664404064416885, + "learning_rate": 3.238992393554518e-05, + "loss": 2.578, + "step": 29878 + }, + { + "epoch": 0.8860125137146755, + "grad_norm": 0.07603754103183746, + "learning_rate": 3.237326682493458e-05, + "loss": 2.5629, + "step": 29879 + }, + { + "epoch": 0.8860421670669869, + "grad_norm": 0.0801701620221138, + "learning_rate": 3.2356613855239026e-05, + "loss": 2.5691, + "step": 29880 + }, + { + "epoch": 0.8860718204192984, + "grad_norm": 0.07092103362083435, + "learning_rate": 3.23399650266058e-05, + "loss": 2.572, + "step": 29881 + }, + { + "epoch": 0.8861014737716099, + "grad_norm": 0.07346901297569275, + "learning_rate": 3.23233203391825e-05, + "loss": 2.5689, + "step": 29882 + }, + { + "epoch": 0.8861311271239214, + "grad_norm": 0.07463204860687256, + "learning_rate": 3.2306679793116464e-05, + "loss": 2.5822, + "step": 29883 + }, + { + "epoch": 0.8861607804762328, + "grad_norm": 0.07315022498369217, + "learning_rate": 3.229004338855512e-05, + "loss": 2.5624, + "step": 29884 + }, + { + "epoch": 0.8861904338285443, + "grad_norm": 0.077296681702137, + "learning_rate": 3.2273411125645634e-05, + "loss": 2.5536, + "step": 29885 + }, + { + "epoch": 0.8862200871808558, + "grad_norm": 0.07408900558948517, + "learning_rate": 3.225678300453544e-05, + "loss": 2.5801, + "step": 29886 + }, + { + "epoch": 0.8862497405331673, + "grad_norm": 0.07499704509973526, + "learning_rate": 3.22401590253717e-05, + "loss": 2.6033, + "step": 29887 + }, + { + "epoch": 0.8862793938854787, + "grad_norm": 0.07577154785394669, + "learning_rate": 3.222353918830162e-05, + "loss": 2.5765, + "step": 29888 + }, + { + "epoch": 0.8863090472377902, + "grad_norm": 0.07655680924654007, + "learning_rate": 3.220692349347237e-05, + "loss": 2.5991, + "step": 29889 + }, + { + "epoch": 0.8863387005901017, + "grad_norm": 0.07869245111942291, + "learning_rate": 3.219031194103117e-05, + "loss": 2.5795, + "step": 29890 + }, + { + "epoch": 0.8863683539424132, + "grad_norm": 0.08040576428174973, + "learning_rate": 3.217370453112506e-05, + "loss": 2.5318, + "step": 29891 + }, + { + "epoch": 0.8863980072947246, + "grad_norm": 0.07572034746408463, + "learning_rate": 3.2157101263901036e-05, + "loss": 2.6039, + "step": 29892 + }, + { + "epoch": 0.8864276606470362, + "grad_norm": 0.07607979327440262, + "learning_rate": 3.21405021395062e-05, + "loss": 2.5862, + "step": 29893 + }, + { + "epoch": 0.8864573139993476, + "grad_norm": 0.0771794244647026, + "learning_rate": 3.212390715808755e-05, + "loss": 2.594, + "step": 29894 + }, + { + "epoch": 0.8864869673516591, + "grad_norm": 0.08229604363441467, + "learning_rate": 3.2107316319792025e-05, + "loss": 2.5239, + "step": 29895 + }, + { + "epoch": 0.8865166207039706, + "grad_norm": 0.08651596307754517, + "learning_rate": 3.20907296247665e-05, + "loss": 2.5763, + "step": 29896 + }, + { + "epoch": 0.8865462740562821, + "grad_norm": 0.08459398150444031, + "learning_rate": 3.207414707315786e-05, + "loss": 2.5687, + "step": 29897 + }, + { + "epoch": 0.8865759274085936, + "grad_norm": 0.08573821187019348, + "learning_rate": 3.2057568665113e-05, + "loss": 2.5708, + "step": 29898 + }, + { + "epoch": 0.886605580760905, + "grad_norm": 0.08369683474302292, + "learning_rate": 3.204099440077868e-05, + "loss": 2.5766, + "step": 29899 + }, + { + "epoch": 0.8866352341132165, + "grad_norm": 0.07839476317167282, + "learning_rate": 3.2024424280301725e-05, + "loss": 2.574, + "step": 29900 + }, + { + "epoch": 0.886664887465528, + "grad_norm": 0.07878772169351578, + "learning_rate": 3.200785830382874e-05, + "loss": 2.5741, + "step": 29901 + }, + { + "epoch": 0.8866945408178395, + "grad_norm": 0.08551236242055893, + "learning_rate": 3.199129647150656e-05, + "loss": 2.587, + "step": 29902 + }, + { + "epoch": 0.8867241941701509, + "grad_norm": 0.08709544688463211, + "learning_rate": 3.197473878348173e-05, + "loss": 2.5735, + "step": 29903 + }, + { + "epoch": 0.8867538475224624, + "grad_norm": 0.07552142441272736, + "learning_rate": 3.195818523990096e-05, + "loss": 2.5578, + "step": 29904 + }, + { + "epoch": 0.8867835008747739, + "grad_norm": 0.07959694415330887, + "learning_rate": 3.194163584091081e-05, + "loss": 2.5417, + "step": 29905 + }, + { + "epoch": 0.8868131542270854, + "grad_norm": 0.08339089155197144, + "learning_rate": 3.192509058665777e-05, + "loss": 2.5922, + "step": 29906 + }, + { + "epoch": 0.8868428075793968, + "grad_norm": 0.08367042988538742, + "learning_rate": 3.190854947728844e-05, + "loss": 2.5771, + "step": 29907 + }, + { + "epoch": 0.8868724609317084, + "grad_norm": 0.08367598801851273, + "learning_rate": 3.1892012512949206e-05, + "loss": 2.546, + "step": 29908 + }, + { + "epoch": 0.8869021142840198, + "grad_norm": 0.08177122473716736, + "learning_rate": 3.187547969378663e-05, + "loss": 2.5844, + "step": 29909 + }, + { + "epoch": 0.8869317676363313, + "grad_norm": 0.08792179077863693, + "learning_rate": 3.1858951019946955e-05, + "loss": 2.58, + "step": 29910 + }, + { + "epoch": 0.8869614209886427, + "grad_norm": 0.07964622229337692, + "learning_rate": 3.184242649157665e-05, + "loss": 2.5606, + "step": 29911 + }, + { + "epoch": 0.8869910743409543, + "grad_norm": 0.07901649922132492, + "learning_rate": 3.182590610882202e-05, + "loss": 2.5712, + "step": 29912 + }, + { + "epoch": 0.8870207276932657, + "grad_norm": 0.08052527159452438, + "learning_rate": 3.180938987182935e-05, + "loss": 2.5595, + "step": 29913 + }, + { + "epoch": 0.8870503810455772, + "grad_norm": 0.07791285216808319, + "learning_rate": 3.179287778074491e-05, + "loss": 2.5617, + "step": 29914 + }, + { + "epoch": 0.8870800343978886, + "grad_norm": 0.0870530977845192, + "learning_rate": 3.1776369835714966e-05, + "loss": 2.6049, + "step": 29915 + }, + { + "epoch": 0.8871096877502002, + "grad_norm": 0.0862588956952095, + "learning_rate": 3.175986603688552e-05, + "loss": 2.5684, + "step": 29916 + }, + { + "epoch": 0.8871393411025117, + "grad_norm": 0.07960102707147598, + "learning_rate": 3.1743366384402835e-05, + "loss": 2.5772, + "step": 29917 + }, + { + "epoch": 0.8871689944548231, + "grad_norm": 0.08978909999132156, + "learning_rate": 3.1726870878413025e-05, + "loss": 2.5692, + "step": 29918 + }, + { + "epoch": 0.8871986478071346, + "grad_norm": 0.08563869446516037, + "learning_rate": 3.171037951906219e-05, + "loss": 2.5316, + "step": 29919 + }, + { + "epoch": 0.8872283011594461, + "grad_norm": 0.08091729879379272, + "learning_rate": 3.169389230649633e-05, + "loss": 2.5653, + "step": 29920 + }, + { + "epoch": 0.8872579545117576, + "grad_norm": 0.07847879081964493, + "learning_rate": 3.167740924086143e-05, + "loss": 2.5662, + "step": 29921 + }, + { + "epoch": 0.887287607864069, + "grad_norm": 0.07890938967466354, + "learning_rate": 3.166093032230344e-05, + "loss": 2.5762, + "step": 29922 + }, + { + "epoch": 0.8873172612163805, + "grad_norm": 0.08598645776510239, + "learning_rate": 3.164445555096829e-05, + "loss": 2.5525, + "step": 29923 + }, + { + "epoch": 0.887346914568692, + "grad_norm": 0.07804425060749054, + "learning_rate": 3.1627984927001916e-05, + "loss": 2.5581, + "step": 29924 + }, + { + "epoch": 0.8873765679210035, + "grad_norm": 0.07697021961212158, + "learning_rate": 3.161151845055021e-05, + "loss": 2.594, + "step": 29925 + }, + { + "epoch": 0.8874062212733149, + "grad_norm": 0.08011528104543686, + "learning_rate": 3.159505612175878e-05, + "loss": 2.5964, + "step": 29926 + }, + { + "epoch": 0.8874358746256265, + "grad_norm": 0.07785283774137497, + "learning_rate": 3.1578597940773556e-05, + "loss": 2.5742, + "step": 29927 + }, + { + "epoch": 0.8874655279779379, + "grad_norm": 0.07608919590711594, + "learning_rate": 3.156214390774026e-05, + "loss": 2.5903, + "step": 29928 + }, + { + "epoch": 0.8874951813302494, + "grad_norm": 0.0777268335223198, + "learning_rate": 3.15456940228046e-05, + "loss": 2.5605, + "step": 29929 + }, + { + "epoch": 0.8875248346825608, + "grad_norm": 0.07776334881782532, + "learning_rate": 3.1529248286112145e-05, + "loss": 2.5516, + "step": 29930 + }, + { + "epoch": 0.8875544880348724, + "grad_norm": 0.07887352257966995, + "learning_rate": 3.151280669780865e-05, + "loss": 2.5447, + "step": 29931 + }, + { + "epoch": 0.8875841413871838, + "grad_norm": 0.07903946191072464, + "learning_rate": 3.1496369258039725e-05, + "loss": 2.5512, + "step": 29932 + }, + { + "epoch": 0.8876137947394953, + "grad_norm": 0.07958335429430008, + "learning_rate": 3.147993596695081e-05, + "loss": 2.5578, + "step": 29933 + }, + { + "epoch": 0.8876434480918067, + "grad_norm": 0.08216991275548935, + "learning_rate": 3.146350682468752e-05, + "loss": 2.5586, + "step": 29934 + }, + { + "epoch": 0.8876731014441183, + "grad_norm": 0.08115696161985397, + "learning_rate": 3.1447081831395276e-05, + "loss": 2.5602, + "step": 29935 + }, + { + "epoch": 0.8877027547964297, + "grad_norm": 0.07761597633361816, + "learning_rate": 3.143066098721964e-05, + "loss": 2.5357, + "step": 29936 + }, + { + "epoch": 0.8877324081487412, + "grad_norm": 0.07810928672552109, + "learning_rate": 3.141424429230583e-05, + "loss": 2.5993, + "step": 29937 + }, + { + "epoch": 0.8877620615010527, + "grad_norm": 0.08049719780683517, + "learning_rate": 3.1397831746799335e-05, + "loss": 2.5797, + "step": 29938 + }, + { + "epoch": 0.8877917148533642, + "grad_norm": 0.08560843765735626, + "learning_rate": 3.1381423350845486e-05, + "loss": 2.5712, + "step": 29939 + }, + { + "epoch": 0.8878213682056757, + "grad_norm": 0.08073890954256058, + "learning_rate": 3.1365019104589555e-05, + "loss": 2.5651, + "step": 29940 + }, + { + "epoch": 0.8878510215579871, + "grad_norm": 0.0832332968711853, + "learning_rate": 3.1348619008176814e-05, + "loss": 2.5768, + "step": 29941 + }, + { + "epoch": 0.8878806749102987, + "grad_norm": 0.072596475481987, + "learning_rate": 3.133222306175254e-05, + "loss": 2.5577, + "step": 29942 + }, + { + "epoch": 0.8879103282626101, + "grad_norm": 0.08488380163908005, + "learning_rate": 3.1315831265461726e-05, + "loss": 2.537, + "step": 29943 + }, + { + "epoch": 0.8879399816149216, + "grad_norm": 0.08171413838863373, + "learning_rate": 3.129944361944981e-05, + "loss": 2.5747, + "step": 29944 + }, + { + "epoch": 0.887969634967233, + "grad_norm": 0.07678471505641937, + "learning_rate": 3.1283060123861725e-05, + "loss": 2.5717, + "step": 29945 + }, + { + "epoch": 0.8879992883195446, + "grad_norm": 0.08491561561822891, + "learning_rate": 3.1266680778842704e-05, + "loss": 2.5524, + "step": 29946 + }, + { + "epoch": 0.888028941671856, + "grad_norm": 0.08265207707881927, + "learning_rate": 3.125030558453762e-05, + "loss": 2.582, + "step": 29947 + }, + { + "epoch": 0.8880585950241675, + "grad_norm": 0.07376690208911896, + "learning_rate": 3.1233934541091524e-05, + "loss": 2.5665, + "step": 29948 + }, + { + "epoch": 0.8880882483764789, + "grad_norm": 0.07630614191293716, + "learning_rate": 3.1217567648649415e-05, + "loss": 2.6033, + "step": 29949 + }, + { + "epoch": 0.8881179017287905, + "grad_norm": 0.08322194963693619, + "learning_rate": 3.1201204907356174e-05, + "loss": 2.5778, + "step": 29950 + }, + { + "epoch": 0.8881475550811019, + "grad_norm": 0.0798473134636879, + "learning_rate": 3.118484631735674e-05, + "loss": 2.5802, + "step": 29951 + }, + { + "epoch": 0.8881772084334134, + "grad_norm": 0.07944085448980331, + "learning_rate": 3.1168491878796e-05, + "loss": 2.5515, + "step": 29952 + }, + { + "epoch": 0.8882068617857248, + "grad_norm": 0.08852750062942505, + "learning_rate": 3.115214159181873e-05, + "loss": 2.525, + "step": 29953 + }, + { + "epoch": 0.8882365151380364, + "grad_norm": 0.0843040943145752, + "learning_rate": 3.113579545656969e-05, + "loss": 2.5781, + "step": 29954 + }, + { + "epoch": 0.8882661684903478, + "grad_norm": 0.07989788055419922, + "learning_rate": 3.111945347319356e-05, + "loss": 2.589, + "step": 29955 + }, + { + "epoch": 0.8882958218426593, + "grad_norm": 0.08362135291099548, + "learning_rate": 3.1103115641835324e-05, + "loss": 2.5865, + "step": 29956 + }, + { + "epoch": 0.8883254751949707, + "grad_norm": 0.08337799459695816, + "learning_rate": 3.108678196263948e-05, + "loss": 2.5903, + "step": 29957 + }, + { + "epoch": 0.8883551285472823, + "grad_norm": 0.07623454183340073, + "learning_rate": 3.107045243575063e-05, + "loss": 2.5533, + "step": 29958 + }, + { + "epoch": 0.8883847818995938, + "grad_norm": 0.08888638764619827, + "learning_rate": 3.1054127061313386e-05, + "loss": 2.556, + "step": 29959 + }, + { + "epoch": 0.8884144352519052, + "grad_norm": 0.07897058874368668, + "learning_rate": 3.1037805839472355e-05, + "loss": 2.5801, + "step": 29960 + }, + { + "epoch": 0.8884440886042168, + "grad_norm": 0.07680923491716385, + "learning_rate": 3.102148877037203e-05, + "loss": 2.5184, + "step": 29961 + }, + { + "epoch": 0.8884737419565282, + "grad_norm": 0.08056818693876266, + "learning_rate": 3.1005175854156966e-05, + "loss": 2.5843, + "step": 29962 + }, + { + "epoch": 0.8885033953088397, + "grad_norm": 0.08459468185901642, + "learning_rate": 3.098886709097154e-05, + "loss": 2.5577, + "step": 29963 + }, + { + "epoch": 0.8885330486611511, + "grad_norm": 0.07878929376602173, + "learning_rate": 3.09725624809602e-05, + "loss": 2.5837, + "step": 29964 + }, + { + "epoch": 0.8885627020134627, + "grad_norm": 0.07416465133428574, + "learning_rate": 3.095626202426732e-05, + "loss": 2.59, + "step": 29965 + }, + { + "epoch": 0.8885923553657741, + "grad_norm": 0.0759606584906578, + "learning_rate": 3.09399657210373e-05, + "loss": 2.5756, + "step": 29966 + }, + { + "epoch": 0.8886220087180856, + "grad_norm": 0.08049052953720093, + "learning_rate": 3.0923673571414345e-05, + "loss": 2.5976, + "step": 29967 + }, + { + "epoch": 0.888651662070397, + "grad_norm": 0.07448700070381165, + "learning_rate": 3.090738557554279e-05, + "loss": 2.5701, + "step": 29968 + }, + { + "epoch": 0.8886813154227086, + "grad_norm": 0.07324545085430145, + "learning_rate": 3.089110173356685e-05, + "loss": 2.5418, + "step": 29969 + }, + { + "epoch": 0.88871096877502, + "grad_norm": 0.07434756308794022, + "learning_rate": 3.087482204563075e-05, + "loss": 2.5443, + "step": 29970 + }, + { + "epoch": 0.8887406221273315, + "grad_norm": 0.07544202357530594, + "learning_rate": 3.085854651187864e-05, + "loss": 2.5685, + "step": 29971 + }, + { + "epoch": 0.8887702754796429, + "grad_norm": 0.07006411254405975, + "learning_rate": 3.084227513245458e-05, + "loss": 2.5554, + "step": 29972 + }, + { + "epoch": 0.8887999288319545, + "grad_norm": 0.07630261033773422, + "learning_rate": 3.082600790750273e-05, + "loss": 2.5575, + "step": 29973 + }, + { + "epoch": 0.8888295821842659, + "grad_norm": 0.07719350606203079, + "learning_rate": 3.0809744837167085e-05, + "loss": 2.5754, + "step": 29974 + }, + { + "epoch": 0.8888592355365774, + "grad_norm": 0.0760759636759758, + "learning_rate": 3.07934859215917e-05, + "loss": 2.5601, + "step": 29975 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.0785847008228302, + "learning_rate": 3.077723116092052e-05, + "loss": 2.5623, + "step": 29976 + }, + { + "epoch": 0.8889185422412004, + "grad_norm": 0.07766448706388474, + "learning_rate": 3.0760980555297525e-05, + "loss": 2.527, + "step": 29977 + }, + { + "epoch": 0.8889481955935118, + "grad_norm": 0.07839062064886093, + "learning_rate": 3.074473410486661e-05, + "loss": 2.5815, + "step": 29978 + }, + { + "epoch": 0.8889778489458233, + "grad_norm": 0.080376535654068, + "learning_rate": 3.072849180977155e-05, + "loss": 2.5397, + "step": 29979 + }, + { + "epoch": 0.8890075022981349, + "grad_norm": 0.08018537610769272, + "learning_rate": 3.07122536701564e-05, + "loss": 2.554, + "step": 29980 + }, + { + "epoch": 0.8890371556504463, + "grad_norm": 0.0780290737748146, + "learning_rate": 3.069601968616459e-05, + "loss": 2.5583, + "step": 29981 + }, + { + "epoch": 0.8890668090027578, + "grad_norm": 0.0820697620511055, + "learning_rate": 3.067978985794018e-05, + "loss": 2.5496, + "step": 29982 + }, + { + "epoch": 0.8890964623550692, + "grad_norm": 0.08003745973110199, + "learning_rate": 3.0663564185626766e-05, + "loss": 2.5224, + "step": 29983 + }, + { + "epoch": 0.8891261157073808, + "grad_norm": 0.07687169313430786, + "learning_rate": 3.064734266936809e-05, + "loss": 2.5553, + "step": 29984 + }, + { + "epoch": 0.8891557690596922, + "grad_norm": 0.08663106709718704, + "learning_rate": 3.063112530930773e-05, + "loss": 2.5915, + "step": 29985 + }, + { + "epoch": 0.8891854224120037, + "grad_norm": 0.07793353497982025, + "learning_rate": 3.061491210558936e-05, + "loss": 2.5584, + "step": 29986 + }, + { + "epoch": 0.8892150757643151, + "grad_norm": 0.07683267444372177, + "learning_rate": 3.0598703058356434e-05, + "loss": 2.5739, + "step": 29987 + }, + { + "epoch": 0.8892447291166267, + "grad_norm": 0.0817977786064148, + "learning_rate": 3.058249816775266e-05, + "loss": 2.5734, + "step": 29988 + }, + { + "epoch": 0.8892743824689381, + "grad_norm": 0.07832508534193039, + "learning_rate": 3.056629743392136e-05, + "loss": 2.5394, + "step": 29989 + }, + { + "epoch": 0.8893040358212496, + "grad_norm": 0.07861657440662384, + "learning_rate": 3.05501008570061e-05, + "loss": 2.5847, + "step": 29990 + }, + { + "epoch": 0.889333689173561, + "grad_norm": 0.0770471841096878, + "learning_rate": 3.053390843715037e-05, + "loss": 2.5582, + "step": 29991 + }, + { + "epoch": 0.8893633425258726, + "grad_norm": 0.07895506918430328, + "learning_rate": 3.051772017449739e-05, + "loss": 2.5542, + "step": 29992 + }, + { + "epoch": 0.889392995878184, + "grad_norm": 0.07933629304170609, + "learning_rate": 3.0501536069190538e-05, + "loss": 2.5698, + "step": 29993 + }, + { + "epoch": 0.8894226492304955, + "grad_norm": 0.07384762167930603, + "learning_rate": 3.0485356121373154e-05, + "loss": 2.5633, + "step": 29994 + }, + { + "epoch": 0.8894523025828069, + "grad_norm": 0.07469680160284042, + "learning_rate": 3.0469180331188563e-05, + "loss": 2.5663, + "step": 29995 + }, + { + "epoch": 0.8894819559351185, + "grad_norm": 0.08029437065124512, + "learning_rate": 3.0453008698779982e-05, + "loss": 2.6078, + "step": 29996 + }, + { + "epoch": 0.8895116092874299, + "grad_norm": 0.08081572502851486, + "learning_rate": 3.0436841224290633e-05, + "loss": 2.6399, + "step": 29997 + }, + { + "epoch": 0.8895412626397414, + "grad_norm": 0.07815579324960709, + "learning_rate": 3.0420677907863627e-05, + "loss": 2.6049, + "step": 29998 + }, + { + "epoch": 0.8895709159920528, + "grad_norm": 0.07582855969667435, + "learning_rate": 3.0404518749642118e-05, + "loss": 2.5872, + "step": 29999 + }, + { + "epoch": 0.8896005693443644, + "grad_norm": 0.07698275148868561, + "learning_rate": 3.0388363749769222e-05, + "loss": 2.598, + "step": 30000 + }, + { + "epoch": 0.8896302226966759, + "grad_norm": 0.0731915533542633, + "learning_rate": 3.0372212908388043e-05, + "loss": 2.5853, + "step": 30001 + }, + { + "epoch": 0.8896598760489873, + "grad_norm": 0.07995448261499405, + "learning_rate": 3.0356066225641465e-05, + "loss": 2.5545, + "step": 30002 + }, + { + "epoch": 0.8896895294012989, + "grad_norm": 0.08031432330608368, + "learning_rate": 3.033992370167249e-05, + "loss": 2.5607, + "step": 30003 + }, + { + "epoch": 0.8897191827536103, + "grad_norm": 0.07560437172651291, + "learning_rate": 3.0323785336624166e-05, + "loss": 2.5672, + "step": 30004 + }, + { + "epoch": 0.8897488361059218, + "grad_norm": 0.07670677453279495, + "learning_rate": 3.0307651130639325e-05, + "loss": 2.5906, + "step": 30005 + }, + { + "epoch": 0.8897784894582332, + "grad_norm": 0.07998127490282059, + "learning_rate": 3.0291521083860795e-05, + "loss": 2.5681, + "step": 30006 + }, + { + "epoch": 0.8898081428105448, + "grad_norm": 0.079302579164505, + "learning_rate": 3.0275395196431465e-05, + "loss": 2.5746, + "step": 30007 + }, + { + "epoch": 0.8898377961628562, + "grad_norm": 0.07903562486171722, + "learning_rate": 3.0259273468494163e-05, + "loss": 2.5609, + "step": 30008 + }, + { + "epoch": 0.8898674495151677, + "grad_norm": 0.08490739017724991, + "learning_rate": 3.0243155900191667e-05, + "loss": 2.5676, + "step": 30009 + }, + { + "epoch": 0.8898971028674791, + "grad_norm": 0.07877564430236816, + "learning_rate": 3.0227042491666636e-05, + "loss": 2.5679, + "step": 30010 + }, + { + "epoch": 0.8899267562197907, + "grad_norm": 0.07638344168663025, + "learning_rate": 3.021093324306179e-05, + "loss": 2.5568, + "step": 30011 + }, + { + "epoch": 0.8899564095721021, + "grad_norm": 0.07720698416233063, + "learning_rate": 3.0194828154519847e-05, + "loss": 2.5765, + "step": 30012 + }, + { + "epoch": 0.8899860629244136, + "grad_norm": 0.07865665853023529, + "learning_rate": 3.0178727226183255e-05, + "loss": 2.5505, + "step": 30013 + }, + { + "epoch": 0.890015716276725, + "grad_norm": 0.07180585712194443, + "learning_rate": 3.0162630458194674e-05, + "loss": 2.5535, + "step": 30014 + }, + { + "epoch": 0.8900453696290366, + "grad_norm": 0.08194272965192795, + "learning_rate": 3.0146537850696654e-05, + "loss": 2.5842, + "step": 30015 + }, + { + "epoch": 0.890075022981348, + "grad_norm": 0.07465632259845734, + "learning_rate": 3.0130449403831695e-05, + "loss": 2.5625, + "step": 30016 + }, + { + "epoch": 0.8901046763336595, + "grad_norm": 0.07431965321302414, + "learning_rate": 3.0114365117742237e-05, + "loss": 2.5584, + "step": 30017 + }, + { + "epoch": 0.890134329685971, + "grad_norm": 0.07908628135919571, + "learning_rate": 3.0098284992570777e-05, + "loss": 2.5547, + "step": 30018 + }, + { + "epoch": 0.8901639830382825, + "grad_norm": 0.07760202139616013, + "learning_rate": 3.0082209028459595e-05, + "loss": 2.5665, + "step": 30019 + }, + { + "epoch": 0.890193636390594, + "grad_norm": 0.07290363311767578, + "learning_rate": 3.0066137225551126e-05, + "loss": 2.5808, + "step": 30020 + }, + { + "epoch": 0.8902232897429054, + "grad_norm": 0.08035094290971756, + "learning_rate": 3.005006958398765e-05, + "loss": 2.5657, + "step": 30021 + }, + { + "epoch": 0.890252943095217, + "grad_norm": 0.07815977185964584, + "learning_rate": 3.0034006103911604e-05, + "loss": 2.5555, + "step": 30022 + }, + { + "epoch": 0.8902825964475284, + "grad_norm": 0.08957897871732712, + "learning_rate": 3.0017946785465045e-05, + "loss": 2.5786, + "step": 30023 + }, + { + "epoch": 0.8903122497998399, + "grad_norm": 0.07254524528980255, + "learning_rate": 3.000189162879019e-05, + "loss": 2.5539, + "step": 30024 + }, + { + "epoch": 0.8903419031521513, + "grad_norm": 0.08491314947605133, + "learning_rate": 2.9985840634029314e-05, + "loss": 2.544, + "step": 30025 + }, + { + "epoch": 0.8903715565044629, + "grad_norm": 0.08397669345140457, + "learning_rate": 2.996979380132442e-05, + "loss": 2.5757, + "step": 30026 + }, + { + "epoch": 0.8904012098567743, + "grad_norm": 0.08198906481266022, + "learning_rate": 2.9953751130817775e-05, + "loss": 2.5918, + "step": 30027 + }, + { + "epoch": 0.8904308632090858, + "grad_norm": 0.07834398746490479, + "learning_rate": 2.993771262265127e-05, + "loss": 2.555, + "step": 30028 + }, + { + "epoch": 0.8904605165613972, + "grad_norm": 0.08276473730802536, + "learning_rate": 2.9921678276967012e-05, + "loss": 2.576, + "step": 30029 + }, + { + "epoch": 0.8904901699137088, + "grad_norm": 0.077034592628479, + "learning_rate": 2.9905648093907e-05, + "loss": 2.6043, + "step": 30030 + }, + { + "epoch": 0.8905198232660202, + "grad_norm": 0.07698648422956467, + "learning_rate": 2.9889622073613176e-05, + "loss": 2.5841, + "step": 30031 + }, + { + "epoch": 0.8905494766183317, + "grad_norm": 0.07509104907512665, + "learning_rate": 2.9873600216227425e-05, + "loss": 2.5655, + "step": 30032 + }, + { + "epoch": 0.8905791299706431, + "grad_norm": 0.08007514476776123, + "learning_rate": 2.9857582521891637e-05, + "loss": 2.5776, + "step": 30033 + }, + { + "epoch": 0.8906087833229547, + "grad_norm": 0.07559316605329514, + "learning_rate": 2.984156899074769e-05, + "loss": 2.5754, + "step": 30034 + }, + { + "epoch": 0.8906384366752661, + "grad_norm": 0.08037376403808594, + "learning_rate": 2.9825559622937315e-05, + "loss": 2.5651, + "step": 30035 + }, + { + "epoch": 0.8906680900275776, + "grad_norm": 0.07895541191101074, + "learning_rate": 2.9809554418602335e-05, + "loss": 2.5649, + "step": 30036 + }, + { + "epoch": 0.890697743379889, + "grad_norm": 0.07764429599046707, + "learning_rate": 2.979355337788442e-05, + "loss": 2.5443, + "step": 30037 + }, + { + "epoch": 0.8907273967322006, + "grad_norm": 0.08254849165678024, + "learning_rate": 2.977755650092534e-05, + "loss": 2.5625, + "step": 30038 + }, + { + "epoch": 0.890757050084512, + "grad_norm": 0.07902427017688751, + "learning_rate": 2.976156378786671e-05, + "loss": 2.5734, + "step": 30039 + }, + { + "epoch": 0.8907867034368235, + "grad_norm": 0.0757942944765091, + "learning_rate": 2.9745575238850132e-05, + "loss": 2.5584, + "step": 30040 + }, + { + "epoch": 0.8908163567891351, + "grad_norm": 0.08187293261289597, + "learning_rate": 2.972959085401722e-05, + "loss": 2.5538, + "step": 30041 + }, + { + "epoch": 0.8908460101414465, + "grad_norm": 0.09067659080028534, + "learning_rate": 2.971361063350947e-05, + "loss": 2.6017, + "step": 30042 + }, + { + "epoch": 0.890875663493758, + "grad_norm": 0.07921198755502701, + "learning_rate": 2.9697634577468437e-05, + "loss": 2.5754, + "step": 30043 + }, + { + "epoch": 0.8909053168460694, + "grad_norm": 0.07485666126012802, + "learning_rate": 2.968166268603556e-05, + "loss": 2.501, + "step": 30044 + }, + { + "epoch": 0.890934970198381, + "grad_norm": 0.08367510885000229, + "learning_rate": 2.966569495935234e-05, + "loss": 2.5629, + "step": 30045 + }, + { + "epoch": 0.8909646235506924, + "grad_norm": 0.07994123548269272, + "learning_rate": 2.9649731397560108e-05, + "loss": 2.5285, + "step": 30046 + }, + { + "epoch": 0.8909942769030039, + "grad_norm": 0.0784824788570404, + "learning_rate": 2.9633772000800196e-05, + "loss": 2.6051, + "step": 30047 + }, + { + "epoch": 0.8910239302553153, + "grad_norm": 0.07317714393138885, + "learning_rate": 2.961781676921399e-05, + "loss": 2.5219, + "step": 30048 + }, + { + "epoch": 0.8910535836076269, + "grad_norm": 0.07734285295009613, + "learning_rate": 2.9601865702942766e-05, + "loss": 2.5748, + "step": 30049 + }, + { + "epoch": 0.8910832369599383, + "grad_norm": 0.07774117588996887, + "learning_rate": 2.9585918802127743e-05, + "loss": 2.5337, + "step": 30050 + }, + { + "epoch": 0.8911128903122498, + "grad_norm": 0.07421641051769257, + "learning_rate": 2.95699760669102e-05, + "loss": 2.5642, + "step": 30051 + }, + { + "epoch": 0.8911425436645612, + "grad_norm": 0.07758128643035889, + "learning_rate": 2.955403749743124e-05, + "loss": 2.5802, + "step": 30052 + }, + { + "epoch": 0.8911721970168728, + "grad_norm": 0.07388191670179367, + "learning_rate": 2.9538103093832036e-05, + "loss": 2.5906, + "step": 30053 + }, + { + "epoch": 0.8912018503691842, + "grad_norm": 0.07475607842206955, + "learning_rate": 2.9522172856253636e-05, + "loss": 2.5786, + "step": 30054 + }, + { + "epoch": 0.8912315037214957, + "grad_norm": 0.07998379319906235, + "learning_rate": 2.950624678483721e-05, + "loss": 2.5413, + "step": 30055 + }, + { + "epoch": 0.8912611570738072, + "grad_norm": 0.07874701172113419, + "learning_rate": 2.9490324879723808e-05, + "loss": 2.5309, + "step": 30056 + }, + { + "epoch": 0.8912908104261187, + "grad_norm": 0.0776640996336937, + "learning_rate": 2.9474407141054204e-05, + "loss": 2.5647, + "step": 30057 + }, + { + "epoch": 0.8913204637784301, + "grad_norm": 0.08278965950012207, + "learning_rate": 2.9458493568969514e-05, + "loss": 2.5433, + "step": 30058 + }, + { + "epoch": 0.8913501171307416, + "grad_norm": 0.08551249653100967, + "learning_rate": 2.944258416361062e-05, + "loss": 2.5612, + "step": 30059 + }, + { + "epoch": 0.8913797704830531, + "grad_norm": 0.07690222561359406, + "learning_rate": 2.942667892511841e-05, + "loss": 2.5583, + "step": 30060 + }, + { + "epoch": 0.8914094238353646, + "grad_norm": 0.07584017515182495, + "learning_rate": 2.9410777853633773e-05, + "loss": 2.5455, + "step": 30061 + }, + { + "epoch": 0.8914390771876761, + "grad_norm": 0.08038067072629929, + "learning_rate": 2.939488094929743e-05, + "loss": 2.5593, + "step": 30062 + }, + { + "epoch": 0.8914687305399875, + "grad_norm": 0.08329490572214127, + "learning_rate": 2.9378988212250212e-05, + "loss": 2.5948, + "step": 30063 + }, + { + "epoch": 0.8914983838922991, + "grad_norm": 0.07738931477069855, + "learning_rate": 2.9363099642632897e-05, + "loss": 2.5737, + "step": 30064 + }, + { + "epoch": 0.8915280372446105, + "grad_norm": 0.07615220546722412, + "learning_rate": 2.9347215240586034e-05, + "loss": 2.5571, + "step": 30065 + }, + { + "epoch": 0.891557690596922, + "grad_norm": 0.08314070850610733, + "learning_rate": 2.9331335006250402e-05, + "loss": 2.5753, + "step": 30066 + }, + { + "epoch": 0.8915873439492334, + "grad_norm": 0.0790814459323883, + "learning_rate": 2.931545893976667e-05, + "loss": 2.5479, + "step": 30067 + }, + { + "epoch": 0.891616997301545, + "grad_norm": 0.07600224018096924, + "learning_rate": 2.9299587041275276e-05, + "loss": 2.5808, + "step": 30068 + }, + { + "epoch": 0.8916466506538564, + "grad_norm": 0.0807494968175888, + "learning_rate": 2.928371931091678e-05, + "loss": 2.5732, + "step": 30069 + }, + { + "epoch": 0.8916763040061679, + "grad_norm": 0.0833902433514595, + "learning_rate": 2.9267855748831784e-05, + "loss": 2.5279, + "step": 30070 + }, + { + "epoch": 0.8917059573584794, + "grad_norm": 0.07912398874759674, + "learning_rate": 2.9251996355160738e-05, + "loss": 2.5504, + "step": 30071 + }, + { + "epoch": 0.8917356107107909, + "grad_norm": 0.0740668997168541, + "learning_rate": 2.923614113004397e-05, + "loss": 2.5903, + "step": 30072 + }, + { + "epoch": 0.8917652640631023, + "grad_norm": 0.07993021607398987, + "learning_rate": 2.9220290073622035e-05, + "loss": 2.5528, + "step": 30073 + }, + { + "epoch": 0.8917949174154138, + "grad_norm": 0.08141244947910309, + "learning_rate": 2.9204443186035268e-05, + "loss": 2.5829, + "step": 30074 + }, + { + "epoch": 0.8918245707677253, + "grad_norm": 0.07262815535068512, + "learning_rate": 2.9188600467424e-05, + "loss": 2.5933, + "step": 30075 + }, + { + "epoch": 0.8918542241200368, + "grad_norm": 0.07220176607370377, + "learning_rate": 2.917276191792845e-05, + "loss": 2.5861, + "step": 30076 + }, + { + "epoch": 0.8918838774723482, + "grad_norm": 0.08137102425098419, + "learning_rate": 2.915692753768895e-05, + "loss": 2.5848, + "step": 30077 + }, + { + "epoch": 0.8919135308246597, + "grad_norm": 0.07517945766448975, + "learning_rate": 2.9141097326845667e-05, + "loss": 2.5561, + "step": 30078 + }, + { + "epoch": 0.8919431841769712, + "grad_norm": 0.07686592638492584, + "learning_rate": 2.912527128553877e-05, + "loss": 2.5522, + "step": 30079 + }, + { + "epoch": 0.8919728375292827, + "grad_norm": 0.07667325437068939, + "learning_rate": 2.9109449413908416e-05, + "loss": 2.5625, + "step": 30080 + }, + { + "epoch": 0.8920024908815941, + "grad_norm": 0.07696565240621567, + "learning_rate": 2.909363171209467e-05, + "loss": 2.5838, + "step": 30081 + }, + { + "epoch": 0.8920321442339056, + "grad_norm": 0.07449743151664734, + "learning_rate": 2.9077818180237692e-05, + "loss": 2.5775, + "step": 30082 + }, + { + "epoch": 0.8920617975862172, + "grad_norm": 0.0807376429438591, + "learning_rate": 2.906200881847748e-05, + "loss": 2.5222, + "step": 30083 + }, + { + "epoch": 0.8920914509385286, + "grad_norm": 0.082481250166893, + "learning_rate": 2.9046203626953982e-05, + "loss": 2.5613, + "step": 30084 + }, + { + "epoch": 0.8921211042908401, + "grad_norm": 0.08896179497241974, + "learning_rate": 2.9030402605807137e-05, + "loss": 2.5877, + "step": 30085 + }, + { + "epoch": 0.8921507576431515, + "grad_norm": 0.08064022660255432, + "learning_rate": 2.901460575517695e-05, + "loss": 2.5958, + "step": 30086 + }, + { + "epoch": 0.8921804109954631, + "grad_norm": 0.07404576987028122, + "learning_rate": 2.89988130752033e-05, + "loss": 2.5492, + "step": 30087 + }, + { + "epoch": 0.8922100643477745, + "grad_norm": 0.07780750840902328, + "learning_rate": 2.8983024566026083e-05, + "loss": 2.5369, + "step": 30088 + }, + { + "epoch": 0.892239717700086, + "grad_norm": 0.08330926299095154, + "learning_rate": 2.8967240227784963e-05, + "loss": 2.5395, + "step": 30089 + }, + { + "epoch": 0.8922693710523975, + "grad_norm": 0.07489854842424393, + "learning_rate": 2.8951460060619827e-05, + "loss": 2.6027, + "step": 30090 + }, + { + "epoch": 0.892299024404709, + "grad_norm": 0.0807594284415245, + "learning_rate": 2.8935684064670286e-05, + "loss": 2.577, + "step": 30091 + }, + { + "epoch": 0.8923286777570204, + "grad_norm": 0.08398639410734177, + "learning_rate": 2.8919912240076175e-05, + "loss": 2.5949, + "step": 30092 + }, + { + "epoch": 0.8923583311093319, + "grad_norm": 0.07305147498846054, + "learning_rate": 2.8904144586977045e-05, + "loss": 2.5248, + "step": 30093 + }, + { + "epoch": 0.8923879844616434, + "grad_norm": 0.07786285877227783, + "learning_rate": 2.8888381105512618e-05, + "loss": 2.5781, + "step": 30094 + }, + { + "epoch": 0.8924176378139549, + "grad_norm": 0.08357693254947662, + "learning_rate": 2.8872621795822453e-05, + "loss": 2.5698, + "step": 30095 + }, + { + "epoch": 0.8924472911662663, + "grad_norm": 0.08251713961362839, + "learning_rate": 2.88568666580461e-05, + "loss": 2.5775, + "step": 30096 + }, + { + "epoch": 0.8924769445185778, + "grad_norm": 0.07126723229885101, + "learning_rate": 2.8841115692323005e-05, + "loss": 2.564, + "step": 30097 + }, + { + "epoch": 0.8925065978708893, + "grad_norm": 0.08106476068496704, + "learning_rate": 2.8825368898792724e-05, + "loss": 2.5695, + "step": 30098 + }, + { + "epoch": 0.8925362512232008, + "grad_norm": 0.0807444304227829, + "learning_rate": 2.8809626277594703e-05, + "loss": 2.5213, + "step": 30099 + }, + { + "epoch": 0.8925659045755122, + "grad_norm": 0.08017682284116745, + "learning_rate": 2.8793887828868326e-05, + "loss": 2.5826, + "step": 30100 + }, + { + "epoch": 0.8925955579278237, + "grad_norm": 0.07164643704891205, + "learning_rate": 2.877815355275293e-05, + "loss": 2.5822, + "step": 30101 + }, + { + "epoch": 0.8926252112801352, + "grad_norm": 0.07886286079883575, + "learning_rate": 2.8762423449387842e-05, + "loss": 2.5831, + "step": 30102 + }, + { + "epoch": 0.8926548646324467, + "grad_norm": 0.07998815923929214, + "learning_rate": 2.8746697518912403e-05, + "loss": 2.5797, + "step": 30103 + }, + { + "epoch": 0.8926845179847582, + "grad_norm": 0.07416358590126038, + "learning_rate": 2.8730975761465884e-05, + "loss": 2.5345, + "step": 30104 + }, + { + "epoch": 0.8927141713370697, + "grad_norm": 0.07620131969451904, + "learning_rate": 2.8715258177187452e-05, + "loss": 2.5717, + "step": 30105 + }, + { + "epoch": 0.8927438246893812, + "grad_norm": 0.1905415654182434, + "learning_rate": 2.869954476621628e-05, + "loss": 2.5497, + "step": 30106 + }, + { + "epoch": 0.8927734780416926, + "grad_norm": 0.07570847123861313, + "learning_rate": 2.8683835528691525e-05, + "loss": 2.5841, + "step": 30107 + }, + { + "epoch": 0.8928031313940041, + "grad_norm": 0.07105818390846252, + "learning_rate": 2.8668130464752307e-05, + "loss": 2.5914, + "step": 30108 + }, + { + "epoch": 0.8928327847463156, + "grad_norm": 0.07760617882013321, + "learning_rate": 2.865242957453773e-05, + "loss": 2.5932, + "step": 30109 + }, + { + "epoch": 0.8928624380986271, + "grad_norm": 0.07996181398630142, + "learning_rate": 2.86367328581868e-05, + "loss": 2.5938, + "step": 30110 + }, + { + "epoch": 0.8928920914509385, + "grad_norm": 0.07114145904779434, + "learning_rate": 2.862104031583851e-05, + "loss": 2.522, + "step": 30111 + }, + { + "epoch": 0.89292174480325, + "grad_norm": 0.07898867130279541, + "learning_rate": 2.8605351947631752e-05, + "loss": 2.5568, + "step": 30112 + }, + { + "epoch": 0.8929513981555615, + "grad_norm": 0.07629439979791641, + "learning_rate": 2.8589667753705585e-05, + "loss": 2.5447, + "step": 30113 + }, + { + "epoch": 0.892981051507873, + "grad_norm": 0.0797618106007576, + "learning_rate": 2.8573987734198837e-05, + "loss": 2.5736, + "step": 30114 + }, + { + "epoch": 0.8930107048601844, + "grad_norm": 0.07318723201751709, + "learning_rate": 2.8558311889250286e-05, + "loss": 2.5654, + "step": 30115 + }, + { + "epoch": 0.8930403582124959, + "grad_norm": 0.07797754555940628, + "learning_rate": 2.8542640218998826e-05, + "loss": 2.6186, + "step": 30116 + }, + { + "epoch": 0.8930700115648074, + "grad_norm": 0.07829711586236954, + "learning_rate": 2.8526972723583235e-05, + "loss": 2.6063, + "step": 30117 + }, + { + "epoch": 0.8930996649171189, + "grad_norm": 0.07706081122159958, + "learning_rate": 2.8511309403142228e-05, + "loss": 2.5565, + "step": 30118 + }, + { + "epoch": 0.8931293182694303, + "grad_norm": 0.07734808325767517, + "learning_rate": 2.8495650257814476e-05, + "loss": 2.5864, + "step": 30119 + }, + { + "epoch": 0.8931589716217418, + "grad_norm": 0.07736121863126755, + "learning_rate": 2.8479995287738703e-05, + "loss": 2.5784, + "step": 30120 + }, + { + "epoch": 0.8931886249740533, + "grad_norm": 0.07230371236801147, + "learning_rate": 2.8464344493053462e-05, + "loss": 2.5675, + "step": 30121 + }, + { + "epoch": 0.8932182783263648, + "grad_norm": 0.07964234799146652, + "learning_rate": 2.844869787389753e-05, + "loss": 2.5589, + "step": 30122 + }, + { + "epoch": 0.8932479316786762, + "grad_norm": 0.0777890607714653, + "learning_rate": 2.8433055430409195e-05, + "loss": 2.5224, + "step": 30123 + }, + { + "epoch": 0.8932775850309878, + "grad_norm": 0.07488834857940674, + "learning_rate": 2.8417417162727054e-05, + "loss": 2.5832, + "step": 30124 + }, + { + "epoch": 0.8933072383832993, + "grad_norm": 0.07441078126430511, + "learning_rate": 2.8401783070989672e-05, + "loss": 2.5423, + "step": 30125 + }, + { + "epoch": 0.8933368917356107, + "grad_norm": 0.07363195717334747, + "learning_rate": 2.8386153155335493e-05, + "loss": 2.5664, + "step": 30126 + }, + { + "epoch": 0.8933665450879222, + "grad_norm": 0.07584438472986221, + "learning_rate": 2.8370527415902846e-05, + "loss": 2.5511, + "step": 30127 + }, + { + "epoch": 0.8933961984402337, + "grad_norm": 0.07687883079051971, + "learning_rate": 2.8354905852830128e-05, + "loss": 2.5517, + "step": 30128 + }, + { + "epoch": 0.8934258517925452, + "grad_norm": 0.07312821596860886, + "learning_rate": 2.8339288466255664e-05, + "loss": 2.5666, + "step": 30129 + }, + { + "epoch": 0.8934555051448566, + "grad_norm": 0.08180668205022812, + "learning_rate": 2.8323675256317794e-05, + "loss": 2.6043, + "step": 30130 + }, + { + "epoch": 0.8934851584971681, + "grad_norm": 0.07443635165691376, + "learning_rate": 2.8308066223154738e-05, + "loss": 2.5838, + "step": 30131 + }, + { + "epoch": 0.8935148118494796, + "grad_norm": 0.07820961624383926, + "learning_rate": 2.8292461366904776e-05, + "loss": 2.5566, + "step": 30132 + }, + { + "epoch": 0.8935444652017911, + "grad_norm": 0.07213138788938522, + "learning_rate": 2.8276860687705963e-05, + "loss": 2.5522, + "step": 30133 + }, + { + "epoch": 0.8935741185541025, + "grad_norm": 0.07362072914838791, + "learning_rate": 2.8261264185696522e-05, + "loss": 2.5858, + "step": 30134 + }, + { + "epoch": 0.893603771906414, + "grad_norm": 0.08071960508823395, + "learning_rate": 2.8245671861014565e-05, + "loss": 2.5705, + "step": 30135 + }, + { + "epoch": 0.8936334252587255, + "grad_norm": 0.07350078970193863, + "learning_rate": 2.8230083713798093e-05, + "loss": 2.5284, + "step": 30136 + }, + { + "epoch": 0.893663078611037, + "grad_norm": 0.07604411989450455, + "learning_rate": 2.8214499744185275e-05, + "loss": 2.5672, + "step": 30137 + }, + { + "epoch": 0.8936927319633484, + "grad_norm": 0.0745701938867569, + "learning_rate": 2.819891995231405e-05, + "loss": 2.5324, + "step": 30138 + }, + { + "epoch": 0.89372238531566, + "grad_norm": 0.07565104216337204, + "learning_rate": 2.818334433832237e-05, + "loss": 2.5555, + "step": 30139 + }, + { + "epoch": 0.8937520386679714, + "grad_norm": 0.0787450298666954, + "learning_rate": 2.8167772902348178e-05, + "loss": 2.5257, + "step": 30140 + }, + { + "epoch": 0.8937816920202829, + "grad_norm": 0.07153908163309097, + "learning_rate": 2.8152205644529306e-05, + "loss": 2.5207, + "step": 30141 + }, + { + "epoch": 0.8938113453725943, + "grad_norm": 0.07805448025465012, + "learning_rate": 2.8136642565003647e-05, + "loss": 2.5118, + "step": 30142 + }, + { + "epoch": 0.8938409987249059, + "grad_norm": 0.08140800893306732, + "learning_rate": 2.812108366390914e-05, + "loss": 2.5803, + "step": 30143 + }, + { + "epoch": 0.8938706520772173, + "grad_norm": 0.07758524268865585, + "learning_rate": 2.8105528941383297e-05, + "loss": 2.5925, + "step": 30144 + }, + { + "epoch": 0.8939003054295288, + "grad_norm": 0.07865726202726364, + "learning_rate": 2.8089978397564052e-05, + "loss": 2.5116, + "step": 30145 + }, + { + "epoch": 0.8939299587818403, + "grad_norm": 0.08213800191879272, + "learning_rate": 2.8074432032589024e-05, + "loss": 2.5351, + "step": 30146 + }, + { + "epoch": 0.8939596121341518, + "grad_norm": 0.07998201251029968, + "learning_rate": 2.805888984659588e-05, + "loss": 2.5567, + "step": 30147 + }, + { + "epoch": 0.8939892654864633, + "grad_norm": 0.07193414866924286, + "learning_rate": 2.8043351839722286e-05, + "loss": 2.5968, + "step": 30148 + }, + { + "epoch": 0.8940189188387747, + "grad_norm": 0.07405679672956467, + "learning_rate": 2.8027818012105743e-05, + "loss": 2.6053, + "step": 30149 + }, + { + "epoch": 0.8940485721910862, + "grad_norm": 0.0795278325676918, + "learning_rate": 2.8012288363883975e-05, + "loss": 2.551, + "step": 30150 + }, + { + "epoch": 0.8940782255433977, + "grad_norm": 0.07911445945501328, + "learning_rate": 2.7996762895194426e-05, + "loss": 2.5671, + "step": 30151 + }, + { + "epoch": 0.8941078788957092, + "grad_norm": 0.07175378501415253, + "learning_rate": 2.7981241606174546e-05, + "loss": 2.5914, + "step": 30152 + }, + { + "epoch": 0.8941375322480206, + "grad_norm": 0.07266746461391449, + "learning_rate": 2.7965724496961885e-05, + "loss": 2.5433, + "step": 30153 + }, + { + "epoch": 0.8941671856003321, + "grad_norm": 0.07659182697534561, + "learning_rate": 2.7950211567693616e-05, + "loss": 2.5422, + "step": 30154 + }, + { + "epoch": 0.8941968389526436, + "grad_norm": 0.07626765221357346, + "learning_rate": 2.7934702818507298e-05, + "loss": 2.5872, + "step": 30155 + }, + { + "epoch": 0.8942264923049551, + "grad_norm": 0.07275763899087906, + "learning_rate": 2.7919198249540202e-05, + "loss": 2.5673, + "step": 30156 + }, + { + "epoch": 0.8942561456572665, + "grad_norm": 0.07532661408185959, + "learning_rate": 2.7903697860929665e-05, + "loss": 2.617, + "step": 30157 + }, + { + "epoch": 0.894285799009578, + "grad_norm": 0.07369472831487656, + "learning_rate": 2.788820165281292e-05, + "loss": 2.5506, + "step": 30158 + }, + { + "epoch": 0.8943154523618895, + "grad_norm": 0.07408886402845383, + "learning_rate": 2.7872709625327175e-05, + "loss": 2.561, + "step": 30159 + }, + { + "epoch": 0.894345105714201, + "grad_norm": 0.08399367332458496, + "learning_rate": 2.7857221778609608e-05, + "loss": 2.5745, + "step": 30160 + }, + { + "epoch": 0.8943747590665124, + "grad_norm": 0.07896079123020172, + "learning_rate": 2.7841738112797387e-05, + "loss": 2.5641, + "step": 30161 + }, + { + "epoch": 0.894404412418824, + "grad_norm": 0.07625603675842285, + "learning_rate": 2.7826258628027513e-05, + "loss": 2.5591, + "step": 30162 + }, + { + "epoch": 0.8944340657711354, + "grad_norm": 0.07618573307991028, + "learning_rate": 2.7810783324437317e-05, + "loss": 2.5721, + "step": 30163 + }, + { + "epoch": 0.8944637191234469, + "grad_norm": 0.08234621584415436, + "learning_rate": 2.7795312202163692e-05, + "loss": 2.5701, + "step": 30164 + }, + { + "epoch": 0.8944933724757583, + "grad_norm": 0.07917618751525879, + "learning_rate": 2.7779845261343584e-05, + "loss": 2.559, + "step": 30165 + }, + { + "epoch": 0.8945230258280699, + "grad_norm": 0.08170400559902191, + "learning_rate": 2.7764382502113994e-05, + "loss": 2.5854, + "step": 30166 + }, + { + "epoch": 0.8945526791803814, + "grad_norm": 0.07490373402833939, + "learning_rate": 2.774892392461187e-05, + "loss": 2.5993, + "step": 30167 + }, + { + "epoch": 0.8945823325326928, + "grad_norm": 0.07692089676856995, + "learning_rate": 2.7733469528974098e-05, + "loss": 2.6043, + "step": 30168 + }, + { + "epoch": 0.8946119858850043, + "grad_norm": 0.08295925706624985, + "learning_rate": 2.771801931533752e-05, + "loss": 2.5808, + "step": 30169 + }, + { + "epoch": 0.8946416392373158, + "grad_norm": 0.08212930709123611, + "learning_rate": 2.7702573283838905e-05, + "loss": 2.5406, + "step": 30170 + }, + { + "epoch": 0.8946712925896273, + "grad_norm": 0.07590029388666153, + "learning_rate": 2.7687131434615098e-05, + "loss": 2.6123, + "step": 30171 + }, + { + "epoch": 0.8947009459419387, + "grad_norm": 0.07708438485860825, + "learning_rate": 2.767169376780282e-05, + "loss": 2.5646, + "step": 30172 + }, + { + "epoch": 0.8947305992942503, + "grad_norm": 0.07875663042068481, + "learning_rate": 2.7656260283538738e-05, + "loss": 2.5877, + "step": 30173 + }, + { + "epoch": 0.8947602526465617, + "grad_norm": 0.08128994703292847, + "learning_rate": 2.7640830981959573e-05, + "loss": 2.5376, + "step": 30174 + }, + { + "epoch": 0.8947899059988732, + "grad_norm": 0.07612315565347672, + "learning_rate": 2.762540586320189e-05, + "loss": 2.5651, + "step": 30175 + }, + { + "epoch": 0.8948195593511846, + "grad_norm": 0.08056751638650894, + "learning_rate": 2.7609984927402355e-05, + "loss": 2.561, + "step": 30176 + }, + { + "epoch": 0.8948492127034962, + "grad_norm": 0.07675724476575851, + "learning_rate": 2.759456817469752e-05, + "loss": 2.5824, + "step": 30177 + }, + { + "epoch": 0.8948788660558076, + "grad_norm": 0.07419131696224213, + "learning_rate": 2.7579155605223837e-05, + "loss": 2.5716, + "step": 30178 + }, + { + "epoch": 0.8949085194081191, + "grad_norm": 0.07651673257350922, + "learning_rate": 2.7563747219117808e-05, + "loss": 2.5765, + "step": 30179 + }, + { + "epoch": 0.8949381727604305, + "grad_norm": 0.07899853587150574, + "learning_rate": 2.7548343016515932e-05, + "loss": 2.5807, + "step": 30180 + }, + { + "epoch": 0.8949678261127421, + "grad_norm": 0.07207614183425903, + "learning_rate": 2.753294299755449e-05, + "loss": 2.582, + "step": 30181 + }, + { + "epoch": 0.8949974794650535, + "grad_norm": 0.07824767380952835, + "learning_rate": 2.7517547162370037e-05, + "loss": 2.5854, + "step": 30182 + }, + { + "epoch": 0.895027132817365, + "grad_norm": 0.07412613183259964, + "learning_rate": 2.7502155511098748e-05, + "loss": 2.5557, + "step": 30183 + }, + { + "epoch": 0.8950567861696764, + "grad_norm": 0.07302341610193253, + "learning_rate": 2.7486768043876952e-05, + "loss": 2.5369, + "step": 30184 + }, + { + "epoch": 0.895086439521988, + "grad_norm": 0.07666124403476715, + "learning_rate": 2.747138476084099e-05, + "loss": 2.5474, + "step": 30185 + }, + { + "epoch": 0.8951160928742994, + "grad_norm": 0.07521552592515945, + "learning_rate": 2.745600566212697e-05, + "loss": 2.5689, + "step": 30186 + }, + { + "epoch": 0.8951457462266109, + "grad_norm": 0.07056281715631485, + "learning_rate": 2.7440630747871174e-05, + "loss": 2.5043, + "step": 30187 + }, + { + "epoch": 0.8951753995789224, + "grad_norm": 0.07327863574028015, + "learning_rate": 2.7425260018209718e-05, + "loss": 2.5636, + "step": 30188 + }, + { + "epoch": 0.8952050529312339, + "grad_norm": 0.07544022798538208, + "learning_rate": 2.7409893473278657e-05, + "loss": 2.5749, + "step": 30189 + }, + { + "epoch": 0.8952347062835454, + "grad_norm": 0.07614163309335709, + "learning_rate": 2.7394531113214103e-05, + "loss": 2.5828, + "step": 30190 + }, + { + "epoch": 0.8952643596358568, + "grad_norm": 0.07744981348514557, + "learning_rate": 2.737917293815212e-05, + "loss": 2.5673, + "step": 30191 + }, + { + "epoch": 0.8952940129881684, + "grad_norm": 0.07438185811042786, + "learning_rate": 2.73638189482287e-05, + "loss": 2.6019, + "step": 30192 + }, + { + "epoch": 0.8953236663404798, + "grad_norm": 0.07573140412569046, + "learning_rate": 2.7348469143579802e-05, + "loss": 2.5955, + "step": 30193 + }, + { + "epoch": 0.8953533196927913, + "grad_norm": 0.07768384367227554, + "learning_rate": 2.7333123524341306e-05, + "loss": 2.5977, + "step": 30194 + }, + { + "epoch": 0.8953829730451027, + "grad_norm": 0.07482430338859558, + "learning_rate": 2.7317782090649112e-05, + "loss": 2.5322, + "step": 30195 + }, + { + "epoch": 0.8954126263974143, + "grad_norm": 0.07242663204669952, + "learning_rate": 2.7302444842639162e-05, + "loss": 2.5617, + "step": 30196 + }, + { + "epoch": 0.8954422797497257, + "grad_norm": 0.07684220373630524, + "learning_rate": 2.7287111780447127e-05, + "loss": 2.5973, + "step": 30197 + }, + { + "epoch": 0.8954719331020372, + "grad_norm": 0.07616207748651505, + "learning_rate": 2.727178290420895e-05, + "loss": 2.5627, + "step": 30198 + }, + { + "epoch": 0.8955015864543486, + "grad_norm": 0.07373605668544769, + "learning_rate": 2.7256458214060253e-05, + "loss": 2.5804, + "step": 30199 + }, + { + "epoch": 0.8955312398066602, + "grad_norm": 0.07380011677742004, + "learning_rate": 2.72411377101367e-05, + "loss": 2.574, + "step": 30200 + }, + { + "epoch": 0.8955608931589716, + "grad_norm": 0.07692261040210724, + "learning_rate": 2.722582139257401e-05, + "loss": 2.5244, + "step": 30201 + }, + { + "epoch": 0.8955905465112831, + "grad_norm": 0.0763530507683754, + "learning_rate": 2.7210509261507864e-05, + "loss": 2.5944, + "step": 30202 + }, + { + "epoch": 0.8956201998635945, + "grad_norm": 0.07436877489089966, + "learning_rate": 2.719520131707376e-05, + "loss": 2.5623, + "step": 30203 + }, + { + "epoch": 0.8956498532159061, + "grad_norm": 0.07797271013259888, + "learning_rate": 2.7179897559407364e-05, + "loss": 2.5425, + "step": 30204 + }, + { + "epoch": 0.8956795065682175, + "grad_norm": 0.07817276567220688, + "learning_rate": 2.7164597988644123e-05, + "loss": 2.6028, + "step": 30205 + }, + { + "epoch": 0.895709159920529, + "grad_norm": 0.07088495045900345, + "learning_rate": 2.7149302604919547e-05, + "loss": 2.566, + "step": 30206 + }, + { + "epoch": 0.8957388132728404, + "grad_norm": 0.07695077359676361, + "learning_rate": 2.713401140836902e-05, + "loss": 2.5817, + "step": 30207 + }, + { + "epoch": 0.895768466625152, + "grad_norm": 0.07537178695201874, + "learning_rate": 2.7118724399128102e-05, + "loss": 2.5835, + "step": 30208 + }, + { + "epoch": 0.8957981199774635, + "grad_norm": 0.07495284825563431, + "learning_rate": 2.7103441577331966e-05, + "loss": 2.5728, + "step": 30209 + }, + { + "epoch": 0.8958277733297749, + "grad_norm": 0.07261212915182114, + "learning_rate": 2.7088162943116003e-05, + "loss": 2.5598, + "step": 30210 + }, + { + "epoch": 0.8958574266820865, + "grad_norm": 0.07816337049007416, + "learning_rate": 2.7072888496615546e-05, + "loss": 2.5498, + "step": 30211 + }, + { + "epoch": 0.8958870800343979, + "grad_norm": 0.07510756701231003, + "learning_rate": 2.7057618237965818e-05, + "loss": 2.5315, + "step": 30212 + }, + { + "epoch": 0.8959167333867094, + "grad_norm": 0.06853660941123962, + "learning_rate": 2.7042352167302108e-05, + "loss": 2.5906, + "step": 30213 + }, + { + "epoch": 0.8959463867390208, + "grad_norm": 0.0715278908610344, + "learning_rate": 2.7027090284759416e-05, + "loss": 2.5815, + "step": 30214 + }, + { + "epoch": 0.8959760400913324, + "grad_norm": 0.07886723428964615, + "learning_rate": 2.7011832590473127e-05, + "loss": 2.5688, + "step": 30215 + }, + { + "epoch": 0.8960056934436438, + "grad_norm": 0.07881621271371841, + "learning_rate": 2.6996579084578253e-05, + "loss": 2.5733, + "step": 30216 + }, + { + "epoch": 0.8960353467959553, + "grad_norm": 0.07043206691741943, + "learning_rate": 2.6981329767209905e-05, + "loss": 2.5778, + "step": 30217 + }, + { + "epoch": 0.8960650001482667, + "grad_norm": 0.07405418157577515, + "learning_rate": 2.696608463850303e-05, + "loss": 2.5837, + "step": 30218 + }, + { + "epoch": 0.8960946535005783, + "grad_norm": 0.07530760765075684, + "learning_rate": 2.695084369859274e-05, + "loss": 2.6055, + "step": 30219 + }, + { + "epoch": 0.8961243068528897, + "grad_norm": 0.0736699104309082, + "learning_rate": 2.6935606947613824e-05, + "loss": 2.5306, + "step": 30220 + }, + { + "epoch": 0.8961539602052012, + "grad_norm": 0.07523320615291595, + "learning_rate": 2.692037438570133e-05, + "loss": 2.5413, + "step": 30221 + }, + { + "epoch": 0.8961836135575126, + "grad_norm": 0.072579026222229, + "learning_rate": 2.6905146012990155e-05, + "loss": 2.5353, + "step": 30222 + }, + { + "epoch": 0.8962132669098242, + "grad_norm": 0.07884292304515839, + "learning_rate": 2.6889921829615082e-05, + "loss": 2.5771, + "step": 30223 + }, + { + "epoch": 0.8962429202621356, + "grad_norm": 0.07560275495052338, + "learning_rate": 2.6874701835711e-05, + "loss": 2.5642, + "step": 30224 + }, + { + "epoch": 0.8962725736144471, + "grad_norm": 0.08346167951822281, + "learning_rate": 2.6859486031412638e-05, + "loss": 2.5303, + "step": 30225 + }, + { + "epoch": 0.8963022269667585, + "grad_norm": 0.07843032479286194, + "learning_rate": 2.684427441685472e-05, + "loss": 2.5179, + "step": 30226 + }, + { + "epoch": 0.8963318803190701, + "grad_norm": 0.0756566971540451, + "learning_rate": 2.6829066992171857e-05, + "loss": 2.5582, + "step": 30227 + }, + { + "epoch": 0.8963615336713816, + "grad_norm": 0.07381647825241089, + "learning_rate": 2.6813863757498945e-05, + "loss": 2.5436, + "step": 30228 + }, + { + "epoch": 0.896391187023693, + "grad_norm": 0.08321903645992279, + "learning_rate": 2.6798664712970545e-05, + "loss": 2.5862, + "step": 30229 + }, + { + "epoch": 0.8964208403760046, + "grad_norm": 0.08241698890924454, + "learning_rate": 2.6783469858721155e-05, + "loss": 2.559, + "step": 30230 + }, + { + "epoch": 0.896450493728316, + "grad_norm": 0.07767452299594879, + "learning_rate": 2.6768279194885335e-05, + "loss": 2.5617, + "step": 30231 + }, + { + "epoch": 0.8964801470806275, + "grad_norm": 0.0750826969742775, + "learning_rate": 2.675309272159765e-05, + "loss": 2.5689, + "step": 30232 + }, + { + "epoch": 0.8965098004329389, + "grad_norm": 0.07713280618190765, + "learning_rate": 2.6737910438992542e-05, + "loss": 2.614, + "step": 30233 + }, + { + "epoch": 0.8965394537852505, + "grad_norm": 0.07552904635667801, + "learning_rate": 2.6722732347204516e-05, + "loss": 2.5464, + "step": 30234 + }, + { + "epoch": 0.8965691071375619, + "grad_norm": 0.07918272167444229, + "learning_rate": 2.6707558446367854e-05, + "loss": 2.5636, + "step": 30235 + }, + { + "epoch": 0.8965987604898734, + "grad_norm": 0.07695473730564117, + "learning_rate": 2.669238873661706e-05, + "loss": 2.6011, + "step": 30236 + }, + { + "epoch": 0.8966284138421848, + "grad_norm": 0.08087317645549774, + "learning_rate": 2.6677223218086412e-05, + "loss": 2.6058, + "step": 30237 + }, + { + "epoch": 0.8966580671944964, + "grad_norm": 0.07698296755552292, + "learning_rate": 2.6662061890910138e-05, + "loss": 2.557, + "step": 30238 + }, + { + "epoch": 0.8966877205468078, + "grad_norm": 0.07487457245588303, + "learning_rate": 2.664690475522258e-05, + "loss": 2.5415, + "step": 30239 + }, + { + "epoch": 0.8967173738991193, + "grad_norm": 0.07274258881807327, + "learning_rate": 2.6631751811157955e-05, + "loss": 2.5502, + "step": 30240 + }, + { + "epoch": 0.8967470272514307, + "grad_norm": 0.08389359712600708, + "learning_rate": 2.661660305885044e-05, + "loss": 2.5564, + "step": 30241 + }, + { + "epoch": 0.8967766806037423, + "grad_norm": 0.08101990073919296, + "learning_rate": 2.6601458498434096e-05, + "loss": 2.5678, + "step": 30242 + }, + { + "epoch": 0.8968063339560537, + "grad_norm": 0.0791015475988388, + "learning_rate": 2.658631813004314e-05, + "loss": 2.5774, + "step": 30243 + }, + { + "epoch": 0.8968359873083652, + "grad_norm": 0.07754168659448624, + "learning_rate": 2.6571181953811586e-05, + "loss": 2.6071, + "step": 30244 + }, + { + "epoch": 0.8968656406606766, + "grad_norm": 0.07616660743951797, + "learning_rate": 2.6556049969873486e-05, + "loss": 2.5748, + "step": 30245 + }, + { + "epoch": 0.8968952940129882, + "grad_norm": 0.07904677093029022, + "learning_rate": 2.6540922178362793e-05, + "loss": 2.5911, + "step": 30246 + }, + { + "epoch": 0.8969249473652996, + "grad_norm": 0.07914314419031143, + "learning_rate": 2.6525798579413508e-05, + "loss": 2.581, + "step": 30247 + }, + { + "epoch": 0.8969546007176111, + "grad_norm": 0.07446782290935516, + "learning_rate": 2.651067917315958e-05, + "loss": 2.5404, + "step": 30248 + }, + { + "epoch": 0.8969842540699227, + "grad_norm": 0.08252907544374466, + "learning_rate": 2.6495563959734848e-05, + "loss": 2.5684, + "step": 30249 + }, + { + "epoch": 0.8970139074222341, + "grad_norm": 0.08301275968551636, + "learning_rate": 2.6480452939273202e-05, + "loss": 2.5981, + "step": 30250 + }, + { + "epoch": 0.8970435607745456, + "grad_norm": 0.07906140387058258, + "learning_rate": 2.646534611190837e-05, + "loss": 2.5548, + "step": 30251 + }, + { + "epoch": 0.897073214126857, + "grad_norm": 0.08165033161640167, + "learning_rate": 2.6450243477774193e-05, + "loss": 2.5492, + "step": 30252 + }, + { + "epoch": 0.8971028674791686, + "grad_norm": 0.07619045674800873, + "learning_rate": 2.643514503700445e-05, + "loss": 2.5764, + "step": 30253 + }, + { + "epoch": 0.89713252083148, + "grad_norm": 0.0780256912112236, + "learning_rate": 2.6420050789732698e-05, + "loss": 2.5882, + "step": 30254 + }, + { + "epoch": 0.8971621741837915, + "grad_norm": 0.07734543085098267, + "learning_rate": 2.640496073609272e-05, + "loss": 2.5757, + "step": 30255 + }, + { + "epoch": 0.8971918275361029, + "grad_norm": 0.07880721986293793, + "learning_rate": 2.6389874876218133e-05, + "loss": 2.544, + "step": 30256 + }, + { + "epoch": 0.8972214808884145, + "grad_norm": 0.07670464366674423, + "learning_rate": 2.63747932102425e-05, + "loss": 2.5812, + "step": 30257 + }, + { + "epoch": 0.8972511342407259, + "grad_norm": 0.07783503830432892, + "learning_rate": 2.6359715738299316e-05, + "loss": 2.5509, + "step": 30258 + }, + { + "epoch": 0.8972807875930374, + "grad_norm": 0.0785563513636589, + "learning_rate": 2.6344642460522205e-05, + "loss": 2.5489, + "step": 30259 + }, + { + "epoch": 0.8973104409453488, + "grad_norm": 0.07632192224264145, + "learning_rate": 2.632957337704456e-05, + "loss": 2.5568, + "step": 30260 + }, + { + "epoch": 0.8973400942976604, + "grad_norm": 0.07412847131490707, + "learning_rate": 2.6314508487999823e-05, + "loss": 2.5891, + "step": 30261 + }, + { + "epoch": 0.8973697476499718, + "grad_norm": 0.07260569930076599, + "learning_rate": 2.6299447793521447e-05, + "loss": 2.5202, + "step": 30262 + }, + { + "epoch": 0.8973994010022833, + "grad_norm": 0.07752621173858643, + "learning_rate": 2.628439129374277e-05, + "loss": 2.58, + "step": 30263 + }, + { + "epoch": 0.8974290543545947, + "grad_norm": 0.07618530839681625, + "learning_rate": 2.6269338988797186e-05, + "loss": 2.5478, + "step": 30264 + }, + { + "epoch": 0.8974587077069063, + "grad_norm": 0.07510815560817719, + "learning_rate": 2.6254290878817865e-05, + "loss": 2.5802, + "step": 30265 + }, + { + "epoch": 0.8974883610592177, + "grad_norm": 0.0745285302400589, + "learning_rate": 2.6239246963938036e-05, + "loss": 2.5455, + "step": 30266 + }, + { + "epoch": 0.8975180144115292, + "grad_norm": 0.07450976222753525, + "learning_rate": 2.6224207244291086e-05, + "loss": 2.5782, + "step": 30267 + }, + { + "epoch": 0.8975476677638407, + "grad_norm": 0.0736801028251648, + "learning_rate": 2.6209171720010084e-05, + "loss": 2.5414, + "step": 30268 + }, + { + "epoch": 0.8975773211161522, + "grad_norm": 0.07806869596242905, + "learning_rate": 2.6194140391228194e-05, + "loss": 2.5691, + "step": 30269 + }, + { + "epoch": 0.8976069744684637, + "grad_norm": 0.07556895911693573, + "learning_rate": 2.617911325807848e-05, + "loss": 2.5409, + "step": 30270 + }, + { + "epoch": 0.8976366278207751, + "grad_norm": 0.07514394074678421, + "learning_rate": 2.6164090320694113e-05, + "loss": 2.5851, + "step": 30271 + }, + { + "epoch": 0.8976662811730867, + "grad_norm": 0.07354360818862915, + "learning_rate": 2.6149071579207984e-05, + "loss": 2.5531, + "step": 30272 + }, + { + "epoch": 0.8976959345253981, + "grad_norm": 0.0754735916852951, + "learning_rate": 2.6134057033753213e-05, + "loss": 2.5504, + "step": 30273 + }, + { + "epoch": 0.8977255878777096, + "grad_norm": 0.08303289860486984, + "learning_rate": 2.61190466844628e-05, + "loss": 2.5789, + "step": 30274 + }, + { + "epoch": 0.897755241230021, + "grad_norm": 0.07633538544178009, + "learning_rate": 2.6104040531469477e-05, + "loss": 2.576, + "step": 30275 + }, + { + "epoch": 0.8977848945823326, + "grad_norm": 0.0740451067686081, + "learning_rate": 2.608903857490619e-05, + "loss": 2.5709, + "step": 30276 + }, + { + "epoch": 0.897814547934644, + "grad_norm": 0.08196365833282471, + "learning_rate": 2.6074040814905832e-05, + "loss": 2.5222, + "step": 30277 + }, + { + "epoch": 0.8978442012869555, + "grad_norm": 0.08247308433055878, + "learning_rate": 2.6059047251601187e-05, + "loss": 2.5887, + "step": 30278 + }, + { + "epoch": 0.8978738546392669, + "grad_norm": 0.07248659431934357, + "learning_rate": 2.6044057885124926e-05, + "loss": 2.5955, + "step": 30279 + }, + { + "epoch": 0.8979035079915785, + "grad_norm": 0.07611466199159622, + "learning_rate": 2.6029072715610003e-05, + "loss": 2.5886, + "step": 30280 + }, + { + "epoch": 0.8979331613438899, + "grad_norm": 0.08047651499509811, + "learning_rate": 2.6014091743189026e-05, + "loss": 2.5615, + "step": 30281 + }, + { + "epoch": 0.8979628146962014, + "grad_norm": 0.07563245296478271, + "learning_rate": 2.5999114967994563e-05, + "loss": 2.5754, + "step": 30282 + }, + { + "epoch": 0.8979924680485128, + "grad_norm": 0.07976590096950531, + "learning_rate": 2.5984142390159336e-05, + "loss": 2.5621, + "step": 30283 + }, + { + "epoch": 0.8980221214008244, + "grad_norm": 0.07240626215934753, + "learning_rate": 2.596917400981591e-05, + "loss": 2.5358, + "step": 30284 + }, + { + "epoch": 0.8980517747531358, + "grad_norm": 0.07376360893249512, + "learning_rate": 2.5954209827096788e-05, + "loss": 2.5897, + "step": 30285 + }, + { + "epoch": 0.8980814281054473, + "grad_norm": 0.0790523812174797, + "learning_rate": 2.5939249842134527e-05, + "loss": 2.5452, + "step": 30286 + }, + { + "epoch": 0.8981110814577588, + "grad_norm": 0.07793106883764267, + "learning_rate": 2.5924294055061525e-05, + "loss": 2.5779, + "step": 30287 + }, + { + "epoch": 0.8981407348100703, + "grad_norm": 0.07340060919523239, + "learning_rate": 2.5909342466010288e-05, + "loss": 2.5829, + "step": 30288 + }, + { + "epoch": 0.8981703881623817, + "grad_norm": 0.07892558723688126, + "learning_rate": 2.5894395075113263e-05, + "loss": 2.604, + "step": 30289 + }, + { + "epoch": 0.8982000415146932, + "grad_norm": 0.07350338250398636, + "learning_rate": 2.587945188250268e-05, + "loss": 2.5917, + "step": 30290 + }, + { + "epoch": 0.8982296948670048, + "grad_norm": 0.07891817390918732, + "learning_rate": 2.5864512888310932e-05, + "loss": 2.5434, + "step": 30291 + }, + { + "epoch": 0.8982593482193162, + "grad_norm": 0.07802422344684601, + "learning_rate": 2.5849578092670247e-05, + "loss": 2.5666, + "step": 30292 + }, + { + "epoch": 0.8982890015716277, + "grad_norm": 0.07888558506965637, + "learning_rate": 2.5834647495713015e-05, + "loss": 2.5898, + "step": 30293 + }, + { + "epoch": 0.8983186549239391, + "grad_norm": 0.07695778459310532, + "learning_rate": 2.581972109757136e-05, + "loss": 2.5521, + "step": 30294 + }, + { + "epoch": 0.8983483082762507, + "grad_norm": 0.08148019015789032, + "learning_rate": 2.580479889837756e-05, + "loss": 2.5606, + "step": 30295 + }, + { + "epoch": 0.8983779616285621, + "grad_norm": 0.07113690674304962, + "learning_rate": 2.5789880898263564e-05, + "loss": 2.5628, + "step": 30296 + }, + { + "epoch": 0.8984076149808736, + "grad_norm": 0.07546529173851013, + "learning_rate": 2.5774967097361602e-05, + "loss": 2.5647, + "step": 30297 + }, + { + "epoch": 0.898437268333185, + "grad_norm": 0.08146696537733078, + "learning_rate": 2.5760057495803678e-05, + "loss": 2.5634, + "step": 30298 + }, + { + "epoch": 0.8984669216854966, + "grad_norm": 0.08002298325300217, + "learning_rate": 2.574515209372186e-05, + "loss": 2.5919, + "step": 30299 + }, + { + "epoch": 0.898496575037808, + "grad_norm": 0.0759071484208107, + "learning_rate": 2.573025089124814e-05, + "loss": 2.5773, + "step": 30300 + }, + { + "epoch": 0.8985262283901195, + "grad_norm": 0.07742859423160553, + "learning_rate": 2.5715353888514427e-05, + "loss": 2.5977, + "step": 30301 + }, + { + "epoch": 0.898555881742431, + "grad_norm": 0.08607657253742218, + "learning_rate": 2.570046108565266e-05, + "loss": 2.5596, + "step": 30302 + }, + { + "epoch": 0.8985855350947425, + "grad_norm": 0.07805678248405457, + "learning_rate": 2.568557248279474e-05, + "loss": 2.5439, + "step": 30303 + }, + { + "epoch": 0.8986151884470539, + "grad_norm": 0.0738312378525734, + "learning_rate": 2.5670688080072503e-05, + "loss": 2.5631, + "step": 30304 + }, + { + "epoch": 0.8986448417993654, + "grad_norm": 0.07677682489156723, + "learning_rate": 2.5655807877617676e-05, + "loss": 2.5567, + "step": 30305 + }, + { + "epoch": 0.8986744951516769, + "grad_norm": 0.07761579006910324, + "learning_rate": 2.5640931875562157e-05, + "loss": 2.5744, + "step": 30306 + }, + { + "epoch": 0.8987041485039884, + "grad_norm": 0.08360078930854797, + "learning_rate": 2.562606007403756e-05, + "loss": 2.5235, + "step": 30307 + }, + { + "epoch": 0.8987338018562998, + "grad_norm": 0.07190172374248505, + "learning_rate": 2.5611192473175672e-05, + "loss": 2.5765, + "step": 30308 + }, + { + "epoch": 0.8987634552086113, + "grad_norm": 0.09554867446422577, + "learning_rate": 2.5596329073108105e-05, + "loss": 2.5308, + "step": 30309 + }, + { + "epoch": 0.8987931085609228, + "grad_norm": 0.07886000722646713, + "learning_rate": 2.5581469873966424e-05, + "loss": 2.5935, + "step": 30310 + }, + { + "epoch": 0.8988227619132343, + "grad_norm": 0.07359010726213455, + "learning_rate": 2.55666148758823e-05, + "loss": 2.5382, + "step": 30311 + }, + { + "epoch": 0.8988524152655458, + "grad_norm": 0.08057156950235367, + "learning_rate": 2.5551764078987238e-05, + "loss": 2.5851, + "step": 30312 + }, + { + "epoch": 0.8988820686178572, + "grad_norm": 0.07904011756181717, + "learning_rate": 2.5536917483412748e-05, + "loss": 2.5601, + "step": 30313 + }, + { + "epoch": 0.8989117219701688, + "grad_norm": 0.07526329904794693, + "learning_rate": 2.5522075089290275e-05, + "loss": 2.5704, + "step": 30314 + }, + { + "epoch": 0.8989413753224802, + "grad_norm": 0.0743308886885643, + "learning_rate": 2.5507236896751275e-05, + "loss": 2.5723, + "step": 30315 + }, + { + "epoch": 0.8989710286747917, + "grad_norm": 0.07440663874149323, + "learning_rate": 2.5492402905927137e-05, + "loss": 2.5994, + "step": 30316 + }, + { + "epoch": 0.8990006820271031, + "grad_norm": 0.07616136968135834, + "learning_rate": 2.5477573116949203e-05, + "loss": 2.5593, + "step": 30317 + }, + { + "epoch": 0.8990303353794147, + "grad_norm": 0.07693564891815186, + "learning_rate": 2.5462747529948814e-05, + "loss": 2.5676, + "step": 30318 + }, + { + "epoch": 0.8990599887317261, + "grad_norm": 0.07796476781368256, + "learning_rate": 2.544792614505731e-05, + "loss": 2.57, + "step": 30319 + }, + { + "epoch": 0.8990896420840376, + "grad_norm": 0.07280125468969345, + "learning_rate": 2.5433108962405805e-05, + "loss": 2.5697, + "step": 30320 + }, + { + "epoch": 0.899119295436349, + "grad_norm": 0.0723600760102272, + "learning_rate": 2.5418295982125585e-05, + "loss": 2.5738, + "step": 30321 + }, + { + "epoch": 0.8991489487886606, + "grad_norm": 0.07449717819690704, + "learning_rate": 2.540348720434782e-05, + "loss": 2.5622, + "step": 30322 + }, + { + "epoch": 0.899178602140972, + "grad_norm": 0.0784822553396225, + "learning_rate": 2.5388682629203687e-05, + "loss": 2.5651, + "step": 30323 + }, + { + "epoch": 0.8992082554932835, + "grad_norm": 0.07740321010351181, + "learning_rate": 2.5373882256824186e-05, + "loss": 2.5648, + "step": 30324 + }, + { + "epoch": 0.899237908845595, + "grad_norm": 0.07324381917715073, + "learning_rate": 2.5359086087340445e-05, + "loss": 2.5965, + "step": 30325 + }, + { + "epoch": 0.8992675621979065, + "grad_norm": 0.07162994146347046, + "learning_rate": 2.534429412088346e-05, + "loss": 2.5987, + "step": 30326 + }, + { + "epoch": 0.8992972155502179, + "grad_norm": 0.07068468630313873, + "learning_rate": 2.532950635758424e-05, + "loss": 2.5768, + "step": 30327 + }, + { + "epoch": 0.8993268689025294, + "grad_norm": 0.07939743995666504, + "learning_rate": 2.5314722797573687e-05, + "loss": 2.5731, + "step": 30328 + }, + { + "epoch": 0.8993565222548409, + "grad_norm": 0.07347430288791656, + "learning_rate": 2.5299943440982797e-05, + "loss": 2.5717, + "step": 30329 + }, + { + "epoch": 0.8993861756071524, + "grad_norm": 0.07446053624153137, + "learning_rate": 2.5285168287942307e-05, + "loss": 2.5639, + "step": 30330 + }, + { + "epoch": 0.8994158289594638, + "grad_norm": 0.08356712758541107, + "learning_rate": 2.527039733858316e-05, + "loss": 2.5752, + "step": 30331 + }, + { + "epoch": 0.8994454823117753, + "grad_norm": 0.07934848219156265, + "learning_rate": 2.525563059303615e-05, + "loss": 2.5716, + "step": 30332 + }, + { + "epoch": 0.8994751356640869, + "grad_norm": 0.07614444941282272, + "learning_rate": 2.5240868051432054e-05, + "loss": 2.5734, + "step": 30333 + }, + { + "epoch": 0.8995047890163983, + "grad_norm": 0.07015243172645569, + "learning_rate": 2.522610971390149e-05, + "loss": 2.5784, + "step": 30334 + }, + { + "epoch": 0.8995344423687098, + "grad_norm": 0.07282188534736633, + "learning_rate": 2.5211355580575302e-05, + "loss": 2.5404, + "step": 30335 + }, + { + "epoch": 0.8995640957210213, + "grad_norm": 0.07594040036201477, + "learning_rate": 2.5196605651583993e-05, + "loss": 2.5768, + "step": 30336 + }, + { + "epoch": 0.8995937490733328, + "grad_norm": 0.0738387480378151, + "learning_rate": 2.5181859927058236e-05, + "loss": 2.5788, + "step": 30337 + }, + { + "epoch": 0.8996234024256442, + "grad_norm": 0.07559286057949066, + "learning_rate": 2.5167118407128654e-05, + "loss": 2.5941, + "step": 30338 + }, + { + "epoch": 0.8996530557779557, + "grad_norm": 0.07388146221637726, + "learning_rate": 2.5152381091925692e-05, + "loss": 2.5485, + "step": 30339 + }, + { + "epoch": 0.8996827091302672, + "grad_norm": 0.07065536826848984, + "learning_rate": 2.5137647981580024e-05, + "loss": 2.5412, + "step": 30340 + }, + { + "epoch": 0.8997123624825787, + "grad_norm": 0.06966810673475266, + "learning_rate": 2.5122919076221884e-05, + "loss": 2.5303, + "step": 30341 + }, + { + "epoch": 0.8997420158348901, + "grad_norm": 0.0728895515203476, + "learning_rate": 2.510819437598183e-05, + "loss": 2.5777, + "step": 30342 + }, + { + "epoch": 0.8997716691872016, + "grad_norm": 0.07229018211364746, + "learning_rate": 2.5093473880990148e-05, + "loss": 2.5524, + "step": 30343 + }, + { + "epoch": 0.8998013225395131, + "grad_norm": 0.07242868095636368, + "learning_rate": 2.5078757591377343e-05, + "loss": 2.5489, + "step": 30344 + }, + { + "epoch": 0.8998309758918246, + "grad_norm": 0.07439283281564713, + "learning_rate": 2.5064045507273703e-05, + "loss": 2.5516, + "step": 30345 + }, + { + "epoch": 0.899860629244136, + "grad_norm": 0.07428327947854996, + "learning_rate": 2.5049337628809398e-05, + "loss": 2.5824, + "step": 30346 + }, + { + "epoch": 0.8998902825964475, + "grad_norm": 0.07130023837089539, + "learning_rate": 2.503463395611477e-05, + "loss": 2.5805, + "step": 30347 + }, + { + "epoch": 0.899919935948759, + "grad_norm": 0.0723395124077797, + "learning_rate": 2.501993448931994e-05, + "loss": 2.6095, + "step": 30348 + }, + { + "epoch": 0.8999495893010705, + "grad_norm": 0.07375239580869675, + "learning_rate": 2.5005239228555133e-05, + "loss": 2.5944, + "step": 30349 + }, + { + "epoch": 0.8999792426533819, + "grad_norm": 0.07240903377532959, + "learning_rate": 2.4990548173950578e-05, + "loss": 2.5504, + "step": 30350 + }, + { + "epoch": 0.9000088960056934, + "grad_norm": 0.07381266355514526, + "learning_rate": 2.4975861325636174e-05, + "loss": 2.5778, + "step": 30351 + }, + { + "epoch": 0.9000385493580049, + "grad_norm": 0.07299502193927765, + "learning_rate": 2.4961178683742035e-05, + "loss": 2.54, + "step": 30352 + }, + { + "epoch": 0.9000682027103164, + "grad_norm": 0.07220815867185593, + "learning_rate": 2.4946500248398174e-05, + "loss": 2.5537, + "step": 30353 + }, + { + "epoch": 0.9000978560626279, + "grad_norm": 0.07126958668231964, + "learning_rate": 2.493182601973465e-05, + "loss": 2.568, + "step": 30354 + }, + { + "epoch": 0.9001275094149394, + "grad_norm": 0.07241392135620117, + "learning_rate": 2.4917155997881302e-05, + "loss": 2.5759, + "step": 30355 + }, + { + "epoch": 0.9001571627672509, + "grad_norm": 0.07210587710142136, + "learning_rate": 2.490249018296803e-05, + "loss": 2.537, + "step": 30356 + }, + { + "epoch": 0.9001868161195623, + "grad_norm": 0.07382148504257202, + "learning_rate": 2.4887828575124837e-05, + "loss": 2.5592, + "step": 30357 + }, + { + "epoch": 0.9002164694718738, + "grad_norm": 0.07660490274429321, + "learning_rate": 2.4873171174481457e-05, + "loss": 2.5702, + "step": 30358 + }, + { + "epoch": 0.9002461228241853, + "grad_norm": 0.07568731904029846, + "learning_rate": 2.485851798116773e-05, + "loss": 2.5942, + "step": 30359 + }, + { + "epoch": 0.9002757761764968, + "grad_norm": 0.07499508559703827, + "learning_rate": 2.4843868995313322e-05, + "loss": 2.5899, + "step": 30360 + }, + { + "epoch": 0.9003054295288082, + "grad_norm": 0.07367749512195587, + "learning_rate": 2.4829224217048142e-05, + "loss": 2.5574, + "step": 30361 + }, + { + "epoch": 0.9003350828811197, + "grad_norm": 0.07366128265857697, + "learning_rate": 2.4814583646501686e-05, + "loss": 2.5538, + "step": 30362 + }, + { + "epoch": 0.9003647362334312, + "grad_norm": 0.07493369281291962, + "learning_rate": 2.4799947283803635e-05, + "loss": 2.5656, + "step": 30363 + }, + { + "epoch": 0.9003943895857427, + "grad_norm": 0.07544522732496262, + "learning_rate": 2.478531512908361e-05, + "loss": 2.5527, + "step": 30364 + }, + { + "epoch": 0.9004240429380541, + "grad_norm": 0.07894135266542435, + "learning_rate": 2.4770687182471162e-05, + "loss": 2.573, + "step": 30365 + }, + { + "epoch": 0.9004536962903656, + "grad_norm": 0.0779532864689827, + "learning_rate": 2.4756063444095868e-05, + "loss": 2.5712, + "step": 30366 + }, + { + "epoch": 0.9004833496426771, + "grad_norm": 0.07793867588043213, + "learning_rate": 2.4741443914087224e-05, + "loss": 2.591, + "step": 30367 + }, + { + "epoch": 0.9005130029949886, + "grad_norm": 0.07573826611042023, + "learning_rate": 2.472682859257469e-05, + "loss": 2.568, + "step": 30368 + }, + { + "epoch": 0.9005426563473, + "grad_norm": 0.07833313941955566, + "learning_rate": 2.471221747968755e-05, + "loss": 2.5864, + "step": 30369 + }, + { + "epoch": 0.9005723096996116, + "grad_norm": 0.08578681945800781, + "learning_rate": 2.4697610575555418e-05, + "loss": 2.5807, + "step": 30370 + }, + { + "epoch": 0.900601963051923, + "grad_norm": 0.07721926271915436, + "learning_rate": 2.4683007880307583e-05, + "loss": 2.5655, + "step": 30371 + }, + { + "epoch": 0.9006316164042345, + "grad_norm": 0.07257939875125885, + "learning_rate": 2.4668409394073223e-05, + "loss": 2.5631, + "step": 30372 + }, + { + "epoch": 0.9006612697565459, + "grad_norm": 0.07409888505935669, + "learning_rate": 2.465381511698167e-05, + "loss": 2.5542, + "step": 30373 + }, + { + "epoch": 0.9006909231088575, + "grad_norm": 0.07521755248308182, + "learning_rate": 2.4639225049162217e-05, + "loss": 2.601, + "step": 30374 + }, + { + "epoch": 0.900720576461169, + "grad_norm": 0.07885001599788666, + "learning_rate": 2.462463919074398e-05, + "loss": 2.5876, + "step": 30375 + }, + { + "epoch": 0.9007502298134804, + "grad_norm": 0.070746511220932, + "learning_rate": 2.461005754185619e-05, + "loss": 2.5617, + "step": 30376 + }, + { + "epoch": 0.9007798831657919, + "grad_norm": 0.08190660178661346, + "learning_rate": 2.4595480102627966e-05, + "loss": 2.5606, + "step": 30377 + }, + { + "epoch": 0.9008095365181034, + "grad_norm": 0.07681542634963989, + "learning_rate": 2.4580906873188312e-05, + "loss": 2.585, + "step": 30378 + }, + { + "epoch": 0.9008391898704149, + "grad_norm": 0.07699950039386749, + "learning_rate": 2.4566337853666354e-05, + "loss": 2.5693, + "step": 30379 + }, + { + "epoch": 0.9008688432227263, + "grad_norm": 0.0757179856300354, + "learning_rate": 2.4551773044191095e-05, + "loss": 2.562, + "step": 30380 + }, + { + "epoch": 0.9008984965750378, + "grad_norm": 0.08048447966575623, + "learning_rate": 2.4537212444891488e-05, + "loss": 2.5282, + "step": 30381 + }, + { + "epoch": 0.9009281499273493, + "grad_norm": 0.07399271428585052, + "learning_rate": 2.452265605589643e-05, + "loss": 2.5569, + "step": 30382 + }, + { + "epoch": 0.9009578032796608, + "grad_norm": 0.07802722603082657, + "learning_rate": 2.450810387733493e-05, + "loss": 2.551, + "step": 30383 + }, + { + "epoch": 0.9009874566319722, + "grad_norm": 0.07404922693967819, + "learning_rate": 2.4493555909335774e-05, + "loss": 2.5971, + "step": 30384 + }, + { + "epoch": 0.9010171099842837, + "grad_norm": 0.07703981548547745, + "learning_rate": 2.44790121520278e-05, + "loss": 2.5742, + "step": 30385 + }, + { + "epoch": 0.9010467633365952, + "grad_norm": 0.07618043571710587, + "learning_rate": 2.4464472605539744e-05, + "loss": 2.5699, + "step": 30386 + }, + { + "epoch": 0.9010764166889067, + "grad_norm": 0.07323319464921951, + "learning_rate": 2.44499372700005e-05, + "loss": 2.5572, + "step": 30387 + }, + { + "epoch": 0.9011060700412181, + "grad_norm": 0.07703125476837158, + "learning_rate": 2.443540614553863e-05, + "loss": 2.5572, + "step": 30388 + }, + { + "epoch": 0.9011357233935297, + "grad_norm": 0.07424197345972061, + "learning_rate": 2.4420879232282866e-05, + "loss": 2.5579, + "step": 30389 + }, + { + "epoch": 0.9011653767458411, + "grad_norm": 0.07539702951908112, + "learning_rate": 2.4406356530361884e-05, + "loss": 2.5777, + "step": 30390 + }, + { + "epoch": 0.9011950300981526, + "grad_norm": 0.07114074379205704, + "learning_rate": 2.439183803990419e-05, + "loss": 2.5865, + "step": 30391 + }, + { + "epoch": 0.901224683450464, + "grad_norm": 0.07168971002101898, + "learning_rate": 2.437732376103846e-05, + "loss": 2.6008, + "step": 30392 + }, + { + "epoch": 0.9012543368027756, + "grad_norm": 0.0771336555480957, + "learning_rate": 2.436281369389315e-05, + "loss": 2.5755, + "step": 30393 + }, + { + "epoch": 0.901283990155087, + "grad_norm": 0.07306607067584991, + "learning_rate": 2.434830783859676e-05, + "loss": 2.5548, + "step": 30394 + }, + { + "epoch": 0.9013136435073985, + "grad_norm": 0.07169364392757416, + "learning_rate": 2.433380619527775e-05, + "loss": 2.5599, + "step": 30395 + }, + { + "epoch": 0.90134329685971, + "grad_norm": 0.07856462150812149, + "learning_rate": 2.4319308764064574e-05, + "loss": 2.5311, + "step": 30396 + }, + { + "epoch": 0.9013729502120215, + "grad_norm": 0.07942403107881546, + "learning_rate": 2.430481554508551e-05, + "loss": 2.5706, + "step": 30397 + }, + { + "epoch": 0.901402603564333, + "grad_norm": 0.07830523699522018, + "learning_rate": 2.4290326538468966e-05, + "loss": 2.5774, + "step": 30398 + }, + { + "epoch": 0.9014322569166444, + "grad_norm": 0.07161811739206314, + "learning_rate": 2.4275841744343275e-05, + "loss": 2.5569, + "step": 30399 + }, + { + "epoch": 0.901461910268956, + "grad_norm": 0.07726255059242249, + "learning_rate": 2.4261361162836613e-05, + "loss": 2.587, + "step": 30400 + }, + { + "epoch": 0.9014915636212674, + "grad_norm": 0.0736946165561676, + "learning_rate": 2.4246884794077274e-05, + "loss": 2.5617, + "step": 30401 + }, + { + "epoch": 0.9015212169735789, + "grad_norm": 0.07764124125242233, + "learning_rate": 2.423241263819348e-05, + "loss": 2.5757, + "step": 30402 + }, + { + "epoch": 0.9015508703258903, + "grad_norm": 0.07241436839103699, + "learning_rate": 2.4217944695313244e-05, + "loss": 2.5476, + "step": 30403 + }, + { + "epoch": 0.9015805236782019, + "grad_norm": 0.0779481753706932, + "learning_rate": 2.420348096556485e-05, + "loss": 2.598, + "step": 30404 + }, + { + "epoch": 0.9016101770305133, + "grad_norm": 0.07469968497753143, + "learning_rate": 2.4189021449076366e-05, + "loss": 2.5739, + "step": 30405 + }, + { + "epoch": 0.9016398303828248, + "grad_norm": 0.07733090966939926, + "learning_rate": 2.4174566145975684e-05, + "loss": 2.5619, + "step": 30406 + }, + { + "epoch": 0.9016694837351362, + "grad_norm": 0.07311666756868362, + "learning_rate": 2.4160115056390873e-05, + "loss": 2.5134, + "step": 30407 + }, + { + "epoch": 0.9016991370874478, + "grad_norm": 0.07637450844049454, + "learning_rate": 2.4145668180449887e-05, + "loss": 2.5286, + "step": 30408 + }, + { + "epoch": 0.9017287904397592, + "grad_norm": 0.07235852628946304, + "learning_rate": 2.413122551828073e-05, + "loss": 2.5637, + "step": 30409 + }, + { + "epoch": 0.9017584437920707, + "grad_norm": 0.07215031236410141, + "learning_rate": 2.4116787070011248e-05, + "loss": 2.5865, + "step": 30410 + }, + { + "epoch": 0.9017880971443821, + "grad_norm": 0.07952994108200073, + "learning_rate": 2.4102352835769337e-05, + "loss": 2.5255, + "step": 30411 + }, + { + "epoch": 0.9018177504966937, + "grad_norm": 0.07336407154798508, + "learning_rate": 2.4087922815682727e-05, + "loss": 2.5565, + "step": 30412 + }, + { + "epoch": 0.9018474038490051, + "grad_norm": 0.08376690000295639, + "learning_rate": 2.4073497009879265e-05, + "loss": 2.5785, + "step": 30413 + }, + { + "epoch": 0.9018770572013166, + "grad_norm": 0.07689805328845978, + "learning_rate": 2.405907541848673e-05, + "loss": 2.578, + "step": 30414 + }, + { + "epoch": 0.901906710553628, + "grad_norm": 0.07470058649778366, + "learning_rate": 2.4044658041632695e-05, + "loss": 2.565, + "step": 30415 + }, + { + "epoch": 0.9019363639059396, + "grad_norm": 0.07277233153581619, + "learning_rate": 2.4030244879445052e-05, + "loss": 2.5554, + "step": 30416 + }, + { + "epoch": 0.9019660172582511, + "grad_norm": 0.07427512854337692, + "learning_rate": 2.40158359320512e-05, + "loss": 2.5803, + "step": 30417 + }, + { + "epoch": 0.9019956706105625, + "grad_norm": 0.078964002430439, + "learning_rate": 2.4001431199578816e-05, + "loss": 2.5778, + "step": 30418 + }, + { + "epoch": 0.902025323962874, + "grad_norm": 0.08042360097169876, + "learning_rate": 2.3987030682155465e-05, + "loss": 2.54, + "step": 30419 + }, + { + "epoch": 0.9020549773151855, + "grad_norm": 0.0723670944571495, + "learning_rate": 2.3972634379908654e-05, + "loss": 2.5579, + "step": 30420 + }, + { + "epoch": 0.902084630667497, + "grad_norm": 0.07836519181728363, + "learning_rate": 2.395824229296578e-05, + "loss": 2.5357, + "step": 30421 + }, + { + "epoch": 0.9021142840198084, + "grad_norm": 0.0773969367146492, + "learning_rate": 2.394385442145447e-05, + "loss": 2.5953, + "step": 30422 + }, + { + "epoch": 0.90214393737212, + "grad_norm": 0.07689537107944489, + "learning_rate": 2.392947076550206e-05, + "loss": 2.5344, + "step": 30423 + }, + { + "epoch": 0.9021735907244314, + "grad_norm": 0.08273264765739441, + "learning_rate": 2.3915091325235894e-05, + "loss": 2.5446, + "step": 30424 + }, + { + "epoch": 0.9022032440767429, + "grad_norm": 0.0754769816994667, + "learning_rate": 2.3900716100783314e-05, + "loss": 2.5097, + "step": 30425 + }, + { + "epoch": 0.9022328974290543, + "grad_norm": 0.07625705748796463, + "learning_rate": 2.3886345092271722e-05, + "loss": 2.5643, + "step": 30426 + }, + { + "epoch": 0.9022625507813659, + "grad_norm": 0.08072497695684433, + "learning_rate": 2.3871978299828123e-05, + "loss": 2.5599, + "step": 30427 + }, + { + "epoch": 0.9022922041336773, + "grad_norm": 0.07234097272157669, + "learning_rate": 2.385761572357992e-05, + "loss": 2.568, + "step": 30428 + }, + { + "epoch": 0.9023218574859888, + "grad_norm": 0.0799955278635025, + "learning_rate": 2.3843257363654282e-05, + "loss": 2.5742, + "step": 30429 + }, + { + "epoch": 0.9023515108383002, + "grad_norm": 0.07480164617300034, + "learning_rate": 2.3828903220178334e-05, + "loss": 2.575, + "step": 30430 + }, + { + "epoch": 0.9023811641906118, + "grad_norm": 0.07826276123523712, + "learning_rate": 2.381455329327914e-05, + "loss": 2.5759, + "step": 30431 + }, + { + "epoch": 0.9024108175429232, + "grad_norm": 0.07212710380554199, + "learning_rate": 2.3800207583083822e-05, + "loss": 2.591, + "step": 30432 + }, + { + "epoch": 0.9024404708952347, + "grad_norm": 0.07885774970054626, + "learning_rate": 2.3785866089719443e-05, + "loss": 2.5631, + "step": 30433 + }, + { + "epoch": 0.9024701242475461, + "grad_norm": 0.08028415590524673, + "learning_rate": 2.3771528813312848e-05, + "loss": 2.5887, + "step": 30434 + }, + { + "epoch": 0.9024997775998577, + "grad_norm": 0.076873779296875, + "learning_rate": 2.3757195753991155e-05, + "loss": 2.5316, + "step": 30435 + }, + { + "epoch": 0.9025294309521692, + "grad_norm": 0.07840200513601303, + "learning_rate": 2.3742866911881322e-05, + "loss": 2.5755, + "step": 30436 + }, + { + "epoch": 0.9025590843044806, + "grad_norm": 0.07739999145269394, + "learning_rate": 2.3728542287110132e-05, + "loss": 2.5901, + "step": 30437 + }, + { + "epoch": 0.9025887376567922, + "grad_norm": 0.0721811056137085, + "learning_rate": 2.3714221879804433e-05, + "loss": 2.5742, + "step": 30438 + }, + { + "epoch": 0.9026183910091036, + "grad_norm": 0.07519584894180298, + "learning_rate": 2.369990569009106e-05, + "loss": 2.6038, + "step": 30439 + }, + { + "epoch": 0.9026480443614151, + "grad_norm": 0.07847204059362411, + "learning_rate": 2.3685593718096755e-05, + "loss": 2.5974, + "step": 30440 + }, + { + "epoch": 0.9026776977137265, + "grad_norm": 0.07537230849266052, + "learning_rate": 2.3671285963948296e-05, + "loss": 2.5629, + "step": 30441 + }, + { + "epoch": 0.902707351066038, + "grad_norm": 0.0781882032752037, + "learning_rate": 2.3656982427772365e-05, + "loss": 2.5651, + "step": 30442 + }, + { + "epoch": 0.9027370044183495, + "grad_norm": 0.07996896654367447, + "learning_rate": 2.3642683109695585e-05, + "loss": 2.5758, + "step": 30443 + }, + { + "epoch": 0.902766657770661, + "grad_norm": 0.06977120041847229, + "learning_rate": 2.3628388009844626e-05, + "loss": 2.5464, + "step": 30444 + }, + { + "epoch": 0.9027963111229724, + "grad_norm": 0.07062000781297684, + "learning_rate": 2.3614097128346057e-05, + "loss": 2.5969, + "step": 30445 + }, + { + "epoch": 0.902825964475284, + "grad_norm": 0.07294756174087524, + "learning_rate": 2.3599810465326445e-05, + "loss": 2.5802, + "step": 30446 + }, + { + "epoch": 0.9028556178275954, + "grad_norm": 0.07391710579395294, + "learning_rate": 2.358552802091224e-05, + "loss": 2.5611, + "step": 30447 + }, + { + "epoch": 0.9028852711799069, + "grad_norm": 0.07825431227684021, + "learning_rate": 2.357124979523001e-05, + "loss": 2.5954, + "step": 30448 + }, + { + "epoch": 0.9029149245322183, + "grad_norm": 0.07437688857316971, + "learning_rate": 2.3556975788406097e-05, + "loss": 2.5252, + "step": 30449 + }, + { + "epoch": 0.9029445778845299, + "grad_norm": 0.07765952497720718, + "learning_rate": 2.354270600056696e-05, + "loss": 2.6002, + "step": 30450 + }, + { + "epoch": 0.9029742312368413, + "grad_norm": 0.08011570572853088, + "learning_rate": 2.3528440431838992e-05, + "loss": 2.5647, + "step": 30451 + }, + { + "epoch": 0.9030038845891528, + "grad_norm": 0.0769728496670723, + "learning_rate": 2.351417908234843e-05, + "loss": 2.5482, + "step": 30452 + }, + { + "epoch": 0.9030335379414642, + "grad_norm": 0.07741337269544601, + "learning_rate": 2.3499921952221614e-05, + "loss": 2.5672, + "step": 30453 + }, + { + "epoch": 0.9030631912937758, + "grad_norm": 0.0765516608953476, + "learning_rate": 2.3485669041584723e-05, + "loss": 2.5841, + "step": 30454 + }, + { + "epoch": 0.9030928446460872, + "grad_norm": 0.07720834761857986, + "learning_rate": 2.34714203505641e-05, + "loss": 2.543, + "step": 30455 + }, + { + "epoch": 0.9031224979983987, + "grad_norm": 0.07410524785518646, + "learning_rate": 2.345717587928581e-05, + "loss": 2.5824, + "step": 30456 + }, + { + "epoch": 0.9031521513507103, + "grad_norm": 0.07796862721443176, + "learning_rate": 2.3442935627876028e-05, + "loss": 2.5716, + "step": 30457 + }, + { + "epoch": 0.9031818047030217, + "grad_norm": 0.07461903989315033, + "learning_rate": 2.3428699596460824e-05, + "loss": 2.6134, + "step": 30458 + }, + { + "epoch": 0.9032114580553332, + "grad_norm": 0.0771600678563118, + "learning_rate": 2.3414467785166315e-05, + "loss": 2.5738, + "step": 30459 + }, + { + "epoch": 0.9032411114076446, + "grad_norm": 0.07867366075515747, + "learning_rate": 2.3400240194118518e-05, + "loss": 2.6056, + "step": 30460 + }, + { + "epoch": 0.9032707647599562, + "grad_norm": 0.07685063034296036, + "learning_rate": 2.3386016823443378e-05, + "loss": 2.5512, + "step": 30461 + }, + { + "epoch": 0.9033004181122676, + "grad_norm": 0.0745268240571022, + "learning_rate": 2.337179767326686e-05, + "loss": 2.5708, + "step": 30462 + }, + { + "epoch": 0.9033300714645791, + "grad_norm": 0.0770842581987381, + "learning_rate": 2.335758274371491e-05, + "loss": 2.5926, + "step": 30463 + }, + { + "epoch": 0.9033597248168905, + "grad_norm": 0.07787945866584778, + "learning_rate": 2.334337203491338e-05, + "loss": 2.5731, + "step": 30464 + }, + { + "epoch": 0.9033893781692021, + "grad_norm": 0.07334453612565994, + "learning_rate": 2.332916554698805e-05, + "loss": 2.5609, + "step": 30465 + }, + { + "epoch": 0.9034190315215135, + "grad_norm": 0.07406379282474518, + "learning_rate": 2.331496328006483e-05, + "loss": 2.5589, + "step": 30466 + }, + { + "epoch": 0.903448684873825, + "grad_norm": 0.07242856919765472, + "learning_rate": 2.3300765234269438e-05, + "loss": 2.5594, + "step": 30467 + }, + { + "epoch": 0.9034783382261364, + "grad_norm": 0.07269736379384995, + "learning_rate": 2.3286571409727562e-05, + "loss": 2.5465, + "step": 30468 + }, + { + "epoch": 0.903507991578448, + "grad_norm": 0.07692873477935791, + "learning_rate": 2.3272381806564992e-05, + "loss": 2.5901, + "step": 30469 + }, + { + "epoch": 0.9035376449307594, + "grad_norm": 0.07478642463684082, + "learning_rate": 2.325819642490723e-05, + "loss": 2.5684, + "step": 30470 + }, + { + "epoch": 0.9035672982830709, + "grad_norm": 0.07555088400840759, + "learning_rate": 2.3244015264880068e-05, + "loss": 2.5787, + "step": 30471 + }, + { + "epoch": 0.9035969516353823, + "grad_norm": 0.07216010242700577, + "learning_rate": 2.322983832660891e-05, + "loss": 2.572, + "step": 30472 + }, + { + "epoch": 0.9036266049876939, + "grad_norm": 0.06910070031881332, + "learning_rate": 2.3215665610219316e-05, + "loss": 2.5698, + "step": 30473 + }, + { + "epoch": 0.9036562583400053, + "grad_norm": 0.07286547869443893, + "learning_rate": 2.3201497115836912e-05, + "loss": 2.5413, + "step": 30474 + }, + { + "epoch": 0.9036859116923168, + "grad_norm": 0.07417461276054382, + "learning_rate": 2.3187332843587093e-05, + "loss": 2.5724, + "step": 30475 + }, + { + "epoch": 0.9037155650446282, + "grad_norm": 0.07045848667621613, + "learning_rate": 2.3173172793595265e-05, + "loss": 2.5827, + "step": 30476 + }, + { + "epoch": 0.9037452183969398, + "grad_norm": 0.07477796822786331, + "learning_rate": 2.315901696598688e-05, + "loss": 2.5727, + "step": 30477 + }, + { + "epoch": 0.9037748717492513, + "grad_norm": 0.07231533527374268, + "learning_rate": 2.3144865360887224e-05, + "loss": 2.5844, + "step": 30478 + }, + { + "epoch": 0.9038045251015627, + "grad_norm": 0.07239524275064468, + "learning_rate": 2.313071797842159e-05, + "loss": 2.5537, + "step": 30479 + }, + { + "epoch": 0.9038341784538743, + "grad_norm": 0.07290218770503998, + "learning_rate": 2.3116574818715376e-05, + "loss": 2.541, + "step": 30480 + }, + { + "epoch": 0.9038638318061857, + "grad_norm": 0.07504794001579285, + "learning_rate": 2.3102435881893757e-05, + "loss": 2.5749, + "step": 30481 + }, + { + "epoch": 0.9038934851584972, + "grad_norm": 0.07479623705148697, + "learning_rate": 2.3088301168081915e-05, + "loss": 2.5845, + "step": 30482 + }, + { + "epoch": 0.9039231385108086, + "grad_norm": 0.0758330449461937, + "learning_rate": 2.3074170677404972e-05, + "loss": 2.55, + "step": 30483 + }, + { + "epoch": 0.9039527918631202, + "grad_norm": 0.07359524816274643, + "learning_rate": 2.3060044409988158e-05, + "loss": 2.5557, + "step": 30484 + }, + { + "epoch": 0.9039824452154316, + "grad_norm": 0.07583168148994446, + "learning_rate": 2.3045922365956484e-05, + "loss": 2.601, + "step": 30485 + }, + { + "epoch": 0.9040120985677431, + "grad_norm": 0.07711470872163773, + "learning_rate": 2.3031804545434963e-05, + "loss": 2.5536, + "step": 30486 + }, + { + "epoch": 0.9040417519200545, + "grad_norm": 0.07750508934259415, + "learning_rate": 2.3017690948548776e-05, + "loss": 2.5665, + "step": 30487 + }, + { + "epoch": 0.9040714052723661, + "grad_norm": 0.07419106364250183, + "learning_rate": 2.3003581575422816e-05, + "loss": 2.5594, + "step": 30488 + }, + { + "epoch": 0.9041010586246775, + "grad_norm": 0.0806984081864357, + "learning_rate": 2.2989476426181986e-05, + "loss": 2.5534, + "step": 30489 + }, + { + "epoch": 0.904130711976989, + "grad_norm": 0.07601408660411835, + "learning_rate": 2.2975375500951245e-05, + "loss": 2.5853, + "step": 30490 + }, + { + "epoch": 0.9041603653293004, + "grad_norm": 0.07790336012840271, + "learning_rate": 2.296127879985538e-05, + "loss": 2.5307, + "step": 30491 + }, + { + "epoch": 0.904190018681612, + "grad_norm": 0.08263733237981796, + "learning_rate": 2.2947186323019397e-05, + "loss": 2.5809, + "step": 30492 + }, + { + "epoch": 0.9042196720339234, + "grad_norm": 0.0771658644080162, + "learning_rate": 2.293309807056787e-05, + "loss": 2.582, + "step": 30493 + }, + { + "epoch": 0.9042493253862349, + "grad_norm": 0.07249173521995544, + "learning_rate": 2.2919014042625697e-05, + "loss": 2.5599, + "step": 30494 + }, + { + "epoch": 0.9042789787385463, + "grad_norm": 0.07608657330274582, + "learning_rate": 2.29049342393175e-05, + "loss": 2.5606, + "step": 30495 + }, + { + "epoch": 0.9043086320908579, + "grad_norm": 0.07433271408081055, + "learning_rate": 2.289085866076801e-05, + "loss": 2.528, + "step": 30496 + }, + { + "epoch": 0.9043382854431693, + "grad_norm": 0.07360635697841644, + "learning_rate": 2.2876787307101853e-05, + "loss": 2.5444, + "step": 30497 + }, + { + "epoch": 0.9043679387954808, + "grad_norm": 0.07368273288011551, + "learning_rate": 2.2862720178443596e-05, + "loss": 2.5591, + "step": 30498 + }, + { + "epoch": 0.9043975921477924, + "grad_norm": 0.07404550909996033, + "learning_rate": 2.2848657274917915e-05, + "loss": 2.5913, + "step": 30499 + }, + { + "epoch": 0.9044272455001038, + "grad_norm": 0.07696570456027985, + "learning_rate": 2.283459859664927e-05, + "loss": 2.5687, + "step": 30500 + }, + { + "epoch": 0.9044568988524153, + "grad_norm": 0.07702787220478058, + "learning_rate": 2.282054414376211e-05, + "loss": 2.5694, + "step": 30501 + }, + { + "epoch": 0.9044865522047267, + "grad_norm": 0.080658458173275, + "learning_rate": 2.2806493916381066e-05, + "loss": 2.5673, + "step": 30502 + }, + { + "epoch": 0.9045162055570383, + "grad_norm": 0.07417773455381393, + "learning_rate": 2.2792447914630365e-05, + "loss": 2.5531, + "step": 30503 + }, + { + "epoch": 0.9045458589093497, + "grad_norm": 0.07212765514850616, + "learning_rate": 2.277840613863441e-05, + "loss": 2.5692, + "step": 30504 + }, + { + "epoch": 0.9045755122616612, + "grad_norm": 0.07592741400003433, + "learning_rate": 2.27643685885176e-05, + "loss": 2.6006, + "step": 30505 + }, + { + "epoch": 0.9046051656139726, + "grad_norm": 0.0844971314072609, + "learning_rate": 2.2750335264404232e-05, + "loss": 2.5891, + "step": 30506 + }, + { + "epoch": 0.9046348189662842, + "grad_norm": 0.07647155225276947, + "learning_rate": 2.273630616641853e-05, + "loss": 2.5819, + "step": 30507 + }, + { + "epoch": 0.9046644723185956, + "grad_norm": 0.07217754423618317, + "learning_rate": 2.2722281294684787e-05, + "loss": 2.6005, + "step": 30508 + }, + { + "epoch": 0.9046941256709071, + "grad_norm": 0.07537366449832916, + "learning_rate": 2.2708260649327185e-05, + "loss": 2.5313, + "step": 30509 + }, + { + "epoch": 0.9047237790232185, + "grad_norm": 0.07306656986474991, + "learning_rate": 2.269424423046984e-05, + "loss": 2.5586, + "step": 30510 + }, + { + "epoch": 0.9047534323755301, + "grad_norm": 0.07967327535152435, + "learning_rate": 2.2680232038236827e-05, + "loss": 2.5548, + "step": 30511 + }, + { + "epoch": 0.9047830857278415, + "grad_norm": 0.0687485784292221, + "learning_rate": 2.2666224072752374e-05, + "loss": 2.5546, + "step": 30512 + }, + { + "epoch": 0.904812739080153, + "grad_norm": 0.07592886686325073, + "learning_rate": 2.2652220334140496e-05, + "loss": 2.5608, + "step": 30513 + }, + { + "epoch": 0.9048423924324644, + "grad_norm": 0.07988349348306656, + "learning_rate": 2.2638220822525036e-05, + "loss": 2.6215, + "step": 30514 + }, + { + "epoch": 0.904872045784776, + "grad_norm": 0.07514572143554688, + "learning_rate": 2.2624225538030118e-05, + "loss": 2.5666, + "step": 30515 + }, + { + "epoch": 0.9049016991370874, + "grad_norm": 0.07764244824647903, + "learning_rate": 2.261023448077959e-05, + "loss": 2.5575, + "step": 30516 + }, + { + "epoch": 0.9049313524893989, + "grad_norm": 0.07446211576461792, + "learning_rate": 2.2596247650897407e-05, + "loss": 2.5836, + "step": 30517 + }, + { + "epoch": 0.9049610058417104, + "grad_norm": 0.07832575589418411, + "learning_rate": 2.2582265048507354e-05, + "loss": 2.5473, + "step": 30518 + }, + { + "epoch": 0.9049906591940219, + "grad_norm": 0.07766367495059967, + "learning_rate": 2.256828667373334e-05, + "loss": 2.5743, + "step": 30519 + }, + { + "epoch": 0.9050203125463334, + "grad_norm": 0.07749415189027786, + "learning_rate": 2.2554312526699095e-05, + "loss": 2.5919, + "step": 30520 + }, + { + "epoch": 0.9050499658986448, + "grad_norm": 0.07378029823303223, + "learning_rate": 2.2540342607528354e-05, + "loss": 2.5394, + "step": 30521 + }, + { + "epoch": 0.9050796192509564, + "grad_norm": 0.07509433478116989, + "learning_rate": 2.2526376916344792e-05, + "loss": 2.5422, + "step": 30522 + }, + { + "epoch": 0.9051092726032678, + "grad_norm": 0.07557486742734909, + "learning_rate": 2.251241545327215e-05, + "loss": 2.5581, + "step": 30523 + }, + { + "epoch": 0.9051389259555793, + "grad_norm": 0.07712496817111969, + "learning_rate": 2.249845821843405e-05, + "loss": 2.5437, + "step": 30524 + }, + { + "epoch": 0.9051685793078907, + "grad_norm": 0.07363320887088776, + "learning_rate": 2.2484505211954052e-05, + "loss": 2.5259, + "step": 30525 + }, + { + "epoch": 0.9051982326602023, + "grad_norm": 0.07031359523534775, + "learning_rate": 2.2470556433955736e-05, + "loss": 2.5457, + "step": 30526 + }, + { + "epoch": 0.9052278860125137, + "grad_norm": 0.08410298079252243, + "learning_rate": 2.2456611884562607e-05, + "loss": 2.5336, + "step": 30527 + }, + { + "epoch": 0.9052575393648252, + "grad_norm": 0.07296343892812729, + "learning_rate": 2.244267156389812e-05, + "loss": 2.5689, + "step": 30528 + }, + { + "epoch": 0.9052871927171366, + "grad_norm": 0.07042324542999268, + "learning_rate": 2.2428735472085793e-05, + "loss": 2.5467, + "step": 30529 + }, + { + "epoch": 0.9053168460694482, + "grad_norm": 0.07465990632772446, + "learning_rate": 2.2414803609248914e-05, + "loss": 2.5553, + "step": 30530 + }, + { + "epoch": 0.9053464994217596, + "grad_norm": 0.07165588438510895, + "learning_rate": 2.2400875975510992e-05, + "loss": 2.5546, + "step": 30531 + }, + { + "epoch": 0.9053761527740711, + "grad_norm": 0.07761525362730026, + "learning_rate": 2.2386952570995267e-05, + "loss": 2.5698, + "step": 30532 + }, + { + "epoch": 0.9054058061263826, + "grad_norm": 0.07626229524612427, + "learning_rate": 2.237303339582508e-05, + "loss": 2.5303, + "step": 30533 + }, + { + "epoch": 0.9054354594786941, + "grad_norm": 0.07251553982496262, + "learning_rate": 2.2359118450123618e-05, + "loss": 2.5485, + "step": 30534 + }, + { + "epoch": 0.9054651128310055, + "grad_norm": 0.0797397717833519, + "learning_rate": 2.2345207734014215e-05, + "loss": 2.5797, + "step": 30535 + }, + { + "epoch": 0.905494766183317, + "grad_norm": 0.07364556193351746, + "learning_rate": 2.2331301247620006e-05, + "loss": 2.5741, + "step": 30536 + }, + { + "epoch": 0.9055244195356285, + "grad_norm": 0.07686247676610947, + "learning_rate": 2.2317398991063997e-05, + "loss": 2.5616, + "step": 30537 + }, + { + "epoch": 0.90555407288794, + "grad_norm": 0.07969719916582108, + "learning_rate": 2.2303500964469482e-05, + "loss": 2.5498, + "step": 30538 + }, + { + "epoch": 0.9055837262402514, + "grad_norm": 0.07469227910041809, + "learning_rate": 2.2289607167959413e-05, + "loss": 2.5796, + "step": 30539 + }, + { + "epoch": 0.9056133795925629, + "grad_norm": 0.076564259827137, + "learning_rate": 2.227571760165692e-05, + "loss": 2.5713, + "step": 30540 + }, + { + "epoch": 0.9056430329448745, + "grad_norm": 0.07439888268709183, + "learning_rate": 2.2261832265684957e-05, + "loss": 2.5789, + "step": 30541 + }, + { + "epoch": 0.9056726862971859, + "grad_norm": 0.07390491664409637, + "learning_rate": 2.224795116016648e-05, + "loss": 2.5783, + "step": 30542 + }, + { + "epoch": 0.9057023396494974, + "grad_norm": 0.0766000896692276, + "learning_rate": 2.223407428522434e-05, + "loss": 2.5972, + "step": 30543 + }, + { + "epoch": 0.9057319930018088, + "grad_norm": 0.07497303187847137, + "learning_rate": 2.222020164098154e-05, + "loss": 2.5632, + "step": 30544 + }, + { + "epoch": 0.9057616463541204, + "grad_norm": 0.07527756690979004, + "learning_rate": 2.2206333227560827e-05, + "loss": 2.5671, + "step": 30545 + }, + { + "epoch": 0.9057912997064318, + "grad_norm": 0.07225760817527771, + "learning_rate": 2.219246904508504e-05, + "loss": 2.5544, + "step": 30546 + }, + { + "epoch": 0.9058209530587433, + "grad_norm": 0.0736595168709755, + "learning_rate": 2.2178609093677083e-05, + "loss": 2.5728, + "step": 30547 + }, + { + "epoch": 0.9058506064110547, + "grad_norm": 0.08059950917959213, + "learning_rate": 2.216475337345941e-05, + "loss": 2.5841, + "step": 30548 + }, + { + "epoch": 0.9058802597633663, + "grad_norm": 0.07355285435914993, + "learning_rate": 2.215090188455493e-05, + "loss": 2.563, + "step": 30549 + }, + { + "epoch": 0.9059099131156777, + "grad_norm": 0.07261955738067627, + "learning_rate": 2.213705462708615e-05, + "loss": 2.5623, + "step": 30550 + }, + { + "epoch": 0.9059395664679892, + "grad_norm": 0.0736563503742218, + "learning_rate": 2.212321160117581e-05, + "loss": 2.5339, + "step": 30551 + }, + { + "epoch": 0.9059692198203007, + "grad_norm": 0.0746038630604744, + "learning_rate": 2.210937280694647e-05, + "loss": 2.5558, + "step": 30552 + }, + { + "epoch": 0.9059988731726122, + "grad_norm": 0.07751261442899704, + "learning_rate": 2.2095538244520708e-05, + "loss": 2.5638, + "step": 30553 + }, + { + "epoch": 0.9060285265249236, + "grad_norm": 0.07489176839590073, + "learning_rate": 2.2081707914020922e-05, + "loss": 2.5768, + "step": 30554 + }, + { + "epoch": 0.9060581798772351, + "grad_norm": 0.07176214456558228, + "learning_rate": 2.206788181556968e-05, + "loss": 2.5589, + "step": 30555 + }, + { + "epoch": 0.9060878332295466, + "grad_norm": 0.07943074405193329, + "learning_rate": 2.2054059949289384e-05, + "loss": 2.5695, + "step": 30556 + }, + { + "epoch": 0.9061174865818581, + "grad_norm": 0.07516169548034668, + "learning_rate": 2.2040242315302493e-05, + "loss": 2.5793, + "step": 30557 + }, + { + "epoch": 0.9061471399341695, + "grad_norm": 0.0717548131942749, + "learning_rate": 2.2026428913731245e-05, + "loss": 2.585, + "step": 30558 + }, + { + "epoch": 0.906176793286481, + "grad_norm": 0.07712177187204361, + "learning_rate": 2.2012619744697983e-05, + "loss": 2.604, + "step": 30559 + }, + { + "epoch": 0.9062064466387925, + "grad_norm": 0.07447699457406998, + "learning_rate": 2.1998814808324997e-05, + "loss": 2.565, + "step": 30560 + }, + { + "epoch": 0.906236099991104, + "grad_norm": 0.07794076204299927, + "learning_rate": 2.1985014104734637e-05, + "loss": 2.597, + "step": 30561 + }, + { + "epoch": 0.9062657533434155, + "grad_norm": 0.07281740754842758, + "learning_rate": 2.1971217634048966e-05, + "loss": 2.5853, + "step": 30562 + }, + { + "epoch": 0.906295406695727, + "grad_norm": 0.07528148591518402, + "learning_rate": 2.1957425396390117e-05, + "loss": 2.5507, + "step": 30563 + }, + { + "epoch": 0.9063250600480385, + "grad_norm": 0.07493797689676285, + "learning_rate": 2.1943637391880432e-05, + "loss": 2.5453, + "step": 30564 + }, + { + "epoch": 0.9063547134003499, + "grad_norm": 0.0724908858537674, + "learning_rate": 2.1929853620641928e-05, + "loss": 2.563, + "step": 30565 + }, + { + "epoch": 0.9063843667526614, + "grad_norm": 0.07297155261039734, + "learning_rate": 2.1916074082796556e-05, + "loss": 2.5384, + "step": 30566 + }, + { + "epoch": 0.9064140201049729, + "grad_norm": 0.07641077786684036, + "learning_rate": 2.1902298778466447e-05, + "loss": 2.5844, + "step": 30567 + }, + { + "epoch": 0.9064436734572844, + "grad_norm": 0.07503536343574524, + "learning_rate": 2.1888527707773608e-05, + "loss": 2.5893, + "step": 30568 + }, + { + "epoch": 0.9064733268095958, + "grad_norm": 0.07434257119894028, + "learning_rate": 2.1874760870839892e-05, + "loss": 2.5877, + "step": 30569 + }, + { + "epoch": 0.9065029801619073, + "grad_norm": 0.07367683202028275, + "learning_rate": 2.1860998267787204e-05, + "loss": 2.5725, + "step": 30570 + }, + { + "epoch": 0.9065326335142188, + "grad_norm": 0.07522465288639069, + "learning_rate": 2.1847239898737437e-05, + "loss": 2.5708, + "step": 30571 + }, + { + "epoch": 0.9065622868665303, + "grad_norm": 0.07030883431434631, + "learning_rate": 2.1833485763812444e-05, + "loss": 2.5442, + "step": 30572 + }, + { + "epoch": 0.9065919402188417, + "grad_norm": 0.07589530199766159, + "learning_rate": 2.1819735863134017e-05, + "loss": 2.5414, + "step": 30573 + }, + { + "epoch": 0.9066215935711532, + "grad_norm": 0.07179402559995651, + "learning_rate": 2.1805990196823887e-05, + "loss": 2.5333, + "step": 30574 + }, + { + "epoch": 0.9066512469234647, + "grad_norm": 0.07787145674228668, + "learning_rate": 2.1792248765003853e-05, + "loss": 2.5719, + "step": 30575 + }, + { + "epoch": 0.9066809002757762, + "grad_norm": 0.0701696053147316, + "learning_rate": 2.1778511567795422e-05, + "loss": 2.5663, + "step": 30576 + }, + { + "epoch": 0.9067105536280876, + "grad_norm": 0.08076100051403046, + "learning_rate": 2.1764778605320446e-05, + "loss": 2.5566, + "step": 30577 + }, + { + "epoch": 0.9067402069803991, + "grad_norm": 0.07593823969364166, + "learning_rate": 2.175104987770049e-05, + "loss": 2.5986, + "step": 30578 + }, + { + "epoch": 0.9067698603327106, + "grad_norm": 0.07422660291194916, + "learning_rate": 2.1737325385057015e-05, + "loss": 2.542, + "step": 30579 + }, + { + "epoch": 0.9067995136850221, + "grad_norm": 0.0731273666024208, + "learning_rate": 2.172360512751159e-05, + "loss": 2.6133, + "step": 30580 + }, + { + "epoch": 0.9068291670373335, + "grad_norm": 0.07283654808998108, + "learning_rate": 2.1709889105185788e-05, + "loss": 2.6053, + "step": 30581 + }, + { + "epoch": 0.906858820389645, + "grad_norm": 0.07780306041240692, + "learning_rate": 2.1696177318201006e-05, + "loss": 2.5428, + "step": 30582 + }, + { + "epoch": 0.9068884737419566, + "grad_norm": 0.08167896419763565, + "learning_rate": 2.1682469766678648e-05, + "loss": 2.5694, + "step": 30583 + }, + { + "epoch": 0.906918127094268, + "grad_norm": 0.07410933822393417, + "learning_rate": 2.1668766450740118e-05, + "loss": 2.5716, + "step": 30584 + }, + { + "epoch": 0.9069477804465795, + "grad_norm": 0.06911198049783707, + "learning_rate": 2.165506737050682e-05, + "loss": 2.5054, + "step": 30585 + }, + { + "epoch": 0.906977433798891, + "grad_norm": 0.0798603966832161, + "learning_rate": 2.1641372526099935e-05, + "loss": 2.5577, + "step": 30586 + }, + { + "epoch": 0.9070070871512025, + "grad_norm": 0.07479113340377808, + "learning_rate": 2.1627681917640863e-05, + "loss": 2.5218, + "step": 30587 + }, + { + "epoch": 0.9070367405035139, + "grad_norm": 0.07610999047756195, + "learning_rate": 2.1613995545250785e-05, + "loss": 2.592, + "step": 30588 + }, + { + "epoch": 0.9070663938558254, + "grad_norm": 0.0766298696398735, + "learning_rate": 2.160031340905083e-05, + "loss": 2.5474, + "step": 30589 + }, + { + "epoch": 0.9070960472081369, + "grad_norm": 0.07583831250667572, + "learning_rate": 2.1586635509162235e-05, + "loss": 2.5687, + "step": 30590 + }, + { + "epoch": 0.9071257005604484, + "grad_norm": 0.08016213774681091, + "learning_rate": 2.157296184570612e-05, + "loss": 2.5933, + "step": 30591 + }, + { + "epoch": 0.9071553539127598, + "grad_norm": 0.07704336196184158, + "learning_rate": 2.1559292418803555e-05, + "loss": 2.5833, + "step": 30592 + }, + { + "epoch": 0.9071850072650713, + "grad_norm": 0.07189342379570007, + "learning_rate": 2.154562722857556e-05, + "loss": 2.5522, + "step": 30593 + }, + { + "epoch": 0.9072146606173828, + "grad_norm": 0.07897444069385529, + "learning_rate": 2.15319662751432e-05, + "loss": 2.5764, + "step": 30594 + }, + { + "epoch": 0.9072443139696943, + "grad_norm": 0.07905738800764084, + "learning_rate": 2.151830955862738e-05, + "loss": 2.5566, + "step": 30595 + }, + { + "epoch": 0.9072739673220057, + "grad_norm": 0.07806840538978577, + "learning_rate": 2.150465707914906e-05, + "loss": 2.565, + "step": 30596 + }, + { + "epoch": 0.9073036206743172, + "grad_norm": 0.07635407894849777, + "learning_rate": 2.1491008836829083e-05, + "loss": 2.5874, + "step": 30597 + }, + { + "epoch": 0.9073332740266287, + "grad_norm": 0.0823843777179718, + "learning_rate": 2.1477364831788414e-05, + "loss": 2.5675, + "step": 30598 + }, + { + "epoch": 0.9073629273789402, + "grad_norm": 0.07442431151866913, + "learning_rate": 2.1463725064147842e-05, + "loss": 2.5641, + "step": 30599 + }, + { + "epoch": 0.9073925807312516, + "grad_norm": 0.07121928036212921, + "learning_rate": 2.145008953402805e-05, + "loss": 2.5721, + "step": 30600 + }, + { + "epoch": 0.9074222340835632, + "grad_norm": 0.07413529604673386, + "learning_rate": 2.143645824154994e-05, + "loss": 2.5482, + "step": 30601 + }, + { + "epoch": 0.9074518874358746, + "grad_norm": 0.07290101796388626, + "learning_rate": 2.142283118683408e-05, + "loss": 2.5324, + "step": 30602 + }, + { + "epoch": 0.9074815407881861, + "grad_norm": 0.08052915334701538, + "learning_rate": 2.140920837000121e-05, + "loss": 2.5218, + "step": 30603 + }, + { + "epoch": 0.9075111941404976, + "grad_norm": 0.07391946017742157, + "learning_rate": 2.1395589791171953e-05, + "loss": 2.5402, + "step": 30604 + }, + { + "epoch": 0.907540847492809, + "grad_norm": 0.07231765985488892, + "learning_rate": 2.1381975450466885e-05, + "loss": 2.5935, + "step": 30605 + }, + { + "epoch": 0.9075705008451206, + "grad_norm": 0.0817890390753746, + "learning_rate": 2.1368365348006567e-05, + "loss": 2.5724, + "step": 30606 + }, + { + "epoch": 0.907600154197432, + "grad_norm": 0.07778147608041763, + "learning_rate": 2.1354759483911578e-05, + "loss": 2.5196, + "step": 30607 + }, + { + "epoch": 0.9076298075497435, + "grad_norm": 0.07270654290914536, + "learning_rate": 2.1341157858302318e-05, + "loss": 2.5674, + "step": 30608 + }, + { + "epoch": 0.907659460902055, + "grad_norm": 0.07292641699314117, + "learning_rate": 2.1327560471299303e-05, + "loss": 2.5612, + "step": 30609 + }, + { + "epoch": 0.9076891142543665, + "grad_norm": 0.07931564003229141, + "learning_rate": 2.1313967323022875e-05, + "loss": 2.5827, + "step": 30610 + }, + { + "epoch": 0.9077187676066779, + "grad_norm": 0.07700816541910172, + "learning_rate": 2.130037841359339e-05, + "loss": 2.5555, + "step": 30611 + }, + { + "epoch": 0.9077484209589894, + "grad_norm": 0.0750005841255188, + "learning_rate": 2.128679374313136e-05, + "loss": 2.5674, + "step": 30612 + }, + { + "epoch": 0.9077780743113009, + "grad_norm": 0.07775597274303436, + "learning_rate": 2.12732133117568e-05, + "loss": 2.5752, + "step": 30613 + }, + { + "epoch": 0.9078077276636124, + "grad_norm": 0.07609614729881287, + "learning_rate": 2.125963711959017e-05, + "loss": 2.6248, + "step": 30614 + }, + { + "epoch": 0.9078373810159238, + "grad_norm": 0.07655907422304153, + "learning_rate": 2.1246065166751538e-05, + "loss": 2.5806, + "step": 30615 + }, + { + "epoch": 0.9078670343682353, + "grad_norm": 0.07289151847362518, + "learning_rate": 2.1232497453361255e-05, + "loss": 2.578, + "step": 30616 + }, + { + "epoch": 0.9078966877205468, + "grad_norm": 0.07863995432853699, + "learning_rate": 2.121893397953939e-05, + "loss": 2.6006, + "step": 30617 + }, + { + "epoch": 0.9079263410728583, + "grad_norm": 0.07370348274707794, + "learning_rate": 2.1205374745406013e-05, + "loss": 2.5495, + "step": 30618 + }, + { + "epoch": 0.9079559944251697, + "grad_norm": 0.0735168308019638, + "learning_rate": 2.1191819751081253e-05, + "loss": 2.554, + "step": 30619 + }, + { + "epoch": 0.9079856477774813, + "grad_norm": 0.0752900093793869, + "learning_rate": 2.1178268996685125e-05, + "loss": 2.5526, + "step": 30620 + }, + { + "epoch": 0.9080153011297927, + "grad_norm": 0.07439670711755753, + "learning_rate": 2.1164722482337583e-05, + "loss": 2.5481, + "step": 30621 + }, + { + "epoch": 0.9080449544821042, + "grad_norm": 0.07212875783443451, + "learning_rate": 2.115118020815865e-05, + "loss": 2.5807, + "step": 30622 + }, + { + "epoch": 0.9080746078344156, + "grad_norm": 0.07637540996074677, + "learning_rate": 2.1137642174268278e-05, + "loss": 2.563, + "step": 30623 + }, + { + "epoch": 0.9081042611867272, + "grad_norm": 0.07276659458875656, + "learning_rate": 2.1124108380786157e-05, + "loss": 2.6054, + "step": 30624 + }, + { + "epoch": 0.9081339145390387, + "grad_norm": 0.07154777646064758, + "learning_rate": 2.1110578827832293e-05, + "loss": 2.5659, + "step": 30625 + }, + { + "epoch": 0.9081635678913501, + "grad_norm": 0.07448137551546097, + "learning_rate": 2.1097053515526488e-05, + "loss": 2.5314, + "step": 30626 + }, + { + "epoch": 0.9081932212436616, + "grad_norm": 0.0766787976026535, + "learning_rate": 2.108353244398842e-05, + "loss": 2.5914, + "step": 30627 + }, + { + "epoch": 0.9082228745959731, + "grad_norm": 0.07242318242788315, + "learning_rate": 2.1070015613337822e-05, + "loss": 2.5601, + "step": 30628 + }, + { + "epoch": 0.9082525279482846, + "grad_norm": 0.07321550697088242, + "learning_rate": 2.105650302369455e-05, + "loss": 2.5632, + "step": 30629 + }, + { + "epoch": 0.908282181300596, + "grad_norm": 0.07690978795289993, + "learning_rate": 2.1042994675178116e-05, + "loss": 2.5901, + "step": 30630 + }, + { + "epoch": 0.9083118346529075, + "grad_norm": 0.07490396499633789, + "learning_rate": 2.102949056790815e-05, + "loss": 2.5922, + "step": 30631 + }, + { + "epoch": 0.908341488005219, + "grad_norm": 0.07635422050952911, + "learning_rate": 2.1015990702004328e-05, + "loss": 2.5552, + "step": 30632 + }, + { + "epoch": 0.9083711413575305, + "grad_norm": 0.07406076788902283, + "learning_rate": 2.1002495077586115e-05, + "loss": 2.5795, + "step": 30633 + }, + { + "epoch": 0.9084007947098419, + "grad_norm": 0.07380205392837524, + "learning_rate": 2.0989003694773022e-05, + "loss": 2.5752, + "step": 30634 + }, + { + "epoch": 0.9084304480621535, + "grad_norm": 0.07207590341567993, + "learning_rate": 2.0975516553684516e-05, + "loss": 2.5831, + "step": 30635 + }, + { + "epoch": 0.9084601014144649, + "grad_norm": 0.0767972469329834, + "learning_rate": 2.0962033654439993e-05, + "loss": 2.5669, + "step": 30636 + }, + { + "epoch": 0.9084897547667764, + "grad_norm": 0.07371754944324493, + "learning_rate": 2.094855499715892e-05, + "loss": 2.5335, + "step": 30637 + }, + { + "epoch": 0.9085194081190878, + "grad_norm": 0.07492607086896896, + "learning_rate": 2.093508058196064e-05, + "loss": 2.6418, + "step": 30638 + }, + { + "epoch": 0.9085490614713994, + "grad_norm": 0.06874340027570724, + "learning_rate": 2.0921610408964397e-05, + "loss": 2.5621, + "step": 30639 + }, + { + "epoch": 0.9085787148237108, + "grad_norm": 0.06971295177936554, + "learning_rate": 2.0908144478289592e-05, + "loss": 2.5426, + "step": 30640 + }, + { + "epoch": 0.9086083681760223, + "grad_norm": 0.07769648730754852, + "learning_rate": 2.0894682790055297e-05, + "loss": 2.5522, + "step": 30641 + }, + { + "epoch": 0.9086380215283337, + "grad_norm": 0.07265174388885498, + "learning_rate": 2.088122534438086e-05, + "loss": 2.5267, + "step": 30642 + }, + { + "epoch": 0.9086676748806453, + "grad_norm": 0.07490391284227371, + "learning_rate": 2.0867772141385466e-05, + "loss": 2.5788, + "step": 30643 + }, + { + "epoch": 0.9086973282329567, + "grad_norm": 0.07434624433517456, + "learning_rate": 2.085432318118824e-05, + "loss": 2.5652, + "step": 30644 + }, + { + "epoch": 0.9087269815852682, + "grad_norm": 0.08017078787088394, + "learning_rate": 2.0840878463908143e-05, + "loss": 2.5767, + "step": 30645 + }, + { + "epoch": 0.9087566349375797, + "grad_norm": 0.07409603893756866, + "learning_rate": 2.0827437989664355e-05, + "loss": 2.6127, + "step": 30646 + }, + { + "epoch": 0.9087862882898912, + "grad_norm": 0.07364881038665771, + "learning_rate": 2.0814001758575786e-05, + "loss": 2.5675, + "step": 30647 + }, + { + "epoch": 0.9088159416422027, + "grad_norm": 0.0721924901008606, + "learning_rate": 2.0800569770761558e-05, + "loss": 2.5496, + "step": 30648 + }, + { + "epoch": 0.9088455949945141, + "grad_norm": 0.07662159949541092, + "learning_rate": 2.0787142026340466e-05, + "loss": 2.5675, + "step": 30649 + }, + { + "epoch": 0.9088752483468256, + "grad_norm": 0.07521292567253113, + "learning_rate": 2.077371852543153e-05, + "loss": 2.581, + "step": 30650 + }, + { + "epoch": 0.9089049016991371, + "grad_norm": 0.07893197983503342, + "learning_rate": 2.0760299268153595e-05, + "loss": 2.5492, + "step": 30651 + }, + { + "epoch": 0.9089345550514486, + "grad_norm": 0.08066147565841675, + "learning_rate": 2.0746884254625452e-05, + "loss": 2.6106, + "step": 30652 + }, + { + "epoch": 0.90896420840376, + "grad_norm": 0.07353707402944565, + "learning_rate": 2.0733473484965902e-05, + "loss": 2.5591, + "step": 30653 + }, + { + "epoch": 0.9089938617560716, + "grad_norm": 0.07871494442224503, + "learning_rate": 2.0720066959293682e-05, + "loss": 2.5892, + "step": 30654 + }, + { + "epoch": 0.909023515108383, + "grad_norm": 0.07829154282808304, + "learning_rate": 2.0706664677727583e-05, + "loss": 2.5328, + "step": 30655 + }, + { + "epoch": 0.9090531684606945, + "grad_norm": 0.07434206455945969, + "learning_rate": 2.0693266640386233e-05, + "loss": 2.5795, + "step": 30656 + }, + { + "epoch": 0.9090828218130059, + "grad_norm": 0.07562950253486633, + "learning_rate": 2.067987284738826e-05, + "loss": 2.5449, + "step": 30657 + }, + { + "epoch": 0.9091124751653175, + "grad_norm": 0.07442259043455124, + "learning_rate": 2.066648329885229e-05, + "loss": 2.5733, + "step": 30658 + }, + { + "epoch": 0.9091421285176289, + "grad_norm": 0.07341775298118591, + "learning_rate": 2.0653097994896896e-05, + "loss": 2.5423, + "step": 30659 + }, + { + "epoch": 0.9091717818699404, + "grad_norm": 0.07388221472501755, + "learning_rate": 2.0639716935640595e-05, + "loss": 2.5552, + "step": 30660 + }, + { + "epoch": 0.9092014352222518, + "grad_norm": 0.0755770355463028, + "learning_rate": 2.062634012120185e-05, + "loss": 2.5612, + "step": 30661 + }, + { + "epoch": 0.9092310885745634, + "grad_norm": 0.07502400130033493, + "learning_rate": 2.061296755169917e-05, + "loss": 2.5909, + "step": 30662 + }, + { + "epoch": 0.9092607419268748, + "grad_norm": 0.07982000708580017, + "learning_rate": 2.0599599227250963e-05, + "loss": 2.527, + "step": 30663 + }, + { + "epoch": 0.9092903952791863, + "grad_norm": 0.07413342595100403, + "learning_rate": 2.0586235147975584e-05, + "loss": 2.5855, + "step": 30664 + }, + { + "epoch": 0.9093200486314978, + "grad_norm": 0.07376785576343536, + "learning_rate": 2.057287531399138e-05, + "loss": 2.5658, + "step": 30665 + }, + { + "epoch": 0.9093497019838093, + "grad_norm": 0.07581319659948349, + "learning_rate": 2.05595197254167e-05, + "loss": 2.5694, + "step": 30666 + }, + { + "epoch": 0.9093793553361208, + "grad_norm": 0.07510345429182053, + "learning_rate": 2.0546168382369723e-05, + "loss": 2.5544, + "step": 30667 + }, + { + "epoch": 0.9094090086884322, + "grad_norm": 0.0767221674323082, + "learning_rate": 2.0532821284968696e-05, + "loss": 2.5729, + "step": 30668 + }, + { + "epoch": 0.9094386620407438, + "grad_norm": 0.0744771808385849, + "learning_rate": 2.051947843333185e-05, + "loss": 2.558, + "step": 30669 + }, + { + "epoch": 0.9094683153930552, + "grad_norm": 0.07085762172937393, + "learning_rate": 2.0506139827577376e-05, + "loss": 2.5589, + "step": 30670 + }, + { + "epoch": 0.9094979687453667, + "grad_norm": 0.07231725007295609, + "learning_rate": 2.0492805467823285e-05, + "loss": 2.572, + "step": 30671 + }, + { + "epoch": 0.9095276220976781, + "grad_norm": 0.0779486820101738, + "learning_rate": 2.047947535418776e-05, + "loss": 2.5301, + "step": 30672 + }, + { + "epoch": 0.9095572754499897, + "grad_norm": 0.07901522517204285, + "learning_rate": 2.046614948678871e-05, + "loss": 2.581, + "step": 30673 + }, + { + "epoch": 0.9095869288023011, + "grad_norm": 0.07685913890600204, + "learning_rate": 2.045282786574426e-05, + "loss": 2.5861, + "step": 30674 + }, + { + "epoch": 0.9096165821546126, + "grad_norm": 0.07618936896324158, + "learning_rate": 2.0439510491172374e-05, + "loss": 2.5789, + "step": 30675 + }, + { + "epoch": 0.909646235506924, + "grad_norm": 0.07470650970935822, + "learning_rate": 2.0426197363190902e-05, + "loss": 2.5768, + "step": 30676 + }, + { + "epoch": 0.9096758888592356, + "grad_norm": 0.07871636003255844, + "learning_rate": 2.0412888481917745e-05, + "loss": 2.5812, + "step": 30677 + }, + { + "epoch": 0.909705542211547, + "grad_norm": 0.07721971720457077, + "learning_rate": 2.039958384747087e-05, + "loss": 2.5859, + "step": 30678 + }, + { + "epoch": 0.9097351955638585, + "grad_norm": 0.0708329975605011, + "learning_rate": 2.038628345996796e-05, + "loss": 2.5493, + "step": 30679 + }, + { + "epoch": 0.9097648489161699, + "grad_norm": 0.07529842853546143, + "learning_rate": 2.0372987319526748e-05, + "loss": 2.5474, + "step": 30680 + }, + { + "epoch": 0.9097945022684815, + "grad_norm": 0.07428930699825287, + "learning_rate": 2.035969542626509e-05, + "loss": 2.5411, + "step": 30681 + }, + { + "epoch": 0.9098241556207929, + "grad_norm": 0.07457185536623001, + "learning_rate": 2.0346407780300667e-05, + "loss": 2.5434, + "step": 30682 + }, + { + "epoch": 0.9098538089731044, + "grad_norm": 0.07567188143730164, + "learning_rate": 2.0333124381751165e-05, + "loss": 2.5433, + "step": 30683 + }, + { + "epoch": 0.9098834623254158, + "grad_norm": 0.07210704684257507, + "learning_rate": 2.0319845230734212e-05, + "loss": 2.5359, + "step": 30684 + }, + { + "epoch": 0.9099131156777274, + "grad_norm": 0.07548493146896362, + "learning_rate": 2.0306570327367325e-05, + "loss": 2.5903, + "step": 30685 + }, + { + "epoch": 0.9099427690300389, + "grad_norm": 0.07467514276504517, + "learning_rate": 2.0293299671768073e-05, + "loss": 2.5622, + "step": 30686 + }, + { + "epoch": 0.9099724223823503, + "grad_norm": 0.07683467119932175, + "learning_rate": 2.0280033264054033e-05, + "loss": 2.5485, + "step": 30687 + }, + { + "epoch": 0.9100020757346619, + "grad_norm": 0.07675249874591827, + "learning_rate": 2.0266771104342663e-05, + "loss": 2.5646, + "step": 30688 + }, + { + "epoch": 0.9100317290869733, + "grad_norm": 0.08052069693803787, + "learning_rate": 2.025351319275137e-05, + "loss": 2.5786, + "step": 30689 + }, + { + "epoch": 0.9100613824392848, + "grad_norm": 0.07645703107118607, + "learning_rate": 2.0240259529397508e-05, + "loss": 2.5791, + "step": 30690 + }, + { + "epoch": 0.9100910357915962, + "grad_norm": 0.08651558309793472, + "learning_rate": 2.0227010114398537e-05, + "loss": 2.5771, + "step": 30691 + }, + { + "epoch": 0.9101206891439078, + "grad_norm": 0.08115563541650772, + "learning_rate": 2.0213764947871692e-05, + "loss": 2.541, + "step": 30692 + }, + { + "epoch": 0.9101503424962192, + "grad_norm": 0.07788082957267761, + "learning_rate": 2.020052402993433e-05, + "loss": 2.5792, + "step": 30693 + }, + { + "epoch": 0.9101799958485307, + "grad_norm": 0.07517118752002716, + "learning_rate": 2.0187287360703743e-05, + "loss": 2.5785, + "step": 30694 + }, + { + "epoch": 0.9102096492008421, + "grad_norm": 0.07854954898357391, + "learning_rate": 2.017405494029706e-05, + "loss": 2.5622, + "step": 30695 + }, + { + "epoch": 0.9102393025531537, + "grad_norm": 0.08038142323493958, + "learning_rate": 2.0160826768831463e-05, + "loss": 2.5566, + "step": 30696 + }, + { + "epoch": 0.9102689559054651, + "grad_norm": 0.07730031758546829, + "learning_rate": 2.0147602846424085e-05, + "loss": 2.5354, + "step": 30697 + }, + { + "epoch": 0.9102986092577766, + "grad_norm": 0.07349027693271637, + "learning_rate": 2.0134383173192107e-05, + "loss": 2.5624, + "step": 30698 + }, + { + "epoch": 0.910328262610088, + "grad_norm": 0.07398207485675812, + "learning_rate": 2.0121167749252544e-05, + "loss": 2.5342, + "step": 30699 + }, + { + "epoch": 0.9103579159623996, + "grad_norm": 0.08359789103269577, + "learning_rate": 2.010795657472242e-05, + "loss": 2.5584, + "step": 30700 + }, + { + "epoch": 0.910387569314711, + "grad_norm": 0.07544378936290741, + "learning_rate": 2.0094749649718635e-05, + "loss": 2.5614, + "step": 30701 + }, + { + "epoch": 0.9104172226670225, + "grad_norm": 0.07831500470638275, + "learning_rate": 2.0081546974358266e-05, + "loss": 2.5684, + "step": 30702 + }, + { + "epoch": 0.9104468760193339, + "grad_norm": 0.08106128126382828, + "learning_rate": 2.0068348548758162e-05, + "loss": 2.5525, + "step": 30703 + }, + { + "epoch": 0.9104765293716455, + "grad_norm": 0.07991120219230652, + "learning_rate": 2.005515437303518e-05, + "loss": 2.5582, + "step": 30704 + }, + { + "epoch": 0.9105061827239569, + "grad_norm": 0.07584255933761597, + "learning_rate": 2.004196444730616e-05, + "loss": 2.5944, + "step": 30705 + }, + { + "epoch": 0.9105358360762684, + "grad_norm": 0.07571625709533691, + "learning_rate": 2.0028778771688015e-05, + "loss": 2.5393, + "step": 30706 + }, + { + "epoch": 0.91056548942858, + "grad_norm": 0.0781267061829567, + "learning_rate": 2.0015597346297376e-05, + "loss": 2.5975, + "step": 30707 + }, + { + "epoch": 0.9105951427808914, + "grad_norm": 0.07949908822774887, + "learning_rate": 2.0002420171251036e-05, + "loss": 2.5973, + "step": 30708 + }, + { + "epoch": 0.9106247961332029, + "grad_norm": 0.07835934311151505, + "learning_rate": 1.9989247246665678e-05, + "loss": 2.5334, + "step": 30709 + }, + { + "epoch": 0.9106544494855143, + "grad_norm": 0.07346510142087936, + "learning_rate": 1.997607857265793e-05, + "loss": 2.598, + "step": 30710 + }, + { + "epoch": 0.9106841028378259, + "grad_norm": 0.07461003959178925, + "learning_rate": 1.996291414934437e-05, + "loss": 2.5614, + "step": 30711 + }, + { + "epoch": 0.9107137561901373, + "grad_norm": 0.07869991660118103, + "learning_rate": 1.9949753976841568e-05, + "loss": 2.5527, + "step": 30712 + }, + { + "epoch": 0.9107434095424488, + "grad_norm": 0.07439074665307999, + "learning_rate": 1.993659805526615e-05, + "loss": 2.5617, + "step": 30713 + }, + { + "epoch": 0.9107730628947602, + "grad_norm": 0.0712166354060173, + "learning_rate": 1.9923446384734534e-05, + "loss": 2.5722, + "step": 30714 + }, + { + "epoch": 0.9108027162470718, + "grad_norm": 0.07509691268205643, + "learning_rate": 1.991029896536317e-05, + "loss": 2.5883, + "step": 30715 + }, + { + "epoch": 0.9108323695993832, + "grad_norm": 0.07297734171152115, + "learning_rate": 1.9897155797268586e-05, + "loss": 2.5839, + "step": 30716 + }, + { + "epoch": 0.9108620229516947, + "grad_norm": 0.07729621231555939, + "learning_rate": 1.9884016880567014e-05, + "loss": 2.5547, + "step": 30717 + }, + { + "epoch": 0.9108916763040061, + "grad_norm": 0.07647115737199783, + "learning_rate": 1.9870882215374865e-05, + "loss": 2.5769, + "step": 30718 + }, + { + "epoch": 0.9109213296563177, + "grad_norm": 0.07014798372983932, + "learning_rate": 1.9857751801808544e-05, + "loss": 2.5578, + "step": 30719 + }, + { + "epoch": 0.9109509830086291, + "grad_norm": 0.07281938940286636, + "learning_rate": 1.9844625639984293e-05, + "loss": 2.5917, + "step": 30720 + }, + { + "epoch": 0.9109806363609406, + "grad_norm": 0.07220438122749329, + "learning_rate": 1.9831503730018242e-05, + "loss": 2.5487, + "step": 30721 + }, + { + "epoch": 0.911010289713252, + "grad_norm": 0.07608604431152344, + "learning_rate": 1.981838607202663e-05, + "loss": 2.556, + "step": 30722 + }, + { + "epoch": 0.9110399430655636, + "grad_norm": 0.07588830590248108, + "learning_rate": 1.980527266612564e-05, + "loss": 2.5555, + "step": 30723 + }, + { + "epoch": 0.911069596417875, + "grad_norm": 0.07533636689186096, + "learning_rate": 1.9792163512431405e-05, + "loss": 2.5929, + "step": 30724 + }, + { + "epoch": 0.9110992497701865, + "grad_norm": 0.07831113785505295, + "learning_rate": 1.9779058611059943e-05, + "loss": 2.5554, + "step": 30725 + }, + { + "epoch": 0.911128903122498, + "grad_norm": 0.07407443225383759, + "learning_rate": 1.976595796212738e-05, + "loss": 2.5577, + "step": 30726 + }, + { + "epoch": 0.9111585564748095, + "grad_norm": 0.07220295816659927, + "learning_rate": 1.975286156574968e-05, + "loss": 2.5378, + "step": 30727 + }, + { + "epoch": 0.911188209827121, + "grad_norm": 0.07512567192316055, + "learning_rate": 1.9739769422042862e-05, + "loss": 2.5673, + "step": 30728 + }, + { + "epoch": 0.9112178631794324, + "grad_norm": 0.07647518813610077, + "learning_rate": 1.972668153112278e-05, + "loss": 2.5808, + "step": 30729 + }, + { + "epoch": 0.911247516531744, + "grad_norm": 0.07349131256341934, + "learning_rate": 1.9713597893105396e-05, + "loss": 2.5749, + "step": 30730 + }, + { + "epoch": 0.9112771698840554, + "grad_norm": 0.07539472728967667, + "learning_rate": 1.97005185081065e-05, + "loss": 2.551, + "step": 30731 + }, + { + "epoch": 0.9113068232363669, + "grad_norm": 0.07580332458019257, + "learning_rate": 1.9687443376242008e-05, + "loss": 2.5927, + "step": 30732 + }, + { + "epoch": 0.9113364765886783, + "grad_norm": 0.07462262362241745, + "learning_rate": 1.96743724976276e-05, + "loss": 2.5515, + "step": 30733 + }, + { + "epoch": 0.9113661299409899, + "grad_norm": 0.0742051750421524, + "learning_rate": 1.9661305872379075e-05, + "loss": 2.5254, + "step": 30734 + }, + { + "epoch": 0.9113957832933013, + "grad_norm": 0.07910394668579102, + "learning_rate": 1.9648243500612173e-05, + "loss": 2.5586, + "step": 30735 + }, + { + "epoch": 0.9114254366456128, + "grad_norm": 0.07597511261701584, + "learning_rate": 1.963518538244252e-05, + "loss": 2.5608, + "step": 30736 + }, + { + "epoch": 0.9114550899979242, + "grad_norm": 0.07855886965990067, + "learning_rate": 1.9622131517985697e-05, + "loss": 2.6318, + "step": 30737 + }, + { + "epoch": 0.9114847433502358, + "grad_norm": 0.07532983273267746, + "learning_rate": 1.960908190735744e-05, + "loss": 2.5506, + "step": 30738 + }, + { + "epoch": 0.9115143967025472, + "grad_norm": 0.07481840252876282, + "learning_rate": 1.9596036550673156e-05, + "loss": 2.5616, + "step": 30739 + }, + { + "epoch": 0.9115440500548587, + "grad_norm": 0.07402243465185165, + "learning_rate": 1.958299544804848e-05, + "loss": 2.5496, + "step": 30740 + }, + { + "epoch": 0.9115737034071701, + "grad_norm": 0.07487371563911438, + "learning_rate": 1.9569958599598813e-05, + "loss": 2.565, + "step": 30741 + }, + { + "epoch": 0.9116033567594817, + "grad_norm": 0.07581746578216553, + "learning_rate": 1.9556926005439622e-05, + "loss": 2.559, + "step": 30742 + }, + { + "epoch": 0.9116330101117931, + "grad_norm": 0.07406994700431824, + "learning_rate": 1.9543897665686317e-05, + "loss": 2.5629, + "step": 30743 + }, + { + "epoch": 0.9116626634641046, + "grad_norm": 0.07764722406864166, + "learning_rate": 1.9530873580454246e-05, + "loss": 2.5674, + "step": 30744 + }, + { + "epoch": 0.911692316816416, + "grad_norm": 0.07048135250806808, + "learning_rate": 1.9517853749858817e-05, + "loss": 2.5804, + "step": 30745 + }, + { + "epoch": 0.9117219701687276, + "grad_norm": 0.07517565786838531, + "learning_rate": 1.9504838174015215e-05, + "loss": 2.5581, + "step": 30746 + }, + { + "epoch": 0.911751623521039, + "grad_norm": 0.07181081920862198, + "learning_rate": 1.9491826853038797e-05, + "loss": 2.5371, + "step": 30747 + }, + { + "epoch": 0.9117812768733505, + "grad_norm": 0.07880154997110367, + "learning_rate": 1.9478819787044687e-05, + "loss": 2.5721, + "step": 30748 + }, + { + "epoch": 0.9118109302256621, + "grad_norm": 0.07020591199398041, + "learning_rate": 1.946581697614813e-05, + "loss": 2.5852, + "step": 30749 + }, + { + "epoch": 0.9118405835779735, + "grad_norm": 0.0718943253159523, + "learning_rate": 1.945281842046426e-05, + "loss": 2.5537, + "step": 30750 + }, + { + "epoch": 0.911870236930285, + "grad_norm": 0.07679470628499985, + "learning_rate": 1.943982412010814e-05, + "loss": 2.5758, + "step": 30751 + }, + { + "epoch": 0.9118998902825964, + "grad_norm": 0.08386798202991486, + "learning_rate": 1.9426834075194853e-05, + "loss": 2.5681, + "step": 30752 + }, + { + "epoch": 0.911929543634908, + "grad_norm": 0.07827074080705643, + "learning_rate": 1.9413848285839476e-05, + "loss": 2.5676, + "step": 30753 + }, + { + "epoch": 0.9119591969872194, + "grad_norm": 0.07091552764177322, + "learning_rate": 1.940086675215702e-05, + "loss": 2.5925, + "step": 30754 + }, + { + "epoch": 0.9119888503395309, + "grad_norm": 0.08277253806591034, + "learning_rate": 1.9387889474262286e-05, + "loss": 2.5601, + "step": 30755 + }, + { + "epoch": 0.9120185036918423, + "grad_norm": 0.07292837649583817, + "learning_rate": 1.9374916452270352e-05, + "loss": 2.5293, + "step": 30756 + }, + { + "epoch": 0.9120481570441539, + "grad_norm": 0.07456078380346298, + "learning_rate": 1.93619476862959e-05, + "loss": 2.583, + "step": 30757 + }, + { + "epoch": 0.9120778103964653, + "grad_norm": 0.07976004481315613, + "learning_rate": 1.9348983176454006e-05, + "loss": 2.5876, + "step": 30758 + }, + { + "epoch": 0.9121074637487768, + "grad_norm": 0.0799744725227356, + "learning_rate": 1.9336022922859353e-05, + "loss": 2.547, + "step": 30759 + }, + { + "epoch": 0.9121371171010882, + "grad_norm": 0.07817602157592773, + "learning_rate": 1.932306692562674e-05, + "loss": 2.5324, + "step": 30760 + }, + { + "epoch": 0.9121667704533998, + "grad_norm": 0.07535701245069504, + "learning_rate": 1.9310115184870857e-05, + "loss": 2.5258, + "step": 30761 + }, + { + "epoch": 0.9121964238057112, + "grad_norm": 0.07472462952136993, + "learning_rate": 1.9297167700706385e-05, + "loss": 2.5693, + "step": 30762 + }, + { + "epoch": 0.9122260771580227, + "grad_norm": 0.07254303991794586, + "learning_rate": 1.9284224473248068e-05, + "loss": 2.5406, + "step": 30763 + }, + { + "epoch": 0.9122557305103342, + "grad_norm": 0.0780973955988884, + "learning_rate": 1.9271285502610423e-05, + "loss": 2.5989, + "step": 30764 + }, + { + "epoch": 0.9122853838626457, + "grad_norm": 0.07611869275569916, + "learning_rate": 1.9258350788908142e-05, + "loss": 2.5462, + "step": 30765 + }, + { + "epoch": 0.9123150372149571, + "grad_norm": 0.07736316323280334, + "learning_rate": 1.924542033225557e-05, + "loss": 2.5596, + "step": 30766 + }, + { + "epoch": 0.9123446905672686, + "grad_norm": 0.07944692671298981, + "learning_rate": 1.9232494132767342e-05, + "loss": 2.6107, + "step": 30767 + }, + { + "epoch": 0.91237434391958, + "grad_norm": 0.07819093763828278, + "learning_rate": 1.9219572190557922e-05, + "loss": 2.59, + "step": 30768 + }, + { + "epoch": 0.9124039972718916, + "grad_norm": 0.0745905265212059, + "learning_rate": 1.9206654505741717e-05, + "loss": 2.5274, + "step": 30769 + }, + { + "epoch": 0.9124336506242031, + "grad_norm": 0.07503267377614975, + "learning_rate": 1.9193741078433026e-05, + "loss": 2.554, + "step": 30770 + }, + { + "epoch": 0.9124633039765145, + "grad_norm": 0.07360316812992096, + "learning_rate": 1.9180831908746364e-05, + "loss": 2.5704, + "step": 30771 + }, + { + "epoch": 0.9124929573288261, + "grad_norm": 0.0787699893116951, + "learning_rate": 1.916792699679598e-05, + "loss": 2.5715, + "step": 30772 + }, + { + "epoch": 0.9125226106811375, + "grad_norm": 0.07295161485671997, + "learning_rate": 1.9155026342696113e-05, + "loss": 2.5467, + "step": 30773 + }, + { + "epoch": 0.912552264033449, + "grad_norm": 0.07249525189399719, + "learning_rate": 1.914212994656106e-05, + "loss": 2.5752, + "step": 30774 + }, + { + "epoch": 0.9125819173857604, + "grad_norm": 0.07329729199409485, + "learning_rate": 1.9129237808505007e-05, + "loss": 2.5612, + "step": 30775 + }, + { + "epoch": 0.912611570738072, + "grad_norm": 0.07404734194278717, + "learning_rate": 1.9116349928642084e-05, + "loss": 2.5435, + "step": 30776 + }, + { + "epoch": 0.9126412240903834, + "grad_norm": 0.07989870756864548, + "learning_rate": 1.910346630708637e-05, + "loss": 2.5407, + "step": 30777 + }, + { + "epoch": 0.9126708774426949, + "grad_norm": 0.07656195759773254, + "learning_rate": 1.9090586943952048e-05, + "loss": 2.5842, + "step": 30778 + }, + { + "epoch": 0.9127005307950063, + "grad_norm": 0.07518458366394043, + "learning_rate": 1.9077711839353085e-05, + "loss": 2.5919, + "step": 30779 + }, + { + "epoch": 0.9127301841473179, + "grad_norm": 0.07349738478660583, + "learning_rate": 1.9064840993403554e-05, + "loss": 2.5977, + "step": 30780 + }, + { + "epoch": 0.9127598374996293, + "grad_norm": 0.07845134288072586, + "learning_rate": 1.9051974406217366e-05, + "loss": 2.557, + "step": 30781 + }, + { + "epoch": 0.9127894908519408, + "grad_norm": 0.07482367008924484, + "learning_rate": 1.9039112077908537e-05, + "loss": 2.5547, + "step": 30782 + }, + { + "epoch": 0.9128191442042523, + "grad_norm": 0.07415144145488739, + "learning_rate": 1.902625400859087e-05, + "loss": 2.551, + "step": 30783 + }, + { + "epoch": 0.9128487975565638, + "grad_norm": 0.07179775089025497, + "learning_rate": 1.9013400198378382e-05, + "loss": 2.5722, + "step": 30784 + }, + { + "epoch": 0.9128784509088752, + "grad_norm": 0.07680188119411469, + "learning_rate": 1.9000550647384763e-05, + "loss": 2.5485, + "step": 30785 + }, + { + "epoch": 0.9129081042611867, + "grad_norm": 0.0718844085931778, + "learning_rate": 1.898770535572386e-05, + "loss": 2.5729, + "step": 30786 + }, + { + "epoch": 0.9129377576134982, + "grad_norm": 0.07157532125711441, + "learning_rate": 1.897486432350931e-05, + "loss": 2.5662, + "step": 30787 + }, + { + "epoch": 0.9129674109658097, + "grad_norm": 0.07592303305864334, + "learning_rate": 1.8962027550854965e-05, + "loss": 2.5436, + "step": 30788 + }, + { + "epoch": 0.9129970643181211, + "grad_norm": 0.07544118911027908, + "learning_rate": 1.8949195037874402e-05, + "loss": 2.5615, + "step": 30789 + }, + { + "epoch": 0.9130267176704326, + "grad_norm": 0.07573064416646957, + "learning_rate": 1.89363667846813e-05, + "loss": 2.5802, + "step": 30790 + }, + { + "epoch": 0.9130563710227442, + "grad_norm": 0.07492867112159729, + "learning_rate": 1.8923542791389246e-05, + "loss": 2.5468, + "step": 30791 + }, + { + "epoch": 0.9130860243750556, + "grad_norm": 0.07872077077627182, + "learning_rate": 1.891072305811181e-05, + "loss": 2.5535, + "step": 30792 + }, + { + "epoch": 0.9131156777273671, + "grad_norm": 0.07354041188955307, + "learning_rate": 1.889790758496246e-05, + "loss": 2.5503, + "step": 30793 + }, + { + "epoch": 0.9131453310796785, + "grad_norm": 0.07631231099367142, + "learning_rate": 1.8885096372054766e-05, + "loss": 2.5682, + "step": 30794 + }, + { + "epoch": 0.9131749844319901, + "grad_norm": 0.0721110999584198, + "learning_rate": 1.8872289419502085e-05, + "loss": 2.5508, + "step": 30795 + }, + { + "epoch": 0.9132046377843015, + "grad_norm": 0.07684530317783356, + "learning_rate": 1.8859486727417885e-05, + "loss": 2.5619, + "step": 30796 + }, + { + "epoch": 0.913234291136613, + "grad_norm": 0.07695397734642029, + "learning_rate": 1.8846688295915515e-05, + "loss": 2.6033, + "step": 30797 + }, + { + "epoch": 0.9132639444889245, + "grad_norm": 0.07021850347518921, + "learning_rate": 1.8833894125108274e-05, + "loss": 2.5786, + "step": 30798 + }, + { + "epoch": 0.913293597841236, + "grad_norm": 0.07294157892465591, + "learning_rate": 1.882110421510952e-05, + "loss": 2.5801, + "step": 30799 + }, + { + "epoch": 0.9133232511935474, + "grad_norm": 0.07475097477436066, + "learning_rate": 1.880831856603249e-05, + "loss": 2.5934, + "step": 30800 + }, + { + "epoch": 0.9133529045458589, + "grad_norm": 0.08132854104042053, + "learning_rate": 1.879553717799043e-05, + "loss": 2.5741, + "step": 30801 + }, + { + "epoch": 0.9133825578981704, + "grad_norm": 0.0737188458442688, + "learning_rate": 1.8782760051096415e-05, + "loss": 2.559, + "step": 30802 + }, + { + "epoch": 0.9134122112504819, + "grad_norm": 0.07308529317378998, + "learning_rate": 1.8769987185463687e-05, + "loss": 2.5479, + "step": 30803 + }, + { + "epoch": 0.9134418646027933, + "grad_norm": 0.08126156032085419, + "learning_rate": 1.8757218581205328e-05, + "loss": 2.5805, + "step": 30804 + }, + { + "epoch": 0.9134715179551048, + "grad_norm": 0.07749639451503754, + "learning_rate": 1.8744454238434405e-05, + "loss": 2.5675, + "step": 30805 + }, + { + "epoch": 0.9135011713074163, + "grad_norm": 0.07179808616638184, + "learning_rate": 1.8731694157263944e-05, + "loss": 2.5674, + "step": 30806 + }, + { + "epoch": 0.9135308246597278, + "grad_norm": 0.07616424560546875, + "learning_rate": 1.8718938337806967e-05, + "loss": 2.5687, + "step": 30807 + }, + { + "epoch": 0.9135604780120392, + "grad_norm": 0.07729604095220566, + "learning_rate": 1.870618678017638e-05, + "loss": 2.5555, + "step": 30808 + }, + { + "epoch": 0.9135901313643507, + "grad_norm": 0.0769987478852272, + "learning_rate": 1.86934394844851e-05, + "loss": 2.6014, + "step": 30809 + }, + { + "epoch": 0.9136197847166622, + "grad_norm": 0.07698698341846466, + "learning_rate": 1.8680696450846023e-05, + "loss": 2.5786, + "step": 30810 + }, + { + "epoch": 0.9136494380689737, + "grad_norm": 0.0749945342540741, + "learning_rate": 1.8667957679372015e-05, + "loss": 2.5747, + "step": 30811 + }, + { + "epoch": 0.9136790914212852, + "grad_norm": 0.07357234507799149, + "learning_rate": 1.865522317017587e-05, + "loss": 2.6023, + "step": 30812 + }, + { + "epoch": 0.9137087447735966, + "grad_norm": 0.07620242238044739, + "learning_rate": 1.8642492923370336e-05, + "loss": 2.5696, + "step": 30813 + }, + { + "epoch": 0.9137383981259082, + "grad_norm": 0.07576481252908707, + "learning_rate": 1.8629766939068206e-05, + "loss": 2.5827, + "step": 30814 + }, + { + "epoch": 0.9137680514782196, + "grad_norm": 0.0772973895072937, + "learning_rate": 1.8617045217382056e-05, + "loss": 2.5635, + "step": 30815 + }, + { + "epoch": 0.9137977048305311, + "grad_norm": 0.07255716621875763, + "learning_rate": 1.8604327758424578e-05, + "loss": 2.5222, + "step": 30816 + }, + { + "epoch": 0.9138273581828426, + "grad_norm": 0.0750199407339096, + "learning_rate": 1.8591614562308458e-05, + "loss": 2.5602, + "step": 30817 + }, + { + "epoch": 0.9138570115351541, + "grad_norm": 0.06841602921485901, + "learning_rate": 1.8578905629146213e-05, + "loss": 2.5477, + "step": 30818 + }, + { + "epoch": 0.9138866648874655, + "grad_norm": 0.07653667032718658, + "learning_rate": 1.8566200959050373e-05, + "loss": 2.5546, + "step": 30819 + }, + { + "epoch": 0.913916318239777, + "grad_norm": 0.07139138132333755, + "learning_rate": 1.8553500552133505e-05, + "loss": 2.5507, + "step": 30820 + }, + { + "epoch": 0.9139459715920885, + "grad_norm": 0.07438499480485916, + "learning_rate": 1.8540804408508027e-05, + "loss": 2.5571, + "step": 30821 + }, + { + "epoch": 0.9139756249444, + "grad_norm": 0.07602871209383011, + "learning_rate": 1.852811252828629e-05, + "loss": 2.5655, + "step": 30822 + }, + { + "epoch": 0.9140052782967114, + "grad_norm": 0.07283969223499298, + "learning_rate": 1.8515424911580813e-05, + "loss": 2.5806, + "step": 30823 + }, + { + "epoch": 0.9140349316490229, + "grad_norm": 0.07138428837060928, + "learning_rate": 1.8502741558503842e-05, + "loss": 2.544, + "step": 30824 + }, + { + "epoch": 0.9140645850013344, + "grad_norm": 0.0734315812587738, + "learning_rate": 1.849006246916779e-05, + "loss": 2.5864, + "step": 30825 + }, + { + "epoch": 0.9140942383536459, + "grad_norm": 0.07286344468593597, + "learning_rate": 1.8477387643684895e-05, + "loss": 2.5795, + "step": 30826 + }, + { + "epoch": 0.9141238917059573, + "grad_norm": 0.07881774753332138, + "learning_rate": 1.8464717082167403e-05, + "loss": 2.5539, + "step": 30827 + }, + { + "epoch": 0.9141535450582688, + "grad_norm": 0.07500877231359482, + "learning_rate": 1.845205078472745e-05, + "loss": 2.5518, + "step": 30828 + }, + { + "epoch": 0.9141831984105803, + "grad_norm": 0.07239436358213425, + "learning_rate": 1.8439388751477272e-05, + "loss": 2.5691, + "step": 30829 + }, + { + "epoch": 0.9142128517628918, + "grad_norm": 0.06913107633590698, + "learning_rate": 1.8426730982529006e-05, + "loss": 2.5939, + "step": 30830 + }, + { + "epoch": 0.9142425051152032, + "grad_norm": 0.07313421368598938, + "learning_rate": 1.8414077477994618e-05, + "loss": 2.5393, + "step": 30831 + }, + { + "epoch": 0.9142721584675148, + "grad_norm": 0.07029549032449722, + "learning_rate": 1.8401428237986297e-05, + "loss": 2.5573, + "step": 30832 + }, + { + "epoch": 0.9143018118198263, + "grad_norm": 0.07185235619544983, + "learning_rate": 1.8388783262615948e-05, + "loss": 2.563, + "step": 30833 + }, + { + "epoch": 0.9143314651721377, + "grad_norm": 0.0708225667476654, + "learning_rate": 1.83761425519956e-05, + "loss": 2.5488, + "step": 30834 + }, + { + "epoch": 0.9143611185244492, + "grad_norm": 0.0720193013548851, + "learning_rate": 1.8363506106237106e-05, + "loss": 2.5792, + "step": 30835 + }, + { + "epoch": 0.9143907718767607, + "grad_norm": 0.0708811804652214, + "learning_rate": 1.835087392545254e-05, + "loss": 2.5645, + "step": 30836 + }, + { + "epoch": 0.9144204252290722, + "grad_norm": 0.07244853675365448, + "learning_rate": 1.8338246009753645e-05, + "loss": 2.5709, + "step": 30837 + }, + { + "epoch": 0.9144500785813836, + "grad_norm": 0.07112348079681396, + "learning_rate": 1.8325622359252226e-05, + "loss": 2.5213, + "step": 30838 + }, + { + "epoch": 0.9144797319336951, + "grad_norm": 0.07550410181283951, + "learning_rate": 1.8313002974060135e-05, + "loss": 2.6054, + "step": 30839 + }, + { + "epoch": 0.9145093852860066, + "grad_norm": 0.07545573264360428, + "learning_rate": 1.8300387854289058e-05, + "loss": 2.5917, + "step": 30840 + }, + { + "epoch": 0.9145390386383181, + "grad_norm": 0.07305021584033966, + "learning_rate": 1.8287777000050797e-05, + "loss": 2.5642, + "step": 30841 + }, + { + "epoch": 0.9145686919906295, + "grad_norm": 0.07694476842880249, + "learning_rate": 1.8275170411456875e-05, + "loss": 2.589, + "step": 30842 + }, + { + "epoch": 0.914598345342941, + "grad_norm": 0.07435674220323563, + "learning_rate": 1.8262568088619036e-05, + "loss": 2.5956, + "step": 30843 + }, + { + "epoch": 0.9146279986952525, + "grad_norm": 0.06915280967950821, + "learning_rate": 1.8249970031648855e-05, + "loss": 2.5413, + "step": 30844 + }, + { + "epoch": 0.914657652047564, + "grad_norm": 0.0711841881275177, + "learning_rate": 1.8237376240657856e-05, + "loss": 2.553, + "step": 30845 + }, + { + "epoch": 0.9146873053998754, + "grad_norm": 0.07473523914813995, + "learning_rate": 1.8224786715757613e-05, + "loss": 2.5486, + "step": 30846 + }, + { + "epoch": 0.914716958752187, + "grad_norm": 0.06859926879405975, + "learning_rate": 1.8212201457059542e-05, + "loss": 2.5642, + "step": 30847 + }, + { + "epoch": 0.9147466121044984, + "grad_norm": 0.07113750278949738, + "learning_rate": 1.8199620464675105e-05, + "loss": 2.5633, + "step": 30848 + }, + { + "epoch": 0.9147762654568099, + "grad_norm": 0.07201337814331055, + "learning_rate": 1.8187043738715768e-05, + "loss": 2.6128, + "step": 30849 + }, + { + "epoch": 0.9148059188091213, + "grad_norm": 0.06909092515707016, + "learning_rate": 1.8174471279292835e-05, + "loss": 2.5649, + "step": 30850 + }, + { + "epoch": 0.9148355721614329, + "grad_norm": 0.06998516619205475, + "learning_rate": 1.8161903086517773e-05, + "loss": 2.5847, + "step": 30851 + }, + { + "epoch": 0.9148652255137443, + "grad_norm": 0.07152152061462402, + "learning_rate": 1.8149339160501653e-05, + "loss": 2.5632, + "step": 30852 + }, + { + "epoch": 0.9148948788660558, + "grad_norm": 0.07277210801839828, + "learning_rate": 1.8136779501355893e-05, + "loss": 2.5779, + "step": 30853 + }, + { + "epoch": 0.9149245322183673, + "grad_norm": 0.06897364556789398, + "learning_rate": 1.812422410919162e-05, + "loss": 2.551, + "step": 30854 + }, + { + "epoch": 0.9149541855706788, + "grad_norm": 0.0697932317852974, + "learning_rate": 1.8111672984120088e-05, + "loss": 2.6084, + "step": 30855 + }, + { + "epoch": 0.9149838389229903, + "grad_norm": 0.07102685421705246, + "learning_rate": 1.8099126126252363e-05, + "loss": 2.5802, + "step": 30856 + }, + { + "epoch": 0.9150134922753017, + "grad_norm": 0.06974713504314423, + "learning_rate": 1.8086583535699642e-05, + "loss": 2.5668, + "step": 30857 + }, + { + "epoch": 0.9150431456276132, + "grad_norm": 0.0734783485531807, + "learning_rate": 1.8074045212572943e-05, + "loss": 2.5571, + "step": 30858 + }, + { + "epoch": 0.9150727989799247, + "grad_norm": 0.07103338837623596, + "learning_rate": 1.8061511156983235e-05, + "loss": 2.5178, + "step": 30859 + }, + { + "epoch": 0.9151024523322362, + "grad_norm": 0.07038478553295135, + "learning_rate": 1.8048981369041594e-05, + "loss": 2.561, + "step": 30860 + }, + { + "epoch": 0.9151321056845476, + "grad_norm": 0.07367251813411713, + "learning_rate": 1.803645584885899e-05, + "loss": 2.5498, + "step": 30861 + }, + { + "epoch": 0.9151617590368591, + "grad_norm": 0.07129838317632675, + "learning_rate": 1.8023934596546275e-05, + "loss": 2.5549, + "step": 30862 + }, + { + "epoch": 0.9151914123891706, + "grad_norm": 0.07207486033439636, + "learning_rate": 1.8011417612214365e-05, + "loss": 2.5838, + "step": 30863 + }, + { + "epoch": 0.9152210657414821, + "grad_norm": 0.07271672785282135, + "learning_rate": 1.7998904895974056e-05, + "loss": 2.5996, + "step": 30864 + }, + { + "epoch": 0.9152507190937935, + "grad_norm": 0.07250677794218063, + "learning_rate": 1.7986396447936203e-05, + "loss": 2.5543, + "step": 30865 + }, + { + "epoch": 0.915280372446105, + "grad_norm": 0.07222386449575424, + "learning_rate": 1.79738922682115e-05, + "loss": 2.5486, + "step": 30866 + }, + { + "epoch": 0.9153100257984165, + "grad_norm": 0.07268941402435303, + "learning_rate": 1.7961392356910745e-05, + "loss": 2.5831, + "step": 30867 + }, + { + "epoch": 0.915339679150728, + "grad_norm": 0.07112392783164978, + "learning_rate": 1.7948896714144624e-05, + "loss": 2.5674, + "step": 30868 + }, + { + "epoch": 0.9153693325030394, + "grad_norm": 0.07405398041009903, + "learning_rate": 1.793640534002372e-05, + "loss": 2.5535, + "step": 30869 + }, + { + "epoch": 0.915398985855351, + "grad_norm": 0.07254408299922943, + "learning_rate": 1.792391823465872e-05, + "loss": 2.5568, + "step": 30870 + }, + { + "epoch": 0.9154286392076624, + "grad_norm": 0.07049674540758133, + "learning_rate": 1.7911435398160202e-05, + "loss": 2.5638, + "step": 30871 + }, + { + "epoch": 0.9154582925599739, + "grad_norm": 0.07083026319742203, + "learning_rate": 1.7898956830638634e-05, + "loss": 2.5597, + "step": 30872 + }, + { + "epoch": 0.9154879459122854, + "grad_norm": 0.0719439908862114, + "learning_rate": 1.7886482532204597e-05, + "loss": 2.573, + "step": 30873 + }, + { + "epoch": 0.9155175992645969, + "grad_norm": 0.07849986106157303, + "learning_rate": 1.787401250296844e-05, + "loss": 2.5838, + "step": 30874 + }, + { + "epoch": 0.9155472526169084, + "grad_norm": 0.07039907574653625, + "learning_rate": 1.786154674304069e-05, + "loss": 2.5589, + "step": 30875 + }, + { + "epoch": 0.9155769059692198, + "grad_norm": 0.07363362610340118, + "learning_rate": 1.7849085252531707e-05, + "loss": 2.541, + "step": 30876 + }, + { + "epoch": 0.9156065593215313, + "grad_norm": 0.07343807816505432, + "learning_rate": 1.783662803155184e-05, + "loss": 2.5672, + "step": 30877 + }, + { + "epoch": 0.9156362126738428, + "grad_norm": 0.07300961762666702, + "learning_rate": 1.782417508021139e-05, + "loss": 2.5767, + "step": 30878 + }, + { + "epoch": 0.9156658660261543, + "grad_norm": 0.07261307537555695, + "learning_rate": 1.7811726398620666e-05, + "loss": 2.5493, + "step": 30879 + }, + { + "epoch": 0.9156955193784657, + "grad_norm": 0.07490301877260208, + "learning_rate": 1.779928198688979e-05, + "loss": 2.5534, + "step": 30880 + }, + { + "epoch": 0.9157251727307772, + "grad_norm": 0.07208778709173203, + "learning_rate": 1.778684184512913e-05, + "loss": 2.569, + "step": 30881 + }, + { + "epoch": 0.9157548260830887, + "grad_norm": 0.07337155193090439, + "learning_rate": 1.7774405973448706e-05, + "loss": 2.5735, + "step": 30882 + }, + { + "epoch": 0.9157844794354002, + "grad_norm": 0.07268473505973816, + "learning_rate": 1.77619743719587e-05, + "loss": 2.5676, + "step": 30883 + }, + { + "epoch": 0.9158141327877116, + "grad_norm": 0.08182284235954285, + "learning_rate": 1.77495470407692e-05, + "loss": 2.5931, + "step": 30884 + }, + { + "epoch": 0.9158437861400232, + "grad_norm": 0.07547035068273544, + "learning_rate": 1.773712397999028e-05, + "loss": 2.5827, + "step": 30885 + }, + { + "epoch": 0.9158734394923346, + "grad_norm": 0.06946563720703125, + "learning_rate": 1.7724705189731792e-05, + "loss": 2.573, + "step": 30886 + }, + { + "epoch": 0.9159030928446461, + "grad_norm": 0.075364850461483, + "learning_rate": 1.771229067010388e-05, + "loss": 2.5388, + "step": 30887 + }, + { + "epoch": 0.9159327461969575, + "grad_norm": 0.07411925494670868, + "learning_rate": 1.7699880421216398e-05, + "loss": 2.5459, + "step": 30888 + }, + { + "epoch": 0.9159623995492691, + "grad_norm": 0.07322216778993607, + "learning_rate": 1.768747444317925e-05, + "loss": 2.5919, + "step": 30889 + }, + { + "epoch": 0.9159920529015805, + "grad_norm": 0.07617821544408798, + "learning_rate": 1.767507273610236e-05, + "loss": 2.6087, + "step": 30890 + }, + { + "epoch": 0.916021706253892, + "grad_norm": 0.0710507482290268, + "learning_rate": 1.7662675300095467e-05, + "loss": 2.5777, + "step": 30891 + }, + { + "epoch": 0.9160513596062034, + "grad_norm": 0.06989865005016327, + "learning_rate": 1.765028213526837e-05, + "loss": 2.557, + "step": 30892 + }, + { + "epoch": 0.916081012958515, + "grad_norm": 0.07340792566537857, + "learning_rate": 1.763789324173082e-05, + "loss": 2.5745, + "step": 30893 + }, + { + "epoch": 0.9161106663108265, + "grad_norm": 0.07284864038228989, + "learning_rate": 1.76255086195925e-05, + "loss": 2.6037, + "step": 30894 + }, + { + "epoch": 0.9161403196631379, + "grad_norm": 0.07301656156778336, + "learning_rate": 1.7613128268963165e-05, + "loss": 2.5703, + "step": 30895 + }, + { + "epoch": 0.9161699730154494, + "grad_norm": 0.07113362103700638, + "learning_rate": 1.7600752189952385e-05, + "loss": 2.5728, + "step": 30896 + }, + { + "epoch": 0.9161996263677609, + "grad_norm": 0.07241018116474152, + "learning_rate": 1.758838038266969e-05, + "loss": 2.5364, + "step": 30897 + }, + { + "epoch": 0.9162292797200724, + "grad_norm": 0.07185453176498413, + "learning_rate": 1.757601284722471e-05, + "loss": 2.5324, + "step": 30898 + }, + { + "epoch": 0.9162589330723838, + "grad_norm": 0.07467161864042282, + "learning_rate": 1.756364958372686e-05, + "loss": 2.5781, + "step": 30899 + }, + { + "epoch": 0.9162885864246954, + "grad_norm": 0.07280628383159637, + "learning_rate": 1.7551290592285774e-05, + "loss": 2.5171, + "step": 30900 + }, + { + "epoch": 0.9163182397770068, + "grad_norm": 0.0745832547545433, + "learning_rate": 1.7538935873010863e-05, + "loss": 2.5514, + "step": 30901 + }, + { + "epoch": 0.9163478931293183, + "grad_norm": 0.07039618492126465, + "learning_rate": 1.752658542601143e-05, + "loss": 2.572, + "step": 30902 + }, + { + "epoch": 0.9163775464816297, + "grad_norm": 0.06983225047588348, + "learning_rate": 1.751423925139689e-05, + "loss": 2.5736, + "step": 30903 + }, + { + "epoch": 0.9164071998339413, + "grad_norm": 0.07708969712257385, + "learning_rate": 1.7501897349276653e-05, + "loss": 2.5499, + "step": 30904 + }, + { + "epoch": 0.9164368531862527, + "grad_norm": 0.07162386924028397, + "learning_rate": 1.748955971975985e-05, + "loss": 2.5681, + "step": 30905 + }, + { + "epoch": 0.9164665065385642, + "grad_norm": 0.0731382742524147, + "learning_rate": 1.7477226362955956e-05, + "loss": 2.5333, + "step": 30906 + }, + { + "epoch": 0.9164961598908756, + "grad_norm": 0.07402731478214264, + "learning_rate": 1.7464897278973935e-05, + "loss": 2.5823, + "step": 30907 + }, + { + "epoch": 0.9165258132431872, + "grad_norm": 0.06970434635877609, + "learning_rate": 1.745257246792309e-05, + "loss": 2.5552, + "step": 30908 + }, + { + "epoch": 0.9165554665954986, + "grad_norm": 0.07317665964365005, + "learning_rate": 1.7440251929912498e-05, + "loss": 2.578, + "step": 30909 + }, + { + "epoch": 0.9165851199478101, + "grad_norm": 0.07356256991624832, + "learning_rate": 1.7427935665051353e-05, + "loss": 2.5727, + "step": 30910 + }, + { + "epoch": 0.9166147733001215, + "grad_norm": 0.07336890697479248, + "learning_rate": 1.741562367344868e-05, + "loss": 2.589, + "step": 30911 + }, + { + "epoch": 0.9166444266524331, + "grad_norm": 0.07262039184570312, + "learning_rate": 1.7403315955213438e-05, + "loss": 2.5761, + "step": 30912 + }, + { + "epoch": 0.9166740800047445, + "grad_norm": 0.07380321621894836, + "learning_rate": 1.739101251045472e-05, + "loss": 2.5669, + "step": 30913 + }, + { + "epoch": 0.916703733357056, + "grad_norm": 0.06703846901655197, + "learning_rate": 1.737871333928137e-05, + "loss": 2.5458, + "step": 30914 + }, + { + "epoch": 0.9167333867093675, + "grad_norm": 0.07069145143032074, + "learning_rate": 1.7366418441802424e-05, + "loss": 2.5695, + "step": 30915 + }, + { + "epoch": 0.916763040061679, + "grad_norm": 0.07472551614046097, + "learning_rate": 1.7354127818126675e-05, + "loss": 2.5857, + "step": 30916 + }, + { + "epoch": 0.9167926934139905, + "grad_norm": 0.07750533521175385, + "learning_rate": 1.7341841468362985e-05, + "loss": 2.5653, + "step": 30917 + }, + { + "epoch": 0.9168223467663019, + "grad_norm": 0.07240626215934753, + "learning_rate": 1.7329559392620153e-05, + "loss": 2.5677, + "step": 30918 + }, + { + "epoch": 0.9168520001186135, + "grad_norm": 0.07621728628873825, + "learning_rate": 1.7317281591006874e-05, + "loss": 2.56, + "step": 30919 + }, + { + "epoch": 0.9168816534709249, + "grad_norm": 0.07418561726808548, + "learning_rate": 1.730500806363189e-05, + "loss": 2.5327, + "step": 30920 + }, + { + "epoch": 0.9169113068232364, + "grad_norm": 0.07381278276443481, + "learning_rate": 1.7292738810603946e-05, + "loss": 2.5699, + "step": 30921 + }, + { + "epoch": 0.9169409601755478, + "grad_norm": 0.07276619970798492, + "learning_rate": 1.728047383203163e-05, + "loss": 2.5638, + "step": 30922 + }, + { + "epoch": 0.9169706135278594, + "grad_norm": 0.07094889134168625, + "learning_rate": 1.7268213128023623e-05, + "loss": 2.541, + "step": 30923 + }, + { + "epoch": 0.9170002668801708, + "grad_norm": 0.0683925598859787, + "learning_rate": 1.7255956698688403e-05, + "loss": 2.5491, + "step": 30924 + }, + { + "epoch": 0.9170299202324823, + "grad_norm": 0.06934070587158203, + "learning_rate": 1.724370454413454e-05, + "loss": 2.5824, + "step": 30925 + }, + { + "epoch": 0.9170595735847937, + "grad_norm": 0.07391122728586197, + "learning_rate": 1.723145666447057e-05, + "loss": 2.5742, + "step": 30926 + }, + { + "epoch": 0.9170892269371053, + "grad_norm": 0.07598850876092911, + "learning_rate": 1.721921305980495e-05, + "loss": 2.5606, + "step": 30927 + }, + { + "epoch": 0.9171188802894167, + "grad_norm": 0.07527074217796326, + "learning_rate": 1.7206973730246046e-05, + "loss": 2.572, + "step": 30928 + }, + { + "epoch": 0.9171485336417282, + "grad_norm": 0.06887569278478622, + "learning_rate": 1.7194738675902267e-05, + "loss": 2.5712, + "step": 30929 + }, + { + "epoch": 0.9171781869940396, + "grad_norm": 0.07496706396341324, + "learning_rate": 1.7182507896881916e-05, + "loss": 2.5674, + "step": 30930 + }, + { + "epoch": 0.9172078403463512, + "grad_norm": 0.07274512946605682, + "learning_rate": 1.717028139329335e-05, + "loss": 2.5486, + "step": 30931 + }, + { + "epoch": 0.9172374936986626, + "grad_norm": 0.07146900147199631, + "learning_rate": 1.7158059165244766e-05, + "loss": 2.5614, + "step": 30932 + }, + { + "epoch": 0.9172671470509741, + "grad_norm": 0.07078509032726288, + "learning_rate": 1.7145841212844516e-05, + "loss": 2.5308, + "step": 30933 + }, + { + "epoch": 0.9172968004032855, + "grad_norm": 0.07264876365661621, + "learning_rate": 1.713362753620068e-05, + "loss": 2.5639, + "step": 30934 + }, + { + "epoch": 0.9173264537555971, + "grad_norm": 0.07399129867553711, + "learning_rate": 1.7121418135421508e-05, + "loss": 2.5693, + "step": 30935 + }, + { + "epoch": 0.9173561071079086, + "grad_norm": 0.07476401329040527, + "learning_rate": 1.710921301061502e-05, + "loss": 2.583, + "step": 30936 + }, + { + "epoch": 0.91738576046022, + "grad_norm": 0.07080378383398056, + "learning_rate": 1.7097012161889357e-05, + "loss": 2.5515, + "step": 30937 + }, + { + "epoch": 0.9174154138125316, + "grad_norm": 0.07610952109098434, + "learning_rate": 1.7084815589352542e-05, + "loss": 2.5829, + "step": 30938 + }, + { + "epoch": 0.917445067164843, + "grad_norm": 0.0705590695142746, + "learning_rate": 1.7072623293112542e-05, + "loss": 2.5506, + "step": 30939 + }, + { + "epoch": 0.9174747205171545, + "grad_norm": 0.07165753841400146, + "learning_rate": 1.7060435273277385e-05, + "loss": 2.5803, + "step": 30940 + }, + { + "epoch": 0.9175043738694659, + "grad_norm": 0.07252223044633865, + "learning_rate": 1.7048251529954983e-05, + "loss": 2.5734, + "step": 30941 + }, + { + "epoch": 0.9175340272217775, + "grad_norm": 0.07302665710449219, + "learning_rate": 1.7036072063253193e-05, + "loss": 2.5804, + "step": 30942 + }, + { + "epoch": 0.9175636805740889, + "grad_norm": 0.07317373901605606, + "learning_rate": 1.7023896873279876e-05, + "loss": 2.5924, + "step": 30943 + }, + { + "epoch": 0.9175933339264004, + "grad_norm": 0.0724177435040474, + "learning_rate": 1.701172596014283e-05, + "loss": 2.5692, + "step": 30944 + }, + { + "epoch": 0.9176229872787118, + "grad_norm": 0.07146614789962769, + "learning_rate": 1.699955932394992e-05, + "loss": 2.5702, + "step": 30945 + }, + { + "epoch": 0.9176526406310234, + "grad_norm": 0.07393307238817215, + "learning_rate": 1.6987396964808777e-05, + "loss": 2.5641, + "step": 30946 + }, + { + "epoch": 0.9176822939833348, + "grad_norm": 0.075550876557827, + "learning_rate": 1.6975238882827147e-05, + "loss": 2.5525, + "step": 30947 + }, + { + "epoch": 0.9177119473356463, + "grad_norm": 0.07579703629016876, + "learning_rate": 1.6963085078112673e-05, + "loss": 2.5702, + "step": 30948 + }, + { + "epoch": 0.9177416006879577, + "grad_norm": 0.07125738263130188, + "learning_rate": 1.695093555077304e-05, + "loss": 2.5487, + "step": 30949 + }, + { + "epoch": 0.9177712540402693, + "grad_norm": 0.06860281527042389, + "learning_rate": 1.693879030091572e-05, + "loss": 2.5859, + "step": 30950 + }, + { + "epoch": 0.9178009073925807, + "grad_norm": 0.07538603991270065, + "learning_rate": 1.6926649328648403e-05, + "loss": 2.5523, + "step": 30951 + }, + { + "epoch": 0.9178305607448922, + "grad_norm": 0.07501832395792007, + "learning_rate": 1.691451263407845e-05, + "loss": 2.5668, + "step": 30952 + }, + { + "epoch": 0.9178602140972036, + "grad_norm": 0.07521331310272217, + "learning_rate": 1.690238021731344e-05, + "loss": 2.5864, + "step": 30953 + }, + { + "epoch": 0.9178898674495152, + "grad_norm": 0.07276438921689987, + "learning_rate": 1.689025207846079e-05, + "loss": 2.577, + "step": 30954 + }, + { + "epoch": 0.9179195208018266, + "grad_norm": 0.073321633040905, + "learning_rate": 1.6878128217627908e-05, + "loss": 2.5361, + "step": 30955 + }, + { + "epoch": 0.9179491741541381, + "grad_norm": 0.07136835157871246, + "learning_rate": 1.686600863492205e-05, + "loss": 2.5744, + "step": 30956 + }, + { + "epoch": 0.9179788275064497, + "grad_norm": 0.07066450268030167, + "learning_rate": 1.6853893330450676e-05, + "loss": 2.5798, + "step": 30957 + }, + { + "epoch": 0.9180084808587611, + "grad_norm": 0.06963718682527542, + "learning_rate": 1.6841782304320984e-05, + "loss": 2.5541, + "step": 30958 + }, + { + "epoch": 0.9180381342110726, + "grad_norm": 0.07494033873081207, + "learning_rate": 1.682967555664022e-05, + "loss": 2.5763, + "step": 30959 + }, + { + "epoch": 0.918067787563384, + "grad_norm": 0.08051721006631851, + "learning_rate": 1.681757308751569e-05, + "loss": 2.5521, + "step": 30960 + }, + { + "epoch": 0.9180974409156956, + "grad_norm": 0.07289387285709381, + "learning_rate": 1.6805474897054474e-05, + "loss": 2.5475, + "step": 30961 + }, + { + "epoch": 0.918127094268007, + "grad_norm": 0.0700942650437355, + "learning_rate": 1.6793380985363703e-05, + "loss": 2.5476, + "step": 30962 + }, + { + "epoch": 0.9181567476203185, + "grad_norm": 0.07406145334243774, + "learning_rate": 1.6781291352550464e-05, + "loss": 2.5801, + "step": 30963 + }, + { + "epoch": 0.9181864009726299, + "grad_norm": 0.06880693137645721, + "learning_rate": 1.6769205998721727e-05, + "loss": 2.5611, + "step": 30964 + }, + { + "epoch": 0.9182160543249415, + "grad_norm": 0.07607267796993256, + "learning_rate": 1.6757124923984733e-05, + "loss": 2.5429, + "step": 30965 + }, + { + "epoch": 0.9182457076772529, + "grad_norm": 0.06852056831121445, + "learning_rate": 1.674504812844635e-05, + "loss": 2.5782, + "step": 30966 + }, + { + "epoch": 0.9182753610295644, + "grad_norm": 0.07209950685501099, + "learning_rate": 1.6732975612213485e-05, + "loss": 2.5762, + "step": 30967 + }, + { + "epoch": 0.9183050143818758, + "grad_norm": 0.07348497211933136, + "learning_rate": 1.6720907375393114e-05, + "loss": 2.5903, + "step": 30968 + }, + { + "epoch": 0.9183346677341874, + "grad_norm": 0.07774536311626434, + "learning_rate": 1.6708843418092033e-05, + "loss": 2.536, + "step": 30969 + }, + { + "epoch": 0.9183643210864988, + "grad_norm": 0.0759226381778717, + "learning_rate": 1.669678374041711e-05, + "loss": 2.5889, + "step": 30970 + }, + { + "epoch": 0.9183939744388103, + "grad_norm": 0.07184883207082748, + "learning_rate": 1.6684728342475085e-05, + "loss": 2.5884, + "step": 30971 + }, + { + "epoch": 0.9184236277911217, + "grad_norm": 0.07041338831186295, + "learning_rate": 1.667267722437288e-05, + "loss": 2.5276, + "step": 30972 + }, + { + "epoch": 0.9184532811434333, + "grad_norm": 0.06996523588895798, + "learning_rate": 1.6660630386216957e-05, + "loss": 2.6069, + "step": 30973 + }, + { + "epoch": 0.9184829344957447, + "grad_norm": 0.0712868794798851, + "learning_rate": 1.6648587828114127e-05, + "loss": 2.5244, + "step": 30974 + }, + { + "epoch": 0.9185125878480562, + "grad_norm": 0.07180985808372498, + "learning_rate": 1.663654955017102e-05, + "loss": 2.6146, + "step": 30975 + }, + { + "epoch": 0.9185422412003676, + "grad_norm": 0.06901255995035172, + "learning_rate": 1.662451555249428e-05, + "loss": 2.557, + "step": 30976 + }, + { + "epoch": 0.9185718945526792, + "grad_norm": 0.0763600692152977, + "learning_rate": 1.6612485835190315e-05, + "loss": 2.5413, + "step": 30977 + }, + { + "epoch": 0.9186015479049907, + "grad_norm": 0.07182007282972336, + "learning_rate": 1.6600460398365824e-05, + "loss": 2.5557, + "step": 30978 + }, + { + "epoch": 0.9186312012573021, + "grad_norm": 0.06950303912162781, + "learning_rate": 1.6588439242127274e-05, + "loss": 2.5478, + "step": 30979 + }, + { + "epoch": 0.9186608546096137, + "grad_norm": 0.07271788269281387, + "learning_rate": 1.6576422366581023e-05, + "loss": 2.5391, + "step": 30980 + }, + { + "epoch": 0.9186905079619251, + "grad_norm": 0.07096994668245316, + "learning_rate": 1.6564409771833543e-05, + "loss": 2.5642, + "step": 30981 + }, + { + "epoch": 0.9187201613142366, + "grad_norm": 0.06931500881910324, + "learning_rate": 1.6552401457991308e-05, + "loss": 2.5592, + "step": 30982 + }, + { + "epoch": 0.918749814666548, + "grad_norm": 0.07258745282888412, + "learning_rate": 1.6540397425160392e-05, + "loss": 2.5617, + "step": 30983 + }, + { + "epoch": 0.9187794680188596, + "grad_norm": 0.07138944417238235, + "learning_rate": 1.652839767344727e-05, + "loss": 2.6004, + "step": 30984 + }, + { + "epoch": 0.918809121371171, + "grad_norm": 0.07040759176015854, + "learning_rate": 1.6516402202958193e-05, + "loss": 2.5582, + "step": 30985 + }, + { + "epoch": 0.9188387747234825, + "grad_norm": 0.07469989359378815, + "learning_rate": 1.6504411013799404e-05, + "loss": 2.5532, + "step": 30986 + }, + { + "epoch": 0.9188684280757939, + "grad_norm": 0.06753391027450562, + "learning_rate": 1.6492424106076986e-05, + "loss": 2.5697, + "step": 30987 + }, + { + "epoch": 0.9188980814281055, + "grad_norm": 0.07455798983573914, + "learning_rate": 1.6480441479897136e-05, + "loss": 2.5598, + "step": 30988 + }, + { + "epoch": 0.9189277347804169, + "grad_norm": 0.0722198560833931, + "learning_rate": 1.6468463135365984e-05, + "loss": 2.5567, + "step": 30989 + }, + { + "epoch": 0.9189573881327284, + "grad_norm": 0.07395755499601364, + "learning_rate": 1.6456489072589565e-05, + "loss": 2.5588, + "step": 30990 + }, + { + "epoch": 0.9189870414850398, + "grad_norm": 0.06876837462186813, + "learning_rate": 1.6444519291673952e-05, + "loss": 2.595, + "step": 30991 + }, + { + "epoch": 0.9190166948373514, + "grad_norm": 0.07151474803686142, + "learning_rate": 1.643255379272518e-05, + "loss": 2.5722, + "step": 30992 + }, + { + "epoch": 0.9190463481896628, + "grad_norm": 0.07379617542028427, + "learning_rate": 1.6420592575849157e-05, + "loss": 2.5917, + "step": 30993 + }, + { + "epoch": 0.9190760015419743, + "grad_norm": 0.06994932144880295, + "learning_rate": 1.6408635641151747e-05, + "loss": 2.5547, + "step": 30994 + }, + { + "epoch": 0.9191056548942858, + "grad_norm": 0.07452253252267838, + "learning_rate": 1.6396682988738865e-05, + "loss": 2.6054, + "step": 30995 + }, + { + "epoch": 0.9191353082465973, + "grad_norm": 0.06816712766885757, + "learning_rate": 1.6384734618716367e-05, + "loss": 2.5657, + "step": 30996 + }, + { + "epoch": 0.9191649615989087, + "grad_norm": 0.07097382098436356, + "learning_rate": 1.637279053119006e-05, + "loss": 2.5438, + "step": 30997 + }, + { + "epoch": 0.9191946149512202, + "grad_norm": 0.0718395933508873, + "learning_rate": 1.6360850726265698e-05, + "loss": 2.549, + "step": 30998 + }, + { + "epoch": 0.9192242683035318, + "grad_norm": 0.07306617498397827, + "learning_rate": 1.634891520404902e-05, + "loss": 2.5735, + "step": 30999 + }, + { + "epoch": 0.9192539216558432, + "grad_norm": 0.07321731001138687, + "learning_rate": 1.6336983964645725e-05, + "loss": 2.5232, + "step": 31000 + }, + { + "epoch": 0.9192835750081547, + "grad_norm": 0.07347186654806137, + "learning_rate": 1.6325057008161447e-05, + "loss": 2.5331, + "step": 31001 + }, + { + "epoch": 0.9193132283604661, + "grad_norm": 0.07213523983955383, + "learning_rate": 1.6313134334701828e-05, + "loss": 2.5762, + "step": 31002 + }, + { + "epoch": 0.9193428817127777, + "grad_norm": 0.07220903038978577, + "learning_rate": 1.6301215944372395e-05, + "loss": 2.5877, + "step": 31003 + }, + { + "epoch": 0.9193725350650891, + "grad_norm": 0.07780463248491287, + "learning_rate": 1.6289301837278725e-05, + "loss": 2.5732, + "step": 31004 + }, + { + "epoch": 0.9194021884174006, + "grad_norm": 0.07499891519546509, + "learning_rate": 1.627739201352635e-05, + "loss": 2.5407, + "step": 31005 + }, + { + "epoch": 0.919431841769712, + "grad_norm": 0.07328219711780548, + "learning_rate": 1.6265486473220682e-05, + "loss": 2.5572, + "step": 31006 + }, + { + "epoch": 0.9194614951220236, + "grad_norm": 0.07242792099714279, + "learning_rate": 1.6253585216467136e-05, + "loss": 2.5762, + "step": 31007 + }, + { + "epoch": 0.919491148474335, + "grad_norm": 0.07878044992685318, + "learning_rate": 1.6241688243371188e-05, + "loss": 2.5691, + "step": 31008 + }, + { + "epoch": 0.9195208018266465, + "grad_norm": 0.07957103848457336, + "learning_rate": 1.6229795554038086e-05, + "loss": 2.5601, + "step": 31009 + }, + { + "epoch": 0.919550455178958, + "grad_norm": 0.07617147266864777, + "learning_rate": 1.6217907148573186e-05, + "loss": 2.5697, + "step": 31010 + }, + { + "epoch": 0.9195801085312695, + "grad_norm": 0.07437434792518616, + "learning_rate": 1.620602302708174e-05, + "loss": 2.5837, + "step": 31011 + }, + { + "epoch": 0.9196097618835809, + "grad_norm": 0.07314905524253845, + "learning_rate": 1.6194143189669053e-05, + "loss": 2.5285, + "step": 31012 + }, + { + "epoch": 0.9196394152358924, + "grad_norm": 0.07532216608524323, + "learning_rate": 1.6182267636440206e-05, + "loss": 2.55, + "step": 31013 + }, + { + "epoch": 0.9196690685882039, + "grad_norm": 0.07220494747161865, + "learning_rate": 1.617039636750045e-05, + "loss": 2.5714, + "step": 31014 + }, + { + "epoch": 0.9196987219405154, + "grad_norm": 0.0702357068657875, + "learning_rate": 1.6158529382954923e-05, + "loss": 2.5442, + "step": 31015 + }, + { + "epoch": 0.9197283752928268, + "grad_norm": 0.07056611031293869, + "learning_rate": 1.614666668290865e-05, + "loss": 2.5631, + "step": 31016 + }, + { + "epoch": 0.9197580286451383, + "grad_norm": 0.07226846367120743, + "learning_rate": 1.613480826746666e-05, + "loss": 2.5375, + "step": 31017 + }, + { + "epoch": 0.9197876819974498, + "grad_norm": 0.0781414806842804, + "learning_rate": 1.6122954136734037e-05, + "loss": 2.6131, + "step": 31018 + }, + { + "epoch": 0.9198173353497613, + "grad_norm": 0.07150160521268845, + "learning_rate": 1.611110429081569e-05, + "loss": 2.5656, + "step": 31019 + }, + { + "epoch": 0.9198469887020728, + "grad_norm": 0.0736604854464531, + "learning_rate": 1.6099258729816603e-05, + "loss": 2.5752, + "step": 31020 + }, + { + "epoch": 0.9198766420543842, + "grad_norm": 0.0768435075879097, + "learning_rate": 1.608741745384157e-05, + "loss": 2.5477, + "step": 31021 + }, + { + "epoch": 0.9199062954066958, + "grad_norm": 0.07671280950307846, + "learning_rate": 1.6075580462995566e-05, + "loss": 2.5727, + "step": 31022 + }, + { + "epoch": 0.9199359487590072, + "grad_norm": 0.0753810703754425, + "learning_rate": 1.6063747757383395e-05, + "loss": 2.5376, + "step": 31023 + }, + { + "epoch": 0.9199656021113187, + "grad_norm": 0.07768796384334564, + "learning_rate": 1.6051919337109755e-05, + "loss": 2.5905, + "step": 31024 + }, + { + "epoch": 0.9199952554636301, + "grad_norm": 0.07468042522668839, + "learning_rate": 1.604009520227945e-05, + "loss": 2.588, + "step": 31025 + }, + { + "epoch": 0.9200249088159417, + "grad_norm": 0.07235259562730789, + "learning_rate": 1.6028275352997168e-05, + "loss": 2.5621, + "step": 31026 + }, + { + "epoch": 0.9200545621682531, + "grad_norm": 0.07403057813644409, + "learning_rate": 1.6016459789367665e-05, + "loss": 2.586, + "step": 31027 + }, + { + "epoch": 0.9200842155205646, + "grad_norm": 0.07266884297132492, + "learning_rate": 1.600464851149541e-05, + "loss": 2.547, + "step": 31028 + }, + { + "epoch": 0.920113868872876, + "grad_norm": 0.07466977834701538, + "learning_rate": 1.599284151948499e-05, + "loss": 2.5698, + "step": 31029 + }, + { + "epoch": 0.9201435222251876, + "grad_norm": 0.07527843862771988, + "learning_rate": 1.5981038813441097e-05, + "loss": 2.5854, + "step": 31030 + }, + { + "epoch": 0.920173175577499, + "grad_norm": 0.07664614915847778, + "learning_rate": 1.59692403934682e-05, + "loss": 2.5538, + "step": 31031 + }, + { + "epoch": 0.9202028289298105, + "grad_norm": 0.072368323802948, + "learning_rate": 1.595744625967077e-05, + "loss": 2.5569, + "step": 31032 + }, + { + "epoch": 0.920232482282122, + "grad_norm": 0.07061830908060074, + "learning_rate": 1.594565641215323e-05, + "loss": 2.561, + "step": 31033 + }, + { + "epoch": 0.9202621356344335, + "grad_norm": 0.07243534177541733, + "learning_rate": 1.5933870851019994e-05, + "loss": 2.5288, + "step": 31034 + }, + { + "epoch": 0.9202917889867449, + "grad_norm": 0.07156870514154434, + "learning_rate": 1.5922089576375422e-05, + "loss": 2.5524, + "step": 31035 + }, + { + "epoch": 0.9203214423390564, + "grad_norm": 0.07378241419792175, + "learning_rate": 1.5910312588323873e-05, + "loss": 2.5706, + "step": 31036 + }, + { + "epoch": 0.9203510956913679, + "grad_norm": 0.07867678254842758, + "learning_rate": 1.58985398869696e-05, + "loss": 2.5687, + "step": 31037 + }, + { + "epoch": 0.9203807490436794, + "grad_norm": 0.08164722472429276, + "learning_rate": 1.5886771472416796e-05, + "loss": 2.5715, + "step": 31038 + }, + { + "epoch": 0.9204104023959908, + "grad_norm": 0.07004175335168839, + "learning_rate": 1.5875007344769764e-05, + "loss": 2.5925, + "step": 31039 + }, + { + "epoch": 0.9204400557483023, + "grad_norm": 0.07134267687797546, + "learning_rate": 1.5863247504132593e-05, + "loss": 2.5578, + "step": 31040 + }, + { + "epoch": 0.9204697091006139, + "grad_norm": 0.0827120766043663, + "learning_rate": 1.585149195060953e-05, + "loss": 2.5681, + "step": 31041 + }, + { + "epoch": 0.9204993624529253, + "grad_norm": 0.07798586040735245, + "learning_rate": 1.5839740684304494e-05, + "loss": 2.5639, + "step": 31042 + }, + { + "epoch": 0.9205290158052368, + "grad_norm": 0.0693531259894371, + "learning_rate": 1.582799370532173e-05, + "loss": 2.5608, + "step": 31043 + }, + { + "epoch": 0.9205586691575482, + "grad_norm": 0.07011672854423523, + "learning_rate": 1.5816251013765216e-05, + "loss": 2.5523, + "step": 31044 + }, + { + "epoch": 0.9205883225098598, + "grad_norm": 0.07384718209505081, + "learning_rate": 1.5804512609738863e-05, + "loss": 2.5531, + "step": 31045 + }, + { + "epoch": 0.9206179758621712, + "grad_norm": 0.07643873244524002, + "learning_rate": 1.5792778493346705e-05, + "loss": 2.5489, + "step": 31046 + }, + { + "epoch": 0.9206476292144827, + "grad_norm": 0.07095802575349808, + "learning_rate": 1.57810486646926e-05, + "loss": 2.5825, + "step": 31047 + }, + { + "epoch": 0.9206772825667942, + "grad_norm": 0.07144252955913544, + "learning_rate": 1.5769323123880464e-05, + "loss": 2.5532, + "step": 31048 + }, + { + "epoch": 0.9207069359191057, + "grad_norm": 0.0780944749712944, + "learning_rate": 1.5757601871014048e-05, + "loss": 2.5672, + "step": 31049 + }, + { + "epoch": 0.9207365892714171, + "grad_norm": 0.06916706264019012, + "learning_rate": 1.5745884906197163e-05, + "loss": 2.564, + "step": 31050 + }, + { + "epoch": 0.9207662426237286, + "grad_norm": 0.0739181786775589, + "learning_rate": 1.5734172229533605e-05, + "loss": 2.5686, + "step": 31051 + }, + { + "epoch": 0.9207958959760401, + "grad_norm": 0.0702071562409401, + "learning_rate": 1.5722463841127077e-05, + "loss": 2.5322, + "step": 31052 + }, + { + "epoch": 0.9208255493283516, + "grad_norm": 0.07014548033475876, + "learning_rate": 1.5710759741081214e-05, + "loss": 2.5267, + "step": 31053 + }, + { + "epoch": 0.920855202680663, + "grad_norm": 0.07347998768091202, + "learning_rate": 1.5699059929499714e-05, + "loss": 2.5754, + "step": 31054 + }, + { + "epoch": 0.9208848560329745, + "grad_norm": 0.07043056935071945, + "learning_rate": 1.568736440648616e-05, + "loss": 2.5419, + "step": 31055 + }, + { + "epoch": 0.920914509385286, + "grad_norm": 0.07156342267990112, + "learning_rate": 1.567567317214419e-05, + "loss": 2.5475, + "step": 31056 + }, + { + "epoch": 0.9209441627375975, + "grad_norm": 0.07696874439716339, + "learning_rate": 1.566398622657722e-05, + "loss": 2.6008, + "step": 31057 + }, + { + "epoch": 0.9209738160899089, + "grad_norm": 0.07200371474027634, + "learning_rate": 1.5652303569888836e-05, + "loss": 2.555, + "step": 31058 + }, + { + "epoch": 0.9210034694422204, + "grad_norm": 0.07670719921588898, + "learning_rate": 1.5640625202182457e-05, + "loss": 2.5716, + "step": 31059 + }, + { + "epoch": 0.9210331227945319, + "grad_norm": 0.07531585544347763, + "learning_rate": 1.5628951123561387e-05, + "loss": 2.5679, + "step": 31060 + }, + { + "epoch": 0.9210627761468434, + "grad_norm": 0.0713406428694725, + "learning_rate": 1.5617281334129153e-05, + "loss": 2.5789, + "step": 31061 + }, + { + "epoch": 0.9210924294991549, + "grad_norm": 0.06927640736103058, + "learning_rate": 1.5605615833989005e-05, + "loss": 2.565, + "step": 31062 + }, + { + "epoch": 0.9211220828514664, + "grad_norm": 0.07786011695861816, + "learning_rate": 1.559395462324431e-05, + "loss": 2.5195, + "step": 31063 + }, + { + "epoch": 0.9211517362037779, + "grad_norm": 0.07388579845428467, + "learning_rate": 1.558229770199826e-05, + "loss": 2.551, + "step": 31064 + }, + { + "epoch": 0.9211813895560893, + "grad_norm": 0.07746345549821854, + "learning_rate": 1.5570645070354163e-05, + "loss": 2.524, + "step": 31065 + }, + { + "epoch": 0.9212110429084008, + "grad_norm": 0.07518267631530762, + "learning_rate": 1.5558996728415097e-05, + "loss": 2.5353, + "step": 31066 + }, + { + "epoch": 0.9212406962607123, + "grad_norm": 0.07010872662067413, + "learning_rate": 1.5547352676284266e-05, + "loss": 2.5477, + "step": 31067 + }, + { + "epoch": 0.9212703496130238, + "grad_norm": 0.0763453021645546, + "learning_rate": 1.5535712914064804e-05, + "loss": 2.6021, + "step": 31068 + }, + { + "epoch": 0.9213000029653352, + "grad_norm": 0.07290547341108322, + "learning_rate": 1.5524077441859795e-05, + "loss": 2.5286, + "step": 31069 + }, + { + "epoch": 0.9213296563176467, + "grad_norm": 0.07088658958673477, + "learning_rate": 1.5512446259772218e-05, + "loss": 2.567, + "step": 31070 + }, + { + "epoch": 0.9213593096699582, + "grad_norm": 0.07521319389343262, + "learning_rate": 1.5500819367905096e-05, + "loss": 2.5488, + "step": 31071 + }, + { + "epoch": 0.9213889630222697, + "grad_norm": 0.07152297347784042, + "learning_rate": 1.5489196766361346e-05, + "loss": 2.5789, + "step": 31072 + }, + { + "epoch": 0.9214186163745811, + "grad_norm": 0.07128787785768509, + "learning_rate": 1.5477578455243945e-05, + "loss": 2.5213, + "step": 31073 + }, + { + "epoch": 0.9214482697268926, + "grad_norm": 0.07707535475492477, + "learning_rate": 1.5465964434655755e-05, + "loss": 2.5564, + "step": 31074 + }, + { + "epoch": 0.9214779230792041, + "grad_norm": 0.07395242899656296, + "learning_rate": 1.5454354704699635e-05, + "loss": 2.545, + "step": 31075 + }, + { + "epoch": 0.9215075764315156, + "grad_norm": 0.06987836211919785, + "learning_rate": 1.544274926547834e-05, + "loss": 2.5951, + "step": 31076 + }, + { + "epoch": 0.921537229783827, + "grad_norm": 0.0716811865568161, + "learning_rate": 1.543114811709473e-05, + "loss": 2.564, + "step": 31077 + }, + { + "epoch": 0.9215668831361385, + "grad_norm": 0.0747351199388504, + "learning_rate": 1.5419551259651445e-05, + "loss": 2.5609, + "step": 31078 + }, + { + "epoch": 0.92159653648845, + "grad_norm": 0.07992249727249146, + "learning_rate": 1.540795869325118e-05, + "loss": 2.5745, + "step": 31079 + }, + { + "epoch": 0.9216261898407615, + "grad_norm": 0.07546009123325348, + "learning_rate": 1.5396370417996687e-05, + "loss": 2.5654, + "step": 31080 + }, + { + "epoch": 0.921655843193073, + "grad_norm": 0.07082758098840714, + "learning_rate": 1.538478643399044e-05, + "loss": 2.5918, + "step": 31081 + }, + { + "epoch": 0.9216854965453845, + "grad_norm": 0.0687067061662674, + "learning_rate": 1.537320674133513e-05, + "loss": 2.5693, + "step": 31082 + }, + { + "epoch": 0.921715149897696, + "grad_norm": 0.06863828748464584, + "learning_rate": 1.5361631340133243e-05, + "loss": 2.551, + "step": 31083 + }, + { + "epoch": 0.9217448032500074, + "grad_norm": 0.06984692066907883, + "learning_rate": 1.535006023048735e-05, + "loss": 2.5847, + "step": 31084 + }, + { + "epoch": 0.9217744566023189, + "grad_norm": 0.0744519978761673, + "learning_rate": 1.533849341249982e-05, + "loss": 2.5453, + "step": 31085 + }, + { + "epoch": 0.9218041099546304, + "grad_norm": 0.0720784142613411, + "learning_rate": 1.5326930886273127e-05, + "loss": 2.5734, + "step": 31086 + }, + { + "epoch": 0.9218337633069419, + "grad_norm": 0.06738056987524033, + "learning_rate": 1.531537265190963e-05, + "loss": 2.5901, + "step": 31087 + }, + { + "epoch": 0.9218634166592533, + "grad_norm": 0.07306301593780518, + "learning_rate": 1.530381870951175e-05, + "loss": 2.559, + "step": 31088 + }, + { + "epoch": 0.9218930700115648, + "grad_norm": 0.07439655065536499, + "learning_rate": 1.529226905918174e-05, + "loss": 2.5873, + "step": 31089 + }, + { + "epoch": 0.9219227233638763, + "grad_norm": 0.07007009536027908, + "learning_rate": 1.528072370102185e-05, + "loss": 2.5576, + "step": 31090 + }, + { + "epoch": 0.9219523767161878, + "grad_norm": 0.07103601843118668, + "learning_rate": 1.526918263513438e-05, + "loss": 2.5567, + "step": 31091 + }, + { + "epoch": 0.9219820300684992, + "grad_norm": 0.07011949270963669, + "learning_rate": 1.5257645861621539e-05, + "loss": 2.5812, + "step": 31092 + }, + { + "epoch": 0.9220116834208107, + "grad_norm": 0.07459870725870132, + "learning_rate": 1.5246113380585347e-05, + "loss": 2.5526, + "step": 31093 + }, + { + "epoch": 0.9220413367731222, + "grad_norm": 0.07305007427930832, + "learning_rate": 1.5234585192128115e-05, + "loss": 2.5477, + "step": 31094 + }, + { + "epoch": 0.9220709901254337, + "grad_norm": 0.06808210164308548, + "learning_rate": 1.5223061296351814e-05, + "loss": 2.5497, + "step": 31095 + }, + { + "epoch": 0.9221006434777451, + "grad_norm": 0.07183997333049774, + "learning_rate": 1.5211541693358533e-05, + "loss": 2.5818, + "step": 31096 + }, + { + "epoch": 0.9221302968300567, + "grad_norm": 0.0717364102602005, + "learning_rate": 1.5200026383250243e-05, + "loss": 2.5486, + "step": 31097 + }, + { + "epoch": 0.9221599501823681, + "grad_norm": 0.07175755500793457, + "learning_rate": 1.5188515366128919e-05, + "loss": 2.5685, + "step": 31098 + }, + { + "epoch": 0.9221896035346796, + "grad_norm": 0.07305876910686493, + "learning_rate": 1.5177008642096535e-05, + "loss": 2.5873, + "step": 31099 + }, + { + "epoch": 0.922219256886991, + "grad_norm": 0.07022468000650406, + "learning_rate": 1.5165506211254954e-05, + "loss": 2.5413, + "step": 31100 + }, + { + "epoch": 0.9222489102393026, + "grad_norm": 0.07302835583686829, + "learning_rate": 1.5154008073706038e-05, + "loss": 2.5709, + "step": 31101 + }, + { + "epoch": 0.9222785635916141, + "grad_norm": 0.07050544023513794, + "learning_rate": 1.5142514229551596e-05, + "loss": 2.5736, + "step": 31102 + }, + { + "epoch": 0.9223082169439255, + "grad_norm": 0.07143249362707138, + "learning_rate": 1.513102467889349e-05, + "loss": 2.5778, + "step": 31103 + }, + { + "epoch": 0.922337870296237, + "grad_norm": 0.0742361918091774, + "learning_rate": 1.5119539421833306e-05, + "loss": 2.5811, + "step": 31104 + }, + { + "epoch": 0.9223675236485485, + "grad_norm": 0.07156247645616531, + "learning_rate": 1.5108058458472795e-05, + "loss": 2.5606, + "step": 31105 + }, + { + "epoch": 0.92239717700086, + "grad_norm": 0.06923483312129974, + "learning_rate": 1.5096581788913655e-05, + "loss": 2.5618, + "step": 31106 + }, + { + "epoch": 0.9224268303531714, + "grad_norm": 0.07222317904233932, + "learning_rate": 1.508510941325758e-05, + "loss": 2.5604, + "step": 31107 + }, + { + "epoch": 0.9224564837054829, + "grad_norm": 0.0740053579211235, + "learning_rate": 1.5073641331606048e-05, + "loss": 2.5566, + "step": 31108 + }, + { + "epoch": 0.9224861370577944, + "grad_norm": 0.07278456538915634, + "learning_rate": 1.5062177544060695e-05, + "loss": 2.5198, + "step": 31109 + }, + { + "epoch": 0.9225157904101059, + "grad_norm": 0.06845499575138092, + "learning_rate": 1.5050718050723e-05, + "loss": 2.5765, + "step": 31110 + }, + { + "epoch": 0.9225454437624173, + "grad_norm": 0.07100994884967804, + "learning_rate": 1.5039262851694435e-05, + "loss": 2.5014, + "step": 31111 + }, + { + "epoch": 0.9225750971147288, + "grad_norm": 0.07341506332159042, + "learning_rate": 1.5027811947076419e-05, + "loss": 2.5564, + "step": 31112 + }, + { + "epoch": 0.9226047504670403, + "grad_norm": 0.07128378003835678, + "learning_rate": 1.5016365336970428e-05, + "loss": 2.5545, + "step": 31113 + }, + { + "epoch": 0.9226344038193518, + "grad_norm": 0.07279084622859955, + "learning_rate": 1.5004923021477768e-05, + "loss": 2.5527, + "step": 31114 + }, + { + "epoch": 0.9226640571716632, + "grad_norm": 0.07367491722106934, + "learning_rate": 1.4993485000699692e-05, + "loss": 2.5488, + "step": 31115 + }, + { + "epoch": 0.9226937105239748, + "grad_norm": 0.07482770830392838, + "learning_rate": 1.4982051274737618e-05, + "loss": 2.5786, + "step": 31116 + }, + { + "epoch": 0.9227233638762862, + "grad_norm": 0.07307863235473633, + "learning_rate": 1.497062184369269e-05, + "loss": 2.5714, + "step": 31117 + }, + { + "epoch": 0.9227530172285977, + "grad_norm": 0.07357730716466904, + "learning_rate": 1.4959196707666156e-05, + "loss": 2.5463, + "step": 31118 + }, + { + "epoch": 0.9227826705809091, + "grad_norm": 0.0768711045384407, + "learning_rate": 1.4947775866759162e-05, + "loss": 2.5363, + "step": 31119 + }, + { + "epoch": 0.9228123239332207, + "grad_norm": 0.08012956380844116, + "learning_rate": 1.4936359321072902e-05, + "loss": 2.5794, + "step": 31120 + }, + { + "epoch": 0.9228419772855321, + "grad_norm": 0.07363878935575485, + "learning_rate": 1.492494707070846e-05, + "loss": 2.5929, + "step": 31121 + }, + { + "epoch": 0.9228716306378436, + "grad_norm": 0.06770119071006775, + "learning_rate": 1.4913539115766872e-05, + "loss": 2.5657, + "step": 31122 + }, + { + "epoch": 0.9229012839901551, + "grad_norm": 0.07316797971725464, + "learning_rate": 1.4902135456349165e-05, + "loss": 2.6289, + "step": 31123 + }, + { + "epoch": 0.9229309373424666, + "grad_norm": 0.07767882943153381, + "learning_rate": 1.4890736092556311e-05, + "loss": 2.5589, + "step": 31124 + }, + { + "epoch": 0.9229605906947781, + "grad_norm": 0.07862845808267593, + "learning_rate": 1.487934102448929e-05, + "loss": 2.5608, + "step": 31125 + }, + { + "epoch": 0.9229902440470895, + "grad_norm": 0.07274992763996124, + "learning_rate": 1.4867950252248908e-05, + "loss": 2.5759, + "step": 31126 + }, + { + "epoch": 0.923019897399401, + "grad_norm": 0.07319971174001694, + "learning_rate": 1.4856563775936139e-05, + "loss": 2.5809, + "step": 31127 + }, + { + "epoch": 0.9230495507517125, + "grad_norm": 0.06921283155679703, + "learning_rate": 1.4845181595651735e-05, + "loss": 2.5767, + "step": 31128 + }, + { + "epoch": 0.923079204104024, + "grad_norm": 0.07205460965633392, + "learning_rate": 1.4833803711496563e-05, + "loss": 2.5905, + "step": 31129 + }, + { + "epoch": 0.9231088574563354, + "grad_norm": 0.07036031037569046, + "learning_rate": 1.4822430123571318e-05, + "loss": 2.5218, + "step": 31130 + }, + { + "epoch": 0.923138510808647, + "grad_norm": 0.07011698186397552, + "learning_rate": 1.4811060831976698e-05, + "loss": 2.5679, + "step": 31131 + }, + { + "epoch": 0.9231681641609584, + "grad_norm": 0.07322920858860016, + "learning_rate": 1.4799695836813398e-05, + "loss": 2.5858, + "step": 31132 + }, + { + "epoch": 0.9231978175132699, + "grad_norm": 0.07001174986362457, + "learning_rate": 1.4788335138182174e-05, + "loss": 2.5782, + "step": 31133 + }, + { + "epoch": 0.9232274708655813, + "grad_norm": 0.07221799343824387, + "learning_rate": 1.47769787361835e-05, + "loss": 2.5569, + "step": 31134 + }, + { + "epoch": 0.9232571242178929, + "grad_norm": 0.06739550828933716, + "learning_rate": 1.4765626630917962e-05, + "loss": 2.5165, + "step": 31135 + }, + { + "epoch": 0.9232867775702043, + "grad_norm": 0.07307100296020508, + "learning_rate": 1.4754278822486088e-05, + "loss": 2.5618, + "step": 31136 + }, + { + "epoch": 0.9233164309225158, + "grad_norm": 0.07089605182409286, + "learning_rate": 1.474293531098836e-05, + "loss": 2.5812, + "step": 31137 + }, + { + "epoch": 0.9233460842748272, + "grad_norm": 0.07116957008838654, + "learning_rate": 1.473159609652519e-05, + "loss": 2.5633, + "step": 31138 + }, + { + "epoch": 0.9233757376271388, + "grad_norm": 0.0716434046626091, + "learning_rate": 1.4720261179197115e-05, + "loss": 2.5516, + "step": 31139 + }, + { + "epoch": 0.9234053909794502, + "grad_norm": 0.0724245086312294, + "learning_rate": 1.4708930559104383e-05, + "loss": 2.5647, + "step": 31140 + }, + { + "epoch": 0.9234350443317617, + "grad_norm": 0.0701451227068901, + "learning_rate": 1.4697604236347362e-05, + "loss": 2.5294, + "step": 31141 + }, + { + "epoch": 0.9234646976840731, + "grad_norm": 0.0698959231376648, + "learning_rate": 1.4686282211026359e-05, + "loss": 2.5729, + "step": 31142 + }, + { + "epoch": 0.9234943510363847, + "grad_norm": 0.07249341905117035, + "learning_rate": 1.4674964483241626e-05, + "loss": 2.5666, + "step": 31143 + }, + { + "epoch": 0.9235240043886962, + "grad_norm": 0.07626176625490189, + "learning_rate": 1.4663651053093363e-05, + "loss": 2.5817, + "step": 31144 + }, + { + "epoch": 0.9235536577410076, + "grad_norm": 0.07476073503494263, + "learning_rate": 1.4652341920681822e-05, + "loss": 2.5294, + "step": 31145 + }, + { + "epoch": 0.9235833110933191, + "grad_norm": 0.07429872453212738, + "learning_rate": 1.464103708610709e-05, + "loss": 2.5831, + "step": 31146 + }, + { + "epoch": 0.9236129644456306, + "grad_norm": 0.07212554663419724, + "learning_rate": 1.4629736549469307e-05, + "loss": 2.582, + "step": 31147 + }, + { + "epoch": 0.9236426177979421, + "grad_norm": 0.07723000645637512, + "learning_rate": 1.4618440310868452e-05, + "loss": 2.5664, + "step": 31148 + }, + { + "epoch": 0.9236722711502535, + "grad_norm": 0.07155898958444595, + "learning_rate": 1.4607148370404666e-05, + "loss": 2.5685, + "step": 31149 + }, + { + "epoch": 0.923701924502565, + "grad_norm": 0.06690644472837448, + "learning_rate": 1.4595860728177924e-05, + "loss": 2.5403, + "step": 31150 + }, + { + "epoch": 0.9237315778548765, + "grad_norm": 0.06911721080541611, + "learning_rate": 1.4584577384288145e-05, + "loss": 2.5604, + "step": 31151 + }, + { + "epoch": 0.923761231207188, + "grad_norm": 0.07029731571674347, + "learning_rate": 1.4573298338835194e-05, + "loss": 2.5366, + "step": 31152 + }, + { + "epoch": 0.9237908845594994, + "grad_norm": 0.07321926206350327, + "learning_rate": 1.456202359191905e-05, + "loss": 2.5777, + "step": 31153 + }, + { + "epoch": 0.923820537911811, + "grad_norm": 0.06918197125196457, + "learning_rate": 1.4550753143639516e-05, + "loss": 2.568, + "step": 31154 + }, + { + "epoch": 0.9238501912641224, + "grad_norm": 0.07000332325696945, + "learning_rate": 1.4539486994096407e-05, + "loss": 2.56, + "step": 31155 + }, + { + "epoch": 0.9238798446164339, + "grad_norm": 0.07310725003480911, + "learning_rate": 1.4528225143389418e-05, + "loss": 2.5334, + "step": 31156 + }, + { + "epoch": 0.9239094979687453, + "grad_norm": 0.07100512832403183, + "learning_rate": 1.4516967591618358e-05, + "loss": 2.5863, + "step": 31157 + }, + { + "epoch": 0.9239391513210569, + "grad_norm": 0.07272003591060638, + "learning_rate": 1.4505714338882924e-05, + "loss": 2.5651, + "step": 31158 + }, + { + "epoch": 0.9239688046733683, + "grad_norm": 0.0706985592842102, + "learning_rate": 1.449446538528265e-05, + "loss": 2.5718, + "step": 31159 + }, + { + "epoch": 0.9239984580256798, + "grad_norm": 0.07266484946012497, + "learning_rate": 1.4483220730917234e-05, + "loss": 2.5542, + "step": 31160 + }, + { + "epoch": 0.9240281113779912, + "grad_norm": 0.0659433901309967, + "learning_rate": 1.4471980375886263e-05, + "loss": 2.5721, + "step": 31161 + }, + { + "epoch": 0.9240577647303028, + "grad_norm": 0.07302948087453842, + "learning_rate": 1.4460744320289265e-05, + "loss": 2.558, + "step": 31162 + }, + { + "epoch": 0.9240874180826142, + "grad_norm": 0.0746818408370018, + "learning_rate": 1.4449512564225664e-05, + "loss": 2.5664, + "step": 31163 + }, + { + "epoch": 0.9241170714349257, + "grad_norm": 0.06808546185493469, + "learning_rate": 1.4438285107794991e-05, + "loss": 2.56, + "step": 31164 + }, + { + "epoch": 0.9241467247872373, + "grad_norm": 0.06905890256166458, + "learning_rate": 1.4427061951096665e-05, + "loss": 2.5728, + "step": 31165 + }, + { + "epoch": 0.9241763781395487, + "grad_norm": 0.07465609163045883, + "learning_rate": 1.4415843094230052e-05, + "loss": 2.5957, + "step": 31166 + }, + { + "epoch": 0.9242060314918602, + "grad_norm": 0.07264037430286407, + "learning_rate": 1.4404628537294461e-05, + "loss": 2.5663, + "step": 31167 + }, + { + "epoch": 0.9242356848441716, + "grad_norm": 0.07092835009098053, + "learning_rate": 1.4393418280389314e-05, + "loss": 2.5602, + "step": 31168 + }, + { + "epoch": 0.9242653381964832, + "grad_norm": 0.0723961815237999, + "learning_rate": 1.4382212323613752e-05, + "loss": 2.5303, + "step": 31169 + }, + { + "epoch": 0.9242949915487946, + "grad_norm": 0.06767244637012482, + "learning_rate": 1.4371010667067087e-05, + "loss": 2.5479, + "step": 31170 + }, + { + "epoch": 0.9243246449011061, + "grad_norm": 0.07115186750888824, + "learning_rate": 1.4359813310848347e-05, + "loss": 2.5962, + "step": 31171 + }, + { + "epoch": 0.9243542982534175, + "grad_norm": 0.07135027647018433, + "learning_rate": 1.4348620255056955e-05, + "loss": 2.563, + "step": 31172 + }, + { + "epoch": 0.9243839516057291, + "grad_norm": 0.06968382745981216, + "learning_rate": 1.433743149979183e-05, + "loss": 2.55, + "step": 31173 + }, + { + "epoch": 0.9244136049580405, + "grad_norm": 0.06997006386518478, + "learning_rate": 1.4326247045152174e-05, + "loss": 2.5785, + "step": 31174 + }, + { + "epoch": 0.924443258310352, + "grad_norm": 0.07330429553985596, + "learning_rate": 1.4315066891236905e-05, + "loss": 2.5513, + "step": 31175 + }, + { + "epoch": 0.9244729116626634, + "grad_norm": 0.07593151926994324, + "learning_rate": 1.4303891038145111e-05, + "loss": 2.5818, + "step": 31176 + }, + { + "epoch": 0.924502565014975, + "grad_norm": 0.06824658811092377, + "learning_rate": 1.4292719485975714e-05, + "loss": 2.5618, + "step": 31177 + }, + { + "epoch": 0.9245322183672864, + "grad_norm": 0.06985998898744583, + "learning_rate": 1.4281552234827688e-05, + "loss": 2.5334, + "step": 31178 + }, + { + "epoch": 0.9245618717195979, + "grad_norm": 0.06957278400659561, + "learning_rate": 1.42703892847999e-05, + "loss": 2.5808, + "step": 31179 + }, + { + "epoch": 0.9245915250719093, + "grad_norm": 0.073226198554039, + "learning_rate": 1.425923063599116e-05, + "loss": 2.5506, + "step": 31180 + }, + { + "epoch": 0.9246211784242209, + "grad_norm": 0.0751083642244339, + "learning_rate": 1.4248076288500334e-05, + "loss": 2.5689, + "step": 31181 + }, + { + "epoch": 0.9246508317765323, + "grad_norm": 0.07254280149936676, + "learning_rate": 1.4236926242426119e-05, + "loss": 2.5329, + "step": 31182 + }, + { + "epoch": 0.9246804851288438, + "grad_norm": 0.07063469290733337, + "learning_rate": 1.4225780497867324e-05, + "loss": 2.5787, + "step": 31183 + }, + { + "epoch": 0.9247101384811552, + "grad_norm": 0.07089125365018845, + "learning_rate": 1.4214639054922595e-05, + "loss": 2.5574, + "step": 31184 + }, + { + "epoch": 0.9247397918334668, + "grad_norm": 0.07113314419984818, + "learning_rate": 1.4203501913690686e-05, + "loss": 2.5409, + "step": 31185 + }, + { + "epoch": 0.9247694451857783, + "grad_norm": 0.07533426582813263, + "learning_rate": 1.4192369074270129e-05, + "loss": 2.5692, + "step": 31186 + }, + { + "epoch": 0.9247990985380897, + "grad_norm": 0.07184645533561707, + "learning_rate": 1.418124053675951e-05, + "loss": 2.5407, + "step": 31187 + }, + { + "epoch": 0.9248287518904013, + "grad_norm": 0.07705271244049072, + "learning_rate": 1.4170116301257419e-05, + "loss": 2.589, + "step": 31188 + }, + { + "epoch": 0.9248584052427127, + "grad_norm": 0.07126937806606293, + "learning_rate": 1.4158996367862387e-05, + "loss": 2.5685, + "step": 31189 + }, + { + "epoch": 0.9248880585950242, + "grad_norm": 0.07267095148563385, + "learning_rate": 1.414788073667278e-05, + "loss": 2.5751, + "step": 31190 + }, + { + "epoch": 0.9249177119473356, + "grad_norm": 0.07082200050354004, + "learning_rate": 1.4136769407787075e-05, + "loss": 2.5354, + "step": 31191 + }, + { + "epoch": 0.9249473652996472, + "grad_norm": 0.07090112566947937, + "learning_rate": 1.4125662381303694e-05, + "loss": 2.5543, + "step": 31192 + }, + { + "epoch": 0.9249770186519586, + "grad_norm": 0.07403626292943954, + "learning_rate": 1.4114559657320947e-05, + "loss": 2.5353, + "step": 31193 + }, + { + "epoch": 0.9250066720042701, + "grad_norm": 0.07581941783428192, + "learning_rate": 1.4103461235937199e-05, + "loss": 2.5664, + "step": 31194 + }, + { + "epoch": 0.9250363253565815, + "grad_norm": 0.0734255388379097, + "learning_rate": 1.4092367117250704e-05, + "loss": 2.5665, + "step": 31195 + }, + { + "epoch": 0.9250659787088931, + "grad_norm": 0.07341937720775604, + "learning_rate": 1.4081277301359663e-05, + "loss": 2.5677, + "step": 31196 + }, + { + "epoch": 0.9250956320612045, + "grad_norm": 0.07518494874238968, + "learning_rate": 1.407019178836233e-05, + "loss": 2.5817, + "step": 31197 + }, + { + "epoch": 0.925125285413516, + "grad_norm": 0.07524221390485764, + "learning_rate": 1.4059110578356849e-05, + "loss": 2.5628, + "step": 31198 + }, + { + "epoch": 0.9251549387658274, + "grad_norm": 0.07507931441068649, + "learning_rate": 1.4048033671441418e-05, + "loss": 2.5403, + "step": 31199 + }, + { + "epoch": 0.925184592118139, + "grad_norm": 0.07684652507305145, + "learning_rate": 1.4036961067714072e-05, + "loss": 2.5174, + "step": 31200 + }, + { + "epoch": 0.9252142454704504, + "grad_norm": 0.07446015626192093, + "learning_rate": 1.4025892767272785e-05, + "loss": 2.585, + "step": 31201 + }, + { + "epoch": 0.9252438988227619, + "grad_norm": 0.07178567349910736, + "learning_rate": 1.4014828770215704e-05, + "loss": 2.5537, + "step": 31202 + }, + { + "epoch": 0.9252735521750733, + "grad_norm": 0.07329261302947998, + "learning_rate": 1.4003769076640637e-05, + "loss": 2.5795, + "step": 31203 + }, + { + "epoch": 0.9253032055273849, + "grad_norm": 0.0751512199640274, + "learning_rate": 1.3992713686645674e-05, + "loss": 2.5668, + "step": 31204 + }, + { + "epoch": 0.9253328588796963, + "grad_norm": 0.07119301706552505, + "learning_rate": 1.3981662600328682e-05, + "loss": 2.5511, + "step": 31205 + }, + { + "epoch": 0.9253625122320078, + "grad_norm": 0.06914117187261581, + "learning_rate": 1.3970615817787413e-05, + "loss": 2.5713, + "step": 31206 + }, + { + "epoch": 0.9253921655843194, + "grad_norm": 0.0722968727350235, + "learning_rate": 1.3959573339119792e-05, + "loss": 2.5091, + "step": 31207 + }, + { + "epoch": 0.9254218189366308, + "grad_norm": 0.07678642123937607, + "learning_rate": 1.3948535164423626e-05, + "loss": 2.5679, + "step": 31208 + }, + { + "epoch": 0.9254514722889423, + "grad_norm": 0.07614766061306, + "learning_rate": 1.3937501293796562e-05, + "loss": 2.5863, + "step": 31209 + }, + { + "epoch": 0.9254811256412537, + "grad_norm": 0.07089854776859283, + "learning_rate": 1.3926471727336353e-05, + "loss": 2.5543, + "step": 31210 + }, + { + "epoch": 0.9255107789935653, + "grad_norm": 0.07779733836650848, + "learning_rate": 1.3915446465140702e-05, + "loss": 2.5716, + "step": 31211 + }, + { + "epoch": 0.9255404323458767, + "grad_norm": 0.07196635752916336, + "learning_rate": 1.3904425507307194e-05, + "loss": 2.592, + "step": 31212 + }, + { + "epoch": 0.9255700856981882, + "grad_norm": 0.07265911251306534, + "learning_rate": 1.389340885393342e-05, + "loss": 2.5802, + "step": 31213 + }, + { + "epoch": 0.9255997390504996, + "grad_norm": 0.07287188619375229, + "learning_rate": 1.3882396505116968e-05, + "loss": 2.56, + "step": 31214 + }, + { + "epoch": 0.9256293924028112, + "grad_norm": 0.07295488566160202, + "learning_rate": 1.3871388460955314e-05, + "loss": 2.57, + "step": 31215 + }, + { + "epoch": 0.9256590457551226, + "grad_norm": 0.0732068195939064, + "learning_rate": 1.3860384721545993e-05, + "loss": 2.54, + "step": 31216 + }, + { + "epoch": 0.9256886991074341, + "grad_norm": 0.07688616216182709, + "learning_rate": 1.384938528698637e-05, + "loss": 2.5436, + "step": 31217 + }, + { + "epoch": 0.9257183524597455, + "grad_norm": 0.0714365765452385, + "learning_rate": 1.3838390157373926e-05, + "loss": 2.5607, + "step": 31218 + }, + { + "epoch": 0.9257480058120571, + "grad_norm": 0.0734812319278717, + "learning_rate": 1.3827399332805968e-05, + "loss": 2.5626, + "step": 31219 + }, + { + "epoch": 0.9257776591643685, + "grad_norm": 0.07573749125003815, + "learning_rate": 1.3816412813379864e-05, + "loss": 2.5712, + "step": 31220 + }, + { + "epoch": 0.92580731251668, + "grad_norm": 0.07270186394453049, + "learning_rate": 1.3805430599192815e-05, + "loss": 2.5589, + "step": 31221 + }, + { + "epoch": 0.9258369658689914, + "grad_norm": 0.07030308246612549, + "learning_rate": 1.3794452690342186e-05, + "loss": 2.5617, + "step": 31222 + }, + { + "epoch": 0.925866619221303, + "grad_norm": 0.07111144810914993, + "learning_rate": 1.3783479086925122e-05, + "loss": 2.5607, + "step": 31223 + }, + { + "epoch": 0.9258962725736144, + "grad_norm": 0.07241128385066986, + "learning_rate": 1.3772509789038823e-05, + "loss": 2.586, + "step": 31224 + }, + { + "epoch": 0.9259259259259259, + "grad_norm": 0.07241225987672806, + "learning_rate": 1.3761544796780379e-05, + "loss": 2.5174, + "step": 31225 + }, + { + "epoch": 0.9259555792782374, + "grad_norm": 0.07581696659326553, + "learning_rate": 1.3750584110246933e-05, + "loss": 2.599, + "step": 31226 + }, + { + "epoch": 0.9259852326305489, + "grad_norm": 0.07362839579582214, + "learning_rate": 1.373962772953552e-05, + "loss": 2.5362, + "step": 31227 + }, + { + "epoch": 0.9260148859828604, + "grad_norm": 0.07010919600725174, + "learning_rate": 1.3728675654743173e-05, + "loss": 2.5838, + "step": 31228 + }, + { + "epoch": 0.9260445393351718, + "grad_norm": 0.06948433816432953, + "learning_rate": 1.3717727885966869e-05, + "loss": 2.576, + "step": 31229 + }, + { + "epoch": 0.9260741926874834, + "grad_norm": 0.0731092020869255, + "learning_rate": 1.3706784423303587e-05, + "loss": 2.5837, + "step": 31230 + }, + { + "epoch": 0.9261038460397948, + "grad_norm": 0.07339050620794296, + "learning_rate": 1.3695845266850137e-05, + "loss": 2.5777, + "step": 31231 + }, + { + "epoch": 0.9261334993921063, + "grad_norm": 0.07173532992601395, + "learning_rate": 1.36849104167035e-05, + "loss": 2.575, + "step": 31232 + }, + { + "epoch": 0.9261631527444177, + "grad_norm": 0.07349300384521484, + "learning_rate": 1.367397987296043e-05, + "loss": 2.5268, + "step": 31233 + }, + { + "epoch": 0.9261928060967293, + "grad_norm": 0.0785820260643959, + "learning_rate": 1.3663053635717793e-05, + "loss": 2.5845, + "step": 31234 + }, + { + "epoch": 0.9262224594490407, + "grad_norm": 0.0699305459856987, + "learning_rate": 1.3652131705072235e-05, + "loss": 2.5607, + "step": 31235 + }, + { + "epoch": 0.9262521128013522, + "grad_norm": 0.06914829462766647, + "learning_rate": 1.3641214081120512e-05, + "loss": 2.5513, + "step": 31236 + }, + { + "epoch": 0.9262817661536636, + "grad_norm": 0.07289417088031769, + "learning_rate": 1.363030076395938e-05, + "loss": 2.5642, + "step": 31237 + }, + { + "epoch": 0.9263114195059752, + "grad_norm": 0.07097698748111725, + "learning_rate": 1.3619391753685428e-05, + "loss": 2.5919, + "step": 31238 + }, + { + "epoch": 0.9263410728582866, + "grad_norm": 0.06856027245521545, + "learning_rate": 1.360848705039519e-05, + "loss": 2.5844, + "step": 31239 + }, + { + "epoch": 0.9263707262105981, + "grad_norm": 0.07145971059799194, + "learning_rate": 1.3597586654185312e-05, + "loss": 2.5499, + "step": 31240 + }, + { + "epoch": 0.9264003795629095, + "grad_norm": 0.07168599218130112, + "learning_rate": 1.3586690565152326e-05, + "loss": 2.5484, + "step": 31241 + }, + { + "epoch": 0.9264300329152211, + "grad_norm": 0.07051423192024231, + "learning_rate": 1.3575798783392657e-05, + "loss": 2.5857, + "step": 31242 + }, + { + "epoch": 0.9264596862675325, + "grad_norm": 0.07209119200706482, + "learning_rate": 1.356491130900278e-05, + "loss": 2.5721, + "step": 31243 + }, + { + "epoch": 0.926489339619844, + "grad_norm": 0.07004822790622711, + "learning_rate": 1.3554028142079122e-05, + "loss": 2.5863, + "step": 31244 + }, + { + "epoch": 0.9265189929721555, + "grad_norm": 0.07115931063890457, + "learning_rate": 1.3543149282718047e-05, + "loss": 2.5578, + "step": 31245 + }, + { + "epoch": 0.926548646324467, + "grad_norm": 0.07015345990657806, + "learning_rate": 1.3532274731015925e-05, + "loss": 2.5884, + "step": 31246 + }, + { + "epoch": 0.9265782996767784, + "grad_norm": 0.07430516928434372, + "learning_rate": 1.3521404487068956e-05, + "loss": 2.5993, + "step": 31247 + }, + { + "epoch": 0.9266079530290899, + "grad_norm": 0.06661964952945709, + "learning_rate": 1.3510538550973395e-05, + "loss": 2.5456, + "step": 31248 + }, + { + "epoch": 0.9266376063814015, + "grad_norm": 0.07032530754804611, + "learning_rate": 1.3499676922825555e-05, + "loss": 2.5844, + "step": 31249 + }, + { + "epoch": 0.9266672597337129, + "grad_norm": 0.07291273772716522, + "learning_rate": 1.3488819602721636e-05, + "loss": 2.5738, + "step": 31250 + }, + { + "epoch": 0.9266969130860244, + "grad_norm": 0.07304801791906357, + "learning_rate": 1.3477966590757673e-05, + "loss": 2.5858, + "step": 31251 + }, + { + "epoch": 0.9267265664383358, + "grad_norm": 0.06939789652824402, + "learning_rate": 1.3467117887029867e-05, + "loss": 2.6031, + "step": 31252 + }, + { + "epoch": 0.9267562197906474, + "grad_norm": 0.07769634574651718, + "learning_rate": 1.3456273491634251e-05, + "loss": 2.6089, + "step": 31253 + }, + { + "epoch": 0.9267858731429588, + "grad_norm": 0.07329632341861725, + "learning_rate": 1.3445433404666808e-05, + "loss": 2.5642, + "step": 31254 + }, + { + "epoch": 0.9268155264952703, + "grad_norm": 0.06823363900184631, + "learning_rate": 1.3434597626223622e-05, + "loss": 2.5549, + "step": 31255 + }, + { + "epoch": 0.9268451798475817, + "grad_norm": 0.0680246576666832, + "learning_rate": 1.3423766156400563e-05, + "loss": 2.5186, + "step": 31256 + }, + { + "epoch": 0.9268748331998933, + "grad_norm": 0.07112085074186325, + "learning_rate": 1.341293899529361e-05, + "loss": 2.5694, + "step": 31257 + }, + { + "epoch": 0.9269044865522047, + "grad_norm": 0.07096083462238312, + "learning_rate": 1.3402116142998522e-05, + "loss": 2.5344, + "step": 31258 + }, + { + "epoch": 0.9269341399045162, + "grad_norm": 0.0723515972495079, + "learning_rate": 1.3391297599611274e-05, + "loss": 2.5702, + "step": 31259 + }, + { + "epoch": 0.9269637932568277, + "grad_norm": 0.07002324610948563, + "learning_rate": 1.3380483365227625e-05, + "loss": 2.5546, + "step": 31260 + }, + { + "epoch": 0.9269934466091392, + "grad_norm": 0.07051000744104385, + "learning_rate": 1.336967343994322e-05, + "loss": 2.5962, + "step": 31261 + }, + { + "epoch": 0.9270230999614506, + "grad_norm": 0.07378170639276505, + "learning_rate": 1.3358867823853982e-05, + "loss": 2.5546, + "step": 31262 + }, + { + "epoch": 0.9270527533137621, + "grad_norm": 0.06939024478197098, + "learning_rate": 1.3348066517055446e-05, + "loss": 2.5468, + "step": 31263 + }, + { + "epoch": 0.9270824066660736, + "grad_norm": 0.07085588574409485, + "learning_rate": 1.3337269519643368e-05, + "loss": 2.5555, + "step": 31264 + }, + { + "epoch": 0.9271120600183851, + "grad_norm": 0.07050242274999619, + "learning_rate": 1.3326476831713341e-05, + "loss": 2.5521, + "step": 31265 + }, + { + "epoch": 0.9271417133706965, + "grad_norm": 0.07017139345407486, + "learning_rate": 1.3315688453360842e-05, + "loss": 2.5647, + "step": 31266 + }, + { + "epoch": 0.927171366723008, + "grad_norm": 0.07055871188640594, + "learning_rate": 1.330490438468146e-05, + "loss": 2.524, + "step": 31267 + }, + { + "epoch": 0.9272010200753195, + "grad_norm": 0.06931741535663605, + "learning_rate": 1.3294124625770677e-05, + "loss": 2.5751, + "step": 31268 + }, + { + "epoch": 0.927230673427631, + "grad_norm": 0.07322457432746887, + "learning_rate": 1.328334917672397e-05, + "loss": 2.5826, + "step": 31269 + }, + { + "epoch": 0.9272603267799425, + "grad_norm": 0.07830700278282166, + "learning_rate": 1.3272578037636763e-05, + "loss": 2.5796, + "step": 31270 + }, + { + "epoch": 0.9272899801322539, + "grad_norm": 0.0742802768945694, + "learning_rate": 1.3261811208604368e-05, + "loss": 2.5488, + "step": 31271 + }, + { + "epoch": 0.9273196334845655, + "grad_norm": 0.0696246474981308, + "learning_rate": 1.3251048689722266e-05, + "loss": 2.5554, + "step": 31272 + }, + { + "epoch": 0.9273492868368769, + "grad_norm": 0.07250957190990448, + "learning_rate": 1.3240290481085603e-05, + "loss": 2.5738, + "step": 31273 + }, + { + "epoch": 0.9273789401891884, + "grad_norm": 0.0775250494480133, + "learning_rate": 1.3229536582789748e-05, + "loss": 2.586, + "step": 31274 + }, + { + "epoch": 0.9274085935414998, + "grad_norm": 0.0702953040599823, + "learning_rate": 1.3218786994929899e-05, + "loss": 2.5919, + "step": 31275 + }, + { + "epoch": 0.9274382468938114, + "grad_norm": 0.07243186235427856, + "learning_rate": 1.320804171760126e-05, + "loss": 2.5734, + "step": 31276 + }, + { + "epoch": 0.9274679002461228, + "grad_norm": 0.07540691643953323, + "learning_rate": 1.3197300750898977e-05, + "loss": 2.5498, + "step": 31277 + }, + { + "epoch": 0.9274975535984343, + "grad_norm": 0.07066023349761963, + "learning_rate": 1.3186564094918141e-05, + "loss": 2.5528, + "step": 31278 + }, + { + "epoch": 0.9275272069507458, + "grad_norm": 0.0734967514872551, + "learning_rate": 1.3175831749753842e-05, + "loss": 2.6068, + "step": 31279 + }, + { + "epoch": 0.9275568603030573, + "grad_norm": 0.06988876312971115, + "learning_rate": 1.3165103715501114e-05, + "loss": 2.5553, + "step": 31280 + }, + { + "epoch": 0.9275865136553687, + "grad_norm": 0.07211852073669434, + "learning_rate": 1.3154379992254938e-05, + "loss": 2.599, + "step": 31281 + }, + { + "epoch": 0.9276161670076802, + "grad_norm": 0.07291395962238312, + "learning_rate": 1.314366058011035e-05, + "loss": 2.5491, + "step": 31282 + }, + { + "epoch": 0.9276458203599917, + "grad_norm": 0.0758601501584053, + "learning_rate": 1.3132945479162161e-05, + "loss": 2.5578, + "step": 31283 + }, + { + "epoch": 0.9276754737123032, + "grad_norm": 0.07184541970491409, + "learning_rate": 1.3122234689505296e-05, + "loss": 2.565, + "step": 31284 + }, + { + "epoch": 0.9277051270646146, + "grad_norm": 0.06936120241880417, + "learning_rate": 1.311152821123468e-05, + "loss": 2.5587, + "step": 31285 + }, + { + "epoch": 0.9277347804169261, + "grad_norm": 0.07252173870801926, + "learning_rate": 1.3100826044445014e-05, + "loss": 2.5448, + "step": 31286 + }, + { + "epoch": 0.9277644337692376, + "grad_norm": 0.07333964109420776, + "learning_rate": 1.309012818923111e-05, + "loss": 2.5646, + "step": 31287 + }, + { + "epoch": 0.9277940871215491, + "grad_norm": 0.07639946043491364, + "learning_rate": 1.3079434645687671e-05, + "loss": 2.5652, + "step": 31288 + }, + { + "epoch": 0.9278237404738606, + "grad_norm": 0.07214830815792084, + "learning_rate": 1.3068745413909455e-05, + "loss": 2.5843, + "step": 31289 + }, + { + "epoch": 0.927853393826172, + "grad_norm": 0.0734207034111023, + "learning_rate": 1.3058060493991053e-05, + "loss": 2.5548, + "step": 31290 + }, + { + "epoch": 0.9278830471784836, + "grad_norm": 0.07255431264638901, + "learning_rate": 1.3047379886027111e-05, + "loss": 2.5336, + "step": 31291 + }, + { + "epoch": 0.927912700530795, + "grad_norm": 0.07658113539218903, + "learning_rate": 1.303670359011222e-05, + "loss": 2.565, + "step": 31292 + }, + { + "epoch": 0.9279423538831065, + "grad_norm": 0.07011198997497559, + "learning_rate": 1.3026031606340915e-05, + "loss": 2.5591, + "step": 31293 + }, + { + "epoch": 0.927972007235418, + "grad_norm": 0.07461782544851303, + "learning_rate": 1.3015363934807678e-05, + "loss": 2.6213, + "step": 31294 + }, + { + "epoch": 0.9280016605877295, + "grad_norm": 0.0730162113904953, + "learning_rate": 1.3004700575606987e-05, + "loss": 2.5551, + "step": 31295 + }, + { + "epoch": 0.9280313139400409, + "grad_norm": 0.07181300222873688, + "learning_rate": 1.2994041528833267e-05, + "loss": 2.5794, + "step": 31296 + }, + { + "epoch": 0.9280609672923524, + "grad_norm": 0.07030940800905228, + "learning_rate": 1.2983386794580888e-05, + "loss": 2.5381, + "step": 31297 + }, + { + "epoch": 0.9280906206446639, + "grad_norm": 0.07236554473638535, + "learning_rate": 1.2972736372944216e-05, + "loss": 2.5485, + "step": 31298 + }, + { + "epoch": 0.9281202739969754, + "grad_norm": 0.07729268819093704, + "learning_rate": 1.2962090264017568e-05, + "loss": 2.558, + "step": 31299 + }, + { + "epoch": 0.9281499273492868, + "grad_norm": 0.07057848572731018, + "learning_rate": 1.2951448467895199e-05, + "loss": 2.5661, + "step": 31300 + }, + { + "epoch": 0.9281795807015983, + "grad_norm": 0.07095932960510254, + "learning_rate": 1.294081098467137e-05, + "loss": 2.5699, + "step": 31301 + }, + { + "epoch": 0.9282092340539098, + "grad_norm": 0.07190268486738205, + "learning_rate": 1.2930177814440225e-05, + "loss": 2.511, + "step": 31302 + }, + { + "epoch": 0.9282388874062213, + "grad_norm": 0.07241285592317581, + "learning_rate": 1.2919548957296024e-05, + "loss": 2.5564, + "step": 31303 + }, + { + "epoch": 0.9282685407585327, + "grad_norm": 0.0749180018901825, + "learning_rate": 1.2908924413332746e-05, + "loss": 2.5782, + "step": 31304 + }, + { + "epoch": 0.9282981941108442, + "grad_norm": 0.0701940730214119, + "learning_rate": 1.2898304182644594e-05, + "loss": 2.563, + "step": 31305 + }, + { + "epoch": 0.9283278474631557, + "grad_norm": 0.0733252540230751, + "learning_rate": 1.2887688265325604e-05, + "loss": 2.6042, + "step": 31306 + }, + { + "epoch": 0.9283575008154672, + "grad_norm": 0.07320728898048401, + "learning_rate": 1.2877076661469699e-05, + "loss": 2.5447, + "step": 31307 + }, + { + "epoch": 0.9283871541677786, + "grad_norm": 0.07230512797832489, + "learning_rate": 1.2866469371170864e-05, + "loss": 2.5663, + "step": 31308 + }, + { + "epoch": 0.9284168075200901, + "grad_norm": 0.07911679148674011, + "learning_rate": 1.285586639452313e-05, + "loss": 2.5642, + "step": 31309 + }, + { + "epoch": 0.9284464608724017, + "grad_norm": 0.07298639416694641, + "learning_rate": 1.2845267731620314e-05, + "loss": 2.5792, + "step": 31310 + }, + { + "epoch": 0.9284761142247131, + "grad_norm": 0.07399782538414001, + "learning_rate": 1.2834673382556227e-05, + "loss": 2.5515, + "step": 31311 + }, + { + "epoch": 0.9285057675770246, + "grad_norm": 0.07399877160787582, + "learning_rate": 1.2824083347424743e-05, + "loss": 2.5856, + "step": 31312 + }, + { + "epoch": 0.928535420929336, + "grad_norm": 0.07026924937963486, + "learning_rate": 1.2813497626319614e-05, + "loss": 2.5473, + "step": 31313 + }, + { + "epoch": 0.9285650742816476, + "grad_norm": 0.07590536773204803, + "learning_rate": 1.2802916219334604e-05, + "loss": 2.5729, + "step": 31314 + }, + { + "epoch": 0.928594727633959, + "grad_norm": 0.07399394363164902, + "learning_rate": 1.2792339126563358e-05, + "loss": 2.5678, + "step": 31315 + }, + { + "epoch": 0.9286243809862705, + "grad_norm": 0.07261496782302856, + "learning_rate": 1.2781766348099632e-05, + "loss": 2.5657, + "step": 31316 + }, + { + "epoch": 0.928654034338582, + "grad_norm": 0.07402976602315903, + "learning_rate": 1.2771197884036966e-05, + "loss": 2.5647, + "step": 31317 + }, + { + "epoch": 0.9286836876908935, + "grad_norm": 0.07313336431980133, + "learning_rate": 1.2760633734468951e-05, + "loss": 2.593, + "step": 31318 + }, + { + "epoch": 0.9287133410432049, + "grad_norm": 0.0732121467590332, + "learning_rate": 1.2750073899489179e-05, + "loss": 2.5798, + "step": 31319 + }, + { + "epoch": 0.9287429943955164, + "grad_norm": 0.07404683530330658, + "learning_rate": 1.2739518379191183e-05, + "loss": 2.5927, + "step": 31320 + }, + { + "epoch": 0.9287726477478279, + "grad_norm": 0.0730813518166542, + "learning_rate": 1.2728967173668393e-05, + "loss": 2.5197, + "step": 31321 + }, + { + "epoch": 0.9288023011001394, + "grad_norm": 0.07064629346132278, + "learning_rate": 1.2718420283014175e-05, + "loss": 2.5692, + "step": 31322 + }, + { + "epoch": 0.9288319544524508, + "grad_norm": 0.07052606344223022, + "learning_rate": 1.2707877707322013e-05, + "loss": 2.5737, + "step": 31323 + }, + { + "epoch": 0.9288616078047623, + "grad_norm": 0.06979499012231827, + "learning_rate": 1.2697339446685218e-05, + "loss": 2.5772, + "step": 31324 + }, + { + "epoch": 0.9288912611570738, + "grad_norm": 0.07119312882423401, + "learning_rate": 1.2686805501197163e-05, + "loss": 2.5577, + "step": 31325 + }, + { + "epoch": 0.9289209145093853, + "grad_norm": 0.07389084994792938, + "learning_rate": 1.2676275870950994e-05, + "loss": 2.554, + "step": 31326 + }, + { + "epoch": 0.9289505678616967, + "grad_norm": 0.07394661754369736, + "learning_rate": 1.2665750556040135e-05, + "loss": 2.5971, + "step": 31327 + }, + { + "epoch": 0.9289802212140083, + "grad_norm": 0.07034941017627716, + "learning_rate": 1.2655229556557734e-05, + "loss": 2.5805, + "step": 31328 + }, + { + "epoch": 0.9290098745663197, + "grad_norm": 0.07140347361564636, + "learning_rate": 1.2644712872596887e-05, + "loss": 2.5961, + "step": 31329 + }, + { + "epoch": 0.9290395279186312, + "grad_norm": 0.07531657814979553, + "learning_rate": 1.2634200504250736e-05, + "loss": 2.5589, + "step": 31330 + }, + { + "epoch": 0.9290691812709427, + "grad_norm": 0.0700526088476181, + "learning_rate": 1.262369245161249e-05, + "loss": 2.5624, + "step": 31331 + }, + { + "epoch": 0.9290988346232542, + "grad_norm": 0.07414255291223526, + "learning_rate": 1.2613188714775014e-05, + "loss": 2.5315, + "step": 31332 + }, + { + "epoch": 0.9291284879755657, + "grad_norm": 0.0734509602189064, + "learning_rate": 1.2602689293831405e-05, + "loss": 2.5205, + "step": 31333 + }, + { + "epoch": 0.9291581413278771, + "grad_norm": 0.06917718797922134, + "learning_rate": 1.2592194188874694e-05, + "loss": 2.5423, + "step": 31334 + }, + { + "epoch": 0.9291877946801886, + "grad_norm": 0.07250598073005676, + "learning_rate": 1.2581703399997757e-05, + "loss": 2.6176, + "step": 31335 + }, + { + "epoch": 0.9292174480325001, + "grad_norm": 0.07505233585834503, + "learning_rate": 1.257121692729346e-05, + "loss": 2.5704, + "step": 31336 + }, + { + "epoch": 0.9292471013848116, + "grad_norm": 0.07101701945066452, + "learning_rate": 1.2560734770854732e-05, + "loss": 2.5631, + "step": 31337 + }, + { + "epoch": 0.929276754737123, + "grad_norm": 0.07007481157779694, + "learning_rate": 1.2550256930774384e-05, + "loss": 2.5594, + "step": 31338 + }, + { + "epoch": 0.9293064080894345, + "grad_norm": 0.07114969193935394, + "learning_rate": 1.2539783407145067e-05, + "loss": 2.5529, + "step": 31339 + }, + { + "epoch": 0.929336061441746, + "grad_norm": 0.07072416692972183, + "learning_rate": 1.252931420005976e-05, + "loss": 2.6193, + "step": 31340 + }, + { + "epoch": 0.9293657147940575, + "grad_norm": 0.07016705721616745, + "learning_rate": 1.2518849309611058e-05, + "loss": 2.578, + "step": 31341 + }, + { + "epoch": 0.9293953681463689, + "grad_norm": 0.07145686447620392, + "learning_rate": 1.2508388735891607e-05, + "loss": 2.5804, + "step": 31342 + }, + { + "epoch": 0.9294250214986804, + "grad_norm": 0.07253672182559967, + "learning_rate": 1.2497932478994001e-05, + "loss": 2.5304, + "step": 31343 + }, + { + "epoch": 0.9294546748509919, + "grad_norm": 0.07280370593070984, + "learning_rate": 1.2487480539010887e-05, + "loss": 2.5696, + "step": 31344 + }, + { + "epoch": 0.9294843282033034, + "grad_norm": 0.06945808231830597, + "learning_rate": 1.2477032916034858e-05, + "loss": 2.5536, + "step": 31345 + }, + { + "epoch": 0.9295139815556148, + "grad_norm": 0.07038458436727524, + "learning_rate": 1.246658961015834e-05, + "loss": 2.5408, + "step": 31346 + }, + { + "epoch": 0.9295436349079264, + "grad_norm": 0.06642254441976547, + "learning_rate": 1.2456150621473872e-05, + "loss": 2.5441, + "step": 31347 + }, + { + "epoch": 0.9295732882602378, + "grad_norm": 0.06947416067123413, + "learning_rate": 1.2445715950073876e-05, + "loss": 2.5711, + "step": 31348 + }, + { + "epoch": 0.9296029416125493, + "grad_norm": 0.06959274411201477, + "learning_rate": 1.2435285596050783e-05, + "loss": 2.5559, + "step": 31349 + }, + { + "epoch": 0.9296325949648607, + "grad_norm": 0.06963727623224258, + "learning_rate": 1.2424859559496903e-05, + "loss": 2.5605, + "step": 31350 + }, + { + "epoch": 0.9296622483171723, + "grad_norm": 0.076178640127182, + "learning_rate": 1.2414437840504555e-05, + "loss": 2.5817, + "step": 31351 + }, + { + "epoch": 0.9296919016694838, + "grad_norm": 0.06905380636453629, + "learning_rate": 1.2404020439166053e-05, + "loss": 2.5776, + "step": 31352 + }, + { + "epoch": 0.9297215550217952, + "grad_norm": 0.0711008831858635, + "learning_rate": 1.239360735557371e-05, + "loss": 2.5564, + "step": 31353 + }, + { + "epoch": 0.9297512083741067, + "grad_norm": 0.0725887343287468, + "learning_rate": 1.2383198589819622e-05, + "loss": 2.6086, + "step": 31354 + }, + { + "epoch": 0.9297808617264182, + "grad_norm": 0.0717925876379013, + "learning_rate": 1.2372794141995991e-05, + "loss": 2.5462, + "step": 31355 + }, + { + "epoch": 0.9298105150787297, + "grad_norm": 0.07206364721059799, + "learning_rate": 1.2362394012195022e-05, + "loss": 2.5759, + "step": 31356 + }, + { + "epoch": 0.9298401684310411, + "grad_norm": 0.07232900708913803, + "learning_rate": 1.2351998200508697e-05, + "loss": 2.5505, + "step": 31357 + }, + { + "epoch": 0.9298698217833526, + "grad_norm": 0.07001178711652756, + "learning_rate": 1.2341606707029162e-05, + "loss": 2.5895, + "step": 31358 + }, + { + "epoch": 0.9298994751356641, + "grad_norm": 0.07098115980625153, + "learning_rate": 1.2331219531848403e-05, + "loss": 2.5593, + "step": 31359 + }, + { + "epoch": 0.9299291284879756, + "grad_norm": 0.07114805281162262, + "learning_rate": 1.23208366750584e-05, + "loss": 2.5637, + "step": 31360 + }, + { + "epoch": 0.929958781840287, + "grad_norm": 0.07413454353809357, + "learning_rate": 1.231045813675108e-05, + "loss": 2.5615, + "step": 31361 + }, + { + "epoch": 0.9299884351925986, + "grad_norm": 0.06791914254426956, + "learning_rate": 1.2300083917018422e-05, + "loss": 2.5469, + "step": 31362 + }, + { + "epoch": 0.93001808854491, + "grad_norm": 0.07192706316709518, + "learning_rate": 1.2289714015952192e-05, + "loss": 2.5735, + "step": 31363 + }, + { + "epoch": 0.9300477418972215, + "grad_norm": 0.07248866558074951, + "learning_rate": 1.2279348433644256e-05, + "loss": 2.5735, + "step": 31364 + }, + { + "epoch": 0.9300773952495329, + "grad_norm": 0.06721384823322296, + "learning_rate": 1.2268987170186375e-05, + "loss": 2.5324, + "step": 31365 + }, + { + "epoch": 0.9301070486018445, + "grad_norm": 0.07232566177845001, + "learning_rate": 1.2258630225670364e-05, + "loss": 2.5506, + "step": 31366 + }, + { + "epoch": 0.9301367019541559, + "grad_norm": 0.07424359768629074, + "learning_rate": 1.2248277600187929e-05, + "loss": 2.5725, + "step": 31367 + }, + { + "epoch": 0.9301663553064674, + "grad_norm": 0.07146301120519638, + "learning_rate": 1.2237929293830718e-05, + "loss": 2.5699, + "step": 31368 + }, + { + "epoch": 0.9301960086587788, + "grad_norm": 0.07220407575368881, + "learning_rate": 1.2227585306690325e-05, + "loss": 2.5447, + "step": 31369 + }, + { + "epoch": 0.9302256620110904, + "grad_norm": 0.07326182723045349, + "learning_rate": 1.2217245638858399e-05, + "loss": 2.5612, + "step": 31370 + }, + { + "epoch": 0.9302553153634018, + "grad_norm": 0.07679618149995804, + "learning_rate": 1.2206910290426477e-05, + "loss": 2.5856, + "step": 31371 + }, + { + "epoch": 0.9302849687157133, + "grad_norm": 0.07031376659870148, + "learning_rate": 1.2196579261486152e-05, + "loss": 2.5676, + "step": 31372 + }, + { + "epoch": 0.9303146220680248, + "grad_norm": 0.06940107047557831, + "learning_rate": 1.2186252552128795e-05, + "loss": 2.5691, + "step": 31373 + }, + { + "epoch": 0.9303442754203363, + "grad_norm": 0.07520674169063568, + "learning_rate": 1.2175930162445891e-05, + "loss": 2.5449, + "step": 31374 + }, + { + "epoch": 0.9303739287726478, + "grad_norm": 0.07417158037424088, + "learning_rate": 1.2165612092528922e-05, + "loss": 2.5643, + "step": 31375 + }, + { + "epoch": 0.9304035821249592, + "grad_norm": 0.07301551103591919, + "learning_rate": 1.2155298342469202e-05, + "loss": 2.5764, + "step": 31376 + }, + { + "epoch": 0.9304332354772707, + "grad_norm": 0.0767822265625, + "learning_rate": 1.214498891235799e-05, + "loss": 2.5513, + "step": 31377 + }, + { + "epoch": 0.9304628888295822, + "grad_norm": 0.07259368896484375, + "learning_rate": 1.2134683802286605e-05, + "loss": 2.5496, + "step": 31378 + }, + { + "epoch": 0.9304925421818937, + "grad_norm": 0.07343021780252457, + "learning_rate": 1.2124383012346419e-05, + "loss": 2.5735, + "step": 31379 + }, + { + "epoch": 0.9305221955342051, + "grad_norm": 0.0722104087471962, + "learning_rate": 1.2114086542628522e-05, + "loss": 2.5535, + "step": 31380 + }, + { + "epoch": 0.9305518488865167, + "grad_norm": 0.07045001536607742, + "learning_rate": 1.2103794393224122e-05, + "loss": 2.5748, + "step": 31381 + }, + { + "epoch": 0.9305815022388281, + "grad_norm": 0.07046640664339066, + "learning_rate": 1.2093506564224421e-05, + "loss": 2.5755, + "step": 31382 + }, + { + "epoch": 0.9306111555911396, + "grad_norm": 0.07157739996910095, + "learning_rate": 1.2083223055720405e-05, + "loss": 2.5588, + "step": 31383 + }, + { + "epoch": 0.930640808943451, + "grad_norm": 0.07442599534988403, + "learning_rate": 1.2072943867803222e-05, + "loss": 2.5482, + "step": 31384 + }, + { + "epoch": 0.9306704622957626, + "grad_norm": 0.07317469269037247, + "learning_rate": 1.2062669000563908e-05, + "loss": 2.5808, + "step": 31385 + }, + { + "epoch": 0.930700115648074, + "grad_norm": 0.07246823608875275, + "learning_rate": 1.205239845409345e-05, + "loss": 2.5683, + "step": 31386 + }, + { + "epoch": 0.9307297690003855, + "grad_norm": 0.06926063448190689, + "learning_rate": 1.204213222848266e-05, + "loss": 2.5708, + "step": 31387 + }, + { + "epoch": 0.9307594223526969, + "grad_norm": 0.07129865884780884, + "learning_rate": 1.203187032382258e-05, + "loss": 2.5361, + "step": 31388 + }, + { + "epoch": 0.9307890757050085, + "grad_norm": 0.06785444170236588, + "learning_rate": 1.202161274020408e-05, + "loss": 2.595, + "step": 31389 + }, + { + "epoch": 0.9308187290573199, + "grad_norm": 0.0730213150382042, + "learning_rate": 1.201135947771792e-05, + "loss": 2.5769, + "step": 31390 + }, + { + "epoch": 0.9308483824096314, + "grad_norm": 0.07374830543994904, + "learning_rate": 1.2001110536454917e-05, + "loss": 2.5597, + "step": 31391 + }, + { + "epoch": 0.9308780357619428, + "grad_norm": 0.07578586786985397, + "learning_rate": 1.1990865916505834e-05, + "loss": 2.5666, + "step": 31392 + }, + { + "epoch": 0.9309076891142544, + "grad_norm": 0.09614846855401993, + "learning_rate": 1.1980625617961427e-05, + "loss": 2.5745, + "step": 31393 + }, + { + "epoch": 0.9309373424665659, + "grad_norm": 0.06979165226221085, + "learning_rate": 1.1970389640912404e-05, + "loss": 2.6029, + "step": 31394 + }, + { + "epoch": 0.9309669958188773, + "grad_norm": 0.07123665511608124, + "learning_rate": 1.1960157985449305e-05, + "loss": 2.5959, + "step": 31395 + }, + { + "epoch": 0.9309966491711889, + "grad_norm": 0.06963920593261719, + "learning_rate": 1.1949930651662776e-05, + "loss": 2.5356, + "step": 31396 + }, + { + "epoch": 0.9310263025235003, + "grad_norm": 0.07079663127660751, + "learning_rate": 1.1939707639643416e-05, + "loss": 2.5756, + "step": 31397 + }, + { + "epoch": 0.9310559558758118, + "grad_norm": 0.07570625096559525, + "learning_rate": 1.1929488949481649e-05, + "loss": 2.5947, + "step": 31398 + }, + { + "epoch": 0.9310856092281232, + "grad_norm": 0.07263673841953278, + "learning_rate": 1.191927458126807e-05, + "loss": 2.544, + "step": 31399 + }, + { + "epoch": 0.9311152625804348, + "grad_norm": 0.07404949516057968, + "learning_rate": 1.1909064535093106e-05, + "loss": 2.5898, + "step": 31400 + }, + { + "epoch": 0.9311449159327462, + "grad_norm": 0.06893303990364075, + "learning_rate": 1.1898858811047131e-05, + "loss": 2.5646, + "step": 31401 + }, + { + "epoch": 0.9311745692850577, + "grad_norm": 0.07479596883058548, + "learning_rate": 1.1888657409220571e-05, + "loss": 2.5815, + "step": 31402 + }, + { + "epoch": 0.9312042226373691, + "grad_norm": 0.07047494500875473, + "learning_rate": 1.1878460329703745e-05, + "loss": 2.5624, + "step": 31403 + }, + { + "epoch": 0.9312338759896807, + "grad_norm": 0.07287587970495224, + "learning_rate": 1.1868267572586855e-05, + "loss": 2.5561, + "step": 31404 + }, + { + "epoch": 0.9312635293419921, + "grad_norm": 0.06806948035955429, + "learning_rate": 1.1858079137960276e-05, + "loss": 2.5542, + "step": 31405 + }, + { + "epoch": 0.9312931826943036, + "grad_norm": 0.06938512623310089, + "learning_rate": 1.1847895025914213e-05, + "loss": 2.5627, + "step": 31406 + }, + { + "epoch": 0.931322836046615, + "grad_norm": 0.0681891143321991, + "learning_rate": 1.1837715236538871e-05, + "loss": 2.5628, + "step": 31407 + }, + { + "epoch": 0.9313524893989266, + "grad_norm": 0.0717681273818016, + "learning_rate": 1.1827539769924345e-05, + "loss": 2.5787, + "step": 31408 + }, + { + "epoch": 0.931382142751238, + "grad_norm": 0.06906365603208542, + "learning_rate": 1.1817368626160674e-05, + "loss": 2.556, + "step": 31409 + }, + { + "epoch": 0.9314117961035495, + "grad_norm": 0.06998880952596664, + "learning_rate": 1.1807201805338064e-05, + "loss": 2.5839, + "step": 31410 + }, + { + "epoch": 0.9314414494558609, + "grad_norm": 0.07081819325685501, + "learning_rate": 1.1797039307546442e-05, + "loss": 2.5611, + "step": 31411 + }, + { + "epoch": 0.9314711028081725, + "grad_norm": 0.07206539064645767, + "learning_rate": 1.178688113287585e-05, + "loss": 2.5667, + "step": 31412 + }, + { + "epoch": 0.9315007561604839, + "grad_norm": 0.069037064909935, + "learning_rate": 1.1776727281416265e-05, + "loss": 2.56, + "step": 31413 + }, + { + "epoch": 0.9315304095127954, + "grad_norm": 0.06944017857313156, + "learning_rate": 1.1766577753257512e-05, + "loss": 2.5757, + "step": 31414 + }, + { + "epoch": 0.931560062865107, + "grad_norm": 0.07053609192371368, + "learning_rate": 1.1756432548489514e-05, + "loss": 2.5958, + "step": 31415 + }, + { + "epoch": 0.9315897162174184, + "grad_norm": 0.06697996705770493, + "learning_rate": 1.1746291667202147e-05, + "loss": 2.5632, + "step": 31416 + }, + { + "epoch": 0.9316193695697299, + "grad_norm": 0.0709361881017685, + "learning_rate": 1.1736155109485114e-05, + "loss": 2.5482, + "step": 31417 + }, + { + "epoch": 0.9316490229220413, + "grad_norm": 0.07087142020463943, + "learning_rate": 1.1726022875428288e-05, + "loss": 2.5794, + "step": 31418 + }, + { + "epoch": 0.9316786762743529, + "grad_norm": 0.07175975292921066, + "learning_rate": 1.1715894965121376e-05, + "loss": 2.5317, + "step": 31419 + }, + { + "epoch": 0.9317083296266643, + "grad_norm": 0.06929074972867966, + "learning_rate": 1.1705771378653973e-05, + "loss": 2.5676, + "step": 31420 + }, + { + "epoch": 0.9317379829789758, + "grad_norm": 0.07228066027164459, + "learning_rate": 1.1695652116115785e-05, + "loss": 2.5736, + "step": 31421 + }, + { + "epoch": 0.9317676363312872, + "grad_norm": 0.07001009583473206, + "learning_rate": 1.1685537177596405e-05, + "loss": 2.5339, + "step": 31422 + }, + { + "epoch": 0.9317972896835988, + "grad_norm": 0.06743910163640976, + "learning_rate": 1.1675426563185432e-05, + "loss": 2.5204, + "step": 31423 + }, + { + "epoch": 0.9318269430359102, + "grad_norm": 0.06902024894952774, + "learning_rate": 1.1665320272972347e-05, + "loss": 2.5586, + "step": 31424 + }, + { + "epoch": 0.9318565963882217, + "grad_norm": 0.06924908608198166, + "learning_rate": 1.165521830704669e-05, + "loss": 2.5789, + "step": 31425 + }, + { + "epoch": 0.9318862497405331, + "grad_norm": 0.06934987753629684, + "learning_rate": 1.1645120665497887e-05, + "loss": 2.5619, + "step": 31426 + }, + { + "epoch": 0.9319159030928447, + "grad_norm": 0.06915593892335892, + "learning_rate": 1.1635027348415427e-05, + "loss": 2.5802, + "step": 31427 + }, + { + "epoch": 0.9319455564451561, + "grad_norm": 0.07120808213949203, + "learning_rate": 1.162493835588857e-05, + "loss": 2.5911, + "step": 31428 + }, + { + "epoch": 0.9319752097974676, + "grad_norm": 0.07160495966672897, + "learning_rate": 1.1614853688006688e-05, + "loss": 2.5806, + "step": 31429 + }, + { + "epoch": 0.932004863149779, + "grad_norm": 0.06861793994903564, + "learning_rate": 1.1604773344859155e-05, + "loss": 2.548, + "step": 31430 + }, + { + "epoch": 0.9320345165020906, + "grad_norm": 0.074119932949543, + "learning_rate": 1.1594697326535175e-05, + "loss": 2.5609, + "step": 31431 + }, + { + "epoch": 0.932064169854402, + "grad_norm": 0.07299519330263138, + "learning_rate": 1.1584625633123957e-05, + "loss": 2.5457, + "step": 31432 + }, + { + "epoch": 0.9320938232067135, + "grad_norm": 0.07091303169727325, + "learning_rate": 1.1574558264714763e-05, + "loss": 2.5196, + "step": 31433 + }, + { + "epoch": 0.9321234765590249, + "grad_norm": 0.06814755499362946, + "learning_rate": 1.1564495221396686e-05, + "loss": 2.5727, + "step": 31434 + }, + { + "epoch": 0.9321531299113365, + "grad_norm": 0.06894565373659134, + "learning_rate": 1.1554436503258824e-05, + "loss": 2.5239, + "step": 31435 + }, + { + "epoch": 0.932182783263648, + "grad_norm": 0.06842587143182755, + "learning_rate": 1.1544382110390272e-05, + "loss": 2.5775, + "step": 31436 + }, + { + "epoch": 0.9322124366159594, + "grad_norm": 0.07027511298656464, + "learning_rate": 1.1534332042880013e-05, + "loss": 2.5786, + "step": 31437 + }, + { + "epoch": 0.932242089968271, + "grad_norm": 0.07013814896345139, + "learning_rate": 1.1524286300817143e-05, + "loss": 2.5779, + "step": 31438 + }, + { + "epoch": 0.9322717433205824, + "grad_norm": 0.07273095101118088, + "learning_rate": 1.1514244884290536e-05, + "loss": 2.5527, + "step": 31439 + }, + { + "epoch": 0.9323013966728939, + "grad_norm": 0.0728221908211708, + "learning_rate": 1.1504207793389177e-05, + "loss": 2.5739, + "step": 31440 + }, + { + "epoch": 0.9323310500252053, + "grad_norm": 0.06969975680112839, + "learning_rate": 1.1494175028201936e-05, + "loss": 2.5905, + "step": 31441 + }, + { + "epoch": 0.9323607033775169, + "grad_norm": 0.07498156279325485, + "learning_rate": 1.1484146588817523e-05, + "loss": 2.5608, + "step": 31442 + }, + { + "epoch": 0.9323903567298283, + "grad_norm": 0.07149748504161835, + "learning_rate": 1.1474122475324867e-05, + "loss": 2.5797, + "step": 31443 + }, + { + "epoch": 0.9324200100821398, + "grad_norm": 0.07593968510627747, + "learning_rate": 1.1464102687812728e-05, + "loss": 2.595, + "step": 31444 + }, + { + "epoch": 0.9324496634344512, + "grad_norm": 0.07065343111753464, + "learning_rate": 1.1454087226369869e-05, + "loss": 2.5651, + "step": 31445 + }, + { + "epoch": 0.9324793167867628, + "grad_norm": 0.07019659876823425, + "learning_rate": 1.1444076091084887e-05, + "loss": 2.5386, + "step": 31446 + }, + { + "epoch": 0.9325089701390742, + "grad_norm": 0.07310301065444946, + "learning_rate": 1.1434069282046433e-05, + "loss": 2.5649, + "step": 31447 + }, + { + "epoch": 0.9325386234913857, + "grad_norm": 0.06710086762905121, + "learning_rate": 1.1424066799343213e-05, + "loss": 2.5594, + "step": 31448 + }, + { + "epoch": 0.9325682768436971, + "grad_norm": 0.06949098408222198, + "learning_rate": 1.1414068643063713e-05, + "loss": 2.5443, + "step": 31449 + }, + { + "epoch": 0.9325979301960087, + "grad_norm": 0.07208991795778275, + "learning_rate": 1.1404074813296472e-05, + "loss": 2.5788, + "step": 31450 + }, + { + "epoch": 0.9326275835483201, + "grad_norm": 0.07490438967943192, + "learning_rate": 1.1394085310130087e-05, + "loss": 2.5656, + "step": 31451 + }, + { + "epoch": 0.9326572369006316, + "grad_norm": 0.06983622908592224, + "learning_rate": 1.138410013365293e-05, + "loss": 2.5624, + "step": 31452 + }, + { + "epoch": 0.932686890252943, + "grad_norm": 0.06815040856599808, + "learning_rate": 1.1374119283953432e-05, + "loss": 2.5571, + "step": 31453 + }, + { + "epoch": 0.9327165436052546, + "grad_norm": 0.07159668952226639, + "learning_rate": 1.1364142761119966e-05, + "loss": 2.5385, + "step": 31454 + }, + { + "epoch": 0.932746196957566, + "grad_norm": 0.074449323117733, + "learning_rate": 1.135417056524085e-05, + "loss": 2.5926, + "step": 31455 + }, + { + "epoch": 0.9327758503098775, + "grad_norm": 0.06999773532152176, + "learning_rate": 1.134420269640446e-05, + "loss": 2.5727, + "step": 31456 + }, + { + "epoch": 0.9328055036621891, + "grad_norm": 0.06863249093294144, + "learning_rate": 1.1334239154699e-05, + "loss": 2.5822, + "step": 31457 + }, + { + "epoch": 0.9328351570145005, + "grad_norm": 0.0677386075258255, + "learning_rate": 1.1324279940212789e-05, + "loss": 2.5765, + "step": 31458 + }, + { + "epoch": 0.932864810366812, + "grad_norm": 0.0684288963675499, + "learning_rate": 1.1314325053033925e-05, + "loss": 2.5647, + "step": 31459 + }, + { + "epoch": 0.9328944637191234, + "grad_norm": 0.07336781173944473, + "learning_rate": 1.1304374493250613e-05, + "loss": 2.5718, + "step": 31460 + }, + { + "epoch": 0.932924117071435, + "grad_norm": 0.06941413879394531, + "learning_rate": 1.129442826095095e-05, + "loss": 2.5412, + "step": 31461 + }, + { + "epoch": 0.9329537704237464, + "grad_norm": 0.07291670143604279, + "learning_rate": 1.1284486356223033e-05, + "loss": 2.5569, + "step": 31462 + }, + { + "epoch": 0.9329834237760579, + "grad_norm": 0.07199141383171082, + "learning_rate": 1.1274548779154847e-05, + "loss": 2.6012, + "step": 31463 + }, + { + "epoch": 0.9330130771283693, + "grad_norm": 0.07358603179454803, + "learning_rate": 1.1264615529834433e-05, + "loss": 2.5894, + "step": 31464 + }, + { + "epoch": 0.9330427304806809, + "grad_norm": 0.07039138674736023, + "learning_rate": 1.1254686608349718e-05, + "loss": 2.5767, + "step": 31465 + }, + { + "epoch": 0.9330723838329923, + "grad_norm": 0.07002244144678116, + "learning_rate": 1.1244762014788635e-05, + "loss": 2.5666, + "step": 31466 + }, + { + "epoch": 0.9331020371853038, + "grad_norm": 0.07201238721609116, + "learning_rate": 1.123484174923911e-05, + "loss": 2.5548, + "step": 31467 + }, + { + "epoch": 0.9331316905376152, + "grad_norm": 0.07282138615846634, + "learning_rate": 1.1224925811788855e-05, + "loss": 2.5705, + "step": 31468 + }, + { + "epoch": 0.9331613438899268, + "grad_norm": 0.0718030333518982, + "learning_rate": 1.1215014202525908e-05, + "loss": 2.5566, + "step": 31469 + }, + { + "epoch": 0.9331909972422382, + "grad_norm": 0.06920815259218216, + "learning_rate": 1.1205106921537867e-05, + "loss": 2.5613, + "step": 31470 + }, + { + "epoch": 0.9332206505945497, + "grad_norm": 0.06960286945104599, + "learning_rate": 1.1195203968912493e-05, + "loss": 2.5845, + "step": 31471 + }, + { + "epoch": 0.9332503039468611, + "grad_norm": 0.07084044814109802, + "learning_rate": 1.1185305344737495e-05, + "loss": 2.5636, + "step": 31472 + }, + { + "epoch": 0.9332799572991727, + "grad_norm": 0.07005088031291962, + "learning_rate": 1.117541104910058e-05, + "loss": 2.6227, + "step": 31473 + }, + { + "epoch": 0.9333096106514841, + "grad_norm": 0.06982453167438507, + "learning_rate": 1.1165521082089237e-05, + "loss": 2.5275, + "step": 31474 + }, + { + "epoch": 0.9333392640037956, + "grad_norm": 0.06843496114015579, + "learning_rate": 1.1155635443791113e-05, + "loss": 2.5834, + "step": 31475 + }, + { + "epoch": 0.933368917356107, + "grad_norm": 0.07095088809728622, + "learning_rate": 1.1145754134293806e-05, + "loss": 2.5374, + "step": 31476 + }, + { + "epoch": 0.9333985707084186, + "grad_norm": 0.07243355363607407, + "learning_rate": 1.1135877153684692e-05, + "loss": 2.5685, + "step": 31477 + }, + { + "epoch": 0.9334282240607301, + "grad_norm": 0.07452966272830963, + "learning_rate": 1.1126004502051313e-05, + "loss": 2.542, + "step": 31478 + }, + { + "epoch": 0.9334578774130415, + "grad_norm": 0.06742757558822632, + "learning_rate": 1.1116136179481096e-05, + "loss": 2.5509, + "step": 31479 + }, + { + "epoch": 0.9334875307653531, + "grad_norm": 0.07397245615720749, + "learning_rate": 1.1106272186061362e-05, + "loss": 2.5593, + "step": 31480 + }, + { + "epoch": 0.9335171841176645, + "grad_norm": 0.07672782242298126, + "learning_rate": 1.109641252187954e-05, + "loss": 2.5547, + "step": 31481 + }, + { + "epoch": 0.933546837469976, + "grad_norm": 0.07653231918811798, + "learning_rate": 1.1086557187022895e-05, + "loss": 2.5774, + "step": 31482 + }, + { + "epoch": 0.9335764908222874, + "grad_norm": 0.07049690186977386, + "learning_rate": 1.1076706181578744e-05, + "loss": 2.5496, + "step": 31483 + }, + { + "epoch": 0.933606144174599, + "grad_norm": 0.0740455687046051, + "learning_rate": 1.1066859505634241e-05, + "loss": 2.557, + "step": 31484 + }, + { + "epoch": 0.9336357975269104, + "grad_norm": 0.07172615826129913, + "learning_rate": 1.105701715927665e-05, + "loss": 2.5524, + "step": 31485 + }, + { + "epoch": 0.9336654508792219, + "grad_norm": 0.07741552591323853, + "learning_rate": 1.1047179142593066e-05, + "loss": 2.5705, + "step": 31486 + }, + { + "epoch": 0.9336951042315333, + "grad_norm": 0.0765821561217308, + "learning_rate": 1.10373454556707e-05, + "loss": 2.5688, + "step": 31487 + }, + { + "epoch": 0.9337247575838449, + "grad_norm": 0.0723324865102768, + "learning_rate": 1.102751609859648e-05, + "loss": 2.5702, + "step": 31488 + }, + { + "epoch": 0.9337544109361563, + "grad_norm": 0.0650726854801178, + "learning_rate": 1.1017691071457614e-05, + "loss": 2.5868, + "step": 31489 + }, + { + "epoch": 0.9337840642884678, + "grad_norm": 0.07307540625333786, + "learning_rate": 1.1007870374340978e-05, + "loss": 2.5697, + "step": 31490 + }, + { + "epoch": 0.9338137176407793, + "grad_norm": 0.07344209402799606, + "learning_rate": 1.0998054007333613e-05, + "loss": 2.574, + "step": 31491 + }, + { + "epoch": 0.9338433709930908, + "grad_norm": 0.07019216567277908, + "learning_rate": 1.0988241970522395e-05, + "loss": 2.591, + "step": 31492 + }, + { + "epoch": 0.9338730243454022, + "grad_norm": 0.07354128360748291, + "learning_rate": 1.0978434263994253e-05, + "loss": 2.5516, + "step": 31493 + }, + { + "epoch": 0.9339026776977137, + "grad_norm": 0.0727141797542572, + "learning_rate": 1.0968630887836006e-05, + "loss": 2.5915, + "step": 31494 + }, + { + "epoch": 0.9339323310500252, + "grad_norm": 0.0698491632938385, + "learning_rate": 1.095883184213442e-05, + "loss": 2.5863, + "step": 31495 + }, + { + "epoch": 0.9339619844023367, + "grad_norm": 0.07278049737215042, + "learning_rate": 1.0949037126976369e-05, + "loss": 2.5861, + "step": 31496 + }, + { + "epoch": 0.9339916377546482, + "grad_norm": 0.07599368691444397, + "learning_rate": 1.093924674244856e-05, + "loss": 2.5622, + "step": 31497 + }, + { + "epoch": 0.9340212911069596, + "grad_norm": 0.07225034385919571, + "learning_rate": 1.0929460688637649e-05, + "loss": 2.5503, + "step": 31498 + }, + { + "epoch": 0.9340509444592712, + "grad_norm": 0.06649631261825562, + "learning_rate": 1.0919678965630287e-05, + "loss": 2.5299, + "step": 31499 + }, + { + "epoch": 0.9340805978115826, + "grad_norm": 0.07200883328914642, + "learning_rate": 1.0909901573513181e-05, + "loss": 2.5557, + "step": 31500 + }, + { + "epoch": 0.9341102511638941, + "grad_norm": 0.07708600163459778, + "learning_rate": 1.0900128512372765e-05, + "loss": 2.572, + "step": 31501 + }, + { + "epoch": 0.9341399045162055, + "grad_norm": 0.07699592411518097, + "learning_rate": 1.0890359782295745e-05, + "loss": 2.589, + "step": 31502 + }, + { + "epoch": 0.9341695578685171, + "grad_norm": 0.07255885004997253, + "learning_rate": 1.0880595383368496e-05, + "loss": 2.5195, + "step": 31503 + }, + { + "epoch": 0.9341992112208285, + "grad_norm": 0.06868299096822739, + "learning_rate": 1.0870835315677507e-05, + "loss": 2.5513, + "step": 31504 + }, + { + "epoch": 0.93422886457314, + "grad_norm": 0.07214425504207611, + "learning_rate": 1.0861079579309263e-05, + "loss": 2.569, + "step": 31505 + }, + { + "epoch": 0.9342585179254514, + "grad_norm": 0.06951583176851273, + "learning_rate": 1.0851328174350139e-05, + "loss": 2.5601, + "step": 31506 + }, + { + "epoch": 0.934288171277763, + "grad_norm": 0.07119511067867279, + "learning_rate": 1.0841581100886455e-05, + "loss": 2.5845, + "step": 31507 + }, + { + "epoch": 0.9343178246300744, + "grad_norm": 0.07234468311071396, + "learning_rate": 1.0831838359004531e-05, + "loss": 2.5833, + "step": 31508 + }, + { + "epoch": 0.9343474779823859, + "grad_norm": 0.07151839882135391, + "learning_rate": 1.0822099948790632e-05, + "loss": 2.5563, + "step": 31509 + }, + { + "epoch": 0.9343771313346974, + "grad_norm": 0.0708848163485527, + "learning_rate": 1.0812365870330964e-05, + "loss": 2.5721, + "step": 31510 + }, + { + "epoch": 0.9344067846870089, + "grad_norm": 0.07127642631530762, + "learning_rate": 1.0802636123711851e-05, + "loss": 2.5544, + "step": 31511 + }, + { + "epoch": 0.9344364380393203, + "grad_norm": 0.07294771075248718, + "learning_rate": 1.0792910709019276e-05, + "loss": 2.5804, + "step": 31512 + }, + { + "epoch": 0.9344660913916318, + "grad_norm": 0.07005216926336288, + "learning_rate": 1.0783189626339508e-05, + "loss": 2.5623, + "step": 31513 + }, + { + "epoch": 0.9344957447439433, + "grad_norm": 0.07355882972478867, + "learning_rate": 1.0773472875758583e-05, + "loss": 2.5633, + "step": 31514 + }, + { + "epoch": 0.9345253980962548, + "grad_norm": 0.0727602168917656, + "learning_rate": 1.0763760457362493e-05, + "loss": 2.558, + "step": 31515 + }, + { + "epoch": 0.9345550514485662, + "grad_norm": 0.07013548165559769, + "learning_rate": 1.0754052371237278e-05, + "loss": 2.5563, + "step": 31516 + }, + { + "epoch": 0.9345847048008777, + "grad_norm": 0.07111912220716476, + "learning_rate": 1.074434861746898e-05, + "loss": 2.5816, + "step": 31517 + }, + { + "epoch": 0.9346143581531893, + "grad_norm": 0.06726787984371185, + "learning_rate": 1.0734649196143365e-05, + "loss": 2.5788, + "step": 31518 + }, + { + "epoch": 0.9346440115055007, + "grad_norm": 0.07848566770553589, + "learning_rate": 1.0724954107346419e-05, + "loss": 2.5892, + "step": 31519 + }, + { + "epoch": 0.9346736648578122, + "grad_norm": 0.07414916157722473, + "learning_rate": 1.0715263351163962e-05, + "loss": 2.599, + "step": 31520 + }, + { + "epoch": 0.9347033182101236, + "grad_norm": 0.07157978415489197, + "learning_rate": 1.070557692768187e-05, + "loss": 2.5302, + "step": 31521 + }, + { + "epoch": 0.9347329715624352, + "grad_norm": 0.06974359601736069, + "learning_rate": 1.0695894836985909e-05, + "loss": 2.561, + "step": 31522 + }, + { + "epoch": 0.9347626249147466, + "grad_norm": 0.06769868731498718, + "learning_rate": 1.0686217079161786e-05, + "loss": 2.5587, + "step": 31523 + }, + { + "epoch": 0.9347922782670581, + "grad_norm": 0.07054003328084946, + "learning_rate": 1.0676543654295157e-05, + "loss": 2.5471, + "step": 31524 + }, + { + "epoch": 0.9348219316193696, + "grad_norm": 0.07124928385019302, + "learning_rate": 1.0666874562471785e-05, + "loss": 2.5422, + "step": 31525 + }, + { + "epoch": 0.9348515849716811, + "grad_norm": 0.06918998062610626, + "learning_rate": 1.0657209803777156e-05, + "loss": 2.5334, + "step": 31526 + }, + { + "epoch": 0.9348812383239925, + "grad_norm": 0.07146216928958893, + "learning_rate": 1.0647549378296983e-05, + "loss": 2.553, + "step": 31527 + }, + { + "epoch": 0.934910891676304, + "grad_norm": 0.0734577476978302, + "learning_rate": 1.0637893286116806e-05, + "loss": 2.526, + "step": 31528 + }, + { + "epoch": 0.9349405450286155, + "grad_norm": 0.07399895042181015, + "learning_rate": 1.0628241527322003e-05, + "loss": 2.5372, + "step": 31529 + }, + { + "epoch": 0.934970198380927, + "grad_norm": 0.06884549558162689, + "learning_rate": 1.0618594101998114e-05, + "loss": 2.576, + "step": 31530 + }, + { + "epoch": 0.9349998517332384, + "grad_norm": 0.06694288551807404, + "learning_rate": 1.0608951010230627e-05, + "loss": 2.56, + "step": 31531 + }, + { + "epoch": 0.9350295050855499, + "grad_norm": 0.06854454427957535, + "learning_rate": 1.0599312252104864e-05, + "loss": 2.5561, + "step": 31532 + }, + { + "epoch": 0.9350591584378614, + "grad_norm": 0.07278972119092941, + "learning_rate": 1.0589677827706146e-05, + "loss": 2.5624, + "step": 31533 + }, + { + "epoch": 0.9350888117901729, + "grad_norm": 0.0755465179681778, + "learning_rate": 1.0580047737119847e-05, + "loss": 2.5602, + "step": 31534 + }, + { + "epoch": 0.9351184651424843, + "grad_norm": 0.07722866535186768, + "learning_rate": 1.0570421980431289e-05, + "loss": 2.5616, + "step": 31535 + }, + { + "epoch": 0.9351481184947958, + "grad_norm": 0.07075222581624985, + "learning_rate": 1.0560800557725624e-05, + "loss": 2.5608, + "step": 31536 + }, + { + "epoch": 0.9351777718471073, + "grad_norm": 0.06676354259252548, + "learning_rate": 1.0551183469088066e-05, + "loss": 2.5663, + "step": 31537 + }, + { + "epoch": 0.9352074251994188, + "grad_norm": 0.07379930466413498, + "learning_rate": 1.054157071460382e-05, + "loss": 2.5737, + "step": 31538 + }, + { + "epoch": 0.9352370785517303, + "grad_norm": 0.07156436145305634, + "learning_rate": 1.0531962294357989e-05, + "loss": 2.5657, + "step": 31539 + }, + { + "epoch": 0.9352667319040417, + "grad_norm": 0.07289088517427444, + "learning_rate": 1.0522358208435612e-05, + "loss": 2.5757, + "step": 31540 + }, + { + "epoch": 0.9352963852563533, + "grad_norm": 0.07276148349046707, + "learning_rate": 1.051275845692179e-05, + "loss": 2.5731, + "step": 31541 + }, + { + "epoch": 0.9353260386086647, + "grad_norm": 0.0678587406873703, + "learning_rate": 1.0503163039901508e-05, + "loss": 2.567, + "step": 31542 + }, + { + "epoch": 0.9353556919609762, + "grad_norm": 0.06918595731258392, + "learning_rate": 1.0493571957459758e-05, + "loss": 2.5563, + "step": 31543 + }, + { + "epoch": 0.9353853453132877, + "grad_norm": 0.07166007161140442, + "learning_rate": 1.0483985209681413e-05, + "loss": 2.55, + "step": 31544 + }, + { + "epoch": 0.9354149986655992, + "grad_norm": 0.0762675553560257, + "learning_rate": 1.0474402796651405e-05, + "loss": 2.5506, + "step": 31545 + }, + { + "epoch": 0.9354446520179106, + "grad_norm": 0.0713435634970665, + "learning_rate": 1.0464824718454558e-05, + "loss": 2.5594, + "step": 31546 + }, + { + "epoch": 0.9354743053702221, + "grad_norm": 0.07075725495815277, + "learning_rate": 1.0455250975175801e-05, + "loss": 2.5756, + "step": 31547 + }, + { + "epoch": 0.9355039587225336, + "grad_norm": 0.06965186446905136, + "learning_rate": 1.0445681566899789e-05, + "loss": 2.5912, + "step": 31548 + }, + { + "epoch": 0.9355336120748451, + "grad_norm": 0.07394258677959442, + "learning_rate": 1.043611649371129e-05, + "loss": 2.5693, + "step": 31549 + }, + { + "epoch": 0.9355632654271565, + "grad_norm": 0.07398927211761475, + "learning_rate": 1.0426555755695067e-05, + "loss": 2.5447, + "step": 31550 + }, + { + "epoch": 0.935592918779468, + "grad_norm": 0.07147065550088882, + "learning_rate": 1.0416999352935664e-05, + "loss": 2.6002, + "step": 31551 + }, + { + "epoch": 0.9356225721317795, + "grad_norm": 0.06804807484149933, + "learning_rate": 1.0407447285517791e-05, + "loss": 2.5527, + "step": 31552 + }, + { + "epoch": 0.935652225484091, + "grad_norm": 0.0671394094824791, + "learning_rate": 1.0397899553525992e-05, + "loss": 2.5747, + "step": 31553 + }, + { + "epoch": 0.9356818788364024, + "grad_norm": 0.07342791557312012, + "learning_rate": 1.038835615704481e-05, + "loss": 2.5745, + "step": 31554 + }, + { + "epoch": 0.935711532188714, + "grad_norm": 0.06974945217370987, + "learning_rate": 1.0378817096158788e-05, + "loss": 2.5332, + "step": 31555 + }, + { + "epoch": 0.9357411855410254, + "grad_norm": 0.06485442072153091, + "learning_rate": 1.0369282370952416e-05, + "loss": 2.5422, + "step": 31556 + }, + { + "epoch": 0.9357708388933369, + "grad_norm": 0.06980783492326736, + "learning_rate": 1.0359751981510068e-05, + "loss": 2.561, + "step": 31557 + }, + { + "epoch": 0.9358004922456483, + "grad_norm": 0.06958630681037903, + "learning_rate": 1.0350225927916179e-05, + "loss": 2.5545, + "step": 31558 + }, + { + "epoch": 0.9358301455979599, + "grad_norm": 0.07407177984714508, + "learning_rate": 1.034070421025507e-05, + "loss": 2.5558, + "step": 31559 + }, + { + "epoch": 0.9358597989502714, + "grad_norm": 0.07199461758136749, + "learning_rate": 1.0331186828611062e-05, + "loss": 2.5803, + "step": 31560 + }, + { + "epoch": 0.9358894523025828, + "grad_norm": 0.06815294176340103, + "learning_rate": 1.0321673783068474e-05, + "loss": 2.599, + "step": 31561 + }, + { + "epoch": 0.9359191056548943, + "grad_norm": 0.07004323601722717, + "learning_rate": 1.0312165073711521e-05, + "loss": 2.5787, + "step": 31562 + }, + { + "epoch": 0.9359487590072058, + "grad_norm": 0.06807681173086166, + "learning_rate": 1.0302660700624355e-05, + "loss": 2.5407, + "step": 31563 + }, + { + "epoch": 0.9359784123595173, + "grad_norm": 0.07121792435646057, + "learning_rate": 1.0293160663891243e-05, + "loss": 2.5588, + "step": 31564 + }, + { + "epoch": 0.9360080657118287, + "grad_norm": 0.07403721660375595, + "learning_rate": 1.0283664963596229e-05, + "loss": 2.5406, + "step": 31565 + }, + { + "epoch": 0.9360377190641402, + "grad_norm": 0.07198864966630936, + "learning_rate": 1.027417359982341e-05, + "loss": 2.564, + "step": 31566 + }, + { + "epoch": 0.9360673724164517, + "grad_norm": 0.06987100839614868, + "learning_rate": 1.0264686572656834e-05, + "loss": 2.5551, + "step": 31567 + }, + { + "epoch": 0.9360970257687632, + "grad_norm": 0.06802435219287872, + "learning_rate": 1.0255203882180542e-05, + "loss": 2.5451, + "step": 31568 + }, + { + "epoch": 0.9361266791210746, + "grad_norm": 0.07359197735786438, + "learning_rate": 1.0245725528478466e-05, + "loss": 2.5541, + "step": 31569 + }, + { + "epoch": 0.9361563324733861, + "grad_norm": 0.07218044251203537, + "learning_rate": 1.0236251511634543e-05, + "loss": 2.5575, + "step": 31570 + }, + { + "epoch": 0.9361859858256976, + "grad_norm": 0.07241024821996689, + "learning_rate": 1.0226781831732701e-05, + "loss": 2.6216, + "step": 31571 + }, + { + "epoch": 0.9362156391780091, + "grad_norm": 0.06770143657922745, + "learning_rate": 1.0217316488856765e-05, + "loss": 2.5495, + "step": 31572 + }, + { + "epoch": 0.9362452925303205, + "grad_norm": 0.06888556480407715, + "learning_rate": 1.0207855483090611e-05, + "loss": 2.5631, + "step": 31573 + }, + { + "epoch": 0.936274945882632, + "grad_norm": 0.07117712497711182, + "learning_rate": 1.0198398814517895e-05, + "loss": 2.5885, + "step": 31574 + }, + { + "epoch": 0.9363045992349435, + "grad_norm": 0.07209242880344391, + "learning_rate": 1.0188946483222494e-05, + "loss": 2.5827, + "step": 31575 + }, + { + "epoch": 0.936334252587255, + "grad_norm": 0.07163393497467041, + "learning_rate": 1.0179498489288009e-05, + "loss": 2.6009, + "step": 31576 + }, + { + "epoch": 0.9363639059395664, + "grad_norm": 0.06863372027873993, + "learning_rate": 1.0170054832798147e-05, + "loss": 2.5487, + "step": 31577 + }, + { + "epoch": 0.936393559291878, + "grad_norm": 0.07231078296899796, + "learning_rate": 1.0160615513836513e-05, + "loss": 2.5596, + "step": 31578 + }, + { + "epoch": 0.9364232126441894, + "grad_norm": 0.0720936730504036, + "learning_rate": 1.015118053248676e-05, + "loss": 2.5591, + "step": 31579 + }, + { + "epoch": 0.9364528659965009, + "grad_norm": 0.06911936402320862, + "learning_rate": 1.014174988883232e-05, + "loss": 2.5412, + "step": 31580 + }, + { + "epoch": 0.9364825193488124, + "grad_norm": 0.0703640952706337, + "learning_rate": 1.0132323582956793e-05, + "loss": 2.5445, + "step": 31581 + }, + { + "epoch": 0.9365121727011239, + "grad_norm": 0.06892184168100357, + "learning_rate": 1.0122901614943614e-05, + "loss": 2.5387, + "step": 31582 + }, + { + "epoch": 0.9365418260534354, + "grad_norm": 0.07255693525075912, + "learning_rate": 1.0113483984876325e-05, + "loss": 2.5399, + "step": 31583 + }, + { + "epoch": 0.9365714794057468, + "grad_norm": 0.06881636381149292, + "learning_rate": 1.0104070692838141e-05, + "loss": 2.5706, + "step": 31584 + }, + { + "epoch": 0.9366011327580583, + "grad_norm": 0.07278595864772797, + "learning_rate": 1.0094661738912436e-05, + "loss": 2.5814, + "step": 31585 + }, + { + "epoch": 0.9366307861103698, + "grad_norm": 0.06885889172554016, + "learning_rate": 1.0085257123182645e-05, + "loss": 2.5647, + "step": 31586 + }, + { + "epoch": 0.9366604394626813, + "grad_norm": 0.0681447684764862, + "learning_rate": 1.0075856845732034e-05, + "loss": 2.5378, + "step": 31587 + }, + { + "epoch": 0.9366900928149927, + "grad_norm": 0.06798696517944336, + "learning_rate": 1.006646090664376e-05, + "loss": 2.5375, + "step": 31588 + }, + { + "epoch": 0.9367197461673042, + "grad_norm": 0.07128647714853287, + "learning_rate": 1.0057069306001087e-05, + "loss": 2.5524, + "step": 31589 + }, + { + "epoch": 0.9367493995196157, + "grad_norm": 0.06989054381847382, + "learning_rate": 1.0047682043887174e-05, + "loss": 2.5407, + "step": 31590 + }, + { + "epoch": 0.9367790528719272, + "grad_norm": 0.06904365867376328, + "learning_rate": 1.0038299120385119e-05, + "loss": 2.6033, + "step": 31591 + }, + { + "epoch": 0.9368087062242386, + "grad_norm": 0.07029875367879868, + "learning_rate": 1.0028920535578078e-05, + "loss": 2.5681, + "step": 31592 + }, + { + "epoch": 0.9368383595765502, + "grad_norm": 0.07170584052801132, + "learning_rate": 1.0019546289549042e-05, + "loss": 2.5682, + "step": 31593 + }, + { + "epoch": 0.9368680129288616, + "grad_norm": 0.07161346822977066, + "learning_rate": 1.0010176382380942e-05, + "loss": 2.5761, + "step": 31594 + }, + { + "epoch": 0.9368976662811731, + "grad_norm": 0.0720396637916565, + "learning_rate": 1.0000810814156935e-05, + "loss": 2.5748, + "step": 31595 + }, + { + "epoch": 0.9369273196334845, + "grad_norm": 0.07183613628149033, + "learning_rate": 9.99144958495979e-06, + "loss": 2.5833, + "step": 31596 + }, + { + "epoch": 0.936956972985796, + "grad_norm": 0.06839276850223541, + "learning_rate": 9.982092694872491e-06, + "loss": 2.5643, + "step": 31597 + }, + { + "epoch": 0.9369866263381075, + "grad_norm": 0.07304070889949799, + "learning_rate": 9.97274014397781e-06, + "loss": 2.5623, + "step": 31598 + }, + { + "epoch": 0.937016279690419, + "grad_norm": 0.07166945934295654, + "learning_rate": 9.963391932358678e-06, + "loss": 2.5763, + "step": 31599 + }, + { + "epoch": 0.9370459330427304, + "grad_norm": 0.06932990998029709, + "learning_rate": 9.954048060097809e-06, + "loss": 2.5798, + "step": 31600 + }, + { + "epoch": 0.937075586395042, + "grad_norm": 0.07021649926900864, + "learning_rate": 9.944708527277967e-06, + "loss": 2.5402, + "step": 31601 + }, + { + "epoch": 0.9371052397473535, + "grad_norm": 0.07266482710838318, + "learning_rate": 9.935373333981868e-06, + "loss": 2.5822, + "step": 31602 + }, + { + "epoch": 0.9371348930996649, + "grad_norm": 0.0666050985455513, + "learning_rate": 9.926042480292163e-06, + "loss": 2.5677, + "step": 31603 + }, + { + "epoch": 0.9371645464519764, + "grad_norm": 0.06976763159036636, + "learning_rate": 9.916715966291512e-06, + "loss": 2.6016, + "step": 31604 + }, + { + "epoch": 0.9371941998042879, + "grad_norm": 0.07017020136117935, + "learning_rate": 9.907393792062403e-06, + "loss": 2.577, + "step": 31605 + }, + { + "epoch": 0.9372238531565994, + "grad_norm": 0.0680730789899826, + "learning_rate": 9.898075957687435e-06, + "loss": 2.5133, + "step": 31606 + }, + { + "epoch": 0.9372535065089108, + "grad_norm": 0.07029677927494049, + "learning_rate": 9.888762463249156e-06, + "loss": 2.5376, + "step": 31607 + }, + { + "epoch": 0.9372831598612223, + "grad_norm": 0.07186630368232727, + "learning_rate": 9.87945330883e-06, + "loss": 2.5503, + "step": 31608 + }, + { + "epoch": 0.9373128132135338, + "grad_norm": 0.0736299678683281, + "learning_rate": 9.870148494512399e-06, + "loss": 2.5483, + "step": 31609 + }, + { + "epoch": 0.9373424665658453, + "grad_norm": 0.06759559363126755, + "learning_rate": 9.860848020378732e-06, + "loss": 2.5625, + "step": 31610 + }, + { + "epoch": 0.9373721199181567, + "grad_norm": 0.07333431392908096, + "learning_rate": 9.851551886511433e-06, + "loss": 2.5421, + "step": 31611 + }, + { + "epoch": 0.9374017732704683, + "grad_norm": 0.06738316267728806, + "learning_rate": 9.842260092992772e-06, + "loss": 2.5763, + "step": 31612 + }, + { + "epoch": 0.9374314266227797, + "grad_norm": 0.06719190627336502, + "learning_rate": 9.832972639905013e-06, + "loss": 2.5881, + "step": 31613 + }, + { + "epoch": 0.9374610799750912, + "grad_norm": 0.06753469258546829, + "learning_rate": 9.823689527330482e-06, + "loss": 2.5671, + "step": 31614 + }, + { + "epoch": 0.9374907333274026, + "grad_norm": 0.07032257318496704, + "learning_rate": 9.814410755351278e-06, + "loss": 2.575, + "step": 31615 + }, + { + "epoch": 0.9375203866797142, + "grad_norm": 0.07191599905490875, + "learning_rate": 9.805136324049612e-06, + "loss": 2.5462, + "step": 31616 + }, + { + "epoch": 0.9375500400320256, + "grad_norm": 0.0676502138376236, + "learning_rate": 9.795866233507589e-06, + "loss": 2.5752, + "step": 31617 + }, + { + "epoch": 0.9375796933843371, + "grad_norm": 0.0728984996676445, + "learning_rate": 9.786600483807307e-06, + "loss": 2.578, + "step": 31618 + }, + { + "epoch": 0.9376093467366485, + "grad_norm": 0.06869986653327942, + "learning_rate": 9.77733907503081e-06, + "loss": 2.5422, + "step": 31619 + }, + { + "epoch": 0.9376390000889601, + "grad_norm": 0.06980361044406891, + "learning_rate": 9.768082007260148e-06, + "loss": 2.561, + "step": 31620 + }, + { + "epoch": 0.9376686534412715, + "grad_norm": 0.07126491516828537, + "learning_rate": 9.758829280577309e-06, + "loss": 2.5651, + "step": 31621 + }, + { + "epoch": 0.937698306793583, + "grad_norm": 0.06780707836151123, + "learning_rate": 9.749580895064114e-06, + "loss": 2.5667, + "step": 31622 + }, + { + "epoch": 0.9377279601458945, + "grad_norm": 0.0693395659327507, + "learning_rate": 9.7403368508025e-06, + "loss": 2.5776, + "step": 31623 + }, + { + "epoch": 0.937757613498206, + "grad_norm": 0.06724260747432709, + "learning_rate": 9.731097147874401e-06, + "loss": 2.5364, + "step": 31624 + }, + { + "epoch": 0.9377872668505175, + "grad_norm": 0.06813894957304001, + "learning_rate": 9.721861786361642e-06, + "loss": 2.5401, + "step": 31625 + }, + { + "epoch": 0.9378169202028289, + "grad_norm": 0.06822166591882706, + "learning_rate": 9.712630766345932e-06, + "loss": 2.5749, + "step": 31626 + }, + { + "epoch": 0.9378465735551405, + "grad_norm": 0.0687943696975708, + "learning_rate": 9.70340408790904e-06, + "loss": 2.5818, + "step": 31627 + }, + { + "epoch": 0.9378762269074519, + "grad_norm": 0.06921418756246567, + "learning_rate": 9.694181751132625e-06, + "loss": 2.5575, + "step": 31628 + }, + { + "epoch": 0.9379058802597634, + "grad_norm": 0.0686277374625206, + "learning_rate": 9.684963756098397e-06, + "loss": 2.5751, + "step": 31629 + }, + { + "epoch": 0.9379355336120748, + "grad_norm": 0.07201485335826874, + "learning_rate": 9.675750102887958e-06, + "loss": 2.5432, + "step": 31630 + }, + { + "epoch": 0.9379651869643864, + "grad_norm": 0.06866680085659027, + "learning_rate": 9.666540791582968e-06, + "loss": 2.547, + "step": 31631 + }, + { + "epoch": 0.9379948403166978, + "grad_norm": 0.07008548825979233, + "learning_rate": 9.657335822264857e-06, + "loss": 2.5784, + "step": 31632 + }, + { + "epoch": 0.9380244936690093, + "grad_norm": 0.06973256170749664, + "learning_rate": 9.648135195015229e-06, + "loss": 2.5453, + "step": 31633 + }, + { + "epoch": 0.9380541470213207, + "grad_norm": 0.07033620774745941, + "learning_rate": 9.638938909915573e-06, + "loss": 2.5605, + "step": 31634 + }, + { + "epoch": 0.9380838003736323, + "grad_norm": 0.0673622116446495, + "learning_rate": 9.629746967047215e-06, + "loss": 2.5749, + "step": 31635 + }, + { + "epoch": 0.9381134537259437, + "grad_norm": 0.0718531385064125, + "learning_rate": 9.620559366491698e-06, + "loss": 2.5507, + "step": 31636 + }, + { + "epoch": 0.9381431070782552, + "grad_norm": 0.07169094681739807, + "learning_rate": 9.611376108330239e-06, + "loss": 2.5758, + "step": 31637 + }, + { + "epoch": 0.9381727604305666, + "grad_norm": 0.07296968251466751, + "learning_rate": 9.602197192644213e-06, + "loss": 2.5859, + "step": 31638 + }, + { + "epoch": 0.9382024137828782, + "grad_norm": 0.07306825369596481, + "learning_rate": 9.593022619514946e-06, + "loss": 2.563, + "step": 31639 + }, + { + "epoch": 0.9382320671351896, + "grad_norm": 0.06983904540538788, + "learning_rate": 9.58385238902365e-06, + "loss": 2.578, + "step": 31640 + }, + { + "epoch": 0.9382617204875011, + "grad_norm": 0.07223643362522125, + "learning_rate": 9.574686501251484e-06, + "loss": 2.5898, + "step": 31641 + }, + { + "epoch": 0.9382913738398125, + "grad_norm": 0.06964240968227386, + "learning_rate": 9.56552495627966e-06, + "loss": 2.5398, + "step": 31642 + }, + { + "epoch": 0.9383210271921241, + "grad_norm": 0.06870896369218826, + "learning_rate": 9.556367754189277e-06, + "loss": 2.5375, + "step": 31643 + }, + { + "epoch": 0.9383506805444356, + "grad_norm": 0.07037022709846497, + "learning_rate": 9.547214895061496e-06, + "loss": 2.5588, + "step": 31644 + }, + { + "epoch": 0.938380333896747, + "grad_norm": 0.06764933466911316, + "learning_rate": 9.53806637897725e-06, + "loss": 2.5296, + "step": 31645 + }, + { + "epoch": 0.9384099872490586, + "grad_norm": 0.06676554679870605, + "learning_rate": 9.528922206017644e-06, + "loss": 2.545, + "step": 31646 + }, + { + "epoch": 0.93843964060137, + "grad_norm": 0.07308871299028397, + "learning_rate": 9.519782376263608e-06, + "loss": 2.5966, + "step": 31647 + }, + { + "epoch": 0.9384692939536815, + "grad_norm": 0.0687275379896164, + "learning_rate": 9.510646889796082e-06, + "loss": 2.5682, + "step": 31648 + }, + { + "epoch": 0.9384989473059929, + "grad_norm": 0.06865911185741425, + "learning_rate": 9.501515746695999e-06, + "loss": 2.5649, + "step": 31649 + }, + { + "epoch": 0.9385286006583045, + "grad_norm": 0.06991396844387054, + "learning_rate": 9.492388947044184e-06, + "loss": 2.5289, + "step": 31650 + }, + { + "epoch": 0.9385582540106159, + "grad_norm": 0.06821869313716888, + "learning_rate": 9.483266490921406e-06, + "loss": 2.536, + "step": 31651 + }, + { + "epoch": 0.9385879073629274, + "grad_norm": 0.07000686973333359, + "learning_rate": 9.474148378408543e-06, + "loss": 2.5645, + "step": 31652 + }, + { + "epoch": 0.9386175607152388, + "grad_norm": 0.07461323589086533, + "learning_rate": 9.46503460958631e-06, + "loss": 2.593, + "step": 31653 + }, + { + "epoch": 0.9386472140675504, + "grad_norm": 0.07113619148731232, + "learning_rate": 9.455925184535364e-06, + "loss": 2.5805, + "step": 31654 + }, + { + "epoch": 0.9386768674198618, + "grad_norm": 0.07197973132133484, + "learning_rate": 9.446820103336417e-06, + "loss": 2.5788, + "step": 31655 + }, + { + "epoch": 0.9387065207721733, + "grad_norm": 0.07076732069253922, + "learning_rate": 9.437719366070074e-06, + "loss": 2.5816, + "step": 31656 + }, + { + "epoch": 0.9387361741244847, + "grad_norm": 0.06850205361843109, + "learning_rate": 9.428622972816937e-06, + "loss": 2.5666, + "step": 31657 + }, + { + "epoch": 0.9387658274767963, + "grad_norm": 0.07252182811498642, + "learning_rate": 9.419530923657549e-06, + "loss": 2.5792, + "step": 31658 + }, + { + "epoch": 0.9387954808291077, + "grad_norm": 0.07574235647916794, + "learning_rate": 9.41044321867246e-06, + "loss": 2.5668, + "step": 31659 + }, + { + "epoch": 0.9388251341814192, + "grad_norm": 0.07564752548933029, + "learning_rate": 9.40135985794205e-06, + "loss": 2.5539, + "step": 31660 + }, + { + "epoch": 0.9388547875337306, + "grad_norm": 0.07267126441001892, + "learning_rate": 9.392280841546808e-06, + "loss": 2.5701, + "step": 31661 + }, + { + "epoch": 0.9388844408860422, + "grad_norm": 0.06966999173164368, + "learning_rate": 9.383206169567116e-06, + "loss": 2.5375, + "step": 31662 + }, + { + "epoch": 0.9389140942383536, + "grad_norm": 0.07504003494977951, + "learning_rate": 9.374135842083354e-06, + "loss": 2.5528, + "step": 31663 + }, + { + "epoch": 0.9389437475906651, + "grad_norm": 0.07012094557285309, + "learning_rate": 9.365069859175845e-06, + "loss": 2.5822, + "step": 31664 + }, + { + "epoch": 0.9389734009429767, + "grad_norm": 0.06908811628818512, + "learning_rate": 9.356008220924861e-06, + "loss": 2.5619, + "step": 31665 + }, + { + "epoch": 0.9390030542952881, + "grad_norm": 0.07202925533056259, + "learning_rate": 9.346950927410669e-06, + "loss": 2.5561, + "step": 31666 + }, + { + "epoch": 0.9390327076475996, + "grad_norm": 0.07116033881902695, + "learning_rate": 9.337897978713427e-06, + "loss": 2.5741, + "step": 31667 + }, + { + "epoch": 0.939062360999911, + "grad_norm": 0.07420936226844788, + "learning_rate": 9.328849374913295e-06, + "loss": 2.5494, + "step": 31668 + }, + { + "epoch": 0.9390920143522226, + "grad_norm": 0.06649375706911087, + "learning_rate": 9.31980511609054e-06, + "loss": 2.5789, + "step": 31669 + }, + { + "epoch": 0.939121667704534, + "grad_norm": 0.0677112564444542, + "learning_rate": 9.310765202324988e-06, + "loss": 2.5597, + "step": 31670 + }, + { + "epoch": 0.9391513210568455, + "grad_norm": 0.07295753061771393, + "learning_rate": 9.301729633696909e-06, + "loss": 2.6225, + "step": 31671 + }, + { + "epoch": 0.9391809744091569, + "grad_norm": 0.0724353939294815, + "learning_rate": 9.292698410286237e-06, + "loss": 2.5965, + "step": 31672 + }, + { + "epoch": 0.9392106277614685, + "grad_norm": 0.07156659662723541, + "learning_rate": 9.283671532172911e-06, + "loss": 2.5712, + "step": 31673 + }, + { + "epoch": 0.9392402811137799, + "grad_norm": 0.07269137352705002, + "learning_rate": 9.27464899943692e-06, + "loss": 2.5634, + "step": 31674 + }, + { + "epoch": 0.9392699344660914, + "grad_norm": 0.07320242375135422, + "learning_rate": 9.265630812158143e-06, + "loss": 2.5737, + "step": 31675 + }, + { + "epoch": 0.9392995878184028, + "grad_norm": 0.07101921737194061, + "learning_rate": 9.256616970416409e-06, + "loss": 2.5538, + "step": 31676 + }, + { + "epoch": 0.9393292411707144, + "grad_norm": 0.07374294102191925, + "learning_rate": 9.247607474291652e-06, + "loss": 2.5881, + "step": 31677 + }, + { + "epoch": 0.9393588945230258, + "grad_norm": 0.07256729155778885, + "learning_rate": 9.238602323863476e-06, + "loss": 2.5698, + "step": 31678 + }, + { + "epoch": 0.9393885478753373, + "grad_norm": 0.07139363884925842, + "learning_rate": 9.22960151921176e-06, + "loss": 2.5657, + "step": 31679 + }, + { + "epoch": 0.9394182012276487, + "grad_norm": 0.07072875648736954, + "learning_rate": 9.22060506041622e-06, + "loss": 2.5685, + "step": 31680 + }, + { + "epoch": 0.9394478545799603, + "grad_norm": 0.06985613703727722, + "learning_rate": 9.211612947556403e-06, + "loss": 2.5704, + "step": 31681 + }, + { + "epoch": 0.9394775079322717, + "grad_norm": 0.06732326745986938, + "learning_rate": 9.20262518071202e-06, + "loss": 2.531, + "step": 31682 + }, + { + "epoch": 0.9395071612845832, + "grad_norm": 0.06829889118671417, + "learning_rate": 9.193641759962568e-06, + "loss": 2.5604, + "step": 31683 + }, + { + "epoch": 0.9395368146368946, + "grad_norm": 0.06623604893684387, + "learning_rate": 9.184662685387758e-06, + "loss": 2.5621, + "step": 31684 + }, + { + "epoch": 0.9395664679892062, + "grad_norm": 0.06875088065862656, + "learning_rate": 9.175687957066913e-06, + "loss": 2.5566, + "step": 31685 + }, + { + "epoch": 0.9395961213415177, + "grad_norm": 0.07006153464317322, + "learning_rate": 9.166717575079641e-06, + "loss": 2.544, + "step": 31686 + }, + { + "epoch": 0.9396257746938291, + "grad_norm": 0.06912216544151306, + "learning_rate": 9.157751539505377e-06, + "loss": 2.5367, + "step": 31687 + }, + { + "epoch": 0.9396554280461407, + "grad_norm": 0.07242890447378159, + "learning_rate": 9.14878985042339e-06, + "loss": 2.5335, + "step": 31688 + }, + { + "epoch": 0.9396850813984521, + "grad_norm": 0.06717197597026825, + "learning_rate": 9.13983250791317e-06, + "loss": 2.5594, + "step": 31689 + }, + { + "epoch": 0.9397147347507636, + "grad_norm": 0.0702182799577713, + "learning_rate": 9.130879512053992e-06, + "loss": 2.576, + "step": 31690 + }, + { + "epoch": 0.939744388103075, + "grad_norm": 0.06902318447828293, + "learning_rate": 9.121930862925176e-06, + "loss": 2.5922, + "step": 31691 + }, + { + "epoch": 0.9397740414553866, + "grad_norm": 0.068602055311203, + "learning_rate": 9.11298656060583e-06, + "loss": 2.5955, + "step": 31692 + }, + { + "epoch": 0.939803694807698, + "grad_norm": 0.07052143663167953, + "learning_rate": 9.10404660517533e-06, + "loss": 2.5518, + "step": 31693 + }, + { + "epoch": 0.9398333481600095, + "grad_norm": 0.07134444266557693, + "learning_rate": 9.095110996712674e-06, + "loss": 2.5676, + "step": 31694 + }, + { + "epoch": 0.9398630015123209, + "grad_norm": 0.06865661591291428, + "learning_rate": 9.086179735297128e-06, + "loss": 2.5757, + "step": 31695 + }, + { + "epoch": 0.9398926548646325, + "grad_norm": 0.07007483392953873, + "learning_rate": 9.077252821007742e-06, + "loss": 2.5364, + "step": 31696 + }, + { + "epoch": 0.9399223082169439, + "grad_norm": 0.06991284340620041, + "learning_rate": 9.068330253923506e-06, + "loss": 2.5432, + "step": 31697 + }, + { + "epoch": 0.9399519615692554, + "grad_norm": 0.06824316829442978, + "learning_rate": 9.05941203412347e-06, + "loss": 2.5619, + "step": 31698 + }, + { + "epoch": 0.9399816149215668, + "grad_norm": 0.06904027611017227, + "learning_rate": 9.050498161686627e-06, + "loss": 2.5953, + "step": 31699 + }, + { + "epoch": 0.9400112682738784, + "grad_norm": 0.07224182039499283, + "learning_rate": 9.04158863669191e-06, + "loss": 2.571, + "step": 31700 + }, + { + "epoch": 0.9400409216261898, + "grad_norm": 0.06991442292928696, + "learning_rate": 9.032683459218205e-06, + "loss": 2.56, + "step": 31701 + }, + { + "epoch": 0.9400705749785013, + "grad_norm": 0.06861118227243423, + "learning_rate": 9.023782629344335e-06, + "loss": 2.5806, + "step": 31702 + }, + { + "epoch": 0.9401002283308127, + "grad_norm": 0.0669826865196228, + "learning_rate": 9.014886147149126e-06, + "loss": 2.5691, + "step": 31703 + }, + { + "epoch": 0.9401298816831243, + "grad_norm": 0.06799063831567764, + "learning_rate": 9.005994012711405e-06, + "loss": 2.5769, + "step": 31704 + }, + { + "epoch": 0.9401595350354357, + "grad_norm": 0.06866540759801865, + "learning_rate": 8.997106226109886e-06, + "loss": 2.5563, + "step": 31705 + }, + { + "epoch": 0.9401891883877472, + "grad_norm": 0.06892784684896469, + "learning_rate": 8.988222787423229e-06, + "loss": 2.567, + "step": 31706 + }, + { + "epoch": 0.9402188417400588, + "grad_norm": 0.068507619202137, + "learning_rate": 8.979343696730202e-06, + "loss": 2.5715, + "step": 31707 + }, + { + "epoch": 0.9402484950923702, + "grad_norm": 0.0728221982717514, + "learning_rate": 8.970468954109302e-06, + "loss": 2.5997, + "step": 31708 + }, + { + "epoch": 0.9402781484446817, + "grad_norm": 0.0690937414765358, + "learning_rate": 8.961598559639239e-06, + "loss": 2.5536, + "step": 31709 + }, + { + "epoch": 0.9403078017969931, + "grad_norm": 0.06491942703723907, + "learning_rate": 8.95273251339851e-06, + "loss": 2.5587, + "step": 31710 + }, + { + "epoch": 0.9403374551493047, + "grad_norm": 0.06705707311630249, + "learning_rate": 8.943870815465605e-06, + "loss": 2.5825, + "step": 31711 + }, + { + "epoch": 0.9403671085016161, + "grad_norm": 0.06870245188474655, + "learning_rate": 8.935013465918962e-06, + "loss": 2.6185, + "step": 31712 + }, + { + "epoch": 0.9403967618539276, + "grad_norm": 0.06861851364374161, + "learning_rate": 8.92616046483713e-06, + "loss": 2.5791, + "step": 31713 + }, + { + "epoch": 0.940426415206239, + "grad_norm": 0.0703553855419159, + "learning_rate": 8.917311812298434e-06, + "loss": 2.5627, + "step": 31714 + }, + { + "epoch": 0.9404560685585506, + "grad_norm": 0.07090682536363602, + "learning_rate": 8.908467508381202e-06, + "loss": 2.6049, + "step": 31715 + }, + { + "epoch": 0.940485721910862, + "grad_norm": 0.06723540276288986, + "learning_rate": 8.899627553163813e-06, + "loss": 2.5204, + "step": 31716 + }, + { + "epoch": 0.9405153752631735, + "grad_norm": 0.0702456682920456, + "learning_rate": 8.89079194672454e-06, + "loss": 2.5496, + "step": 31717 + }, + { + "epoch": 0.940545028615485, + "grad_norm": 0.0691007748246193, + "learning_rate": 8.881960689141543e-06, + "loss": 2.5737, + "step": 31718 + }, + { + "epoch": 0.9405746819677965, + "grad_norm": 0.06845080852508545, + "learning_rate": 8.873133780493092e-06, + "loss": 2.5943, + "step": 31719 + }, + { + "epoch": 0.9406043353201079, + "grad_norm": 0.06998748332262039, + "learning_rate": 8.864311220857401e-06, + "loss": 2.5498, + "step": 31720 + }, + { + "epoch": 0.9406339886724194, + "grad_norm": 0.06991562992334366, + "learning_rate": 8.855493010312465e-06, + "loss": 2.5533, + "step": 31721 + }, + { + "epoch": 0.9406636420247309, + "grad_norm": 0.06508755683898926, + "learning_rate": 8.846679148936499e-06, + "loss": 2.5806, + "step": 31722 + }, + { + "epoch": 0.9406932953770424, + "grad_norm": 0.06808873265981674, + "learning_rate": 8.837869636807494e-06, + "loss": 2.5379, + "step": 31723 + }, + { + "epoch": 0.9407229487293538, + "grad_norm": 0.0725436732172966, + "learning_rate": 8.82906447400339e-06, + "loss": 2.5687, + "step": 31724 + }, + { + "epoch": 0.9407526020816653, + "grad_norm": 0.06650378555059433, + "learning_rate": 8.820263660602345e-06, + "loss": 2.5394, + "step": 31725 + }, + { + "epoch": 0.9407822554339769, + "grad_norm": 0.06889145821332932, + "learning_rate": 8.811467196682077e-06, + "loss": 2.584, + "step": 31726 + }, + { + "epoch": 0.9408119087862883, + "grad_norm": 0.07021547853946686, + "learning_rate": 8.802675082320577e-06, + "loss": 2.5601, + "step": 31727 + }, + { + "epoch": 0.9408415621385998, + "grad_norm": 0.06755276769399643, + "learning_rate": 8.793887317595729e-06, + "loss": 2.5536, + "step": 31728 + }, + { + "epoch": 0.9408712154909112, + "grad_norm": 0.07011719048023224, + "learning_rate": 8.785103902585356e-06, + "loss": 2.5878, + "step": 31729 + }, + { + "epoch": 0.9409008688432228, + "grad_norm": 0.07043319195508957, + "learning_rate": 8.776324837367178e-06, + "loss": 2.5664, + "step": 31730 + }, + { + "epoch": 0.9409305221955342, + "grad_norm": 0.06946659088134766, + "learning_rate": 8.767550122018964e-06, + "loss": 2.5557, + "step": 31731 + }, + { + "epoch": 0.9409601755478457, + "grad_norm": 0.06634139269590378, + "learning_rate": 8.758779756618373e-06, + "loss": 2.5582, + "step": 31732 + }, + { + "epoch": 0.9409898289001571, + "grad_norm": 0.06865173578262329, + "learning_rate": 8.750013741243123e-06, + "loss": 2.5274, + "step": 31733 + }, + { + "epoch": 0.9410194822524687, + "grad_norm": 0.07214055955410004, + "learning_rate": 8.741252075970874e-06, + "loss": 2.5575, + "step": 31734 + }, + { + "epoch": 0.9410491356047801, + "grad_norm": 0.06803975999355316, + "learning_rate": 8.732494760879173e-06, + "loss": 2.5753, + "step": 31735 + }, + { + "epoch": 0.9410787889570916, + "grad_norm": 0.07312490791082382, + "learning_rate": 8.723741796045514e-06, + "loss": 2.5723, + "step": 31736 + }, + { + "epoch": 0.941108442309403, + "grad_norm": 0.070179782807827, + "learning_rate": 8.714993181547448e-06, + "loss": 2.5806, + "step": 31737 + }, + { + "epoch": 0.9411380956617146, + "grad_norm": 0.06909551471471786, + "learning_rate": 8.706248917462411e-06, + "loss": 2.5811, + "step": 31738 + }, + { + "epoch": 0.941167749014026, + "grad_norm": 0.0720774456858635, + "learning_rate": 8.697509003867899e-06, + "loss": 2.5687, + "step": 31739 + }, + { + "epoch": 0.9411974023663375, + "grad_norm": 0.06916700303554535, + "learning_rate": 8.688773440841235e-06, + "loss": 2.5494, + "step": 31740 + }, + { + "epoch": 0.941227055718649, + "grad_norm": 0.06984520703554153, + "learning_rate": 8.680042228459861e-06, + "loss": 2.579, + "step": 31741 + }, + { + "epoch": 0.9412567090709605, + "grad_norm": 0.06900624930858612, + "learning_rate": 8.671315366801047e-06, + "loss": 2.5893, + "step": 31742 + }, + { + "epoch": 0.9412863624232719, + "grad_norm": 0.07125674933195114, + "learning_rate": 8.662592855942064e-06, + "loss": 2.6178, + "step": 31743 + }, + { + "epoch": 0.9413160157755834, + "grad_norm": 0.0724138393998146, + "learning_rate": 8.653874695960185e-06, + "loss": 2.5312, + "step": 31744 + }, + { + "epoch": 0.9413456691278949, + "grad_norm": 0.07186321914196014, + "learning_rate": 8.645160886932568e-06, + "loss": 2.5978, + "step": 31745 + }, + { + "epoch": 0.9413753224802064, + "grad_norm": 0.06915950030088425, + "learning_rate": 8.636451428936431e-06, + "loss": 2.5699, + "step": 31746 + }, + { + "epoch": 0.9414049758325179, + "grad_norm": 0.09767267853021622, + "learning_rate": 8.627746322048768e-06, + "loss": 2.5732, + "step": 31747 + }, + { + "epoch": 0.9414346291848293, + "grad_norm": 0.07345059514045715, + "learning_rate": 8.619045566346794e-06, + "loss": 2.5504, + "step": 31748 + }, + { + "epoch": 0.9414642825371409, + "grad_norm": 0.07143693417310715, + "learning_rate": 8.61034916190756e-06, + "loss": 2.5949, + "step": 31749 + }, + { + "epoch": 0.9414939358894523, + "grad_norm": 0.07445251196622849, + "learning_rate": 8.601657108807948e-06, + "loss": 2.563, + "step": 31750 + }, + { + "epoch": 0.9415235892417638, + "grad_norm": 0.0695197582244873, + "learning_rate": 8.592969407125062e-06, + "loss": 2.5259, + "step": 31751 + }, + { + "epoch": 0.9415532425940752, + "grad_norm": 0.06909475475549698, + "learning_rate": 8.584286056935786e-06, + "loss": 2.556, + "step": 31752 + }, + { + "epoch": 0.9415828959463868, + "grad_norm": 0.07143176347017288, + "learning_rate": 8.575607058316948e-06, + "loss": 2.5751, + "step": 31753 + }, + { + "epoch": 0.9416125492986982, + "grad_norm": 0.07262398302555084, + "learning_rate": 8.566932411345484e-06, + "loss": 2.5412, + "step": 31754 + }, + { + "epoch": 0.9416422026510097, + "grad_norm": 0.07399117946624756, + "learning_rate": 8.558262116098225e-06, + "loss": 2.5778, + "step": 31755 + }, + { + "epoch": 0.9416718560033212, + "grad_norm": 0.06817765533924103, + "learning_rate": 8.549596172651941e-06, + "loss": 2.5951, + "step": 31756 + }, + { + "epoch": 0.9417015093556327, + "grad_norm": 0.06478676944971085, + "learning_rate": 8.540934581083294e-06, + "loss": 2.5331, + "step": 31757 + }, + { + "epoch": 0.9417311627079441, + "grad_norm": 0.06880082935094833, + "learning_rate": 8.532277341468997e-06, + "loss": 2.5488, + "step": 31758 + }, + { + "epoch": 0.9417608160602556, + "grad_norm": 0.07352356612682343, + "learning_rate": 8.523624453885769e-06, + "loss": 2.5658, + "step": 31759 + }, + { + "epoch": 0.941790469412567, + "grad_norm": 0.06691567599773407, + "learning_rate": 8.514975918410217e-06, + "loss": 2.5701, + "step": 31760 + }, + { + "epoch": 0.9418201227648786, + "grad_norm": 0.06883863359689713, + "learning_rate": 8.506331735118944e-06, + "loss": 2.5689, + "step": 31761 + }, + { + "epoch": 0.94184977611719, + "grad_norm": 0.07142634689807892, + "learning_rate": 8.497691904088389e-06, + "loss": 2.5419, + "step": 31762 + }, + { + "epoch": 0.9418794294695015, + "grad_norm": 0.07207901030778885, + "learning_rate": 8.489056425395214e-06, + "loss": 2.6019, + "step": 31763 + }, + { + "epoch": 0.941909082821813, + "grad_norm": 0.07398858666419983, + "learning_rate": 8.480425299115746e-06, + "loss": 2.5636, + "step": 31764 + }, + { + "epoch": 0.9419387361741245, + "grad_norm": 0.07055465131998062, + "learning_rate": 8.471798525326536e-06, + "loss": 2.5398, + "step": 31765 + }, + { + "epoch": 0.9419683895264359, + "grad_norm": 0.06904206424951553, + "learning_rate": 8.463176104103908e-06, + "loss": 2.5644, + "step": 31766 + }, + { + "epoch": 0.9419980428787474, + "grad_norm": 0.06882339715957642, + "learning_rate": 8.454558035524196e-06, + "loss": 2.5815, + "step": 31767 + }, + { + "epoch": 0.942027696231059, + "grad_norm": 0.07731138914823532, + "learning_rate": 8.445944319663778e-06, + "loss": 2.597, + "step": 31768 + }, + { + "epoch": 0.9420573495833704, + "grad_norm": 0.07197946310043335, + "learning_rate": 8.437334956598874e-06, + "loss": 2.5479, + "step": 31769 + }, + { + "epoch": 0.9420870029356819, + "grad_norm": 0.07145189493894577, + "learning_rate": 8.428729946405755e-06, + "loss": 2.5735, + "step": 31770 + }, + { + "epoch": 0.9421166562879933, + "grad_norm": 0.07080696523189545, + "learning_rate": 8.420129289160584e-06, + "loss": 2.5507, + "step": 31771 + }, + { + "epoch": 0.9421463096403049, + "grad_norm": 0.06981607526540756, + "learning_rate": 8.411532984939574e-06, + "loss": 2.5429, + "step": 31772 + }, + { + "epoch": 0.9421759629926163, + "grad_norm": 0.07143840193748474, + "learning_rate": 8.402941033818833e-06, + "loss": 2.5782, + "step": 31773 + }, + { + "epoch": 0.9422056163449278, + "grad_norm": 0.07107372581958771, + "learning_rate": 8.394353435874414e-06, + "loss": 2.5728, + "step": 31774 + }, + { + "epoch": 0.9422352696972393, + "grad_norm": 0.0704309344291687, + "learning_rate": 8.385770191182363e-06, + "loss": 2.5927, + "step": 31775 + }, + { + "epoch": 0.9422649230495508, + "grad_norm": 0.06697489321231842, + "learning_rate": 8.377191299818732e-06, + "loss": 2.5856, + "step": 31776 + }, + { + "epoch": 0.9422945764018622, + "grad_norm": 0.07029218971729279, + "learning_rate": 8.36861676185946e-06, + "loss": 2.5378, + "step": 31777 + }, + { + "epoch": 0.9423242297541737, + "grad_norm": 0.07073374092578888, + "learning_rate": 8.36004657738043e-06, + "loss": 2.5505, + "step": 31778 + }, + { + "epoch": 0.9423538831064852, + "grad_norm": 0.07209111750125885, + "learning_rate": 8.351480746457585e-06, + "loss": 2.5783, + "step": 31779 + }, + { + "epoch": 0.9423835364587967, + "grad_norm": 0.07116436213254929, + "learning_rate": 8.342919269166804e-06, + "loss": 2.5583, + "step": 31780 + }, + { + "epoch": 0.9424131898111081, + "grad_norm": 0.07183858007192612, + "learning_rate": 8.334362145583863e-06, + "loss": 2.529, + "step": 31781 + }, + { + "epoch": 0.9424428431634196, + "grad_norm": 0.07313346862792969, + "learning_rate": 8.325809375784533e-06, + "loss": 2.5514, + "step": 31782 + }, + { + "epoch": 0.9424724965157311, + "grad_norm": 0.07576370984315872, + "learning_rate": 8.31726095984453e-06, + "loss": 2.5779, + "step": 31783 + }, + { + "epoch": 0.9425021498680426, + "grad_norm": 0.07061784714460373, + "learning_rate": 8.308716897839575e-06, + "loss": 2.5851, + "step": 31784 + }, + { + "epoch": 0.942531803220354, + "grad_norm": 0.06905820220708847, + "learning_rate": 8.300177189845382e-06, + "loss": 2.5819, + "step": 31785 + }, + { + "epoch": 0.9425614565726655, + "grad_norm": 0.06709790974855423, + "learning_rate": 8.291641835937447e-06, + "loss": 2.5223, + "step": 31786 + }, + { + "epoch": 0.942591109924977, + "grad_norm": 0.06881946325302124, + "learning_rate": 8.283110836191487e-06, + "loss": 2.557, + "step": 31787 + }, + { + "epoch": 0.9426207632772885, + "grad_norm": 0.07147209346294403, + "learning_rate": 8.274584190682944e-06, + "loss": 2.5538, + "step": 31788 + }, + { + "epoch": 0.9426504166296, + "grad_norm": 0.06971509009599686, + "learning_rate": 8.266061899487365e-06, + "loss": 2.5978, + "step": 31789 + }, + { + "epoch": 0.9426800699819115, + "grad_norm": 0.0658818930387497, + "learning_rate": 8.257543962680247e-06, + "loss": 2.5612, + "step": 31790 + }, + { + "epoch": 0.942709723334223, + "grad_norm": 0.07454203069210052, + "learning_rate": 8.249030380336974e-06, + "loss": 2.6071, + "step": 31791 + }, + { + "epoch": 0.9427393766865344, + "grad_norm": 0.06920359283685684, + "learning_rate": 8.240521152532932e-06, + "loss": 2.5505, + "step": 31792 + }, + { + "epoch": 0.9427690300388459, + "grad_norm": 0.07190393656492233, + "learning_rate": 8.232016279343447e-06, + "loss": 2.5703, + "step": 31793 + }, + { + "epoch": 0.9427986833911574, + "grad_norm": 0.06781895458698273, + "learning_rate": 8.223515760843902e-06, + "loss": 2.5419, + "step": 31794 + }, + { + "epoch": 0.9428283367434689, + "grad_norm": 0.07202334702014923, + "learning_rate": 8.215019597109575e-06, + "loss": 2.5699, + "step": 31795 + }, + { + "epoch": 0.9428579900957803, + "grad_norm": 0.07188314199447632, + "learning_rate": 8.206527788215624e-06, + "loss": 2.5438, + "step": 31796 + }, + { + "epoch": 0.9428876434480918, + "grad_norm": 0.06848789751529694, + "learning_rate": 8.198040334237266e-06, + "loss": 2.574, + "step": 31797 + }, + { + "epoch": 0.9429172968004033, + "grad_norm": 0.0722862184047699, + "learning_rate": 8.189557235249723e-06, + "loss": 2.6126, + "step": 31798 + }, + { + "epoch": 0.9429469501527148, + "grad_norm": 0.07129177451133728, + "learning_rate": 8.181078491328043e-06, + "loss": 2.5648, + "step": 31799 + }, + { + "epoch": 0.9429766035050262, + "grad_norm": 0.06724784523248672, + "learning_rate": 8.172604102547333e-06, + "loss": 2.566, + "step": 31800 + }, + { + "epoch": 0.9430062568573377, + "grad_norm": 0.06865547597408295, + "learning_rate": 8.164134068982643e-06, + "loss": 2.5693, + "step": 31801 + }, + { + "epoch": 0.9430359102096492, + "grad_norm": 0.06694215536117554, + "learning_rate": 8.15566839070897e-06, + "loss": 2.5499, + "step": 31802 + }, + { + "epoch": 0.9430655635619607, + "grad_norm": 0.06876818090677261, + "learning_rate": 8.14720706780131e-06, + "loss": 2.5651, + "step": 31803 + }, + { + "epoch": 0.9430952169142721, + "grad_norm": 0.06740226596593857, + "learning_rate": 8.138750100334436e-06, + "loss": 2.5802, + "step": 31804 + }, + { + "epoch": 0.9431248702665836, + "grad_norm": 0.07039259374141693, + "learning_rate": 8.130297488383454e-06, + "loss": 2.5828, + "step": 31805 + }, + { + "epoch": 0.9431545236188951, + "grad_norm": 0.07191860675811768, + "learning_rate": 8.121849232023081e-06, + "loss": 2.5832, + "step": 31806 + }, + { + "epoch": 0.9431841769712066, + "grad_norm": 0.07066182047128677, + "learning_rate": 8.113405331328206e-06, + "loss": 2.5687, + "step": 31807 + }, + { + "epoch": 0.943213830323518, + "grad_norm": 0.06689833849668503, + "learning_rate": 8.104965786373542e-06, + "loss": 2.5532, + "step": 31808 + }, + { + "epoch": 0.9432434836758296, + "grad_norm": 0.07136137783527374, + "learning_rate": 8.096530597233808e-06, + "loss": 2.5372, + "step": 31809 + }, + { + "epoch": 0.9432731370281411, + "grad_norm": 0.06646095216274261, + "learning_rate": 8.088099763983781e-06, + "loss": 2.5687, + "step": 31810 + }, + { + "epoch": 0.9433027903804525, + "grad_norm": 0.06948462873697281, + "learning_rate": 8.079673286698063e-06, + "loss": 2.5344, + "step": 31811 + }, + { + "epoch": 0.943332443732764, + "grad_norm": 0.07133471220731735, + "learning_rate": 8.071251165451265e-06, + "loss": 2.5578, + "step": 31812 + }, + { + "epoch": 0.9433620970850755, + "grad_norm": 0.0689447820186615, + "learning_rate": 8.062833400317937e-06, + "loss": 2.5757, + "step": 31813 + }, + { + "epoch": 0.943391750437387, + "grad_norm": 0.07256707549095154, + "learning_rate": 8.054419991372685e-06, + "loss": 2.5662, + "step": 31814 + }, + { + "epoch": 0.9434214037896984, + "grad_norm": 0.06998661160469055, + "learning_rate": 8.046010938690008e-06, + "loss": 2.5847, + "step": 31815 + }, + { + "epoch": 0.9434510571420099, + "grad_norm": 0.06797847896814346, + "learning_rate": 8.037606242344287e-06, + "loss": 2.5816, + "step": 31816 + }, + { + "epoch": 0.9434807104943214, + "grad_norm": 0.0663333609700203, + "learning_rate": 8.02920590241002e-06, + "loss": 2.5271, + "step": 31817 + }, + { + "epoch": 0.9435103638466329, + "grad_norm": 0.06730663031339645, + "learning_rate": 8.020809918961592e-06, + "loss": 2.5852, + "step": 31818 + }, + { + "epoch": 0.9435400171989443, + "grad_norm": 0.06524279713630676, + "learning_rate": 8.012418292073387e-06, + "loss": 2.5051, + "step": 31819 + }, + { + "epoch": 0.9435696705512558, + "grad_norm": 0.06847906112670898, + "learning_rate": 8.004031021819624e-06, + "loss": 2.5886, + "step": 31820 + }, + { + "epoch": 0.9435993239035673, + "grad_norm": 0.07097486406564713, + "learning_rate": 7.995648108274578e-06, + "loss": 2.5943, + "step": 31821 + }, + { + "epoch": 0.9436289772558788, + "grad_norm": 0.06808178871870041, + "learning_rate": 7.987269551512633e-06, + "loss": 2.5766, + "step": 31822 + }, + { + "epoch": 0.9436586306081902, + "grad_norm": 0.06722937524318695, + "learning_rate": 7.978895351607785e-06, + "loss": 2.5372, + "step": 31823 + }, + { + "epoch": 0.9436882839605018, + "grad_norm": 0.07376114279031754, + "learning_rate": 7.970525508634307e-06, + "loss": 2.5761, + "step": 31824 + }, + { + "epoch": 0.9437179373128132, + "grad_norm": 0.06456305831670761, + "learning_rate": 7.962160022666253e-06, + "loss": 2.5611, + "step": 31825 + }, + { + "epoch": 0.9437475906651247, + "grad_norm": 0.06841574609279633, + "learning_rate": 7.95379889377773e-06, + "loss": 2.5809, + "step": 31826 + }, + { + "epoch": 0.9437772440174361, + "grad_norm": 0.06645561009645462, + "learning_rate": 7.945442122042789e-06, + "loss": 2.5256, + "step": 31827 + }, + { + "epoch": 0.9438068973697477, + "grad_norm": 0.06911376863718033, + "learning_rate": 7.937089707535427e-06, + "loss": 2.5521, + "step": 31828 + }, + { + "epoch": 0.9438365507220591, + "grad_norm": 0.07040334492921829, + "learning_rate": 7.928741650329586e-06, + "loss": 2.5496, + "step": 31829 + }, + { + "epoch": 0.9438662040743706, + "grad_norm": 0.06863518059253693, + "learning_rate": 7.920397950499148e-06, + "loss": 2.5608, + "step": 31830 + }, + { + "epoch": 0.9438958574266821, + "grad_norm": 0.07124967128038406, + "learning_rate": 7.912058608118111e-06, + "loss": 2.5791, + "step": 31831 + }, + { + "epoch": 0.9439255107789936, + "grad_norm": 0.06888245046138763, + "learning_rate": 7.903723623260251e-06, + "loss": 2.5216, + "step": 31832 + }, + { + "epoch": 0.9439551641313051, + "grad_norm": 0.06993936747312546, + "learning_rate": 7.895392995999395e-06, + "loss": 2.5641, + "step": 31833 + }, + { + "epoch": 0.9439848174836165, + "grad_norm": 0.07418069243431091, + "learning_rate": 7.88706672640932e-06, + "loss": 2.5472, + "step": 31834 + }, + { + "epoch": 0.944014470835928, + "grad_norm": 0.06897161155939102, + "learning_rate": 7.878744814563687e-06, + "loss": 2.5768, + "step": 31835 + }, + { + "epoch": 0.9440441241882395, + "grad_norm": 0.07395083457231522, + "learning_rate": 7.870427260536273e-06, + "loss": 2.5552, + "step": 31836 + }, + { + "epoch": 0.944073777540551, + "grad_norm": 0.07267634570598602, + "learning_rate": 7.862114064400683e-06, + "loss": 2.5702, + "step": 31837 + }, + { + "epoch": 0.9441034308928624, + "grad_norm": 0.07034880667924881, + "learning_rate": 7.853805226230582e-06, + "loss": 2.5895, + "step": 31838 + }, + { + "epoch": 0.944133084245174, + "grad_norm": 0.07016599178314209, + "learning_rate": 7.845500746099466e-06, + "loss": 2.5591, + "step": 31839 + }, + { + "epoch": 0.9441627375974854, + "grad_norm": 0.06774411350488663, + "learning_rate": 7.837200624080943e-06, + "loss": 2.5617, + "step": 31840 + }, + { + "epoch": 0.9441923909497969, + "grad_norm": 0.06918246299028397, + "learning_rate": 7.828904860248453e-06, + "loss": 2.5635, + "step": 31841 + }, + { + "epoch": 0.9442220443021083, + "grad_norm": 0.07343270629644394, + "learning_rate": 7.820613454675551e-06, + "loss": 2.5818, + "step": 31842 + }, + { + "epoch": 0.9442516976544199, + "grad_norm": 0.07180000096559525, + "learning_rate": 7.812326407435566e-06, + "loss": 2.5423, + "step": 31843 + }, + { + "epoch": 0.9442813510067313, + "grad_norm": 0.0713486522436142, + "learning_rate": 7.804043718601883e-06, + "loss": 2.5576, + "step": 31844 + }, + { + "epoch": 0.9443110043590428, + "grad_norm": 0.07098042964935303, + "learning_rate": 7.795765388247945e-06, + "loss": 2.5611, + "step": 31845 + }, + { + "epoch": 0.9443406577113542, + "grad_norm": 0.06785125285387039, + "learning_rate": 7.787491416446967e-06, + "loss": 2.5726, + "step": 31846 + }, + { + "epoch": 0.9443703110636658, + "grad_norm": 0.06976047158241272, + "learning_rate": 7.779221803272173e-06, + "loss": 2.5817, + "step": 31847 + }, + { + "epoch": 0.9443999644159772, + "grad_norm": 0.06931682676076889, + "learning_rate": 7.770956548796948e-06, + "loss": 2.5618, + "step": 31848 + }, + { + "epoch": 0.9444296177682887, + "grad_norm": 0.07053034752607346, + "learning_rate": 7.762695653094343e-06, + "loss": 2.5903, + "step": 31849 + }, + { + "epoch": 0.9444592711206001, + "grad_norm": 0.06966564804315567, + "learning_rate": 7.754439116237577e-06, + "loss": 2.5659, + "step": 31850 + }, + { + "epoch": 0.9444889244729117, + "grad_norm": 0.07334379851818085, + "learning_rate": 7.74618693829976e-06, + "loss": 2.574, + "step": 31851 + }, + { + "epoch": 0.9445185778252232, + "grad_norm": 0.06858234852552414, + "learning_rate": 7.737939119353887e-06, + "loss": 2.5342, + "step": 31852 + }, + { + "epoch": 0.9445482311775346, + "grad_norm": 0.07000650465488434, + "learning_rate": 7.729695659473123e-06, + "loss": 2.5928, + "step": 31853 + }, + { + "epoch": 0.9445778845298461, + "grad_norm": 0.06806781888008118, + "learning_rate": 7.721456558730412e-06, + "loss": 2.5415, + "step": 31854 + }, + { + "epoch": 0.9446075378821576, + "grad_norm": 0.07221035659313202, + "learning_rate": 7.713221817198634e-06, + "loss": 2.573, + "step": 31855 + }, + { + "epoch": 0.9446371912344691, + "grad_norm": 0.06906277686357498, + "learning_rate": 7.704991434950848e-06, + "loss": 2.5581, + "step": 31856 + }, + { + "epoch": 0.9446668445867805, + "grad_norm": 0.07028559595346451, + "learning_rate": 7.696765412059826e-06, + "loss": 2.5716, + "step": 31857 + }, + { + "epoch": 0.944696497939092, + "grad_norm": 0.0709923654794693, + "learning_rate": 7.688543748598453e-06, + "loss": 2.5682, + "step": 31858 + }, + { + "epoch": 0.9447261512914035, + "grad_norm": 0.06898590922355652, + "learning_rate": 7.680326444639508e-06, + "loss": 2.5773, + "step": 31859 + }, + { + "epoch": 0.944755804643715, + "grad_norm": 0.06857150793075562, + "learning_rate": 7.672113500255817e-06, + "loss": 2.5523, + "step": 31860 + }, + { + "epoch": 0.9447854579960264, + "grad_norm": 0.06697901338338852, + "learning_rate": 7.663904915520047e-06, + "loss": 2.5705, + "step": 31861 + }, + { + "epoch": 0.944815111348338, + "grad_norm": 0.06888695061206818, + "learning_rate": 7.655700690504918e-06, + "loss": 2.5752, + "step": 31862 + }, + { + "epoch": 0.9448447647006494, + "grad_norm": 0.0706087201833725, + "learning_rate": 7.647500825283038e-06, + "loss": 2.5542, + "step": 31863 + }, + { + "epoch": 0.9448744180529609, + "grad_norm": 0.0670662671327591, + "learning_rate": 7.63930531992707e-06, + "loss": 2.5566, + "step": 31864 + }, + { + "epoch": 0.9449040714052723, + "grad_norm": 0.07084422558546066, + "learning_rate": 7.631114174509569e-06, + "loss": 2.571, + "step": 31865 + }, + { + "epoch": 0.9449337247575839, + "grad_norm": 0.06912320852279663, + "learning_rate": 7.62292738910314e-06, + "loss": 2.5631, + "step": 31866 + }, + { + "epoch": 0.9449633781098953, + "grad_norm": 0.0690082460641861, + "learning_rate": 7.614744963780118e-06, + "loss": 2.5549, + "step": 31867 + }, + { + "epoch": 0.9449930314622068, + "grad_norm": 0.07460162043571472, + "learning_rate": 7.606566898613055e-06, + "loss": 2.6023, + "step": 31868 + }, + { + "epoch": 0.9450226848145182, + "grad_norm": 0.06946095079183578, + "learning_rate": 7.598393193674336e-06, + "loss": 2.5699, + "step": 31869 + }, + { + "epoch": 0.9450523381668298, + "grad_norm": 0.0695839673280716, + "learning_rate": 7.590223849036404e-06, + "loss": 2.5631, + "step": 31870 + }, + { + "epoch": 0.9450819915191412, + "grad_norm": 0.06850551813840866, + "learning_rate": 7.582058864771535e-06, + "loss": 2.575, + "step": 31871 + }, + { + "epoch": 0.9451116448714527, + "grad_norm": 0.06925050169229507, + "learning_rate": 7.57389824095206e-06, + "loss": 2.5419, + "step": 31872 + }, + { + "epoch": 0.9451412982237642, + "grad_norm": 0.06856316328048706, + "learning_rate": 7.565741977650253e-06, + "loss": 2.5874, + "step": 31873 + }, + { + "epoch": 0.9451709515760757, + "grad_norm": 0.0709819346666336, + "learning_rate": 7.557590074938337e-06, + "loss": 2.5506, + "step": 31874 + }, + { + "epoch": 0.9452006049283872, + "grad_norm": 0.06674996018409729, + "learning_rate": 7.549442532888473e-06, + "loss": 2.5636, + "step": 31875 + }, + { + "epoch": 0.9452302582806986, + "grad_norm": 0.06883018463850021, + "learning_rate": 7.541299351572828e-06, + "loss": 2.5467, + "step": 31876 + }, + { + "epoch": 0.9452599116330102, + "grad_norm": 0.06768685579299927, + "learning_rate": 7.533160531063565e-06, + "loss": 2.5591, + "step": 31877 + }, + { + "epoch": 0.9452895649853216, + "grad_norm": 0.06713744252920151, + "learning_rate": 7.525026071432628e-06, + "loss": 2.5512, + "step": 31878 + }, + { + "epoch": 0.9453192183376331, + "grad_norm": 0.06771859526634216, + "learning_rate": 7.516895972752125e-06, + "loss": 2.5417, + "step": 31879 + }, + { + "epoch": 0.9453488716899445, + "grad_norm": 0.06765451282262802, + "learning_rate": 7.508770235094053e-06, + "loss": 2.5617, + "step": 31880 + }, + { + "epoch": 0.9453785250422561, + "grad_norm": 0.06959515064954758, + "learning_rate": 7.500648858530357e-06, + "loss": 2.5514, + "step": 31881 + }, + { + "epoch": 0.9454081783945675, + "grad_norm": 0.07175812870264053, + "learning_rate": 7.492531843132866e-06, + "loss": 2.5727, + "step": 31882 + }, + { + "epoch": 0.945437831746879, + "grad_norm": 0.06758684664964676, + "learning_rate": 7.484419188973634e-06, + "loss": 2.5203, + "step": 31883 + }, + { + "epoch": 0.9454674850991904, + "grad_norm": 0.06997284293174744, + "learning_rate": 7.476310896124383e-06, + "loss": 2.598, + "step": 31884 + }, + { + "epoch": 0.945497138451502, + "grad_norm": 0.07202588766813278, + "learning_rate": 7.468206964656943e-06, + "loss": 2.5308, + "step": 31885 + }, + { + "epoch": 0.9455267918038134, + "grad_norm": 0.07200513780117035, + "learning_rate": 7.46010739464309e-06, + "loss": 2.5341, + "step": 31886 + }, + { + "epoch": 0.9455564451561249, + "grad_norm": 0.06968072056770325, + "learning_rate": 7.4520121861545445e-06, + "loss": 2.5481, + "step": 31887 + }, + { + "epoch": 0.9455860985084363, + "grad_norm": 0.06679429858922958, + "learning_rate": 7.443921339262971e-06, + "loss": 2.5755, + "step": 31888 + }, + { + "epoch": 0.9456157518607479, + "grad_norm": 0.06781258434057236, + "learning_rate": 7.43583485403998e-06, + "loss": 2.5773, + "step": 31889 + }, + { + "epoch": 0.9456454052130593, + "grad_norm": 0.07006967812776566, + "learning_rate": 7.4277527305572356e-06, + "loss": 2.5527, + "step": 31890 + }, + { + "epoch": 0.9456750585653708, + "grad_norm": 0.07130454480648041, + "learning_rate": 7.419674968886292e-06, + "loss": 2.6084, + "step": 31891 + }, + { + "epoch": 0.9457047119176822, + "grad_norm": 0.0737019032239914, + "learning_rate": 7.411601569098703e-06, + "loss": 2.553, + "step": 31892 + }, + { + "epoch": 0.9457343652699938, + "grad_norm": 0.06885462999343872, + "learning_rate": 7.403532531265911e-06, + "loss": 2.5685, + "step": 31893 + }, + { + "epoch": 0.9457640186223053, + "grad_norm": 0.06954833120107651, + "learning_rate": 7.395467855459359e-06, + "loss": 2.5997, + "step": 31894 + }, + { + "epoch": 0.9457936719746167, + "grad_norm": 0.06848777830600739, + "learning_rate": 7.387407541750491e-06, + "loss": 2.5528, + "step": 31895 + }, + { + "epoch": 0.9458233253269283, + "grad_norm": 0.07114346325397491, + "learning_rate": 7.379351590210748e-06, + "loss": 2.6075, + "step": 31896 + }, + { + "epoch": 0.9458529786792397, + "grad_norm": 0.06907812505960464, + "learning_rate": 7.371300000911352e-06, + "loss": 2.5855, + "step": 31897 + }, + { + "epoch": 0.9458826320315512, + "grad_norm": 0.07162990421056747, + "learning_rate": 7.36325277392369e-06, + "loss": 2.5612, + "step": 31898 + }, + { + "epoch": 0.9459122853838626, + "grad_norm": 0.06950164586305618, + "learning_rate": 7.355209909318983e-06, + "loss": 2.5787, + "step": 31899 + }, + { + "epoch": 0.9459419387361742, + "grad_norm": 0.06697630137205124, + "learning_rate": 7.347171407168452e-06, + "loss": 2.5467, + "step": 31900 + }, + { + "epoch": 0.9459715920884856, + "grad_norm": 0.07180296629667282, + "learning_rate": 7.339137267543261e-06, + "loss": 2.5703, + "step": 31901 + }, + { + "epoch": 0.9460012454407971, + "grad_norm": 0.06877020746469498, + "learning_rate": 7.331107490514577e-06, + "loss": 2.5488, + "step": 31902 + }, + { + "epoch": 0.9460308987931085, + "grad_norm": 0.07134611904621124, + "learning_rate": 7.323082076153509e-06, + "loss": 2.5711, + "step": 31903 + }, + { + "epoch": 0.9460605521454201, + "grad_norm": 0.07046670466661453, + "learning_rate": 7.315061024531111e-06, + "loss": 2.5507, + "step": 31904 + }, + { + "epoch": 0.9460902054977315, + "grad_norm": 0.06835277378559113, + "learning_rate": 7.307044335718438e-06, + "loss": 2.5736, + "step": 31905 + }, + { + "epoch": 0.946119858850043, + "grad_norm": 0.06880167126655579, + "learning_rate": 7.299032009786432e-06, + "loss": 2.5581, + "step": 31906 + }, + { + "epoch": 0.9461495122023544, + "grad_norm": 0.06935980170965195, + "learning_rate": 7.291024046806039e-06, + "loss": 2.5697, + "step": 31907 + }, + { + "epoch": 0.946179165554666, + "grad_norm": 0.06985984742641449, + "learning_rate": 7.283020446848254e-06, + "loss": 2.5705, + "step": 31908 + }, + { + "epoch": 0.9462088189069774, + "grad_norm": 0.06716670095920563, + "learning_rate": 7.275021209983857e-06, + "loss": 2.6227, + "step": 31909 + }, + { + "epoch": 0.9462384722592889, + "grad_norm": 0.06940596550703049, + "learning_rate": 7.267026336283788e-06, + "loss": 2.5649, + "step": 31910 + }, + { + "epoch": 0.9462681256116003, + "grad_norm": 0.07034378498792648, + "learning_rate": 7.259035825818716e-06, + "loss": 2.5994, + "step": 31911 + }, + { + "epoch": 0.9462977789639119, + "grad_norm": 0.0650821104645729, + "learning_rate": 7.251049678659472e-06, + "loss": 2.5894, + "step": 31912 + }, + { + "epoch": 0.9463274323162233, + "grad_norm": 0.06634888797998428, + "learning_rate": 7.243067894876776e-06, + "loss": 2.5289, + "step": 31913 + }, + { + "epoch": 0.9463570856685348, + "grad_norm": 0.07025335729122162, + "learning_rate": 7.235090474541295e-06, + "loss": 2.591, + "step": 31914 + }, + { + "epoch": 0.9463867390208464, + "grad_norm": 0.06938697397708893, + "learning_rate": 7.227117417723639e-06, + "loss": 2.5473, + "step": 31915 + }, + { + "epoch": 0.9464163923731578, + "grad_norm": 0.0671309381723404, + "learning_rate": 7.2191487244944735e-06, + "loss": 2.4825, + "step": 31916 + }, + { + "epoch": 0.9464460457254693, + "grad_norm": 0.06862994283437729, + "learning_rate": 7.211184394924297e-06, + "loss": 2.5433, + "step": 31917 + }, + { + "epoch": 0.9464756990777807, + "grad_norm": 0.06546122580766678, + "learning_rate": 7.203224429083721e-06, + "loss": 2.5584, + "step": 31918 + }, + { + "epoch": 0.9465053524300923, + "grad_norm": 0.06692513078451157, + "learning_rate": 7.1952688270431335e-06, + "loss": 2.5811, + "step": 31919 + }, + { + "epoch": 0.9465350057824037, + "grad_norm": 0.06818851828575134, + "learning_rate": 7.187317588873032e-06, + "loss": 2.5902, + "step": 31920 + }, + { + "epoch": 0.9465646591347152, + "grad_norm": 0.07092390954494476, + "learning_rate": 7.1793707146438625e-06, + "loss": 2.5792, + "step": 31921 + }, + { + "epoch": 0.9465943124870266, + "grad_norm": 0.0691104605793953, + "learning_rate": 7.171428204425901e-06, + "loss": 2.5552, + "step": 31922 + }, + { + "epoch": 0.9466239658393382, + "grad_norm": 0.07253704965114594, + "learning_rate": 7.16349005828959e-06, + "loss": 2.5684, + "step": 31923 + }, + { + "epoch": 0.9466536191916496, + "grad_norm": 0.06948195397853851, + "learning_rate": 7.155556276305153e-06, + "loss": 2.5724, + "step": 31924 + }, + { + "epoch": 0.9466832725439611, + "grad_norm": 0.07112117856740952, + "learning_rate": 7.147626858542811e-06, + "loss": 2.5779, + "step": 31925 + }, + { + "epoch": 0.9467129258962725, + "grad_norm": 0.07234183698892593, + "learning_rate": 7.139701805072896e-06, + "loss": 2.5715, + "step": 31926 + }, + { + "epoch": 0.9467425792485841, + "grad_norm": 0.07010690122842789, + "learning_rate": 7.131781115965519e-06, + "loss": 2.5935, + "step": 31927 + }, + { + "epoch": 0.9467722326008955, + "grad_norm": 0.07243328541517258, + "learning_rate": 7.12386479129079e-06, + "loss": 2.5585, + "step": 31928 + }, + { + "epoch": 0.946801885953207, + "grad_norm": 0.06852629780769348, + "learning_rate": 7.115952831118822e-06, + "loss": 2.5851, + "step": 31929 + }, + { + "epoch": 0.9468315393055184, + "grad_norm": 0.06959808617830276, + "learning_rate": 7.108045235519722e-06, + "loss": 2.5658, + "step": 31930 + }, + { + "epoch": 0.94686119265783, + "grad_norm": 0.07022686302661896, + "learning_rate": 7.100142004563492e-06, + "loss": 2.5441, + "step": 31931 + }, + { + "epoch": 0.9468908460101414, + "grad_norm": 0.06965108215808868, + "learning_rate": 7.092243138320131e-06, + "loss": 2.6093, + "step": 31932 + }, + { + "epoch": 0.9469204993624529, + "grad_norm": 0.0703132301568985, + "learning_rate": 7.0843486368595274e-06, + "loss": 2.5638, + "step": 31933 + }, + { + "epoch": 0.9469501527147645, + "grad_norm": 0.0694054663181305, + "learning_rate": 7.07645850025157e-06, + "loss": 2.5481, + "step": 31934 + }, + { + "epoch": 0.9469798060670759, + "grad_norm": 0.06728534400463104, + "learning_rate": 7.068572728566258e-06, + "loss": 2.5688, + "step": 31935 + }, + { + "epoch": 0.9470094594193874, + "grad_norm": 0.06908602267503738, + "learning_rate": 7.060691321873314e-06, + "loss": 2.5964, + "step": 31936 + }, + { + "epoch": 0.9470391127716988, + "grad_norm": 0.07143011689186096, + "learning_rate": 7.0528142802426256e-06, + "loss": 2.5418, + "step": 31937 + }, + { + "epoch": 0.9470687661240104, + "grad_norm": 0.06769049912691116, + "learning_rate": 7.044941603743804e-06, + "loss": 2.5908, + "step": 31938 + }, + { + "epoch": 0.9470984194763218, + "grad_norm": 0.07230447977781296, + "learning_rate": 7.037073292446683e-06, + "loss": 2.584, + "step": 31939 + }, + { + "epoch": 0.9471280728286333, + "grad_norm": 0.0726969912648201, + "learning_rate": 7.029209346420873e-06, + "loss": 2.5965, + "step": 31940 + }, + { + "epoch": 0.9471577261809447, + "grad_norm": 0.06776848435401917, + "learning_rate": 7.02134976573604e-06, + "loss": 2.5679, + "step": 31941 + }, + { + "epoch": 0.9471873795332563, + "grad_norm": 0.06950981914997101, + "learning_rate": 7.013494550461796e-06, + "loss": 2.5392, + "step": 31942 + }, + { + "epoch": 0.9472170328855677, + "grad_norm": 0.06836368143558502, + "learning_rate": 7.005643700667641e-06, + "loss": 2.5391, + "step": 31943 + }, + { + "epoch": 0.9472466862378792, + "grad_norm": 0.06891739368438721, + "learning_rate": 6.99779721642313e-06, + "loss": 2.5728, + "step": 31944 + }, + { + "epoch": 0.9472763395901906, + "grad_norm": 0.06882541626691818, + "learning_rate": 6.989955097797762e-06, + "loss": 2.5756, + "step": 31945 + }, + { + "epoch": 0.9473059929425022, + "grad_norm": 0.06787745654582977, + "learning_rate": 6.9821173448609834e-06, + "loss": 2.5691, + "step": 31946 + }, + { + "epoch": 0.9473356462948136, + "grad_norm": 0.07128187268972397, + "learning_rate": 6.97428395768207e-06, + "loss": 2.563, + "step": 31947 + }, + { + "epoch": 0.9473652996471251, + "grad_norm": 0.06930465251207352, + "learning_rate": 6.966454936330635e-06, + "loss": 2.5864, + "step": 31948 + }, + { + "epoch": 0.9473949529994365, + "grad_norm": 0.06754443049430847, + "learning_rate": 6.958630280875788e-06, + "loss": 2.5833, + "step": 31949 + }, + { + "epoch": 0.9474246063517481, + "grad_norm": 0.06852693855762482, + "learning_rate": 6.9508099913869195e-06, + "loss": 2.5682, + "step": 31950 + }, + { + "epoch": 0.9474542597040595, + "grad_norm": 0.07216703146696091, + "learning_rate": 6.942994067933306e-06, + "loss": 2.589, + "step": 31951 + }, + { + "epoch": 0.947483913056371, + "grad_norm": 0.07691235840320587, + "learning_rate": 6.935182510584059e-06, + "loss": 2.5391, + "step": 31952 + }, + { + "epoch": 0.9475135664086825, + "grad_norm": 0.06752612441778183, + "learning_rate": 6.927375319408458e-06, + "loss": 2.5488, + "step": 31953 + }, + { + "epoch": 0.947543219760994, + "grad_norm": 0.07073193043470383, + "learning_rate": 6.919572494475557e-06, + "loss": 2.5677, + "step": 31954 + }, + { + "epoch": 0.9475728731133055, + "grad_norm": 0.06977855414152145, + "learning_rate": 6.911774035854468e-06, + "loss": 2.5456, + "step": 31955 + }, + { + "epoch": 0.9476025264656169, + "grad_norm": 0.07085726410150528, + "learning_rate": 6.903979943614302e-06, + "loss": 2.5285, + "step": 31956 + }, + { + "epoch": 0.9476321798179285, + "grad_norm": 0.06800241768360138, + "learning_rate": 6.89619021782395e-06, + "loss": 2.5738, + "step": 31957 + }, + { + "epoch": 0.9476618331702399, + "grad_norm": 0.0691024586558342, + "learning_rate": 6.888404858552522e-06, + "loss": 2.5532, + "step": 31958 + }, + { + "epoch": 0.9476914865225514, + "grad_norm": 0.0675804540514946, + "learning_rate": 6.880623865868907e-06, + "loss": 2.5637, + "step": 31959 + }, + { + "epoch": 0.9477211398748628, + "grad_norm": 0.07008743286132812, + "learning_rate": 6.872847239841995e-06, + "loss": 2.5317, + "step": 31960 + }, + { + "epoch": 0.9477507932271744, + "grad_norm": 0.0696987509727478, + "learning_rate": 6.8650749805406755e-06, + "loss": 2.5863, + "step": 31961 + }, + { + "epoch": 0.9477804465794858, + "grad_norm": 0.07304014265537262, + "learning_rate": 6.857307088033726e-06, + "loss": 2.5688, + "step": 31962 + }, + { + "epoch": 0.9478100999317973, + "grad_norm": 0.06961070001125336, + "learning_rate": 6.849543562390037e-06, + "loss": 2.5728, + "step": 31963 + }, + { + "epoch": 0.9478397532841087, + "grad_norm": 0.06750329583883286, + "learning_rate": 6.841784403678275e-06, + "loss": 2.5518, + "step": 31964 + }, + { + "epoch": 0.9478694066364203, + "grad_norm": 0.06661127507686615, + "learning_rate": 6.834029611967163e-06, + "loss": 2.538, + "step": 31965 + }, + { + "epoch": 0.9478990599887317, + "grad_norm": 0.06820084154605865, + "learning_rate": 6.8262791873253125e-06, + "loss": 2.5659, + "step": 31966 + }, + { + "epoch": 0.9479287133410432, + "grad_norm": 0.06480039656162262, + "learning_rate": 6.818533129821503e-06, + "loss": 2.5632, + "step": 31967 + }, + { + "epoch": 0.9479583666933546, + "grad_norm": 0.0706654042005539, + "learning_rate": 6.810791439524178e-06, + "loss": 2.5894, + "step": 31968 + }, + { + "epoch": 0.9479880200456662, + "grad_norm": 0.06758610904216766, + "learning_rate": 6.80305411650195e-06, + "loss": 2.5632, + "step": 31969 + }, + { + "epoch": 0.9480176733979776, + "grad_norm": 0.07127363234758377, + "learning_rate": 6.79532116082332e-06, + "loss": 2.5795, + "step": 31970 + }, + { + "epoch": 0.9480473267502891, + "grad_norm": 0.06963659822940826, + "learning_rate": 6.787592572556789e-06, + "loss": 2.5662, + "step": 31971 + }, + { + "epoch": 0.9480769801026006, + "grad_norm": 0.06979616731405258, + "learning_rate": 6.779868351770746e-06, + "loss": 2.5767, + "step": 31972 + }, + { + "epoch": 0.9481066334549121, + "grad_norm": 0.0699855387210846, + "learning_rate": 6.772148498533692e-06, + "loss": 2.5873, + "step": 31973 + }, + { + "epoch": 0.9481362868072235, + "grad_norm": 0.06989080458879471, + "learning_rate": 6.764433012913962e-06, + "loss": 2.5575, + "step": 31974 + }, + { + "epoch": 0.948165940159535, + "grad_norm": 0.06830491125583649, + "learning_rate": 6.756721894979778e-06, + "loss": 2.5641, + "step": 31975 + }, + { + "epoch": 0.9481955935118466, + "grad_norm": 0.07036871463060379, + "learning_rate": 6.749015144799475e-06, + "loss": 2.5655, + "step": 31976 + }, + { + "epoch": 0.948225246864158, + "grad_norm": 0.07361859828233719, + "learning_rate": 6.741312762441332e-06, + "loss": 2.5913, + "step": 31977 + }, + { + "epoch": 0.9482549002164695, + "grad_norm": 0.07072453200817108, + "learning_rate": 6.733614747973571e-06, + "loss": 2.5491, + "step": 31978 + }, + { + "epoch": 0.9482845535687809, + "grad_norm": 0.0681389570236206, + "learning_rate": 6.72592110146425e-06, + "loss": 2.5639, + "step": 31979 + }, + { + "epoch": 0.9483142069210925, + "grad_norm": 0.072079136967659, + "learning_rate": 6.718231822981591e-06, + "loss": 2.582, + "step": 31980 + }, + { + "epoch": 0.9483438602734039, + "grad_norm": 0.07070494443178177, + "learning_rate": 6.710546912593651e-06, + "loss": 2.5743, + "step": 31981 + }, + { + "epoch": 0.9483735136257154, + "grad_norm": 0.06705278158187866, + "learning_rate": 6.702866370368488e-06, + "loss": 2.5697, + "step": 31982 + }, + { + "epoch": 0.9484031669780268, + "grad_norm": 0.06687477231025696, + "learning_rate": 6.695190196374157e-06, + "loss": 2.5885, + "step": 31983 + }, + { + "epoch": 0.9484328203303384, + "grad_norm": 0.07059146463871002, + "learning_rate": 6.687518390678549e-06, + "loss": 2.5714, + "step": 31984 + }, + { + "epoch": 0.9484624736826498, + "grad_norm": 0.07387574017047882, + "learning_rate": 6.679850953349664e-06, + "loss": 2.5621, + "step": 31985 + }, + { + "epoch": 0.9484921270349613, + "grad_norm": 0.0674963891506195, + "learning_rate": 6.672187884455394e-06, + "loss": 2.5747, + "step": 31986 + }, + { + "epoch": 0.9485217803872728, + "grad_norm": 0.06900960206985474, + "learning_rate": 6.664529184063517e-06, + "loss": 2.5624, + "step": 31987 + }, + { + "epoch": 0.9485514337395843, + "grad_norm": 0.07157687842845917, + "learning_rate": 6.656874852241978e-06, + "loss": 2.5636, + "step": 31988 + }, + { + "epoch": 0.9485810870918957, + "grad_norm": 0.06862186640501022, + "learning_rate": 6.649224889058448e-06, + "loss": 2.5905, + "step": 31989 + }, + { + "epoch": 0.9486107404442072, + "grad_norm": 0.06837372481822968, + "learning_rate": 6.641579294580758e-06, + "loss": 2.5748, + "step": 31990 + }, + { + "epoch": 0.9486403937965187, + "grad_norm": 0.0664890930056572, + "learning_rate": 6.633938068876521e-06, + "loss": 2.5784, + "step": 31991 + }, + { + "epoch": 0.9486700471488302, + "grad_norm": 0.06793245673179626, + "learning_rate": 6.626301212013464e-06, + "loss": 2.5404, + "step": 31992 + }, + { + "epoch": 0.9486997005011416, + "grad_norm": 0.07593655586242676, + "learning_rate": 6.618668724059196e-06, + "loss": 2.5522, + "step": 31993 + }, + { + "epoch": 0.9487293538534531, + "grad_norm": 0.07039421051740646, + "learning_rate": 6.611040605081331e-06, + "loss": 2.5614, + "step": 31994 + }, + { + "epoch": 0.9487590072057646, + "grad_norm": 0.07010155916213989, + "learning_rate": 6.60341685514737e-06, + "loss": 2.576, + "step": 31995 + }, + { + "epoch": 0.9487886605580761, + "grad_norm": 0.06736771762371063, + "learning_rate": 6.595797474324816e-06, + "loss": 2.5333, + "step": 31996 + }, + { + "epoch": 0.9488183139103876, + "grad_norm": 0.07203416526317596, + "learning_rate": 6.5881824626812245e-06, + "loss": 2.583, + "step": 31997 + }, + { + "epoch": 0.948847967262699, + "grad_norm": 0.06863722205162048, + "learning_rate": 6.580571820283931e-06, + "loss": 2.5518, + "step": 31998 + }, + { + "epoch": 0.9488776206150106, + "grad_norm": 0.06645839661359787, + "learning_rate": 6.572965547200383e-06, + "loss": 2.5907, + "step": 31999 + }, + { + "epoch": 0.948907273967322, + "grad_norm": 0.06767799705266953, + "learning_rate": 6.5653636434979124e-06, + "loss": 2.5326, + "step": 32000 + }, + { + "epoch": 0.9489369273196335, + "grad_norm": 0.06782843172550201, + "learning_rate": 6.557766109243801e-06, + "loss": 2.547, + "step": 32001 + }, + { + "epoch": 0.948966580671945, + "grad_norm": 0.06726108491420746, + "learning_rate": 6.5501729445054395e-06, + "loss": 2.5384, + "step": 32002 + }, + { + "epoch": 0.9489962340242565, + "grad_norm": 0.06726241856813431, + "learning_rate": 6.542584149349995e-06, + "loss": 2.5639, + "step": 32003 + }, + { + "epoch": 0.9490258873765679, + "grad_norm": 0.06850235164165497, + "learning_rate": 6.534999723844637e-06, + "loss": 2.5537, + "step": 32004 + }, + { + "epoch": 0.9490555407288794, + "grad_norm": 0.06752312928438187, + "learning_rate": 6.527419668056534e-06, + "loss": 2.5698, + "step": 32005 + }, + { + "epoch": 0.9490851940811909, + "grad_norm": 0.07170264422893524, + "learning_rate": 6.5198439820528535e-06, + "loss": 2.5615, + "step": 32006 + }, + { + "epoch": 0.9491148474335024, + "grad_norm": 0.06919026374816895, + "learning_rate": 6.512272665900709e-06, + "loss": 2.5458, + "step": 32007 + }, + { + "epoch": 0.9491445007858138, + "grad_norm": 0.06698570400476456, + "learning_rate": 6.5047057196671035e-06, + "loss": 2.5394, + "step": 32008 + }, + { + "epoch": 0.9491741541381253, + "grad_norm": 0.06544332206249237, + "learning_rate": 6.497143143418982e-06, + "loss": 2.5532, + "step": 32009 + }, + { + "epoch": 0.9492038074904368, + "grad_norm": 0.06735215336084366, + "learning_rate": 6.489584937223347e-06, + "loss": 2.5573, + "step": 32010 + }, + { + "epoch": 0.9492334608427483, + "grad_norm": 0.06654168665409088, + "learning_rate": 6.482031101147146e-06, + "loss": 2.5795, + "step": 32011 + }, + { + "epoch": 0.9492631141950597, + "grad_norm": 0.06712961196899414, + "learning_rate": 6.4744816352573235e-06, + "loss": 2.5694, + "step": 32012 + }, + { + "epoch": 0.9492927675473712, + "grad_norm": 0.06664718687534332, + "learning_rate": 6.4669365396206045e-06, + "loss": 2.56, + "step": 32013 + }, + { + "epoch": 0.9493224208996827, + "grad_norm": 0.07390717417001724, + "learning_rate": 6.459395814303936e-06, + "loss": 2.5772, + "step": 32014 + }, + { + "epoch": 0.9493520742519942, + "grad_norm": 0.06835749000310898, + "learning_rate": 6.451859459374043e-06, + "loss": 2.5573, + "step": 32015 + }, + { + "epoch": 0.9493817276043056, + "grad_norm": 0.06853465735912323, + "learning_rate": 6.444327474897649e-06, + "loss": 2.57, + "step": 32016 + }, + { + "epoch": 0.9494113809566171, + "grad_norm": 0.06786566227674484, + "learning_rate": 6.436799860941423e-06, + "loss": 2.5687, + "step": 32017 + }, + { + "epoch": 0.9494410343089287, + "grad_norm": 0.0669039785861969, + "learning_rate": 6.429276617572088e-06, + "loss": 2.5783, + "step": 32018 + }, + { + "epoch": 0.9494706876612401, + "grad_norm": 0.0672527328133583, + "learning_rate": 6.421757744856205e-06, + "loss": 2.5698, + "step": 32019 + }, + { + "epoch": 0.9495003410135516, + "grad_norm": 0.06921540200710297, + "learning_rate": 6.414243242860385e-06, + "loss": 2.5719, + "step": 32020 + }, + { + "epoch": 0.949529994365863, + "grad_norm": 0.07164205610752106, + "learning_rate": 6.406733111651187e-06, + "loss": 2.5795, + "step": 32021 + }, + { + "epoch": 0.9495596477181746, + "grad_norm": 0.06871537119150162, + "learning_rate": 6.399227351295056e-06, + "loss": 2.5633, + "step": 32022 + }, + { + "epoch": 0.949589301070486, + "grad_norm": 0.07182279229164124, + "learning_rate": 6.3917259618584965e-06, + "loss": 2.5158, + "step": 32023 + }, + { + "epoch": 0.9496189544227975, + "grad_norm": 0.06914962828159332, + "learning_rate": 6.384228943407899e-06, + "loss": 2.5008, + "step": 32024 + }, + { + "epoch": 0.949648607775109, + "grad_norm": 0.06954417377710342, + "learning_rate": 6.376736296009711e-06, + "loss": 2.5599, + "step": 32025 + }, + { + "epoch": 0.9496782611274205, + "grad_norm": 0.07086223363876343, + "learning_rate": 6.3692480197302675e-06, + "loss": 2.5825, + "step": 32026 + }, + { + "epoch": 0.9497079144797319, + "grad_norm": 0.06834673136472702, + "learning_rate": 6.361764114635849e-06, + "loss": 2.5436, + "step": 32027 + }, + { + "epoch": 0.9497375678320434, + "grad_norm": 0.07064058631658554, + "learning_rate": 6.3542845807927354e-06, + "loss": 2.5798, + "step": 32028 + }, + { + "epoch": 0.9497672211843549, + "grad_norm": 0.07021033763885498, + "learning_rate": 6.3468094182672076e-06, + "loss": 2.5614, + "step": 32029 + }, + { + "epoch": 0.9497968745366664, + "grad_norm": 0.06919360160827637, + "learning_rate": 6.3393386271253796e-06, + "loss": 2.6052, + "step": 32030 + }, + { + "epoch": 0.9498265278889778, + "grad_norm": 0.06926699727773666, + "learning_rate": 6.331872207433476e-06, + "loss": 2.576, + "step": 32031 + }, + { + "epoch": 0.9498561812412893, + "grad_norm": 0.06986773014068604, + "learning_rate": 6.324410159257554e-06, + "loss": 2.6001, + "step": 32032 + }, + { + "epoch": 0.9498858345936008, + "grad_norm": 0.06498746573925018, + "learning_rate": 6.316952482663674e-06, + "loss": 2.5206, + "step": 32033 + }, + { + "epoch": 0.9499154879459123, + "grad_norm": 0.07150726020336151, + "learning_rate": 6.309499177718003e-06, + "loss": 2.5421, + "step": 32034 + }, + { + "epoch": 0.9499451412982237, + "grad_norm": 0.06954695284366608, + "learning_rate": 6.3020502444863796e-06, + "loss": 2.5661, + "step": 32035 + }, + { + "epoch": 0.9499747946505352, + "grad_norm": 0.06968671828508377, + "learning_rate": 6.294605683034915e-06, + "loss": 2.5851, + "step": 32036 + }, + { + "epoch": 0.9500044480028467, + "grad_norm": 0.06693214923143387, + "learning_rate": 6.287165493429392e-06, + "loss": 2.5586, + "step": 32037 + }, + { + "epoch": 0.9500341013551582, + "grad_norm": 0.06996060907840729, + "learning_rate": 6.279729675735813e-06, + "loss": 2.5668, + "step": 32038 + }, + { + "epoch": 0.9500637547074697, + "grad_norm": 0.06990045309066772, + "learning_rate": 6.272298230019957e-06, + "loss": 2.5507, + "step": 32039 + }, + { + "epoch": 0.9500934080597812, + "grad_norm": 0.06834357976913452, + "learning_rate": 6.264871156347662e-06, + "loss": 2.555, + "step": 32040 + }, + { + "epoch": 0.9501230614120927, + "grad_norm": 0.06816317141056061, + "learning_rate": 6.2574484547846534e-06, + "loss": 2.5616, + "step": 32041 + }, + { + "epoch": 0.9501527147644041, + "grad_norm": 0.06860675662755966, + "learning_rate": 6.250030125396711e-06, + "loss": 2.5613, + "step": 32042 + }, + { + "epoch": 0.9501823681167156, + "grad_norm": 0.07037129253149033, + "learning_rate": 6.242616168249504e-06, + "loss": 2.5678, + "step": 32043 + }, + { + "epoch": 0.9502120214690271, + "grad_norm": 0.06621512025594711, + "learning_rate": 6.2352065834087034e-06, + "loss": 2.5572, + "step": 32044 + }, + { + "epoch": 0.9502416748213386, + "grad_norm": 0.0674510970711708, + "learning_rate": 6.227801370939867e-06, + "loss": 2.5916, + "step": 32045 + }, + { + "epoch": 0.95027132817365, + "grad_norm": 0.0651051476597786, + "learning_rate": 6.2204005309086095e-06, + "loss": 2.5693, + "step": 32046 + }, + { + "epoch": 0.9503009815259615, + "grad_norm": 0.0681123286485672, + "learning_rate": 6.213004063380434e-06, + "loss": 2.539, + "step": 32047 + }, + { + "epoch": 0.950330634878273, + "grad_norm": 0.0679926946759224, + "learning_rate": 6.205611968420899e-06, + "loss": 2.5781, + "step": 32048 + }, + { + "epoch": 0.9503602882305845, + "grad_norm": 0.06797638535499573, + "learning_rate": 6.198224246095452e-06, + "loss": 2.55, + "step": 32049 + }, + { + "epoch": 0.9503899415828959, + "grad_norm": 0.06924738734960556, + "learning_rate": 6.190840896469429e-06, + "loss": 2.5609, + "step": 32050 + }, + { + "epoch": 0.9504195949352074, + "grad_norm": 0.06718064099550247, + "learning_rate": 6.1834619196083355e-06, + "loss": 2.5559, + "step": 32051 + }, + { + "epoch": 0.9504492482875189, + "grad_norm": 0.0691150426864624, + "learning_rate": 6.176087315577394e-06, + "loss": 2.5776, + "step": 32052 + }, + { + "epoch": 0.9504789016398304, + "grad_norm": 0.06852638721466064, + "learning_rate": 6.168717084441999e-06, + "loss": 2.5638, + "step": 32053 + }, + { + "epoch": 0.9505085549921418, + "grad_norm": 0.07197745144367218, + "learning_rate": 6.161351226267375e-06, + "loss": 2.5865, + "step": 32054 + }, + { + "epoch": 0.9505382083444534, + "grad_norm": 0.06810370087623596, + "learning_rate": 6.153989741118749e-06, + "loss": 2.5536, + "step": 32055 + }, + { + "epoch": 0.9505678616967648, + "grad_norm": 0.0708344355225563, + "learning_rate": 6.146632629061288e-06, + "loss": 2.5886, + "step": 32056 + }, + { + "epoch": 0.9505975150490763, + "grad_norm": 0.06538943201303482, + "learning_rate": 6.139279890160221e-06, + "loss": 2.6199, + "step": 32057 + }, + { + "epoch": 0.9506271684013877, + "grad_norm": 0.0711197704076767, + "learning_rate": 6.13193152448055e-06, + "loss": 2.6097, + "step": 32058 + }, + { + "epoch": 0.9506568217536993, + "grad_norm": 0.07085546851158142, + "learning_rate": 6.124587532087389e-06, + "loss": 2.5641, + "step": 32059 + }, + { + "epoch": 0.9506864751060108, + "grad_norm": 0.07011453062295914, + "learning_rate": 6.117247913045798e-06, + "loss": 2.5914, + "step": 32060 + }, + { + "epoch": 0.9507161284583222, + "grad_norm": 0.07004887610673904, + "learning_rate": 6.109912667420781e-06, + "loss": 2.5727, + "step": 32061 + }, + { + "epoch": 0.9507457818106337, + "grad_norm": 0.06623559445142746, + "learning_rate": 6.102581795277229e-06, + "loss": 2.5599, + "step": 32062 + }, + { + "epoch": 0.9507754351629452, + "grad_norm": 0.06741660088300705, + "learning_rate": 6.095255296680091e-06, + "loss": 2.5788, + "step": 32063 + }, + { + "epoch": 0.9508050885152567, + "grad_norm": 0.07218652218580246, + "learning_rate": 6.0879331716942595e-06, + "loss": 2.5808, + "step": 32064 + }, + { + "epoch": 0.9508347418675681, + "grad_norm": 0.07069657742977142, + "learning_rate": 6.080615420384517e-06, + "loss": 2.5664, + "step": 32065 + }, + { + "epoch": 0.9508643952198796, + "grad_norm": 0.06734254211187363, + "learning_rate": 6.073302042815754e-06, + "loss": 2.5726, + "step": 32066 + }, + { + "epoch": 0.9508940485721911, + "grad_norm": 0.0732664167881012, + "learning_rate": 6.065993039052642e-06, + "loss": 2.5435, + "step": 32067 + }, + { + "epoch": 0.9509237019245026, + "grad_norm": 0.067893847823143, + "learning_rate": 6.058688409159963e-06, + "loss": 2.5554, + "step": 32068 + }, + { + "epoch": 0.950953355276814, + "grad_norm": 0.06662542372941971, + "learning_rate": 6.0513881532023866e-06, + "loss": 2.5865, + "step": 32069 + }, + { + "epoch": 0.9509830086291255, + "grad_norm": 0.06845906376838684, + "learning_rate": 6.044092271244583e-06, + "loss": 2.6073, + "step": 32070 + }, + { + "epoch": 0.951012661981437, + "grad_norm": 0.06913308799266815, + "learning_rate": 6.036800763351058e-06, + "loss": 2.559, + "step": 32071 + }, + { + "epoch": 0.9510423153337485, + "grad_norm": 0.06960000842809677, + "learning_rate": 6.029513629586536e-06, + "loss": 2.5715, + "step": 32072 + }, + { + "epoch": 0.9510719686860599, + "grad_norm": 0.07178528606891632, + "learning_rate": 6.02223087001541e-06, + "loss": 2.587, + "step": 32073 + }, + { + "epoch": 0.9511016220383715, + "grad_norm": 0.06742381304502487, + "learning_rate": 6.014952484702241e-06, + "loss": 2.5714, + "step": 32074 + }, + { + "epoch": 0.9511312753906829, + "grad_norm": 0.06920582801103592, + "learning_rate": 6.00767847371142e-06, + "loss": 2.5832, + "step": 32075 + }, + { + "epoch": 0.9511609287429944, + "grad_norm": 0.07060742378234863, + "learning_rate": 6.000408837107396e-06, + "loss": 2.5558, + "step": 32076 + }, + { + "epoch": 0.9511905820953058, + "grad_norm": 0.06838678568601608, + "learning_rate": 5.993143574954562e-06, + "loss": 2.5906, + "step": 32077 + }, + { + "epoch": 0.9512202354476174, + "grad_norm": 0.06720028072595596, + "learning_rate": 5.985882687317256e-06, + "loss": 2.5256, + "step": 32078 + }, + { + "epoch": 0.9512498887999288, + "grad_norm": 0.06853073835372925, + "learning_rate": 5.9786261742597605e-06, + "loss": 2.5739, + "step": 32079 + }, + { + "epoch": 0.9512795421522403, + "grad_norm": 0.06891339272260666, + "learning_rate": 5.9713740358462995e-06, + "loss": 2.5131, + "step": 32080 + }, + { + "epoch": 0.9513091955045518, + "grad_norm": 0.06856077909469604, + "learning_rate": 5.964126272141157e-06, + "loss": 2.5637, + "step": 32081 + }, + { + "epoch": 0.9513388488568633, + "grad_norm": 0.07265891879796982, + "learning_rate": 5.9568828832084475e-06, + "loss": 2.5256, + "step": 32082 + }, + { + "epoch": 0.9513685022091748, + "grad_norm": 0.06838271021842957, + "learning_rate": 5.949643869112342e-06, + "loss": 2.5427, + "step": 32083 + }, + { + "epoch": 0.9513981555614862, + "grad_norm": 0.06820245832204819, + "learning_rate": 5.942409229916956e-06, + "loss": 2.5645, + "step": 32084 + }, + { + "epoch": 0.9514278089137977, + "grad_norm": 0.06878594309091568, + "learning_rate": 5.93517896568635e-06, + "loss": 2.545, + "step": 32085 + }, + { + "epoch": 0.9514574622661092, + "grad_norm": 0.07445796579122543, + "learning_rate": 5.927953076484527e-06, + "loss": 2.5746, + "step": 32086 + }, + { + "epoch": 0.9514871156184207, + "grad_norm": 0.06612510979175568, + "learning_rate": 5.920731562375492e-06, + "loss": 2.5628, + "step": 32087 + }, + { + "epoch": 0.9515167689707321, + "grad_norm": 0.0669781044125557, + "learning_rate": 5.913514423423138e-06, + "loss": 2.5345, + "step": 32088 + }, + { + "epoch": 0.9515464223230437, + "grad_norm": 0.07052608579397202, + "learning_rate": 5.906301659691471e-06, + "loss": 2.5494, + "step": 32089 + }, + { + "epoch": 0.9515760756753551, + "grad_norm": 0.06717447936534882, + "learning_rate": 5.899093271244271e-06, + "loss": 2.5362, + "step": 32090 + }, + { + "epoch": 0.9516057290276666, + "grad_norm": 0.06794886291027069, + "learning_rate": 5.891889258145433e-06, + "loss": 2.5402, + "step": 32091 + }, + { + "epoch": 0.951635382379978, + "grad_norm": 0.06685492396354675, + "learning_rate": 5.8846896204587384e-06, + "loss": 2.5689, + "step": 32092 + }, + { + "epoch": 0.9516650357322896, + "grad_norm": 0.06721633672714233, + "learning_rate": 5.877494358247915e-06, + "loss": 2.5659, + "step": 32093 + }, + { + "epoch": 0.951694689084601, + "grad_norm": 0.0681806355714798, + "learning_rate": 5.870303471576743e-06, + "loss": 2.5633, + "step": 32094 + }, + { + "epoch": 0.9517243424369125, + "grad_norm": 0.06674598902463913, + "learning_rate": 5.863116960508841e-06, + "loss": 2.5415, + "step": 32095 + }, + { + "epoch": 0.9517539957892239, + "grad_norm": 0.06861601769924164, + "learning_rate": 5.855934825107823e-06, + "loss": 2.5611, + "step": 32096 + }, + { + "epoch": 0.9517836491415355, + "grad_norm": 0.06690914928913116, + "learning_rate": 5.84875706543736e-06, + "loss": 2.5681, + "step": 32097 + }, + { + "epoch": 0.9518133024938469, + "grad_norm": 0.06916436553001404, + "learning_rate": 5.841583681560902e-06, + "loss": 2.5652, + "step": 32098 + }, + { + "epoch": 0.9518429558461584, + "grad_norm": 0.06823979318141937, + "learning_rate": 5.834414673542121e-06, + "loss": 2.595, + "step": 32099 + }, + { + "epoch": 0.9518726091984698, + "grad_norm": 0.06746822595596313, + "learning_rate": 5.827250041444354e-06, + "loss": 2.5604, + "step": 32100 + }, + { + "epoch": 0.9519022625507814, + "grad_norm": 0.0664900541305542, + "learning_rate": 5.820089785331162e-06, + "loss": 2.5449, + "step": 32101 + }, + { + "epoch": 0.9519319159030929, + "grad_norm": 0.07533719390630722, + "learning_rate": 5.812933905265827e-06, + "loss": 2.5469, + "step": 32102 + }, + { + "epoch": 0.9519615692554043, + "grad_norm": 0.0690384954214096, + "learning_rate": 5.805782401311854e-06, + "loss": 2.5794, + "step": 32103 + }, + { + "epoch": 0.9519912226077158, + "grad_norm": 0.06669618934392929, + "learning_rate": 5.798635273532471e-06, + "loss": 2.5651, + "step": 32104 + }, + { + "epoch": 0.9520208759600273, + "grad_norm": 0.08173069357872009, + "learning_rate": 5.791492521991016e-06, + "loss": 2.5785, + "step": 32105 + }, + { + "epoch": 0.9520505293123388, + "grad_norm": 0.06740587204694748, + "learning_rate": 5.78435414675077e-06, + "loss": 2.5397, + "step": 32106 + }, + { + "epoch": 0.9520801826646502, + "grad_norm": 0.07254913449287415, + "learning_rate": 5.777220147874851e-06, + "loss": 2.5617, + "step": 32107 + }, + { + "epoch": 0.9521098360169618, + "grad_norm": 0.06845399737358093, + "learning_rate": 5.770090525426486e-06, + "loss": 2.5569, + "step": 32108 + }, + { + "epoch": 0.9521394893692732, + "grad_norm": 0.06742644309997559, + "learning_rate": 5.76296527946879e-06, + "loss": 2.5992, + "step": 32109 + }, + { + "epoch": 0.9521691427215847, + "grad_norm": 0.06860055774450302, + "learning_rate": 5.755844410064881e-06, + "loss": 2.5328, + "step": 32110 + }, + { + "epoch": 0.9521987960738961, + "grad_norm": 0.06898293644189835, + "learning_rate": 5.748727917277818e-06, + "loss": 2.5459, + "step": 32111 + }, + { + "epoch": 0.9522284494262077, + "grad_norm": 0.07133060693740845, + "learning_rate": 5.741615801170608e-06, + "loss": 2.551, + "step": 32112 + }, + { + "epoch": 0.9522581027785191, + "grad_norm": 0.0660640075802803, + "learning_rate": 5.7345080618061986e-06, + "loss": 2.6028, + "step": 32113 + }, + { + "epoch": 0.9522877561308306, + "grad_norm": 0.06625247001647949, + "learning_rate": 5.727404699247596e-06, + "loss": 2.5507, + "step": 32114 + }, + { + "epoch": 0.952317409483142, + "grad_norm": 0.06925860047340393, + "learning_rate": 5.720305713557639e-06, + "loss": 2.5554, + "step": 32115 + }, + { + "epoch": 0.9523470628354536, + "grad_norm": 0.0702475756406784, + "learning_rate": 5.713211104799221e-06, + "loss": 2.5604, + "step": 32116 + }, + { + "epoch": 0.952376716187765, + "grad_norm": 0.06921520084142685, + "learning_rate": 5.706120873035126e-06, + "loss": 2.5935, + "step": 32117 + }, + { + "epoch": 0.9524063695400765, + "grad_norm": 0.06667398661375046, + "learning_rate": 5.699035018328247e-06, + "loss": 2.5468, + "step": 32118 + }, + { + "epoch": 0.9524360228923879, + "grad_norm": 0.06748861819505692, + "learning_rate": 5.691953540741202e-06, + "loss": 2.5315, + "step": 32119 + }, + { + "epoch": 0.9524656762446995, + "grad_norm": 0.06736684590578079, + "learning_rate": 5.684876440336772e-06, + "loss": 2.5625, + "step": 32120 + }, + { + "epoch": 0.9524953295970109, + "grad_norm": 0.07038687914609909, + "learning_rate": 5.677803717177632e-06, + "loss": 2.5801, + "step": 32121 + }, + { + "epoch": 0.9525249829493224, + "grad_norm": 0.06654252856969833, + "learning_rate": 5.670735371326397e-06, + "loss": 2.5418, + "step": 32122 + }, + { + "epoch": 0.952554636301634, + "grad_norm": 0.06775429099798203, + "learning_rate": 5.663671402845627e-06, + "loss": 2.512, + "step": 32123 + }, + { + "epoch": 0.9525842896539454, + "grad_norm": 0.07186069339513779, + "learning_rate": 5.656611811797885e-06, + "loss": 2.579, + "step": 32124 + }, + { + "epoch": 0.9526139430062569, + "grad_norm": 0.06894013285636902, + "learning_rate": 5.649556598245731e-06, + "loss": 2.5794, + "step": 32125 + }, + { + "epoch": 0.9526435963585683, + "grad_norm": 0.06903015822172165, + "learning_rate": 5.64250576225156e-06, + "loss": 2.5737, + "step": 32126 + }, + { + "epoch": 0.9526732497108799, + "grad_norm": 0.06926868855953217, + "learning_rate": 5.635459303877877e-06, + "loss": 2.5995, + "step": 32127 + }, + { + "epoch": 0.9527029030631913, + "grad_norm": 0.06710438430309296, + "learning_rate": 5.628417223187077e-06, + "loss": 2.5557, + "step": 32128 + }, + { + "epoch": 0.9527325564155028, + "grad_norm": 0.06951330602169037, + "learning_rate": 5.621379520241499e-06, + "loss": 2.6177, + "step": 32129 + }, + { + "epoch": 0.9527622097678142, + "grad_norm": 0.06886547058820724, + "learning_rate": 5.614346195103482e-06, + "loss": 2.6007, + "step": 32130 + }, + { + "epoch": 0.9527918631201258, + "grad_norm": 0.06704071909189224, + "learning_rate": 5.607317247835253e-06, + "loss": 2.5504, + "step": 32131 + }, + { + "epoch": 0.9528215164724372, + "grad_norm": 0.06774141639471054, + "learning_rate": 5.600292678499097e-06, + "loss": 2.5543, + "step": 32132 + }, + { + "epoch": 0.9528511698247487, + "grad_norm": 0.06544415652751923, + "learning_rate": 5.593272487157186e-06, + "loss": 2.5392, + "step": 32133 + }, + { + "epoch": 0.9528808231770601, + "grad_norm": 0.06999244540929794, + "learning_rate": 5.586256673871748e-06, + "loss": 2.5484, + "step": 32134 + }, + { + "epoch": 0.9529104765293717, + "grad_norm": 0.0663803219795227, + "learning_rate": 5.5792452387049e-06, + "loss": 2.5618, + "step": 32135 + }, + { + "epoch": 0.9529401298816831, + "grad_norm": 0.07033147662878036, + "learning_rate": 5.572238181718647e-06, + "loss": 2.572, + "step": 32136 + }, + { + "epoch": 0.9529697832339946, + "grad_norm": 0.0684593990445137, + "learning_rate": 5.565235502975108e-06, + "loss": 2.5395, + "step": 32137 + }, + { + "epoch": 0.952999436586306, + "grad_norm": 0.06589961796998978, + "learning_rate": 5.558237202536287e-06, + "loss": 2.5237, + "step": 32138 + }, + { + "epoch": 0.9530290899386176, + "grad_norm": 0.06602880358695984, + "learning_rate": 5.551243280464191e-06, + "loss": 2.5637, + "step": 32139 + }, + { + "epoch": 0.953058743290929, + "grad_norm": 0.06843739002943039, + "learning_rate": 5.544253736820659e-06, + "loss": 2.5485, + "step": 32140 + }, + { + "epoch": 0.9530883966432405, + "grad_norm": 0.07398609071969986, + "learning_rate": 5.537268571667586e-06, + "loss": 2.5404, + "step": 32141 + }, + { + "epoch": 0.953118049995552, + "grad_norm": 0.06803460419178009, + "learning_rate": 5.5302877850669235e-06, + "loss": 2.5737, + "step": 32142 + }, + { + "epoch": 0.9531477033478635, + "grad_norm": 0.0658300369977951, + "learning_rate": 5.523311377080398e-06, + "loss": 2.5258, + "step": 32143 + }, + { + "epoch": 0.953177356700175, + "grad_norm": 0.06812579184770584, + "learning_rate": 5.51633934776985e-06, + "loss": 2.5797, + "step": 32144 + }, + { + "epoch": 0.9532070100524864, + "grad_norm": 0.06866530328989029, + "learning_rate": 5.5093716971970074e-06, + "loss": 2.5387, + "step": 32145 + }, + { + "epoch": 0.953236663404798, + "grad_norm": 0.06828590482473373, + "learning_rate": 5.502408425423544e-06, + "loss": 2.5569, + "step": 32146 + }, + { + "epoch": 0.9532663167571094, + "grad_norm": 0.06920000910758972, + "learning_rate": 5.495449532511187e-06, + "loss": 2.5427, + "step": 32147 + }, + { + "epoch": 0.9532959701094209, + "grad_norm": 0.07025012373924255, + "learning_rate": 5.488495018521444e-06, + "loss": 2.551, + "step": 32148 + }, + { + "epoch": 0.9533256234617323, + "grad_norm": 0.0701778382062912, + "learning_rate": 5.481544883515987e-06, + "loss": 2.6132, + "step": 32149 + }, + { + "epoch": 0.9533552768140439, + "grad_norm": 0.0672575905919075, + "learning_rate": 5.474599127556324e-06, + "loss": 2.5708, + "step": 32150 + }, + { + "epoch": 0.9533849301663553, + "grad_norm": 0.06930168718099594, + "learning_rate": 5.4676577507039585e-06, + "loss": 2.5512, + "step": 32151 + }, + { + "epoch": 0.9534145835186668, + "grad_norm": 0.06862711161375046, + "learning_rate": 5.4607207530203985e-06, + "loss": 2.5489, + "step": 32152 + }, + { + "epoch": 0.9534442368709782, + "grad_norm": 0.06598670035600662, + "learning_rate": 5.453788134567039e-06, + "loss": 2.5229, + "step": 32153 + }, + { + "epoch": 0.9534738902232898, + "grad_norm": 0.07008310407400131, + "learning_rate": 5.446859895405221e-06, + "loss": 2.5718, + "step": 32154 + }, + { + "epoch": 0.9535035435756012, + "grad_norm": 0.0662699043750763, + "learning_rate": 5.439936035596338e-06, + "loss": 2.6092, + "step": 32155 + }, + { + "epoch": 0.9535331969279127, + "grad_norm": 0.06929387897253036, + "learning_rate": 5.4330165552017865e-06, + "loss": 2.5398, + "step": 32156 + }, + { + "epoch": 0.9535628502802241, + "grad_norm": 0.06651636213064194, + "learning_rate": 5.426101454282739e-06, + "loss": 2.6127, + "step": 32157 + }, + { + "epoch": 0.9535925036325357, + "grad_norm": 0.06519710272550583, + "learning_rate": 5.419190732900425e-06, + "loss": 2.5724, + "step": 32158 + }, + { + "epoch": 0.9536221569848471, + "grad_norm": 0.06763089448213577, + "learning_rate": 5.412284391116129e-06, + "loss": 2.5822, + "step": 32159 + }, + { + "epoch": 0.9536518103371586, + "grad_norm": 0.06482930481433868, + "learning_rate": 5.405382428990913e-06, + "loss": 2.5573, + "step": 32160 + }, + { + "epoch": 0.95368146368947, + "grad_norm": 0.06980080902576447, + "learning_rate": 5.398484846585949e-06, + "loss": 2.549, + "step": 32161 + }, + { + "epoch": 0.9537111170417816, + "grad_norm": 0.06863460689783096, + "learning_rate": 5.391591643962301e-06, + "loss": 2.5894, + "step": 32162 + }, + { + "epoch": 0.9537407703940931, + "grad_norm": 0.06863725930452347, + "learning_rate": 5.384702821180976e-06, + "loss": 2.572, + "step": 32163 + }, + { + "epoch": 0.9537704237464045, + "grad_norm": 0.06547695398330688, + "learning_rate": 5.3778183783030345e-06, + "loss": 2.5244, + "step": 32164 + }, + { + "epoch": 0.9538000770987161, + "grad_norm": 0.06887561827898026, + "learning_rate": 5.370938315389373e-06, + "loss": 2.5216, + "step": 32165 + }, + { + "epoch": 0.9538297304510275, + "grad_norm": 0.06725479662418365, + "learning_rate": 5.364062632500944e-06, + "loss": 2.5583, + "step": 32166 + }, + { + "epoch": 0.953859383803339, + "grad_norm": 0.06731468439102173, + "learning_rate": 5.357191329698697e-06, + "loss": 2.553, + "step": 32167 + }, + { + "epoch": 0.9538890371556504, + "grad_norm": 0.06519993394613266, + "learning_rate": 5.350324407043417e-06, + "loss": 2.5703, + "step": 32168 + }, + { + "epoch": 0.953918690507962, + "grad_norm": 0.06871818006038666, + "learning_rate": 5.343461864595889e-06, + "loss": 2.5657, + "step": 32169 + }, + { + "epoch": 0.9539483438602734, + "grad_norm": 0.07034183293581009, + "learning_rate": 5.336603702417009e-06, + "loss": 2.5683, + "step": 32170 + }, + { + "epoch": 0.9539779972125849, + "grad_norm": 0.06893522292375565, + "learning_rate": 5.329749920567339e-06, + "loss": 2.5547, + "step": 32171 + }, + { + "epoch": 0.9540076505648963, + "grad_norm": 0.06735832989215851, + "learning_rate": 5.32290051910761e-06, + "loss": 2.5614, + "step": 32172 + }, + { + "epoch": 0.9540373039172079, + "grad_norm": 0.06782648712396622, + "learning_rate": 5.316055498098549e-06, + "loss": 2.5158, + "step": 32173 + }, + { + "epoch": 0.9540669572695193, + "grad_norm": 0.06793459504842758, + "learning_rate": 5.309214857600719e-06, + "loss": 2.5449, + "step": 32174 + }, + { + "epoch": 0.9540966106218308, + "grad_norm": 0.06878063827753067, + "learning_rate": 5.302378597674684e-06, + "loss": 2.5479, + "step": 32175 + }, + { + "epoch": 0.9541262639741422, + "grad_norm": 0.06971758604049683, + "learning_rate": 5.295546718381061e-06, + "loss": 2.5728, + "step": 32176 + }, + { + "epoch": 0.9541559173264538, + "grad_norm": 0.06703980267047882, + "learning_rate": 5.288719219780247e-06, + "loss": 2.5887, + "step": 32177 + }, + { + "epoch": 0.9541855706787652, + "grad_norm": 0.06830466538667679, + "learning_rate": 5.281896101932693e-06, + "loss": 2.5859, + "step": 32178 + }, + { + "epoch": 0.9542152240310767, + "grad_norm": 0.06937576085329056, + "learning_rate": 5.275077364898906e-06, + "loss": 2.5529, + "step": 32179 + }, + { + "epoch": 0.9542448773833881, + "grad_norm": 0.06768027693033218, + "learning_rate": 5.268263008739227e-06, + "loss": 2.5505, + "step": 32180 + }, + { + "epoch": 0.9542745307356997, + "grad_norm": 0.06826082617044449, + "learning_rate": 5.261453033514052e-06, + "loss": 2.5859, + "step": 32181 + }, + { + "epoch": 0.9543041840880111, + "grad_norm": 0.06810571998357773, + "learning_rate": 5.254647439283556e-06, + "loss": 2.5716, + "step": 32182 + }, + { + "epoch": 0.9543338374403226, + "grad_norm": 0.06836473941802979, + "learning_rate": 5.247846226108133e-06, + "loss": 2.575, + "step": 32183 + }, + { + "epoch": 0.9543634907926342, + "grad_norm": 0.06463199853897095, + "learning_rate": 5.241049394047903e-06, + "loss": 2.5545, + "step": 32184 + }, + { + "epoch": 0.9543931441449456, + "grad_norm": 0.06753318011760712, + "learning_rate": 5.2342569431631515e-06, + "loss": 2.5648, + "step": 32185 + }, + { + "epoch": 0.9544227974972571, + "grad_norm": 0.06736436486244202, + "learning_rate": 5.227468873513941e-06, + "loss": 2.581, + "step": 32186 + }, + { + "epoch": 0.9544524508495685, + "grad_norm": 0.0679381713271141, + "learning_rate": 5.220685185160446e-06, + "loss": 2.5626, + "step": 32187 + }, + { + "epoch": 0.9544821042018801, + "grad_norm": 0.06975828111171722, + "learning_rate": 5.213905878162728e-06, + "loss": 2.5289, + "step": 32188 + }, + { + "epoch": 0.9545117575541915, + "grad_norm": 0.06747046858072281, + "learning_rate": 5.207130952580741e-06, + "loss": 2.5732, + "step": 32189 + }, + { + "epoch": 0.954541410906503, + "grad_norm": 0.069438137114048, + "learning_rate": 5.2003604084746025e-06, + "loss": 2.5562, + "step": 32190 + }, + { + "epoch": 0.9545710642588144, + "grad_norm": 0.06839155405759811, + "learning_rate": 5.193594245904154e-06, + "loss": 2.5705, + "step": 32191 + }, + { + "epoch": 0.954600717611126, + "grad_norm": 0.06825338304042816, + "learning_rate": 5.186832464929403e-06, + "loss": 2.5576, + "step": 32192 + }, + { + "epoch": 0.9546303709634374, + "grad_norm": 0.06892756372690201, + "learning_rate": 5.180075065610135e-06, + "loss": 2.5689, + "step": 32193 + }, + { + "epoch": 0.9546600243157489, + "grad_norm": 0.06815093755722046, + "learning_rate": 5.1733220480063015e-06, + "loss": 2.5695, + "step": 32194 + }, + { + "epoch": 0.9546896776680603, + "grad_norm": 0.07271554321050644, + "learning_rate": 5.166573412177577e-06, + "loss": 2.5907, + "step": 32195 + }, + { + "epoch": 0.9547193310203719, + "grad_norm": 0.07132319360971451, + "learning_rate": 5.159829158183804e-06, + "loss": 2.5325, + "step": 32196 + }, + { + "epoch": 0.9547489843726833, + "grad_norm": 0.07046998292207718, + "learning_rate": 5.153089286084711e-06, + "loss": 2.5474, + "step": 32197 + }, + { + "epoch": 0.9547786377249948, + "grad_norm": 0.06705712527036667, + "learning_rate": 5.1463537959399175e-06, + "loss": 2.5838, + "step": 32198 + }, + { + "epoch": 0.9548082910773062, + "grad_norm": 0.06627389043569565, + "learning_rate": 5.139622687809098e-06, + "loss": 2.5486, + "step": 32199 + }, + { + "epoch": 0.9548379444296178, + "grad_norm": 0.06963701546192169, + "learning_rate": 5.132895961751871e-06, + "loss": 2.5504, + "step": 32200 + }, + { + "epoch": 0.9548675977819292, + "grad_norm": 0.06841927021741867, + "learning_rate": 5.126173617827801e-06, + "loss": 2.5424, + "step": 32201 + }, + { + "epoch": 0.9548972511342407, + "grad_norm": 0.07204949855804443, + "learning_rate": 5.119455656096395e-06, + "loss": 2.5829, + "step": 32202 + }, + { + "epoch": 0.9549269044865522, + "grad_norm": 0.06848805397748947, + "learning_rate": 5.112742076617216e-06, + "loss": 2.5817, + "step": 32203 + }, + { + "epoch": 0.9549565578388637, + "grad_norm": 0.06598516553640366, + "learning_rate": 5.106032879449551e-06, + "loss": 2.5345, + "step": 32204 + }, + { + "epoch": 0.9549862111911752, + "grad_norm": 0.07069091498851776, + "learning_rate": 5.099328064652964e-06, + "loss": 2.5708, + "step": 32205 + }, + { + "epoch": 0.9550158645434866, + "grad_norm": 0.07002108544111252, + "learning_rate": 5.092627632286795e-06, + "loss": 2.5437, + "step": 32206 + }, + { + "epoch": 0.9550455178957982, + "grad_norm": 0.06756240874528885, + "learning_rate": 5.08593158241033e-06, + "loss": 2.5694, + "step": 32207 + }, + { + "epoch": 0.9550751712481096, + "grad_norm": 0.06817490607500076, + "learning_rate": 5.0792399150829115e-06, + "loss": 2.5703, + "step": 32208 + }, + { + "epoch": 0.9551048246004211, + "grad_norm": 0.0654512420296669, + "learning_rate": 5.072552630363769e-06, + "loss": 2.5619, + "step": 32209 + }, + { + "epoch": 0.9551344779527325, + "grad_norm": 0.06746252626180649, + "learning_rate": 5.065869728312078e-06, + "loss": 2.5754, + "step": 32210 + }, + { + "epoch": 0.9551641313050441, + "grad_norm": 0.06716489791870117, + "learning_rate": 5.059191208987124e-06, + "loss": 2.5526, + "step": 32211 + }, + { + "epoch": 0.9551937846573555, + "grad_norm": 0.07007485628128052, + "learning_rate": 5.052517072447971e-06, + "loss": 2.6029, + "step": 32212 + }, + { + "epoch": 0.955223438009667, + "grad_norm": 0.06838708370923996, + "learning_rate": 5.045847318753738e-06, + "loss": 2.5726, + "step": 32213 + }, + { + "epoch": 0.9552530913619784, + "grad_norm": 0.06797066330909729, + "learning_rate": 5.03918194796349e-06, + "loss": 2.5609, + "step": 32214 + }, + { + "epoch": 0.95528274471429, + "grad_norm": 0.07056275010108948, + "learning_rate": 5.032520960136233e-06, + "loss": 2.5582, + "step": 32215 + }, + { + "epoch": 0.9553123980666014, + "grad_norm": 0.06469261646270752, + "learning_rate": 5.025864355330978e-06, + "loss": 2.5394, + "step": 32216 + }, + { + "epoch": 0.9553420514189129, + "grad_norm": 0.06870223581790924, + "learning_rate": 5.019212133606621e-06, + "loss": 2.5714, + "step": 32217 + }, + { + "epoch": 0.9553717047712244, + "grad_norm": 0.0680154487490654, + "learning_rate": 5.012564295022115e-06, + "loss": 2.5832, + "step": 32218 + }, + { + "epoch": 0.9554013581235359, + "grad_norm": 0.06639868766069412, + "learning_rate": 5.005920839636302e-06, + "loss": 2.5741, + "step": 32219 + }, + { + "epoch": 0.9554310114758473, + "grad_norm": 0.06783393770456314, + "learning_rate": 4.999281767508079e-06, + "loss": 2.557, + "step": 32220 + }, + { + "epoch": 0.9554606648281588, + "grad_norm": 0.06686486303806305, + "learning_rate": 4.992647078696122e-06, + "loss": 2.5626, + "step": 32221 + }, + { + "epoch": 0.9554903181804703, + "grad_norm": 0.06870109587907791, + "learning_rate": 4.986016773259272e-06, + "loss": 2.5794, + "step": 32222 + }, + { + "epoch": 0.9555199715327818, + "grad_norm": 0.06637211889028549, + "learning_rate": 4.979390851256205e-06, + "loss": 2.5887, + "step": 32223 + }, + { + "epoch": 0.9555496248850932, + "grad_norm": 0.06747249513864517, + "learning_rate": 4.9727693127456504e-06, + "loss": 2.5936, + "step": 32224 + }, + { + "epoch": 0.9555792782374047, + "grad_norm": 0.06984716653823853, + "learning_rate": 4.9661521577861744e-06, + "loss": 2.5796, + "step": 32225 + }, + { + "epoch": 0.9556089315897163, + "grad_norm": 0.06586068868637085, + "learning_rate": 4.959539386436341e-06, + "loss": 2.5599, + "step": 32226 + }, + { + "epoch": 0.9556385849420277, + "grad_norm": 0.07226823270320892, + "learning_rate": 4.952930998754768e-06, + "loss": 2.5853, + "step": 32227 + }, + { + "epoch": 0.9556682382943392, + "grad_norm": 0.06867259740829468, + "learning_rate": 4.946326994800021e-06, + "loss": 2.5974, + "step": 32228 + }, + { + "epoch": 0.9556978916466506, + "grad_norm": 0.06912493705749512, + "learning_rate": 4.939727374630443e-06, + "loss": 2.571, + "step": 32229 + }, + { + "epoch": 0.9557275449989622, + "grad_norm": 0.06862126290798187, + "learning_rate": 4.933132138304597e-06, + "loss": 2.5412, + "step": 32230 + }, + { + "epoch": 0.9557571983512736, + "grad_norm": 0.06553280353546143, + "learning_rate": 4.926541285880825e-06, + "loss": 2.5513, + "step": 32231 + }, + { + "epoch": 0.9557868517035851, + "grad_norm": 0.06565745919942856, + "learning_rate": 4.9199548174175265e-06, + "loss": 2.5838, + "step": 32232 + }, + { + "epoch": 0.9558165050558965, + "grad_norm": 0.06426339596509933, + "learning_rate": 4.913372732972987e-06, + "loss": 2.5565, + "step": 32233 + }, + { + "epoch": 0.9558461584082081, + "grad_norm": 0.0676717758178711, + "learning_rate": 4.906795032605549e-06, + "loss": 2.5824, + "step": 32234 + }, + { + "epoch": 0.9558758117605195, + "grad_norm": 0.06581097841262817, + "learning_rate": 4.900221716373388e-06, + "loss": 2.521, + "step": 32235 + }, + { + "epoch": 0.955905465112831, + "grad_norm": 0.06559717655181885, + "learning_rate": 4.893652784334846e-06, + "loss": 2.5655, + "step": 32236 + }, + { + "epoch": 0.9559351184651425, + "grad_norm": 0.06858070194721222, + "learning_rate": 4.8870882365478764e-06, + "loss": 2.5541, + "step": 32237 + }, + { + "epoch": 0.955964771817454, + "grad_norm": 0.0671045109629631, + "learning_rate": 4.880528073070767e-06, + "loss": 2.5851, + "step": 32238 + }, + { + "epoch": 0.9559944251697654, + "grad_norm": 0.06644944846630096, + "learning_rate": 4.873972293961582e-06, + "loss": 2.5729, + "step": 32239 + }, + { + "epoch": 0.9560240785220769, + "grad_norm": 0.06764460355043411, + "learning_rate": 4.86742089927833e-06, + "loss": 2.5793, + "step": 32240 + }, + { + "epoch": 0.9560537318743884, + "grad_norm": 0.06489317864179611, + "learning_rate": 4.860873889079076e-06, + "loss": 2.5803, + "step": 32241 + }, + { + "epoch": 0.9560833852266999, + "grad_norm": 0.0663624256849289, + "learning_rate": 4.854331263421718e-06, + "loss": 2.5438, + "step": 32242 + }, + { + "epoch": 0.9561130385790113, + "grad_norm": 0.06912224739789963, + "learning_rate": 4.8477930223643214e-06, + "loss": 2.6018, + "step": 32243 + }, + { + "epoch": 0.9561426919313228, + "grad_norm": 0.07258538156747818, + "learning_rate": 4.841259165964618e-06, + "loss": 2.5858, + "step": 32244 + }, + { + "epoch": 0.9561723452836343, + "grad_norm": 0.06796670705080032, + "learning_rate": 4.834729694280615e-06, + "loss": 2.5644, + "step": 32245 + }, + { + "epoch": 0.9562019986359458, + "grad_norm": 0.06729976087808609, + "learning_rate": 4.828204607370101e-06, + "loss": 2.5856, + "step": 32246 + }, + { + "epoch": 0.9562316519882573, + "grad_norm": 0.0636046975851059, + "learning_rate": 4.821683905290808e-06, + "loss": 2.5638, + "step": 32247 + }, + { + "epoch": 0.9562613053405687, + "grad_norm": 0.06828586012125015, + "learning_rate": 4.815167588100522e-06, + "loss": 2.5606, + "step": 32248 + }, + { + "epoch": 0.9562909586928803, + "grad_norm": 0.07080449163913727, + "learning_rate": 4.808655655856864e-06, + "loss": 2.5657, + "step": 32249 + }, + { + "epoch": 0.9563206120451917, + "grad_norm": 0.066270612180233, + "learning_rate": 4.8021481086176214e-06, + "loss": 2.5778, + "step": 32250 + }, + { + "epoch": 0.9563502653975032, + "grad_norm": 0.06555160880088806, + "learning_rate": 4.795644946440303e-06, + "loss": 2.5703, + "step": 32251 + }, + { + "epoch": 0.9563799187498147, + "grad_norm": 0.06567824631929398, + "learning_rate": 4.789146169382586e-06, + "loss": 2.5129, + "step": 32252 + }, + { + "epoch": 0.9564095721021262, + "grad_norm": 0.06777076423168182, + "learning_rate": 4.782651777501979e-06, + "loss": 2.6059, + "step": 32253 + }, + { + "epoch": 0.9564392254544376, + "grad_norm": 0.06949328631162643, + "learning_rate": 4.776161770855991e-06, + "loss": 2.5845, + "step": 32254 + }, + { + "epoch": 0.9564688788067491, + "grad_norm": 0.06978050619363785, + "learning_rate": 4.769676149502078e-06, + "loss": 2.5755, + "step": 32255 + }, + { + "epoch": 0.9564985321590606, + "grad_norm": 0.0658169612288475, + "learning_rate": 4.763194913497693e-06, + "loss": 2.5687, + "step": 32256 + }, + { + "epoch": 0.9565281855113721, + "grad_norm": 0.06471985578536987, + "learning_rate": 4.756718062900234e-06, + "loss": 2.565, + "step": 32257 + }, + { + "epoch": 0.9565578388636835, + "grad_norm": 0.06701932102441788, + "learning_rate": 4.750245597767044e-06, + "loss": 2.5554, + "step": 32258 + }, + { + "epoch": 0.956587492215995, + "grad_norm": 0.06720713526010513, + "learning_rate": 4.743777518155468e-06, + "loss": 2.5564, + "step": 32259 + }, + { + "epoch": 0.9566171455683065, + "grad_norm": 0.06529180705547333, + "learning_rate": 4.73731382412268e-06, + "loss": 2.5925, + "step": 32260 + }, + { + "epoch": 0.956646798920618, + "grad_norm": 0.06811973452568054, + "learning_rate": 4.730854515726024e-06, + "loss": 2.5766, + "step": 32261 + }, + { + "epoch": 0.9566764522729294, + "grad_norm": 0.0655292421579361, + "learning_rate": 4.7243995930226765e-06, + "loss": 2.5499, + "step": 32262 + }, + { + "epoch": 0.9567061056252409, + "grad_norm": 0.06570979207754135, + "learning_rate": 4.717949056069759e-06, + "loss": 2.5855, + "step": 32263 + }, + { + "epoch": 0.9567357589775524, + "grad_norm": 0.06566546857357025, + "learning_rate": 4.711502904924448e-06, + "loss": 2.5731, + "step": 32264 + }, + { + "epoch": 0.9567654123298639, + "grad_norm": 0.07080361992120743, + "learning_rate": 4.705061139643807e-06, + "loss": 2.5562, + "step": 32265 + }, + { + "epoch": 0.9567950656821753, + "grad_norm": 0.06574057042598724, + "learning_rate": 4.698623760284793e-06, + "loss": 2.5244, + "step": 32266 + }, + { + "epoch": 0.9568247190344868, + "grad_norm": 0.06722301244735718, + "learning_rate": 4.692190766904525e-06, + "loss": 2.5796, + "step": 32267 + }, + { + "epoch": 0.9568543723867984, + "grad_norm": 0.0654464140534401, + "learning_rate": 4.685762159559959e-06, + "loss": 2.5444, + "step": 32268 + }, + { + "epoch": 0.9568840257391098, + "grad_norm": 0.06646018475294113, + "learning_rate": 4.679337938307937e-06, + "loss": 2.5561, + "step": 32269 + }, + { + "epoch": 0.9569136790914213, + "grad_norm": 0.06821125745773315, + "learning_rate": 4.672918103205415e-06, + "loss": 2.5627, + "step": 32270 + }, + { + "epoch": 0.9569433324437328, + "grad_norm": 0.06825675070285797, + "learning_rate": 4.666502654309235e-06, + "loss": 2.5923, + "step": 32271 + }, + { + "epoch": 0.9569729857960443, + "grad_norm": 0.06890123337507248, + "learning_rate": 4.660091591676186e-06, + "loss": 2.5697, + "step": 32272 + }, + { + "epoch": 0.9570026391483557, + "grad_norm": 0.06674742698669434, + "learning_rate": 4.653684915363055e-06, + "loss": 2.5944, + "step": 32273 + }, + { + "epoch": 0.9570322925006672, + "grad_norm": 0.06845156103372574, + "learning_rate": 4.647282625426575e-06, + "loss": 2.5477, + "step": 32274 + }, + { + "epoch": 0.9570619458529787, + "grad_norm": 0.06754767149686813, + "learning_rate": 4.640884721923422e-06, + "loss": 2.5537, + "step": 32275 + }, + { + "epoch": 0.9570915992052902, + "grad_norm": 0.06960033625364304, + "learning_rate": 4.634491204910274e-06, + "loss": 2.5849, + "step": 32276 + }, + { + "epoch": 0.9571212525576016, + "grad_norm": 0.06855683773756027, + "learning_rate": 4.6281020744436965e-06, + "loss": 2.5576, + "step": 32277 + }, + { + "epoch": 0.9571509059099131, + "grad_norm": 0.06591172516345978, + "learning_rate": 4.621717330580366e-06, + "loss": 2.5744, + "step": 32278 + }, + { + "epoch": 0.9571805592622246, + "grad_norm": 0.06542833894491196, + "learning_rate": 4.615336973376682e-06, + "loss": 2.5829, + "step": 32279 + }, + { + "epoch": 0.9572102126145361, + "grad_norm": 0.06718096137046814, + "learning_rate": 4.608961002889267e-06, + "loss": 2.5322, + "step": 32280 + }, + { + "epoch": 0.9572398659668475, + "grad_norm": 0.06879987567663193, + "learning_rate": 4.602589419174574e-06, + "loss": 2.5375, + "step": 32281 + }, + { + "epoch": 0.957269519319159, + "grad_norm": 0.06579906493425369, + "learning_rate": 4.596222222288948e-06, + "loss": 2.5631, + "step": 32282 + }, + { + "epoch": 0.9572991726714705, + "grad_norm": 0.06816866993904114, + "learning_rate": 4.589859412288788e-06, + "loss": 2.5269, + "step": 32283 + }, + { + "epoch": 0.957328826023782, + "grad_norm": 0.06612759828567505, + "learning_rate": 4.583500989230493e-06, + "loss": 2.5782, + "step": 32284 + }, + { + "epoch": 0.9573584793760934, + "grad_norm": 0.06704343110322952, + "learning_rate": 4.577146953170297e-06, + "loss": 2.5751, + "step": 32285 + }, + { + "epoch": 0.957388132728405, + "grad_norm": 0.06720636785030365, + "learning_rate": 4.570797304164542e-06, + "loss": 2.5872, + "step": 32286 + }, + { + "epoch": 0.9574177860807164, + "grad_norm": 0.06770538538694382, + "learning_rate": 4.564452042269407e-06, + "loss": 2.5848, + "step": 32287 + }, + { + "epoch": 0.9574474394330279, + "grad_norm": 0.06566408276557922, + "learning_rate": 4.558111167541068e-06, + "loss": 2.5648, + "step": 32288 + }, + { + "epoch": 0.9574770927853394, + "grad_norm": 0.06615332514047623, + "learning_rate": 4.551774680035703e-06, + "loss": 2.5708, + "step": 32289 + }, + { + "epoch": 0.9575067461376509, + "grad_norm": 0.06591619551181793, + "learning_rate": 4.545442579809433e-06, + "loss": 2.6041, + "step": 32290 + }, + { + "epoch": 0.9575363994899624, + "grad_norm": 0.06758353859186172, + "learning_rate": 4.539114866918326e-06, + "loss": 2.5554, + "step": 32291 + }, + { + "epoch": 0.9575660528422738, + "grad_norm": 0.06945561617612839, + "learning_rate": 4.532791541418391e-06, + "loss": 2.5631, + "step": 32292 + }, + { + "epoch": 0.9575957061945853, + "grad_norm": 0.06708038598299026, + "learning_rate": 4.526472603365583e-06, + "loss": 2.5556, + "step": 32293 + }, + { + "epoch": 0.9576253595468968, + "grad_norm": 0.06981775164604187, + "learning_rate": 4.52015805281597e-06, + "loss": 2.6086, + "step": 32294 + }, + { + "epoch": 0.9576550128992083, + "grad_norm": 0.06863238662481308, + "learning_rate": 4.513847889825396e-06, + "loss": 2.5563, + "step": 32295 + }, + { + "epoch": 0.9576846662515197, + "grad_norm": 0.06892713904380798, + "learning_rate": 4.507542114449703e-06, + "loss": 2.5337, + "step": 32296 + }, + { + "epoch": 0.9577143196038312, + "grad_norm": 0.06516391038894653, + "learning_rate": 4.501240726744793e-06, + "loss": 2.5606, + "step": 32297 + }, + { + "epoch": 0.9577439729561427, + "grad_norm": 0.06954655796289444, + "learning_rate": 4.494943726766454e-06, + "loss": 2.5862, + "step": 32298 + }, + { + "epoch": 0.9577736263084542, + "grad_norm": 0.06932602822780609, + "learning_rate": 4.488651114570419e-06, + "loss": 2.5837, + "step": 32299 + }, + { + "epoch": 0.9578032796607656, + "grad_norm": 0.06592017412185669, + "learning_rate": 4.482362890212477e-06, + "loss": 2.5769, + "step": 32300 + }, + { + "epoch": 0.9578329330130771, + "grad_norm": 0.06757690012454987, + "learning_rate": 4.47607905374825e-06, + "loss": 2.5698, + "step": 32301 + }, + { + "epoch": 0.9578625863653886, + "grad_norm": 0.07100068032741547, + "learning_rate": 4.469799605233415e-06, + "loss": 2.5654, + "step": 32302 + }, + { + "epoch": 0.9578922397177001, + "grad_norm": 0.06516958773136139, + "learning_rate": 4.46352454472354e-06, + "loss": 2.5523, + "step": 32303 + }, + { + "epoch": 0.9579218930700115, + "grad_norm": 0.06748400628566742, + "learning_rate": 4.457253872274191e-06, + "loss": 2.5753, + "step": 32304 + }, + { + "epoch": 0.957951546422323, + "grad_norm": 0.06844785064458847, + "learning_rate": 4.450987587940991e-06, + "loss": 2.5506, + "step": 32305 + }, + { + "epoch": 0.9579811997746345, + "grad_norm": 0.06734771281480789, + "learning_rate": 4.444725691779283e-06, + "loss": 2.5548, + "step": 32306 + }, + { + "epoch": 0.958010853126946, + "grad_norm": 0.06666059046983719, + "learning_rate": 4.438468183844635e-06, + "loss": 2.5683, + "step": 32307 + }, + { + "epoch": 0.9580405064792574, + "grad_norm": 0.0672215074300766, + "learning_rate": 4.432215064192391e-06, + "loss": 2.5359, + "step": 32308 + }, + { + "epoch": 0.958070159831569, + "grad_norm": 0.06560508906841278, + "learning_rate": 4.425966332877895e-06, + "loss": 2.5209, + "step": 32309 + }, + { + "epoch": 0.9580998131838805, + "grad_norm": 0.07016913592815399, + "learning_rate": 4.419721989956549e-06, + "loss": 2.5512, + "step": 32310 + }, + { + "epoch": 0.9581294665361919, + "grad_norm": 0.06771824508905411, + "learning_rate": 4.413482035483696e-06, + "loss": 2.5455, + "step": 32311 + }, + { + "epoch": 0.9581591198885034, + "grad_norm": 0.07164975255727768, + "learning_rate": 4.407246469514514e-06, + "loss": 2.5876, + "step": 32312 + }, + { + "epoch": 0.9581887732408149, + "grad_norm": 0.06713826954364777, + "learning_rate": 4.401015292104238e-06, + "loss": 2.5711, + "step": 32313 + }, + { + "epoch": 0.9582184265931264, + "grad_norm": 0.06784199178218842, + "learning_rate": 4.394788503307989e-06, + "loss": 2.6086, + "step": 32314 + }, + { + "epoch": 0.9582480799454378, + "grad_norm": 0.06762385368347168, + "learning_rate": 4.388566103181002e-06, + "loss": 2.5506, + "step": 32315 + }, + { + "epoch": 0.9582777332977493, + "grad_norm": 0.06758691370487213, + "learning_rate": 4.382348091778287e-06, + "loss": 2.5796, + "step": 32316 + }, + { + "epoch": 0.9583073866500608, + "grad_norm": 0.0674334317445755, + "learning_rate": 4.376134469154969e-06, + "loss": 2.5482, + "step": 32317 + }, + { + "epoch": 0.9583370400023723, + "grad_norm": 0.06823123246431351, + "learning_rate": 4.369925235366057e-06, + "loss": 2.5607, + "step": 32318 + }, + { + "epoch": 0.9583666933546837, + "grad_norm": 0.06834414601325989, + "learning_rate": 4.363720390466563e-06, + "loss": 2.5707, + "step": 32319 + }, + { + "epoch": 0.9583963467069953, + "grad_norm": 0.06770163029432297, + "learning_rate": 4.357519934511333e-06, + "loss": 2.5863, + "step": 32320 + }, + { + "epoch": 0.9584260000593067, + "grad_norm": 0.07111015170812607, + "learning_rate": 4.351323867555379e-06, + "loss": 2.5726, + "step": 32321 + }, + { + "epoch": 0.9584556534116182, + "grad_norm": 0.06657983362674713, + "learning_rate": 4.345132189653544e-06, + "loss": 2.5561, + "step": 32322 + }, + { + "epoch": 0.9584853067639296, + "grad_norm": 0.07026500254869461, + "learning_rate": 4.338944900860619e-06, + "loss": 2.5608, + "step": 32323 + }, + { + "epoch": 0.9585149601162412, + "grad_norm": 0.07194554060697556, + "learning_rate": 4.332762001231449e-06, + "loss": 2.5796, + "step": 32324 + }, + { + "epoch": 0.9585446134685526, + "grad_norm": 0.0690077617764473, + "learning_rate": 4.3265834908207125e-06, + "loss": 2.5706, + "step": 32325 + }, + { + "epoch": 0.9585742668208641, + "grad_norm": 0.06555647403001785, + "learning_rate": 4.320409369683143e-06, + "loss": 2.5824, + "step": 32326 + }, + { + "epoch": 0.9586039201731755, + "grad_norm": 0.06750009208917618, + "learning_rate": 4.314239637873474e-06, + "loss": 2.5674, + "step": 32327 + }, + { + "epoch": 0.9586335735254871, + "grad_norm": 0.06705964356660843, + "learning_rate": 4.308074295446274e-06, + "loss": 2.5638, + "step": 32328 + }, + { + "epoch": 0.9586632268777985, + "grad_norm": 0.06667952239513397, + "learning_rate": 4.301913342456165e-06, + "loss": 2.5555, + "step": 32329 + }, + { + "epoch": 0.95869288023011, + "grad_norm": 0.06710325181484222, + "learning_rate": 4.295756778957716e-06, + "loss": 2.529, + "step": 32330 + }, + { + "epoch": 0.9587225335824215, + "grad_norm": 0.06815081834793091, + "learning_rate": 4.289604605005437e-06, + "loss": 2.5569, + "step": 32331 + }, + { + "epoch": 0.958752186934733, + "grad_norm": 0.06679647415876389, + "learning_rate": 4.283456820653731e-06, + "loss": 2.5518, + "step": 32332 + }, + { + "epoch": 0.9587818402870445, + "grad_norm": 0.0690552219748497, + "learning_rate": 4.277313425957164e-06, + "loss": 2.5866, + "step": 32333 + }, + { + "epoch": 0.9588114936393559, + "grad_norm": 0.06658732891082764, + "learning_rate": 4.271174420970081e-06, + "loss": 2.5797, + "step": 32334 + }, + { + "epoch": 0.9588411469916674, + "grad_norm": 0.06759381294250488, + "learning_rate": 4.265039805746773e-06, + "loss": 2.5412, + "step": 32335 + }, + { + "epoch": 0.9588708003439789, + "grad_norm": 0.0682634636759758, + "learning_rate": 4.258909580341697e-06, + "loss": 2.5466, + "step": 32336 + }, + { + "epoch": 0.9589004536962904, + "grad_norm": 0.06740044057369232, + "learning_rate": 4.2527837448090305e-06, + "loss": 2.5156, + "step": 32337 + }, + { + "epoch": 0.9589301070486018, + "grad_norm": 0.06729390472173691, + "learning_rate": 4.2466622992031186e-06, + "loss": 2.5455, + "step": 32338 + }, + { + "epoch": 0.9589597604009134, + "grad_norm": 0.06616269797086716, + "learning_rate": 4.240545243578032e-06, + "loss": 2.5804, + "step": 32339 + }, + { + "epoch": 0.9589894137532248, + "grad_norm": 0.0690910592675209, + "learning_rate": 4.234432577988057e-06, + "loss": 2.5286, + "step": 32340 + }, + { + "epoch": 0.9590190671055363, + "grad_norm": 0.06388495117425919, + "learning_rate": 4.228324302487263e-06, + "loss": 2.5886, + "step": 32341 + }, + { + "epoch": 0.9590487204578477, + "grad_norm": 0.06537749618291855, + "learning_rate": 4.222220417129774e-06, + "loss": 2.5411, + "step": 32342 + }, + { + "epoch": 0.9590783738101593, + "grad_norm": 0.06728959083557129, + "learning_rate": 4.216120921969602e-06, + "loss": 2.5364, + "step": 32343 + }, + { + "epoch": 0.9591080271624707, + "grad_norm": 0.06659632176160812, + "learning_rate": 4.210025817060759e-06, + "loss": 2.5602, + "step": 32344 + }, + { + "epoch": 0.9591376805147822, + "grad_norm": 0.06580059975385666, + "learning_rate": 4.203935102457257e-06, + "loss": 2.5608, + "step": 32345 + }, + { + "epoch": 0.9591673338670936, + "grad_norm": 0.0676056370139122, + "learning_rate": 4.197848778213054e-06, + "loss": 2.5572, + "step": 32346 + }, + { + "epoch": 0.9591969872194052, + "grad_norm": 0.0658319741487503, + "learning_rate": 4.191766844381939e-06, + "loss": 2.5425, + "step": 32347 + }, + { + "epoch": 0.9592266405717166, + "grad_norm": 0.06654161959886551, + "learning_rate": 4.1856893010178695e-06, + "loss": 2.536, + "step": 32348 + }, + { + "epoch": 0.9592562939240281, + "grad_norm": 0.07069797813892365, + "learning_rate": 4.179616148174581e-06, + "loss": 2.5572, + "step": 32349 + }, + { + "epoch": 0.9592859472763396, + "grad_norm": 0.06967636197805405, + "learning_rate": 4.173547385905974e-06, + "loss": 2.5398, + "step": 32350 + }, + { + "epoch": 0.9593156006286511, + "grad_norm": 0.06702457368373871, + "learning_rate": 4.167483014265672e-06, + "loss": 2.571, + "step": 32351 + }, + { + "epoch": 0.9593452539809626, + "grad_norm": 0.07026863843202591, + "learning_rate": 4.16142303330741e-06, + "loss": 2.5386, + "step": 32352 + }, + { + "epoch": 0.959374907333274, + "grad_norm": 0.06748897582292557, + "learning_rate": 4.155367443084867e-06, + "loss": 2.599, + "step": 32353 + }, + { + "epoch": 0.9594045606855856, + "grad_norm": 0.0689535140991211, + "learning_rate": 4.149316243651668e-06, + "loss": 2.5652, + "step": 32354 + }, + { + "epoch": 0.959434214037897, + "grad_norm": 0.06854772567749023, + "learning_rate": 4.143269435061325e-06, + "loss": 2.5439, + "step": 32355 + }, + { + "epoch": 0.9594638673902085, + "grad_norm": 0.06775052845478058, + "learning_rate": 4.1372270173675175e-06, + "loss": 2.5665, + "step": 32356 + }, + { + "epoch": 0.9594935207425199, + "grad_norm": 0.0660901591181755, + "learning_rate": 4.131188990623646e-06, + "loss": 2.5748, + "step": 32357 + }, + { + "epoch": 0.9595231740948315, + "grad_norm": 0.06856771558523178, + "learning_rate": 4.125155354883225e-06, + "loss": 2.5657, + "step": 32358 + }, + { + "epoch": 0.9595528274471429, + "grad_norm": 0.06623231619596481, + "learning_rate": 4.119126110199656e-06, + "loss": 2.5582, + "step": 32359 + }, + { + "epoch": 0.9595824807994544, + "grad_norm": 0.06714178621768951, + "learning_rate": 4.113101256626339e-06, + "loss": 2.571, + "step": 32360 + }, + { + "epoch": 0.9596121341517658, + "grad_norm": 0.07088743895292282, + "learning_rate": 4.107080794216622e-06, + "loss": 2.586, + "step": 32361 + }, + { + "epoch": 0.9596417875040774, + "grad_norm": 0.06897510588169098, + "learning_rate": 4.10106472302385e-06, + "loss": 2.5783, + "step": 32362 + }, + { + "epoch": 0.9596714408563888, + "grad_norm": 0.06818369776010513, + "learning_rate": 4.09505304310126e-06, + "loss": 2.5517, + "step": 32363 + }, + { + "epoch": 0.9597010942087003, + "grad_norm": 0.06678790599107742, + "learning_rate": 4.0890457545020855e-06, + "loss": 2.536, + "step": 32364 + }, + { + "epoch": 0.9597307475610117, + "grad_norm": 0.0685800090432167, + "learning_rate": 4.083042857279562e-06, + "loss": 2.5452, + "step": 32365 + }, + { + "epoch": 0.9597604009133233, + "grad_norm": 0.06730487942695618, + "learning_rate": 4.077044351486758e-06, + "loss": 2.5735, + "step": 32366 + }, + { + "epoch": 0.9597900542656347, + "grad_norm": 0.06621610373258591, + "learning_rate": 4.07105023717691e-06, + "loss": 2.5541, + "step": 32367 + }, + { + "epoch": 0.9598197076179462, + "grad_norm": 0.06970986723899841, + "learning_rate": 4.065060514403029e-06, + "loss": 2.5609, + "step": 32368 + }, + { + "epoch": 0.9598493609702576, + "grad_norm": 0.06761132180690765, + "learning_rate": 4.0590751832181306e-06, + "loss": 2.5581, + "step": 32369 + }, + { + "epoch": 0.9598790143225692, + "grad_norm": 0.06445641070604324, + "learning_rate": 4.0530942436752815e-06, + "loss": 2.5683, + "step": 32370 + }, + { + "epoch": 0.9599086676748807, + "grad_norm": 0.06517521291971207, + "learning_rate": 4.0471176958273844e-06, + "loss": 2.569, + "step": 32371 + }, + { + "epoch": 0.9599383210271921, + "grad_norm": 0.06900949776172638, + "learning_rate": 4.0411455397273974e-06, + "loss": 2.5604, + "step": 32372 + }, + { + "epoch": 0.9599679743795037, + "grad_norm": 0.06746194511651993, + "learning_rate": 4.035177775428223e-06, + "loss": 2.5627, + "step": 32373 + }, + { + "epoch": 0.9599976277318151, + "grad_norm": 0.06600437313318253, + "learning_rate": 4.02921440298265e-06, + "loss": 2.5654, + "step": 32374 + }, + { + "epoch": 0.9600272810841266, + "grad_norm": 0.07378602027893066, + "learning_rate": 4.023255422443528e-06, + "loss": 2.5693, + "step": 32375 + }, + { + "epoch": 0.960056934436438, + "grad_norm": 0.06426870077848434, + "learning_rate": 4.017300833863591e-06, + "loss": 2.5843, + "step": 32376 + }, + { + "epoch": 0.9600865877887496, + "grad_norm": 0.06828554719686508, + "learning_rate": 4.0113506372955745e-06, + "loss": 2.5507, + "step": 32377 + }, + { + "epoch": 0.960116241141061, + "grad_norm": 0.07343105971813202, + "learning_rate": 4.00540483279227e-06, + "loss": 2.5695, + "step": 32378 + }, + { + "epoch": 0.9601458944933725, + "grad_norm": 0.07103469967842102, + "learning_rate": 3.999463420406191e-06, + "loss": 2.5907, + "step": 32379 + }, + { + "epoch": 0.9601755478456839, + "grad_norm": 0.07098132371902466, + "learning_rate": 3.9935264001900175e-06, + "loss": 2.5288, + "step": 32380 + }, + { + "epoch": 0.9602052011979955, + "grad_norm": 0.06675469130277634, + "learning_rate": 3.987593772196263e-06, + "loss": 2.5628, + "step": 32381 + }, + { + "epoch": 0.9602348545503069, + "grad_norm": 0.06583601236343384, + "learning_rate": 3.981665536477552e-06, + "loss": 2.5482, + "step": 32382 + }, + { + "epoch": 0.9602645079026184, + "grad_norm": 0.06369073688983917, + "learning_rate": 3.9757416930862875e-06, + "loss": 2.5715, + "step": 32383 + }, + { + "epoch": 0.9602941612549298, + "grad_norm": 0.0683155506849289, + "learning_rate": 3.969822242074983e-06, + "loss": 2.5262, + "step": 32384 + }, + { + "epoch": 0.9603238146072414, + "grad_norm": 0.06681277602910995, + "learning_rate": 3.963907183496041e-06, + "loss": 2.5593, + "step": 32385 + }, + { + "epoch": 0.9603534679595528, + "grad_norm": 0.06306101381778717, + "learning_rate": 3.957996517401863e-06, + "loss": 2.5416, + "step": 32386 + }, + { + "epoch": 0.9603831213118643, + "grad_norm": 0.06922554224729538, + "learning_rate": 3.952090243844742e-06, + "loss": 2.5737, + "step": 32387 + }, + { + "epoch": 0.9604127746641757, + "grad_norm": 0.06869985908269882, + "learning_rate": 3.946188362877079e-06, + "loss": 2.5615, + "step": 32388 + }, + { + "epoch": 0.9604424280164873, + "grad_norm": 0.06504958868026733, + "learning_rate": 3.940290874551e-06, + "loss": 2.5647, + "step": 32389 + }, + { + "epoch": 0.9604720813687987, + "grad_norm": 0.06674264371395111, + "learning_rate": 3.934397778918797e-06, + "loss": 2.5398, + "step": 32390 + }, + { + "epoch": 0.9605017347211102, + "grad_norm": 0.06575552374124527, + "learning_rate": 3.928509076032705e-06, + "loss": 2.5725, + "step": 32391 + }, + { + "epoch": 0.9605313880734218, + "grad_norm": 0.06741049140691757, + "learning_rate": 3.922624765944738e-06, + "loss": 2.535, + "step": 32392 + }, + { + "epoch": 0.9605610414257332, + "grad_norm": 0.06393525004386902, + "learning_rate": 3.916744848707132e-06, + "loss": 2.5545, + "step": 32393 + }, + { + "epoch": 0.9605906947780447, + "grad_norm": 0.06811969727277756, + "learning_rate": 3.910869324371902e-06, + "loss": 2.5893, + "step": 32394 + }, + { + "epoch": 0.9606203481303561, + "grad_norm": 0.06468163430690765, + "learning_rate": 3.904998192991061e-06, + "loss": 2.5323, + "step": 32395 + }, + { + "epoch": 0.9606500014826677, + "grad_norm": 0.06614790856838226, + "learning_rate": 3.899131454616623e-06, + "loss": 2.5721, + "step": 32396 + }, + { + "epoch": 0.9606796548349791, + "grad_norm": 0.06802450120449066, + "learning_rate": 3.89326910930049e-06, + "loss": 2.5712, + "step": 32397 + }, + { + "epoch": 0.9607093081872906, + "grad_norm": 0.06523682922124863, + "learning_rate": 3.887411157094623e-06, + "loss": 2.5788, + "step": 32398 + }, + { + "epoch": 0.960738961539602, + "grad_norm": 0.06739749014377594, + "learning_rate": 3.881557598050922e-06, + "loss": 2.6013, + "step": 32399 + }, + { + "epoch": 0.9607686148919136, + "grad_norm": 0.06674223393201828, + "learning_rate": 3.875708432221181e-06, + "loss": 2.5604, + "step": 32400 + }, + { + "epoch": 0.960798268244225, + "grad_norm": 0.06874044984579086, + "learning_rate": 3.869863659657191e-06, + "loss": 2.5653, + "step": 32401 + }, + { + "epoch": 0.9608279215965365, + "grad_norm": 0.06570041924715042, + "learning_rate": 3.864023280410744e-06, + "loss": 2.5779, + "step": 32402 + }, + { + "epoch": 0.9608575749488479, + "grad_norm": 0.0654413029551506, + "learning_rate": 3.858187294533466e-06, + "loss": 2.5625, + "step": 32403 + }, + { + "epoch": 0.9608872283011595, + "grad_norm": 0.06681541353464127, + "learning_rate": 3.852355702077148e-06, + "loss": 2.5912, + "step": 32404 + }, + { + "epoch": 0.9609168816534709, + "grad_norm": 0.06631311029195786, + "learning_rate": 3.846528503093416e-06, + "loss": 2.568, + "step": 32405 + }, + { + "epoch": 0.9609465350057824, + "grad_norm": 0.06627140194177628, + "learning_rate": 3.840705697633784e-06, + "loss": 2.5701, + "step": 32406 + }, + { + "epoch": 0.9609761883580938, + "grad_norm": 0.06592020392417908, + "learning_rate": 3.834887285749877e-06, + "loss": 2.563, + "step": 32407 + }, + { + "epoch": 0.9610058417104054, + "grad_norm": 0.07209110260009766, + "learning_rate": 3.82907326749321e-06, + "loss": 2.5472, + "step": 32408 + }, + { + "epoch": 0.9610354950627168, + "grad_norm": 0.07116295397281647, + "learning_rate": 3.823263642915242e-06, + "loss": 2.5367, + "step": 32409 + }, + { + "epoch": 0.9610651484150283, + "grad_norm": 0.06842046976089478, + "learning_rate": 3.817458412067487e-06, + "loss": 2.5812, + "step": 32410 + }, + { + "epoch": 0.9610948017673397, + "grad_norm": 0.06671438366174698, + "learning_rate": 3.811657575001293e-06, + "loss": 2.5697, + "step": 32411 + }, + { + "epoch": 0.9611244551196513, + "grad_norm": 0.06720764935016632, + "learning_rate": 3.8058611317680624e-06, + "loss": 2.5458, + "step": 32412 + }, + { + "epoch": 0.9611541084719628, + "grad_norm": 0.06580422818660736, + "learning_rate": 3.8000690824190333e-06, + "loss": 2.5476, + "step": 32413 + }, + { + "epoch": 0.9611837618242742, + "grad_norm": 0.06490693241357803, + "learning_rate": 3.794281427005608e-06, + "loss": 2.5427, + "step": 32414 + }, + { + "epoch": 0.9612134151765858, + "grad_norm": 0.0669962614774704, + "learning_rate": 3.788498165579024e-06, + "loss": 2.5641, + "step": 32415 + }, + { + "epoch": 0.9612430685288972, + "grad_norm": 0.06452497839927673, + "learning_rate": 3.7827192981904068e-06, + "loss": 2.5539, + "step": 32416 + }, + { + "epoch": 0.9612727218812087, + "grad_norm": 0.06212449073791504, + "learning_rate": 3.7769448248910488e-06, + "loss": 2.5427, + "step": 32417 + }, + { + "epoch": 0.9613023752335201, + "grad_norm": 0.06831071525812149, + "learning_rate": 3.7711747457319645e-06, + "loss": 2.571, + "step": 32418 + }, + { + "epoch": 0.9613320285858317, + "grad_norm": 0.0661206766963005, + "learning_rate": 3.7654090607643352e-06, + "loss": 2.5449, + "step": 32419 + }, + { + "epoch": 0.9613616819381431, + "grad_norm": 0.06602942943572998, + "learning_rate": 3.7596477700391764e-06, + "loss": 2.592, + "step": 32420 + }, + { + "epoch": 0.9613913352904546, + "grad_norm": 0.06834098696708679, + "learning_rate": 3.7538908736075018e-06, + "loss": 2.5676, + "step": 32421 + }, + { + "epoch": 0.961420988642766, + "grad_norm": 0.06794492900371552, + "learning_rate": 3.7481383715203265e-06, + "loss": 2.553, + "step": 32422 + }, + { + "epoch": 0.9614506419950776, + "grad_norm": 0.06704209744930267, + "learning_rate": 3.7423902638285544e-06, + "loss": 2.5429, + "step": 32423 + }, + { + "epoch": 0.961480295347389, + "grad_norm": 0.0765165239572525, + "learning_rate": 3.7366465505830895e-06, + "loss": 2.5698, + "step": 32424 + }, + { + "epoch": 0.9615099486997005, + "grad_norm": 0.0699889212846756, + "learning_rate": 3.730907231834779e-06, + "loss": 2.5466, + "step": 32425 + }, + { + "epoch": 0.9615396020520119, + "grad_norm": 0.07021357864141464, + "learning_rate": 3.7251723076344725e-06, + "loss": 2.5323, + "step": 32426 + }, + { + "epoch": 0.9615692554043235, + "grad_norm": 0.07025892287492752, + "learning_rate": 3.719441778033017e-06, + "loss": 2.5485, + "step": 32427 + }, + { + "epoch": 0.9615989087566349, + "grad_norm": 0.06758759170770645, + "learning_rate": 3.7137156430810393e-06, + "loss": 2.556, + "step": 32428 + }, + { + "epoch": 0.9616285621089464, + "grad_norm": 0.06834476441144943, + "learning_rate": 3.707993902829276e-06, + "loss": 2.5619, + "step": 32429 + }, + { + "epoch": 0.9616582154612578, + "grad_norm": 0.06673204898834229, + "learning_rate": 3.7022765573284655e-06, + "loss": 2.5715, + "step": 32430 + }, + { + "epoch": 0.9616878688135694, + "grad_norm": 0.06493303179740906, + "learning_rate": 3.696563606629122e-06, + "loss": 2.5598, + "step": 32431 + }, + { + "epoch": 0.9617175221658808, + "grad_norm": 0.06596244871616364, + "learning_rate": 3.690855050781983e-06, + "loss": 2.5553, + "step": 32432 + }, + { + "epoch": 0.9617471755181923, + "grad_norm": 0.06665404140949249, + "learning_rate": 3.685150889837452e-06, + "loss": 2.5744, + "step": 32433 + }, + { + "epoch": 0.9617768288705039, + "grad_norm": 0.06830532103776932, + "learning_rate": 3.6794511238461558e-06, + "loss": 2.5076, + "step": 32434 + }, + { + "epoch": 0.9618064822228153, + "grad_norm": 0.06898915022611618, + "learning_rate": 3.673755752858443e-06, + "loss": 2.5789, + "step": 32435 + }, + { + "epoch": 0.9618361355751268, + "grad_norm": 0.06775045394897461, + "learning_rate": 3.668064776924829e-06, + "loss": 2.5915, + "step": 32436 + }, + { + "epoch": 0.9618657889274382, + "grad_norm": 0.06731782853603363, + "learning_rate": 3.662378196095717e-06, + "loss": 2.5384, + "step": 32437 + }, + { + "epoch": 0.9618954422797498, + "grad_norm": 0.06899487972259521, + "learning_rate": 3.656696010421401e-06, + "loss": 2.5605, + "step": 32438 + }, + { + "epoch": 0.9619250956320612, + "grad_norm": 0.0700942650437355, + "learning_rate": 3.6510182199522844e-06, + "loss": 2.6101, + "step": 32439 + }, + { + "epoch": 0.9619547489843727, + "grad_norm": 0.06491505354642868, + "learning_rate": 3.6453448247386056e-06, + "loss": 2.5204, + "step": 32440 + }, + { + "epoch": 0.9619844023366841, + "grad_norm": 0.06745853275060654, + "learning_rate": 3.6396758248305463e-06, + "loss": 2.5758, + "step": 32441 + }, + { + "epoch": 0.9620140556889957, + "grad_norm": 0.06664978712797165, + "learning_rate": 3.6340112202783993e-06, + "loss": 2.5601, + "step": 32442 + }, + { + "epoch": 0.9620437090413071, + "grad_norm": 0.07197678834199905, + "learning_rate": 3.6283510111322914e-06, + "loss": 2.5203, + "step": 32443 + }, + { + "epoch": 0.9620733623936186, + "grad_norm": 0.06661960482597351, + "learning_rate": 3.6226951974423493e-06, + "loss": 2.5756, + "step": 32444 + }, + { + "epoch": 0.96210301574593, + "grad_norm": 0.06741620600223541, + "learning_rate": 3.6170437792585885e-06, + "loss": 2.5384, + "step": 32445 + }, + { + "epoch": 0.9621326690982416, + "grad_norm": 0.06207757815718651, + "learning_rate": 3.6113967566311356e-06, + "loss": 2.5298, + "step": 32446 + }, + { + "epoch": 0.962162322450553, + "grad_norm": 0.06577765196561813, + "learning_rate": 3.6057541296099503e-06, + "loss": 2.5427, + "step": 32447 + }, + { + "epoch": 0.9621919758028645, + "grad_norm": 0.06770151108503342, + "learning_rate": 3.6001158982450487e-06, + "loss": 2.5537, + "step": 32448 + }, + { + "epoch": 0.962221629155176, + "grad_norm": 0.0671747699379921, + "learning_rate": 3.594482062586335e-06, + "loss": 2.5552, + "step": 32449 + }, + { + "epoch": 0.9622512825074875, + "grad_norm": 0.0683465451002121, + "learning_rate": 3.588852622683658e-06, + "loss": 2.533, + "step": 32450 + }, + { + "epoch": 0.9622809358597989, + "grad_norm": 0.07074175775051117, + "learning_rate": 3.5832275785868673e-06, + "loss": 2.551, + "step": 32451 + }, + { + "epoch": 0.9623105892121104, + "grad_norm": 0.06828005611896515, + "learning_rate": 3.5776069303458116e-06, + "loss": 2.538, + "step": 32452 + }, + { + "epoch": 0.9623402425644219, + "grad_norm": 0.06574969738721848, + "learning_rate": 3.571990678010284e-06, + "loss": 2.5355, + "step": 32453 + }, + { + "epoch": 0.9623698959167334, + "grad_norm": 0.06653497368097305, + "learning_rate": 3.566378821630023e-06, + "loss": 2.5704, + "step": 32454 + }, + { + "epoch": 0.9623995492690449, + "grad_norm": 0.0686696246266365, + "learning_rate": 3.5607713612546554e-06, + "loss": 2.5614, + "step": 32455 + }, + { + "epoch": 0.9624292026213563, + "grad_norm": 0.06782468408346176, + "learning_rate": 3.555168296933864e-06, + "loss": 2.5615, + "step": 32456 + }, + { + "epoch": 0.9624588559736679, + "grad_norm": 0.06928502768278122, + "learning_rate": 3.549569628717331e-06, + "loss": 2.588, + "step": 32457 + }, + { + "epoch": 0.9624885093259793, + "grad_norm": 0.06989717483520508, + "learning_rate": 3.543975356654516e-06, + "loss": 2.5231, + "step": 32458 + }, + { + "epoch": 0.9625181626782908, + "grad_norm": 0.06976451724767685, + "learning_rate": 3.5383854807949922e-06, + "loss": 2.5736, + "step": 32459 + }, + { + "epoch": 0.9625478160306022, + "grad_norm": 0.06750626862049103, + "learning_rate": 3.53280000118833e-06, + "loss": 2.5449, + "step": 32460 + }, + { + "epoch": 0.9625774693829138, + "grad_norm": 0.06961826235055923, + "learning_rate": 3.527218917883934e-06, + "loss": 2.5799, + "step": 32461 + }, + { + "epoch": 0.9626071227352252, + "grad_norm": 0.06729928404092789, + "learning_rate": 3.52164223093121e-06, + "loss": 2.5686, + "step": 32462 + }, + { + "epoch": 0.9626367760875367, + "grad_norm": 0.06564067304134369, + "learning_rate": 3.5160699403795626e-06, + "loss": 2.5448, + "step": 32463 + }, + { + "epoch": 0.9626664294398481, + "grad_norm": 0.0668550655245781, + "learning_rate": 3.510502046278341e-06, + "loss": 2.5852, + "step": 32464 + }, + { + "epoch": 0.9626960827921597, + "grad_norm": 0.06535879522562027, + "learning_rate": 3.5049385486768392e-06, + "loss": 2.5809, + "step": 32465 + }, + { + "epoch": 0.9627257361444711, + "grad_norm": 0.06590492278337479, + "learning_rate": 3.4993794476243514e-06, + "loss": 2.5818, + "step": 32466 + }, + { + "epoch": 0.9627553894967826, + "grad_norm": 0.0665617510676384, + "learning_rate": 3.4938247431700596e-06, + "loss": 2.5434, + "step": 32467 + }, + { + "epoch": 0.962785042849094, + "grad_norm": 0.06504157185554504, + "learning_rate": 3.4882744353632033e-06, + "loss": 2.5538, + "step": 32468 + }, + { + "epoch": 0.9628146962014056, + "grad_norm": 0.06642794609069824, + "learning_rate": 3.4827285242529095e-06, + "loss": 2.5746, + "step": 32469 + }, + { + "epoch": 0.962844349553717, + "grad_norm": 0.07007690519094467, + "learning_rate": 3.4771870098882497e-06, + "loss": 2.5696, + "step": 32470 + }, + { + "epoch": 0.9628740029060285, + "grad_norm": 0.06504204124212265, + "learning_rate": 3.471649892318296e-06, + "loss": 2.5623, + "step": 32471 + }, + { + "epoch": 0.96290365625834, + "grad_norm": 0.0645979717373848, + "learning_rate": 3.466117171592176e-06, + "loss": 2.5497, + "step": 32472 + }, + { + "epoch": 0.9629333096106515, + "grad_norm": 0.07005584985017776, + "learning_rate": 3.4605888477587944e-06, + "loss": 2.5433, + "step": 32473 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 0.06684159487485886, + "learning_rate": 3.4550649208671127e-06, + "loss": 2.5205, + "step": 32474 + }, + { + "epoch": 0.9629926163152744, + "grad_norm": 0.06618596613407135, + "learning_rate": 3.4495453909660913e-06, + "loss": 2.545, + "step": 32475 + }, + { + "epoch": 0.963022269667586, + "grad_norm": 0.06913904845714569, + "learning_rate": 3.4440302581045245e-06, + "loss": 2.5441, + "step": 32476 + }, + { + "epoch": 0.9630519230198974, + "grad_norm": 0.07245494425296783, + "learning_rate": 3.4385195223313738e-06, + "loss": 2.5636, + "step": 32477 + }, + { + "epoch": 0.9630815763722089, + "grad_norm": 0.06805351376533508, + "learning_rate": 3.4330131836953214e-06, + "loss": 2.57, + "step": 32478 + }, + { + "epoch": 0.9631112297245203, + "grad_norm": 0.06617393344640732, + "learning_rate": 3.427511242245163e-06, + "loss": 2.5792, + "step": 32479 + }, + { + "epoch": 0.9631408830768319, + "grad_norm": 0.06543458998203278, + "learning_rate": 3.4220136980296355e-06, + "loss": 2.5368, + "step": 32480 + }, + { + "epoch": 0.9631705364291433, + "grad_norm": 0.06722046434879303, + "learning_rate": 3.416520551097424e-06, + "loss": 2.5486, + "step": 32481 + }, + { + "epoch": 0.9632001897814548, + "grad_norm": 0.06691747903823853, + "learning_rate": 3.4110318014971555e-06, + "loss": 2.5696, + "step": 32482 + }, + { + "epoch": 0.9632298431337663, + "grad_norm": 0.06790782511234283, + "learning_rate": 3.4055474492774574e-06, + "loss": 2.563, + "step": 32483 + }, + { + "epoch": 0.9632594964860778, + "grad_norm": 0.06673029810190201, + "learning_rate": 3.400067494486847e-06, + "loss": 2.5711, + "step": 32484 + }, + { + "epoch": 0.9632891498383892, + "grad_norm": 0.06805316358804703, + "learning_rate": 3.3945919371738967e-06, + "loss": 2.5821, + "step": 32485 + }, + { + "epoch": 0.9633188031907007, + "grad_norm": 0.06728402525186539, + "learning_rate": 3.389120777387067e-06, + "loss": 2.5656, + "step": 32486 + }, + { + "epoch": 0.9633484565430122, + "grad_norm": 0.0674344152212143, + "learning_rate": 3.3836540151748196e-06, + "loss": 2.5959, + "step": 32487 + }, + { + "epoch": 0.9633781098953237, + "grad_norm": 0.06552531570196152, + "learning_rate": 3.3781916505855603e-06, + "loss": 2.5466, + "step": 32488 + }, + { + "epoch": 0.9634077632476351, + "grad_norm": 0.06551040709018707, + "learning_rate": 3.3727336836676947e-06, + "loss": 2.57, + "step": 32489 + }, + { + "epoch": 0.9634374165999466, + "grad_norm": 0.06695044785737991, + "learning_rate": 3.367280114469462e-06, + "loss": 2.5706, + "step": 32490 + }, + { + "epoch": 0.9634670699522581, + "grad_norm": 0.06704184412956238, + "learning_rate": 3.3618309430392124e-06, + "loss": 2.6054, + "step": 32491 + }, + { + "epoch": 0.9634967233045696, + "grad_norm": 0.07174494862556458, + "learning_rate": 3.356386169425185e-06, + "loss": 2.5567, + "step": 32492 + }, + { + "epoch": 0.963526376656881, + "grad_norm": 0.07027740776538849, + "learning_rate": 3.3509457936756193e-06, + "loss": 2.6278, + "step": 32493 + }, + { + "epoch": 0.9635560300091925, + "grad_norm": 0.06743466854095459, + "learning_rate": 3.3455098158386987e-06, + "loss": 2.5115, + "step": 32494 + }, + { + "epoch": 0.963585683361504, + "grad_norm": 0.06372183561325073, + "learning_rate": 3.3400782359625517e-06, + "loss": 2.534, + "step": 32495 + }, + { + "epoch": 0.9636153367138155, + "grad_norm": 0.06828748434782028, + "learning_rate": 3.3346510540952503e-06, + "loss": 2.5724, + "step": 32496 + }, + { + "epoch": 0.963644990066127, + "grad_norm": 0.06854864954948425, + "learning_rate": 3.3292282702848674e-06, + "loss": 2.5554, + "step": 32497 + }, + { + "epoch": 0.9636746434184384, + "grad_norm": 0.0681404173374176, + "learning_rate": 3.3238098845794206e-06, + "loss": 2.5887, + "step": 32498 + }, + { + "epoch": 0.96370429677075, + "grad_norm": 0.06990666687488556, + "learning_rate": 3.3183958970268714e-06, + "loss": 2.5866, + "step": 32499 + }, + { + "epoch": 0.9637339501230614, + "grad_norm": 0.06953001767396927, + "learning_rate": 3.312986307675181e-06, + "loss": 2.5639, + "step": 32500 + }, + { + "epoch": 0.9637636034753729, + "grad_norm": 0.06687844544649124, + "learning_rate": 3.307581116572256e-06, + "loss": 2.5413, + "step": 32501 + }, + { + "epoch": 0.9637932568276844, + "grad_norm": 0.06712467968463898, + "learning_rate": 3.3021803237659463e-06, + "loss": 2.5548, + "step": 32502 + }, + { + "epoch": 0.9638229101799959, + "grad_norm": 0.06479187309741974, + "learning_rate": 3.2967839293041034e-06, + "loss": 2.5529, + "step": 32503 + }, + { + "epoch": 0.9638525635323073, + "grad_norm": 0.07108748704195023, + "learning_rate": 3.2913919332344666e-06, + "loss": 2.5989, + "step": 32504 + }, + { + "epoch": 0.9638822168846188, + "grad_norm": 0.06423696130514145, + "learning_rate": 3.2860043356048308e-06, + "loss": 2.5345, + "step": 32505 + }, + { + "epoch": 0.9639118702369303, + "grad_norm": 0.06533032655715942, + "learning_rate": 3.2806211364629357e-06, + "loss": 2.5905, + "step": 32506 + }, + { + "epoch": 0.9639415235892418, + "grad_norm": 0.06636534631252289, + "learning_rate": 3.2752423358563545e-06, + "loss": 2.571, + "step": 32507 + }, + { + "epoch": 0.9639711769415532, + "grad_norm": 0.06692545115947723, + "learning_rate": 3.269867933832771e-06, + "loss": 2.6089, + "step": 32508 + }, + { + "epoch": 0.9640008302938647, + "grad_norm": 0.06642154604196548, + "learning_rate": 3.264497930439758e-06, + "loss": 2.5546, + "step": 32509 + }, + { + "epoch": 0.9640304836461762, + "grad_norm": 0.0692954882979393, + "learning_rate": 3.2591323257248896e-06, + "loss": 2.5394, + "step": 32510 + }, + { + "epoch": 0.9640601369984877, + "grad_norm": 0.06558391451835632, + "learning_rate": 3.2537711197356825e-06, + "loss": 2.5813, + "step": 32511 + }, + { + "epoch": 0.9640897903507991, + "grad_norm": 0.06657946109771729, + "learning_rate": 3.2484143125195987e-06, + "loss": 2.5614, + "step": 32512 + }, + { + "epoch": 0.9641194437031106, + "grad_norm": 0.062437426298856735, + "learning_rate": 3.243061904124045e-06, + "loss": 2.5366, + "step": 32513 + }, + { + "epoch": 0.9641490970554221, + "grad_norm": 0.06896065175533295, + "learning_rate": 3.2377138945964833e-06, + "loss": 2.5921, + "step": 32514 + }, + { + "epoch": 0.9641787504077336, + "grad_norm": 0.07153737545013428, + "learning_rate": 3.2323702839842096e-06, + "loss": 2.5635, + "step": 32515 + }, + { + "epoch": 0.964208403760045, + "grad_norm": 0.06794052571058273, + "learning_rate": 3.2270310723345185e-06, + "loss": 2.5926, + "step": 32516 + }, + { + "epoch": 0.9642380571123566, + "grad_norm": 0.06741289049386978, + "learning_rate": 3.2216962596948174e-06, + "loss": 2.5714, + "step": 32517 + }, + { + "epoch": 0.9642677104646681, + "grad_norm": 0.0696684867143631, + "learning_rate": 3.2163658461122346e-06, + "loss": 2.5236, + "step": 32518 + }, + { + "epoch": 0.9642973638169795, + "grad_norm": 0.06772246211767197, + "learning_rate": 3.2110398316340105e-06, + "loss": 2.5709, + "step": 32519 + }, + { + "epoch": 0.964327017169291, + "grad_norm": 0.06797090917825699, + "learning_rate": 3.205718216307274e-06, + "loss": 2.569, + "step": 32520 + }, + { + "epoch": 0.9643566705216025, + "grad_norm": 0.06465553492307663, + "learning_rate": 3.2004010001792094e-06, + "loss": 2.5553, + "step": 32521 + }, + { + "epoch": 0.964386323873914, + "grad_norm": 0.06614697724580765, + "learning_rate": 3.1950881832968348e-06, + "loss": 2.5419, + "step": 32522 + }, + { + "epoch": 0.9644159772262254, + "grad_norm": 0.06595263630151749, + "learning_rate": 3.189779765707279e-06, + "loss": 2.5642, + "step": 32523 + }, + { + "epoch": 0.9644456305785369, + "grad_norm": 0.06476226449012756, + "learning_rate": 3.1844757474574494e-06, + "loss": 2.5593, + "step": 32524 + }, + { + "epoch": 0.9644752839308484, + "grad_norm": 0.06646834313869476, + "learning_rate": 3.1791761285944187e-06, + "loss": 2.5519, + "step": 32525 + }, + { + "epoch": 0.9645049372831599, + "grad_norm": 0.0681617334485054, + "learning_rate": 3.1738809091649833e-06, + "loss": 2.58, + "step": 32526 + }, + { + "epoch": 0.9645345906354713, + "grad_norm": 0.06440330296754837, + "learning_rate": 3.168590089216161e-06, + "loss": 2.5767, + "step": 32527 + }, + { + "epoch": 0.9645642439877828, + "grad_norm": 0.07291669398546219, + "learning_rate": 3.163303668794748e-06, + "loss": 2.5548, + "step": 32528 + }, + { + "epoch": 0.9645938973400943, + "grad_norm": 0.06753338128328323, + "learning_rate": 3.1580216479475397e-06, + "loss": 2.5734, + "step": 32529 + }, + { + "epoch": 0.9646235506924058, + "grad_norm": 0.0647353082895279, + "learning_rate": 3.152744026721388e-06, + "loss": 2.5247, + "step": 32530 + }, + { + "epoch": 0.9646532040447172, + "grad_norm": 0.06873392313718796, + "learning_rate": 3.147470805162922e-06, + "loss": 2.5458, + "step": 32531 + }, + { + "epoch": 0.9646828573970287, + "grad_norm": 0.06933280825614929, + "learning_rate": 3.1422019833189374e-06, + "loss": 2.5187, + "step": 32532 + }, + { + "epoch": 0.9647125107493402, + "grad_norm": 0.06714311242103577, + "learning_rate": 3.1369375612360085e-06, + "loss": 2.5824, + "step": 32533 + }, + { + "epoch": 0.9647421641016517, + "grad_norm": 0.06781992316246033, + "learning_rate": 3.1316775389607645e-06, + "loss": 2.5982, + "step": 32534 + }, + { + "epoch": 0.9647718174539631, + "grad_norm": 0.06641021370887756, + "learning_rate": 3.1264219165398346e-06, + "loss": 2.5566, + "step": 32535 + }, + { + "epoch": 0.9648014708062747, + "grad_norm": 0.06453026831150055, + "learning_rate": 3.1211706940196817e-06, + "loss": 2.5625, + "step": 32536 + }, + { + "epoch": 0.9648311241585861, + "grad_norm": 0.06682684272527695, + "learning_rate": 3.1159238714468797e-06, + "loss": 2.5774, + "step": 32537 + }, + { + "epoch": 0.9648607775108976, + "grad_norm": 0.06681769341230392, + "learning_rate": 3.110681448867836e-06, + "loss": 2.5702, + "step": 32538 + }, + { + "epoch": 0.9648904308632091, + "grad_norm": 0.06735606491565704, + "learning_rate": 3.105443426329013e-06, + "loss": 2.5424, + "step": 32539 + }, + { + "epoch": 0.9649200842155206, + "grad_norm": 0.06624284386634827, + "learning_rate": 3.100209803876819e-06, + "loss": 2.5727, + "step": 32540 + }, + { + "epoch": 0.9649497375678321, + "grad_norm": 0.06902783364057541, + "learning_rate": 3.094980581557494e-06, + "loss": 2.5796, + "step": 32541 + }, + { + "epoch": 0.9649793909201435, + "grad_norm": 0.06718649715185165, + "learning_rate": 3.089755759417445e-06, + "loss": 2.5483, + "step": 32542 + }, + { + "epoch": 0.965009044272455, + "grad_norm": 0.06796897947788239, + "learning_rate": 3.084535337502914e-06, + "loss": 2.594, + "step": 32543 + }, + { + "epoch": 0.9650386976247665, + "grad_norm": 0.06811730563640594, + "learning_rate": 3.079319315860085e-06, + "loss": 2.5491, + "step": 32544 + }, + { + "epoch": 0.965068350977078, + "grad_norm": 0.0667804405093193, + "learning_rate": 3.0741076945352e-06, + "loss": 2.5886, + "step": 32545 + }, + { + "epoch": 0.9650980043293894, + "grad_norm": 0.06825628876686096, + "learning_rate": 3.0689004735743873e-06, + "loss": 2.6133, + "step": 32546 + }, + { + "epoch": 0.965127657681701, + "grad_norm": 0.06762413680553436, + "learning_rate": 3.0636976530237227e-06, + "loss": 2.5712, + "step": 32547 + }, + { + "epoch": 0.9651573110340124, + "grad_norm": 0.06651795655488968, + "learning_rate": 3.05849923292939e-06, + "loss": 2.5207, + "step": 32548 + }, + { + "epoch": 0.9651869643863239, + "grad_norm": 0.06492484360933304, + "learning_rate": 3.0533052133372985e-06, + "loss": 2.5602, + "step": 32549 + }, + { + "epoch": 0.9652166177386353, + "grad_norm": 0.06532657146453857, + "learning_rate": 3.0481155942934657e-06, + "loss": 2.548, + "step": 32550 + }, + { + "epoch": 0.9652462710909469, + "grad_norm": 0.07022079080343246, + "learning_rate": 3.0429303758439107e-06, + "loss": 2.5507, + "step": 32551 + }, + { + "epoch": 0.9652759244432583, + "grad_norm": 0.06717310100793839, + "learning_rate": 3.0377495580344862e-06, + "loss": 2.565, + "step": 32552 + }, + { + "epoch": 0.9653055777955698, + "grad_norm": 0.06759996712207794, + "learning_rate": 3.032573140911099e-06, + "loss": 2.5494, + "step": 32553 + }, + { + "epoch": 0.9653352311478812, + "grad_norm": 0.07080807536840439, + "learning_rate": 3.0274011245195466e-06, + "loss": 2.5674, + "step": 32554 + }, + { + "epoch": 0.9653648845001928, + "grad_norm": 0.06707639247179031, + "learning_rate": 3.0222335089056807e-06, + "loss": 2.5604, + "step": 32555 + }, + { + "epoch": 0.9653945378525042, + "grad_norm": 0.06660869717597961, + "learning_rate": 3.0170702941152985e-06, + "loss": 2.5677, + "step": 32556 + }, + { + "epoch": 0.9654241912048157, + "grad_norm": 0.06501682102680206, + "learning_rate": 3.011911480193974e-06, + "loss": 2.5508, + "step": 32557 + }, + { + "epoch": 0.9654538445571272, + "grad_norm": 0.06969843804836273, + "learning_rate": 3.0067570671875597e-06, + "loss": 2.6297, + "step": 32558 + }, + { + "epoch": 0.9654834979094387, + "grad_norm": 0.06659701466560364, + "learning_rate": 3.0016070551415753e-06, + "loss": 2.5492, + "step": 32559 + }, + { + "epoch": 0.9655131512617502, + "grad_norm": 0.06682876497507095, + "learning_rate": 2.9964614441017057e-06, + "loss": 2.5786, + "step": 32560 + }, + { + "epoch": 0.9655428046140616, + "grad_norm": 0.06737280637025833, + "learning_rate": 2.9913202341134147e-06, + "loss": 2.5381, + "step": 32561 + }, + { + "epoch": 0.9655724579663731, + "grad_norm": 0.0640345960855484, + "learning_rate": 2.986183425222333e-06, + "loss": 2.5596, + "step": 32562 + }, + { + "epoch": 0.9656021113186846, + "grad_norm": 0.0693523958325386, + "learning_rate": 2.9810510174739236e-06, + "loss": 2.5735, + "step": 32563 + }, + { + "epoch": 0.9656317646709961, + "grad_norm": 0.06788809597492218, + "learning_rate": 2.9759230109136504e-06, + "loss": 2.5852, + "step": 32564 + }, + { + "epoch": 0.9656614180233075, + "grad_norm": 0.06733063608407974, + "learning_rate": 2.9707994055868103e-06, + "loss": 2.5604, + "step": 32565 + }, + { + "epoch": 0.965691071375619, + "grad_norm": 0.0648566260933876, + "learning_rate": 2.965680201538923e-06, + "loss": 2.5304, + "step": 32566 + }, + { + "epoch": 0.9657207247279305, + "grad_norm": 0.06947647780179977, + "learning_rate": 2.9605653988151736e-06, + "loss": 2.5714, + "step": 32567 + }, + { + "epoch": 0.965750378080242, + "grad_norm": 0.06747002899646759, + "learning_rate": 2.955454997460971e-06, + "loss": 2.5427, + "step": 32568 + }, + { + "epoch": 0.9657800314325534, + "grad_norm": 0.06483256816864014, + "learning_rate": 2.950348997521557e-06, + "loss": 2.5776, + "step": 32569 + }, + { + "epoch": 0.965809684784865, + "grad_norm": 0.06587834656238556, + "learning_rate": 2.945247399042117e-06, + "loss": 2.5879, + "step": 32570 + }, + { + "epoch": 0.9658393381371764, + "grad_norm": 0.06573180109262466, + "learning_rate": 2.940150202067782e-06, + "loss": 2.5598, + "step": 32571 + }, + { + "epoch": 0.9658689914894879, + "grad_norm": 0.08457320928573608, + "learning_rate": 2.935057406643793e-06, + "loss": 2.5654, + "step": 32572 + }, + { + "epoch": 0.9658986448417993, + "grad_norm": 0.06569666415452957, + "learning_rate": 2.9299690128151703e-06, + "loss": 2.5625, + "step": 32573 + }, + { + "epoch": 0.9659282981941109, + "grad_norm": 0.06649164855480194, + "learning_rate": 2.924885020626988e-06, + "loss": 2.5853, + "step": 32574 + }, + { + "epoch": 0.9659579515464223, + "grad_norm": 0.06570102274417877, + "learning_rate": 2.9198054301242673e-06, + "loss": 2.5795, + "step": 32575 + }, + { + "epoch": 0.9659876048987338, + "grad_norm": 0.07238217443227768, + "learning_rate": 2.914730241352026e-06, + "loss": 2.5619, + "step": 32576 + }, + { + "epoch": 0.9660172582510452, + "grad_norm": 0.06795123964548111, + "learning_rate": 2.909659454355118e-06, + "loss": 2.5576, + "step": 32577 + }, + { + "epoch": 0.9660469116033568, + "grad_norm": 0.06524688750505447, + "learning_rate": 2.9045930691785074e-06, + "loss": 2.5483, + "step": 32578 + }, + { + "epoch": 0.9660765649556683, + "grad_norm": 0.0665140151977539, + "learning_rate": 2.899531085867102e-06, + "loss": 2.5596, + "step": 32579 + }, + { + "epoch": 0.9661062183079797, + "grad_norm": 0.06876876205205917, + "learning_rate": 2.894473504465589e-06, + "loss": 2.558, + "step": 32580 + }, + { + "epoch": 0.9661358716602912, + "grad_norm": 0.07170754671096802, + "learning_rate": 2.889420325018932e-06, + "loss": 2.5986, + "step": 32581 + }, + { + "epoch": 0.9661655250126027, + "grad_norm": 0.07029356062412262, + "learning_rate": 2.884371547571707e-06, + "loss": 2.5634, + "step": 32582 + }, + { + "epoch": 0.9661951783649142, + "grad_norm": 0.06817282736301422, + "learning_rate": 2.8793271721687662e-06, + "loss": 2.5328, + "step": 32583 + }, + { + "epoch": 0.9662248317172256, + "grad_norm": 0.06922352313995361, + "learning_rate": 2.874287198854686e-06, + "loss": 2.5458, + "step": 32584 + }, + { + "epoch": 0.9662544850695372, + "grad_norm": 0.06369398534297943, + "learning_rate": 2.869251627674152e-06, + "loss": 2.5304, + "step": 32585 + }, + { + "epoch": 0.9662841384218486, + "grad_norm": 0.0680355429649353, + "learning_rate": 2.8642204586716848e-06, + "loss": 2.5411, + "step": 32586 + }, + { + "epoch": 0.9663137917741601, + "grad_norm": 0.06785213202238083, + "learning_rate": 2.859193691891915e-06, + "loss": 2.5566, + "step": 32587 + }, + { + "epoch": 0.9663434451264715, + "grad_norm": 0.15842421352863312, + "learning_rate": 2.8541713273792514e-06, + "loss": 2.5459, + "step": 32588 + }, + { + "epoch": 0.966373098478783, + "grad_norm": 0.06913761049509048, + "learning_rate": 2.8491533651783253e-06, + "loss": 2.5971, + "step": 32589 + }, + { + "epoch": 0.9664027518310945, + "grad_norm": 0.06825517117977142, + "learning_rate": 2.8441398053334344e-06, + "loss": 2.5849, + "step": 32590 + }, + { + "epoch": 0.966432405183406, + "grad_norm": 0.06573361903429031, + "learning_rate": 2.8391306478889877e-06, + "loss": 2.591, + "step": 32591 + }, + { + "epoch": 0.9664620585357174, + "grad_norm": 0.06889137625694275, + "learning_rate": 2.8341258928893943e-06, + "loss": 2.5595, + "step": 32592 + }, + { + "epoch": 0.966491711888029, + "grad_norm": 0.06514187157154083, + "learning_rate": 2.8291255403789516e-06, + "loss": 2.5771, + "step": 32593 + }, + { + "epoch": 0.9665213652403404, + "grad_norm": 0.06828942149877548, + "learning_rate": 2.824129590401958e-06, + "loss": 2.6021, + "step": 32594 + }, + { + "epoch": 0.9665510185926519, + "grad_norm": 0.06715553253889084, + "learning_rate": 2.8191380430026557e-06, + "loss": 2.5242, + "step": 32595 + }, + { + "epoch": 0.9665806719449633, + "grad_norm": 0.06486538052558899, + "learning_rate": 2.8141508982252316e-06, + "loss": 2.5982, + "step": 32596 + }, + { + "epoch": 0.9666103252972749, + "grad_norm": 0.0675000250339508, + "learning_rate": 2.8091681561138173e-06, + "loss": 2.5995, + "step": 32597 + }, + { + "epoch": 0.9666399786495863, + "grad_norm": 0.06460107862949371, + "learning_rate": 2.8041898167125433e-06, + "loss": 2.5767, + "step": 32598 + }, + { + "epoch": 0.9666696320018978, + "grad_norm": 0.06740456819534302, + "learning_rate": 2.799215880065542e-06, + "loss": 2.5581, + "step": 32599 + }, + { + "epoch": 0.9666992853542093, + "grad_norm": 0.06910761445760727, + "learning_rate": 2.794246346216778e-06, + "loss": 2.588, + "step": 32600 + }, + { + "epoch": 0.9667289387065208, + "grad_norm": 0.06985277682542801, + "learning_rate": 2.789281215210382e-06, + "loss": 2.5543, + "step": 32601 + }, + { + "epoch": 0.9667585920588323, + "grad_norm": 0.06852828711271286, + "learning_rate": 2.7843204870901527e-06, + "loss": 2.5589, + "step": 32602 + }, + { + "epoch": 0.9667882454111437, + "grad_norm": 0.06545950472354889, + "learning_rate": 2.7793641619001664e-06, + "loss": 2.5182, + "step": 32603 + }, + { + "epoch": 0.9668178987634553, + "grad_norm": 0.06719788908958435, + "learning_rate": 2.7744122396842765e-06, + "loss": 2.5841, + "step": 32604 + }, + { + "epoch": 0.9668475521157667, + "grad_norm": 0.06664308905601501, + "learning_rate": 2.769464720486281e-06, + "loss": 2.5988, + "step": 32605 + }, + { + "epoch": 0.9668772054680782, + "grad_norm": 0.06791792809963226, + "learning_rate": 2.7645216043500342e-06, + "loss": 2.5612, + "step": 32606 + }, + { + "epoch": 0.9669068588203896, + "grad_norm": 0.06782367080450058, + "learning_rate": 2.7595828913193345e-06, + "loss": 2.5643, + "step": 32607 + }, + { + "epoch": 0.9669365121727012, + "grad_norm": 0.06836730986833572, + "learning_rate": 2.754648581437813e-06, + "loss": 2.5694, + "step": 32608 + }, + { + "epoch": 0.9669661655250126, + "grad_norm": 0.06534998118877411, + "learning_rate": 2.7497186747492687e-06, + "loss": 2.5584, + "step": 32609 + }, + { + "epoch": 0.9669958188773241, + "grad_norm": 0.06576735526323318, + "learning_rate": 2.7447931712972775e-06, + "loss": 2.5488, + "step": 32610 + }, + { + "epoch": 0.9670254722296355, + "grad_norm": 0.06524912267923355, + "learning_rate": 2.7398720711255263e-06, + "loss": 2.5611, + "step": 32611 + }, + { + "epoch": 0.9670551255819471, + "grad_norm": 0.06567668169736862, + "learning_rate": 2.7349553742775924e-06, + "loss": 2.5585, + "step": 32612 + }, + { + "epoch": 0.9670847789342585, + "grad_norm": 0.06736387312412262, + "learning_rate": 2.73004308079694e-06, + "loss": 2.547, + "step": 32613 + }, + { + "epoch": 0.96711443228657, + "grad_norm": 0.06797140836715698, + "learning_rate": 2.725135190727146e-06, + "loss": 2.5666, + "step": 32614 + }, + { + "epoch": 0.9671440856388814, + "grad_norm": 0.06692090630531311, + "learning_rate": 2.7202317041115644e-06, + "loss": 2.5644, + "step": 32615 + }, + { + "epoch": 0.967173738991193, + "grad_norm": 0.06583326309919357, + "learning_rate": 2.715332620993771e-06, + "loss": 2.5764, + "step": 32616 + }, + { + "epoch": 0.9672033923435044, + "grad_norm": 0.06989162415266037, + "learning_rate": 2.71043794141701e-06, + "loss": 2.6172, + "step": 32617 + }, + { + "epoch": 0.9672330456958159, + "grad_norm": 0.06823612749576569, + "learning_rate": 2.7055476654246902e-06, + "loss": 2.5752, + "step": 32618 + }, + { + "epoch": 0.9672626990481273, + "grad_norm": 0.06843829154968262, + "learning_rate": 2.7006617930601106e-06, + "loss": 2.5748, + "step": 32619 + }, + { + "epoch": 0.9672923524004389, + "grad_norm": 0.06533390283584595, + "learning_rate": 2.6957803243665703e-06, + "loss": 2.5434, + "step": 32620 + }, + { + "epoch": 0.9673220057527504, + "grad_norm": 0.06716334074735641, + "learning_rate": 2.6909032593872008e-06, + "loss": 2.532, + "step": 32621 + }, + { + "epoch": 0.9673516591050618, + "grad_norm": 0.06432560086250305, + "learning_rate": 2.686030598165301e-06, + "loss": 2.5545, + "step": 32622 + }, + { + "epoch": 0.9673813124573734, + "grad_norm": 0.06813164800405502, + "learning_rate": 2.6811623407438923e-06, + "loss": 2.5657, + "step": 32623 + }, + { + "epoch": 0.9674109658096848, + "grad_norm": 0.06656443327665329, + "learning_rate": 2.6762984871662177e-06, + "loss": 2.5678, + "step": 32624 + }, + { + "epoch": 0.9674406191619963, + "grad_norm": 0.06724991649389267, + "learning_rate": 2.6714390374752427e-06, + "loss": 2.5525, + "step": 32625 + }, + { + "epoch": 0.9674702725143077, + "grad_norm": 0.06756570935249329, + "learning_rate": 2.6665839917140446e-06, + "loss": 2.5835, + "step": 32626 + }, + { + "epoch": 0.9674999258666193, + "grad_norm": 0.06835004687309265, + "learning_rate": 2.6617333499256436e-06, + "loss": 2.58, + "step": 32627 + }, + { + "epoch": 0.9675295792189307, + "grad_norm": 0.06936647742986679, + "learning_rate": 2.65688711215295e-06, + "loss": 2.5806, + "step": 32628 + }, + { + "epoch": 0.9675592325712422, + "grad_norm": 0.07011836767196655, + "learning_rate": 2.6520452784388748e-06, + "loss": 2.5731, + "step": 32629 + }, + { + "epoch": 0.9675888859235536, + "grad_norm": 0.06625688821077347, + "learning_rate": 2.6472078488263276e-06, + "loss": 2.5519, + "step": 32630 + }, + { + "epoch": 0.9676185392758652, + "grad_norm": 0.07277453690767288, + "learning_rate": 2.642374823358107e-06, + "loss": 2.5367, + "step": 32631 + }, + { + "epoch": 0.9676481926281766, + "grad_norm": 0.06818877905607224, + "learning_rate": 2.6375462020770125e-06, + "loss": 2.5867, + "step": 32632 + }, + { + "epoch": 0.9676778459804881, + "grad_norm": 0.06603341549634933, + "learning_rate": 2.6327219850257877e-06, + "loss": 2.5345, + "step": 32633 + }, + { + "epoch": 0.9677074993327995, + "grad_norm": 0.06359691172838211, + "learning_rate": 2.6279021722472317e-06, + "loss": 2.5673, + "step": 32634 + }, + { + "epoch": 0.9677371526851111, + "grad_norm": 0.06470585614442825, + "learning_rate": 2.623086763783977e-06, + "loss": 2.5852, + "step": 32635 + }, + { + "epoch": 0.9677668060374225, + "grad_norm": 0.06797324866056442, + "learning_rate": 2.6182757596786566e-06, + "loss": 2.5606, + "step": 32636 + }, + { + "epoch": 0.967796459389734, + "grad_norm": 0.06812062114477158, + "learning_rate": 2.6134691599739025e-06, + "loss": 2.5778, + "step": 32637 + }, + { + "epoch": 0.9678261127420454, + "grad_norm": 0.06642486155033112, + "learning_rate": 2.6086669647122363e-06, + "loss": 2.551, + "step": 32638 + }, + { + "epoch": 0.967855766094357, + "grad_norm": 0.06678532063961029, + "learning_rate": 2.60386917393618e-06, + "loss": 2.5411, + "step": 32639 + }, + { + "epoch": 0.9678854194466684, + "grad_norm": 0.06781884282827377, + "learning_rate": 2.5990757876882543e-06, + "loss": 2.5869, + "step": 32640 + }, + { + "epoch": 0.9679150727989799, + "grad_norm": 0.06604745984077454, + "learning_rate": 2.594286806010926e-06, + "loss": 2.5615, + "step": 32641 + }, + { + "epoch": 0.9679447261512915, + "grad_norm": 0.06473911553621292, + "learning_rate": 2.589502228946494e-06, + "loss": 2.5601, + "step": 32642 + }, + { + "epoch": 0.9679743795036029, + "grad_norm": 0.06495686620473862, + "learning_rate": 2.584722056537425e-06, + "loss": 2.5916, + "step": 32643 + }, + { + "epoch": 0.9680040328559144, + "grad_norm": 0.06545022875070572, + "learning_rate": 2.579946288826074e-06, + "loss": 2.5348, + "step": 32644 + }, + { + "epoch": 0.9680336862082258, + "grad_norm": 0.06996162235736847, + "learning_rate": 2.575174925854573e-06, + "loss": 2.551, + "step": 32645 + }, + { + "epoch": 0.9680633395605374, + "grad_norm": 0.06713543832302094, + "learning_rate": 2.570407967665389e-06, + "loss": 2.5244, + "step": 32646 + }, + { + "epoch": 0.9680929929128488, + "grad_norm": 0.06744121760129929, + "learning_rate": 2.5656454143005436e-06, + "loss": 2.5363, + "step": 32647 + }, + { + "epoch": 0.9681226462651603, + "grad_norm": 0.06590387970209122, + "learning_rate": 2.5608872658023365e-06, + "loss": 2.5801, + "step": 32648 + }, + { + "epoch": 0.9681522996174717, + "grad_norm": 0.06549077481031418, + "learning_rate": 2.5561335222128444e-06, + "loss": 2.5681, + "step": 32649 + }, + { + "epoch": 0.9681819529697833, + "grad_norm": 0.06952480226755142, + "learning_rate": 2.5513841835742013e-06, + "loss": 2.6097, + "step": 32650 + }, + { + "epoch": 0.9682116063220947, + "grad_norm": 0.06783600896596909, + "learning_rate": 2.5466392499284284e-06, + "loss": 2.542, + "step": 32651 + }, + { + "epoch": 0.9682412596744062, + "grad_norm": 0.07057029753923416, + "learning_rate": 2.541898721317548e-06, + "loss": 2.5567, + "step": 32652 + }, + { + "epoch": 0.9682709130267176, + "grad_norm": 0.0669943243265152, + "learning_rate": 2.537162597783527e-06, + "loss": 2.5377, + "step": 32653 + }, + { + "epoch": 0.9683005663790292, + "grad_norm": 0.0679088681936264, + "learning_rate": 2.5324308793683306e-06, + "loss": 2.5539, + "step": 32654 + }, + { + "epoch": 0.9683302197313406, + "grad_norm": 0.0686412900686264, + "learning_rate": 2.527703566113815e-06, + "loss": 2.5734, + "step": 32655 + }, + { + "epoch": 0.9683598730836521, + "grad_norm": 0.07090193033218384, + "learning_rate": 2.5229806580618907e-06, + "loss": 2.5718, + "step": 32656 + }, + { + "epoch": 0.9683895264359635, + "grad_norm": 0.06671611964702606, + "learning_rate": 2.5182621552544137e-06, + "loss": 2.5664, + "step": 32657 + }, + { + "epoch": 0.9684191797882751, + "grad_norm": 0.07122623920440674, + "learning_rate": 2.513548057733017e-06, + "loss": 2.5587, + "step": 32658 + }, + { + "epoch": 0.9684488331405865, + "grad_norm": 0.07034706324338913, + "learning_rate": 2.508838365539612e-06, + "loss": 2.5537, + "step": 32659 + }, + { + "epoch": 0.968478486492898, + "grad_norm": 0.06793778389692307, + "learning_rate": 2.504133078715831e-06, + "loss": 2.5713, + "step": 32660 + }, + { + "epoch": 0.9685081398452094, + "grad_norm": 0.06895186752080917, + "learning_rate": 2.499432197303364e-06, + "loss": 2.6061, + "step": 32661 + }, + { + "epoch": 0.968537793197521, + "grad_norm": 0.06936080008745193, + "learning_rate": 2.4947357213438436e-06, + "loss": 2.6038, + "step": 32662 + }, + { + "epoch": 0.9685674465498325, + "grad_norm": 0.06577951461076736, + "learning_rate": 2.490043650878737e-06, + "loss": 2.5554, + "step": 32663 + }, + { + "epoch": 0.9685970999021439, + "grad_norm": 0.06572684645652771, + "learning_rate": 2.485355985949789e-06, + "loss": 2.5603, + "step": 32664 + }, + { + "epoch": 0.9686267532544555, + "grad_norm": 0.06486566364765167, + "learning_rate": 2.480672726598354e-06, + "loss": 2.558, + "step": 32665 + }, + { + "epoch": 0.9686564066067669, + "grad_norm": 0.06365402042865753, + "learning_rate": 2.475993872865956e-06, + "loss": 2.5569, + "step": 32666 + }, + { + "epoch": 0.9686860599590784, + "grad_norm": 0.06587696075439453, + "learning_rate": 2.471319424794061e-06, + "loss": 2.5592, + "step": 32667 + }, + { + "epoch": 0.9687157133113898, + "grad_norm": 0.06528239697217941, + "learning_rate": 2.4666493824240243e-06, + "loss": 2.5571, + "step": 32668 + }, + { + "epoch": 0.9687453666637014, + "grad_norm": 0.06529780477285385, + "learning_rate": 2.4619837457972025e-06, + "loss": 2.5446, + "step": 32669 + }, + { + "epoch": 0.9687750200160128, + "grad_norm": 0.07032965123653412, + "learning_rate": 2.457322514954896e-06, + "loss": 2.5681, + "step": 32670 + }, + { + "epoch": 0.9688046733683243, + "grad_norm": 0.06694488972425461, + "learning_rate": 2.4526656899384047e-06, + "loss": 2.5463, + "step": 32671 + }, + { + "epoch": 0.9688343267206357, + "grad_norm": 0.06801845133304596, + "learning_rate": 2.448013270789029e-06, + "loss": 2.5844, + "step": 32672 + }, + { + "epoch": 0.9688639800729473, + "grad_norm": 0.06722288578748703, + "learning_rate": 2.443365257547847e-06, + "loss": 2.5763, + "step": 32673 + }, + { + "epoch": 0.9688936334252587, + "grad_norm": 0.06601718813180923, + "learning_rate": 2.4387216502560483e-06, + "loss": 2.5605, + "step": 32674 + }, + { + "epoch": 0.9689232867775702, + "grad_norm": 0.06768527626991272, + "learning_rate": 2.4340824489548217e-06, + "loss": 2.5636, + "step": 32675 + }, + { + "epoch": 0.9689529401298816, + "grad_norm": 0.06720639020204544, + "learning_rate": 2.4294476536851352e-06, + "loss": 2.5671, + "step": 32676 + }, + { + "epoch": 0.9689825934821932, + "grad_norm": 0.06458240747451782, + "learning_rate": 2.4248172644881773e-06, + "loss": 2.5641, + "step": 32677 + }, + { + "epoch": 0.9690122468345046, + "grad_norm": 0.06823913007974625, + "learning_rate": 2.4201912814048044e-06, + "loss": 2.5724, + "step": 32678 + }, + { + "epoch": 0.9690419001868161, + "grad_norm": 0.06712529808282852, + "learning_rate": 2.4155697044760948e-06, + "loss": 2.5485, + "step": 32679 + }, + { + "epoch": 0.9690715535391276, + "grad_norm": 0.0653417706489563, + "learning_rate": 2.410952533742905e-06, + "loss": 2.5611, + "step": 32680 + }, + { + "epoch": 0.9691012068914391, + "grad_norm": 0.06468813121318817, + "learning_rate": 2.406339769246091e-06, + "loss": 2.555, + "step": 32681 + }, + { + "epoch": 0.9691308602437505, + "grad_norm": 0.06538067013025284, + "learning_rate": 2.4017314110266196e-06, + "loss": 2.5828, + "step": 32682 + }, + { + "epoch": 0.969160513596062, + "grad_norm": 0.06866706907749176, + "learning_rate": 2.3971274591251814e-06, + "loss": 2.5283, + "step": 32683 + }, + { + "epoch": 0.9691901669483736, + "grad_norm": 0.065652035176754, + "learning_rate": 2.3925279135826315e-06, + "loss": 2.5475, + "step": 32684 + }, + { + "epoch": 0.969219820300685, + "grad_norm": 0.06435389816761017, + "learning_rate": 2.3879327744396606e-06, + "loss": 2.5491, + "step": 32685 + }, + { + "epoch": 0.9692494736529965, + "grad_norm": 0.06697562336921692, + "learning_rate": 2.3833420417369577e-06, + "loss": 2.5235, + "step": 32686 + }, + { + "epoch": 0.9692791270053079, + "grad_norm": 0.06729315221309662, + "learning_rate": 2.3787557155151574e-06, + "loss": 2.5522, + "step": 32687 + }, + { + "epoch": 0.9693087803576195, + "grad_norm": 0.06325599551200867, + "learning_rate": 2.3741737958148935e-06, + "loss": 2.5496, + "step": 32688 + }, + { + "epoch": 0.9693384337099309, + "grad_norm": 0.06484192609786987, + "learning_rate": 2.3695962826767446e-06, + "loss": 2.5538, + "step": 32689 + }, + { + "epoch": 0.9693680870622424, + "grad_norm": 0.06650175899267197, + "learning_rate": 2.3650231761412343e-06, + "loss": 2.5838, + "step": 32690 + }, + { + "epoch": 0.9693977404145538, + "grad_norm": 0.07073505967855453, + "learning_rate": 2.36045447624883e-06, + "loss": 2.5664, + "step": 32691 + }, + { + "epoch": 0.9694273937668654, + "grad_norm": 0.06443227827548981, + "learning_rate": 2.3558901830400547e-06, + "loss": 2.5758, + "step": 32692 + }, + { + "epoch": 0.9694570471191768, + "grad_norm": 0.06550376117229462, + "learning_rate": 2.3513302965552653e-06, + "loss": 2.5615, + "step": 32693 + }, + { + "epoch": 0.9694867004714883, + "grad_norm": 0.06414072215557098, + "learning_rate": 2.3467748168348736e-06, + "loss": 2.5409, + "step": 32694 + }, + { + "epoch": 0.9695163538237997, + "grad_norm": 0.065629743039608, + "learning_rate": 2.3422237439192917e-06, + "loss": 2.5646, + "step": 32695 + }, + { + "epoch": 0.9695460071761113, + "grad_norm": 0.06807758659124374, + "learning_rate": 2.3376770778486544e-06, + "loss": 2.573, + "step": 32696 + }, + { + "epoch": 0.9695756605284227, + "grad_norm": 0.06665953248739243, + "learning_rate": 2.3331348186633183e-06, + "loss": 2.5734, + "step": 32697 + }, + { + "epoch": 0.9696053138807342, + "grad_norm": 0.06838753074407578, + "learning_rate": 2.3285969664034734e-06, + "loss": 2.594, + "step": 32698 + }, + { + "epoch": 0.9696349672330457, + "grad_norm": 0.06413452327251434, + "learning_rate": 2.324063521109365e-06, + "loss": 2.5729, + "step": 32699 + }, + { + "epoch": 0.9696646205853572, + "grad_norm": 0.0687796100974083, + "learning_rate": 2.3195344828211285e-06, + "loss": 2.5579, + "step": 32700 + }, + { + "epoch": 0.9696942739376686, + "grad_norm": 0.06785260885953903, + "learning_rate": 2.3150098515787866e-06, + "loss": 2.5361, + "step": 32701 + }, + { + "epoch": 0.9697239272899801, + "grad_norm": 0.06573861092329025, + "learning_rate": 2.310489627422474e-06, + "loss": 2.5349, + "step": 32702 + }, + { + "epoch": 0.9697535806422916, + "grad_norm": 0.06498317420482635, + "learning_rate": 2.3059738103922145e-06, + "loss": 2.579, + "step": 32703 + }, + { + "epoch": 0.9697832339946031, + "grad_norm": 0.06616765260696411, + "learning_rate": 2.3014624005279763e-06, + "loss": 2.5457, + "step": 32704 + }, + { + "epoch": 0.9698128873469146, + "grad_norm": 0.06594262272119522, + "learning_rate": 2.2969553978697267e-06, + "loss": 2.5752, + "step": 32705 + }, + { + "epoch": 0.969842540699226, + "grad_norm": 0.06472762674093246, + "learning_rate": 2.292452802457379e-06, + "loss": 2.5482, + "step": 32706 + }, + { + "epoch": 0.9698721940515376, + "grad_norm": 0.07952757924795151, + "learning_rate": 2.287954614330734e-06, + "loss": 2.5605, + "step": 32707 + }, + { + "epoch": 0.969901847403849, + "grad_norm": 0.06700113415718079, + "learning_rate": 2.28346083352976e-06, + "loss": 2.6015, + "step": 32708 + }, + { + "epoch": 0.9699315007561605, + "grad_norm": 0.06456847488880157, + "learning_rate": 2.2789714600940926e-06, + "loss": 2.5788, + "step": 32709 + }, + { + "epoch": 0.969961154108472, + "grad_norm": 0.06653380393981934, + "learning_rate": 2.274486494063588e-06, + "loss": 2.5919, + "step": 32710 + }, + { + "epoch": 0.9699908074607835, + "grad_norm": 0.06999735534191132, + "learning_rate": 2.2700059354779925e-06, + "loss": 2.5792, + "step": 32711 + }, + { + "epoch": 0.9700204608130949, + "grad_norm": 0.06929487735033035, + "learning_rate": 2.265529784376885e-06, + "loss": 2.5627, + "step": 32712 + }, + { + "epoch": 0.9700501141654064, + "grad_norm": 0.07082397490739822, + "learning_rate": 2.261058040799957e-06, + "loss": 2.5827, + "step": 32713 + }, + { + "epoch": 0.9700797675177179, + "grad_norm": 0.0640600323677063, + "learning_rate": 2.256590704786787e-06, + "loss": 2.5704, + "step": 32714 + }, + { + "epoch": 0.9701094208700294, + "grad_norm": 0.0668085515499115, + "learning_rate": 2.2521277763769555e-06, + "loss": 2.5816, + "step": 32715 + }, + { + "epoch": 0.9701390742223408, + "grad_norm": 0.06507065892219543, + "learning_rate": 2.2476692556099853e-06, + "loss": 2.5731, + "step": 32716 + }, + { + "epoch": 0.9701687275746523, + "grad_norm": 0.06871549040079117, + "learning_rate": 2.24321514252529e-06, + "loss": 2.5829, + "step": 32717 + }, + { + "epoch": 0.9701983809269638, + "grad_norm": 0.06577843427658081, + "learning_rate": 2.2387654371623935e-06, + "loss": 2.5628, + "step": 32718 + }, + { + "epoch": 0.9702280342792753, + "grad_norm": 0.06307609379291534, + "learning_rate": 2.2343201395606526e-06, + "loss": 2.5355, + "step": 32719 + }, + { + "epoch": 0.9702576876315867, + "grad_norm": 0.06801871955394745, + "learning_rate": 2.229879249759481e-06, + "loss": 2.5569, + "step": 32720 + }, + { + "epoch": 0.9702873409838982, + "grad_norm": 0.0640966072678566, + "learning_rate": 2.225442767798125e-06, + "loss": 2.5728, + "step": 32721 + }, + { + "epoch": 0.9703169943362097, + "grad_norm": 0.06756691634654999, + "learning_rate": 2.221010693715941e-06, + "loss": 2.5693, + "step": 32722 + }, + { + "epoch": 0.9703466476885212, + "grad_norm": 0.06560827046632767, + "learning_rate": 2.2165830275521215e-06, + "loss": 2.5752, + "step": 32723 + }, + { + "epoch": 0.9703763010408326, + "grad_norm": 0.06507053226232529, + "learning_rate": 2.212159769345967e-06, + "loss": 2.5668, + "step": 32724 + }, + { + "epoch": 0.9704059543931441, + "grad_norm": 0.06378795206546783, + "learning_rate": 2.207740919136558e-06, + "loss": 2.554, + "step": 32725 + }, + { + "epoch": 0.9704356077454557, + "grad_norm": 0.06739110499620438, + "learning_rate": 2.20332647696303e-06, + "loss": 2.576, + "step": 32726 + }, + { + "epoch": 0.9704652610977671, + "grad_norm": 0.06676680594682693, + "learning_rate": 2.198916442864518e-06, + "loss": 2.5616, + "step": 32727 + }, + { + "epoch": 0.9704949144500786, + "grad_norm": 0.06487169861793518, + "learning_rate": 2.194510816879991e-06, + "loss": 2.5391, + "step": 32728 + }, + { + "epoch": 0.97052456780239, + "grad_norm": 0.06362570822238922, + "learning_rate": 2.1901095990485843e-06, + "loss": 2.5796, + "step": 32729 + }, + { + "epoch": 0.9705542211547016, + "grad_norm": 0.06486573070287704, + "learning_rate": 2.1857127894091557e-06, + "loss": 2.5333, + "step": 32730 + }, + { + "epoch": 0.970583874507013, + "grad_norm": 0.06677741557359695, + "learning_rate": 2.1813203880007293e-06, + "loss": 2.5478, + "step": 32731 + }, + { + "epoch": 0.9706135278593245, + "grad_norm": 0.06448928266763687, + "learning_rate": 2.176932394862108e-06, + "loss": 2.5495, + "step": 32732 + }, + { + "epoch": 0.970643181211636, + "grad_norm": 0.06464281678199768, + "learning_rate": 2.17254881003226e-06, + "loss": 2.5515, + "step": 32733 + }, + { + "epoch": 0.9706728345639475, + "grad_norm": 0.06454217433929443, + "learning_rate": 2.168169633549877e-06, + "loss": 2.5522, + "step": 32734 + }, + { + "epoch": 0.9707024879162589, + "grad_norm": 0.07135950028896332, + "learning_rate": 2.163794865453872e-06, + "loss": 2.5573, + "step": 32735 + }, + { + "epoch": 0.9707321412685704, + "grad_norm": 0.0666436105966568, + "learning_rate": 2.159424505782881e-06, + "loss": 2.5671, + "step": 32736 + }, + { + "epoch": 0.9707617946208819, + "grad_norm": 0.06520699709653854, + "learning_rate": 2.1550585545756505e-06, + "loss": 2.5411, + "step": 32737 + }, + { + "epoch": 0.9707914479731934, + "grad_norm": 0.06842444837093353, + "learning_rate": 2.150697011870817e-06, + "loss": 2.5674, + "step": 32738 + }, + { + "epoch": 0.9708211013255048, + "grad_norm": 0.06605354696512222, + "learning_rate": 2.1463398777070152e-06, + "loss": 2.5665, + "step": 32739 + }, + { + "epoch": 0.9708507546778163, + "grad_norm": 0.0639980211853981, + "learning_rate": 2.141987152122826e-06, + "loss": 2.5569, + "step": 32740 + }, + { + "epoch": 0.9708804080301278, + "grad_norm": 0.06614002585411072, + "learning_rate": 2.13763883515683e-06, + "loss": 2.5853, + "step": 32741 + }, + { + "epoch": 0.9709100613824393, + "grad_norm": 0.06417939066886902, + "learning_rate": 2.1332949268474954e-06, + "loss": 2.5512, + "step": 32742 + }, + { + "epoch": 0.9709397147347507, + "grad_norm": 0.0708906427025795, + "learning_rate": 2.128955427233237e-06, + "loss": 2.5593, + "step": 32743 + }, + { + "epoch": 0.9709693680870622, + "grad_norm": 0.06363637000322342, + "learning_rate": 2.1246203363525783e-06, + "loss": 2.5421, + "step": 32744 + }, + { + "epoch": 0.9709990214393737, + "grad_norm": 0.06787046790122986, + "learning_rate": 2.120289654243879e-06, + "loss": 2.5487, + "step": 32745 + }, + { + "epoch": 0.9710286747916852, + "grad_norm": 0.06587643921375275, + "learning_rate": 2.1159633809454403e-06, + "loss": 2.5606, + "step": 32746 + }, + { + "epoch": 0.9710583281439967, + "grad_norm": 0.06672508269548416, + "learning_rate": 2.1116415164956215e-06, + "loss": 2.5655, + "step": 32747 + }, + { + "epoch": 0.9710879814963082, + "grad_norm": 0.06557765603065491, + "learning_rate": 2.1073240609326694e-06, + "loss": 2.5596, + "step": 32748 + }, + { + "epoch": 0.9711176348486197, + "grad_norm": 0.06766953319311142, + "learning_rate": 2.103011014294831e-06, + "loss": 2.5476, + "step": 32749 + }, + { + "epoch": 0.9711472882009311, + "grad_norm": 0.06446334719657898, + "learning_rate": 2.0987023766202984e-06, + "loss": 2.5952, + "step": 32750 + }, + { + "epoch": 0.9711769415532426, + "grad_norm": 0.06897898018360138, + "learning_rate": 2.0943981479472627e-06, + "loss": 2.5698, + "step": 32751 + }, + { + "epoch": 0.971206594905554, + "grad_norm": 0.06730829179286957, + "learning_rate": 2.09009832831375e-06, + "loss": 2.5663, + "step": 32752 + }, + { + "epoch": 0.9712362482578656, + "grad_norm": 0.06507200747728348, + "learning_rate": 2.08580291775784e-06, + "loss": 2.584, + "step": 32753 + }, + { + "epoch": 0.971265901610177, + "grad_norm": 0.06526508927345276, + "learning_rate": 2.0815119163176686e-06, + "loss": 2.5761, + "step": 32754 + }, + { + "epoch": 0.9712955549624885, + "grad_norm": 0.06826671212911606, + "learning_rate": 2.077225324031151e-06, + "loss": 2.558, + "step": 32755 + }, + { + "epoch": 0.9713252083148, + "grad_norm": 0.06604872643947601, + "learning_rate": 2.072943140936312e-06, + "loss": 2.5294, + "step": 32756 + }, + { + "epoch": 0.9713548616671115, + "grad_norm": 0.06813149154186249, + "learning_rate": 2.0686653670709543e-06, + "loss": 2.5646, + "step": 32757 + }, + { + "epoch": 0.9713845150194229, + "grad_norm": 0.06594524532556534, + "learning_rate": 2.064392002473103e-06, + "loss": 2.573, + "step": 32758 + }, + { + "epoch": 0.9714141683717344, + "grad_norm": 0.06843748688697815, + "learning_rate": 2.0601230471805068e-06, + "loss": 2.5633, + "step": 32759 + }, + { + "epoch": 0.9714438217240459, + "grad_norm": 0.0648118108510971, + "learning_rate": 2.055858501230967e-06, + "loss": 2.5403, + "step": 32760 + }, + { + "epoch": 0.9714734750763574, + "grad_norm": 0.0666106566786766, + "learning_rate": 2.0515983646622883e-06, + "loss": 2.5708, + "step": 32761 + }, + { + "epoch": 0.9715031284286688, + "grad_norm": 0.06730642914772034, + "learning_rate": 2.047342637512217e-06, + "loss": 2.5512, + "step": 32762 + }, + { + "epoch": 0.9715327817809803, + "grad_norm": 0.06458039581775665, + "learning_rate": 2.0430913198183354e-06, + "loss": 2.5186, + "step": 32763 + }, + { + "epoch": 0.9715624351332918, + "grad_norm": 0.06714877486228943, + "learning_rate": 2.03884441161839e-06, + "loss": 2.5358, + "step": 32764 + }, + { + "epoch": 0.9715920884856033, + "grad_norm": 0.07139035314321518, + "learning_rate": 2.034601912949963e-06, + "loss": 2.5636, + "step": 32765 + }, + { + "epoch": 0.9716217418379147, + "grad_norm": 0.06655417382717133, + "learning_rate": 2.0303638238505784e-06, + "loss": 2.5439, + "step": 32766 + }, + { + "epoch": 0.9716513951902263, + "grad_norm": 0.06503806263208389, + "learning_rate": 2.0261301443578185e-06, + "loss": 2.55, + "step": 32767 + }, + { + "epoch": 0.9716810485425378, + "grad_norm": 0.06584875285625458, + "learning_rate": 2.021900874509153e-06, + "loss": 2.5481, + "step": 32768 + }, + { + "epoch": 0.9717107018948492, + "grad_norm": 0.06433114409446716, + "learning_rate": 2.0176760143419957e-06, + "loss": 2.591, + "step": 32769 + }, + { + "epoch": 0.9717403552471607, + "grad_norm": 0.0670517086982727, + "learning_rate": 2.013455563893818e-06, + "loss": 2.5514, + "step": 32770 + }, + { + "epoch": 0.9717700085994722, + "grad_norm": 0.06538292020559311, + "learning_rate": 2.009239523202033e-06, + "loss": 2.5763, + "step": 32771 + }, + { + "epoch": 0.9717996619517837, + "grad_norm": 0.06624976545572281, + "learning_rate": 2.0050278923038343e-06, + "loss": 2.617, + "step": 32772 + }, + { + "epoch": 0.9718293153040951, + "grad_norm": 0.0674128532409668, + "learning_rate": 2.000820671236636e-06, + "loss": 2.5748, + "step": 32773 + }, + { + "epoch": 0.9718589686564066, + "grad_norm": 0.0666782483458519, + "learning_rate": 1.996617860037575e-06, + "loss": 2.5659, + "step": 32774 + }, + { + "epoch": 0.9718886220087181, + "grad_norm": 0.06839918345212936, + "learning_rate": 1.9924194587440103e-06, + "loss": 2.5553, + "step": 32775 + }, + { + "epoch": 0.9719182753610296, + "grad_norm": 0.06685389578342438, + "learning_rate": 1.9882254673930233e-06, + "loss": 2.5794, + "step": 32776 + }, + { + "epoch": 0.971947928713341, + "grad_norm": 0.06754348427057266, + "learning_rate": 1.9840358860218068e-06, + "loss": 2.5608, + "step": 32777 + }, + { + "epoch": 0.9719775820656525, + "grad_norm": 0.06455378234386444, + "learning_rate": 1.9798507146674417e-06, + "loss": 2.5514, + "step": 32778 + }, + { + "epoch": 0.972007235417964, + "grad_norm": 0.0657280832529068, + "learning_rate": 1.9756699533669543e-06, + "loss": 2.6018, + "step": 32779 + }, + { + "epoch": 0.9720368887702755, + "grad_norm": 0.06697070598602295, + "learning_rate": 1.9714936021574257e-06, + "loss": 2.5683, + "step": 32780 + }, + { + "epoch": 0.9720665421225869, + "grad_norm": 0.06470805406570435, + "learning_rate": 1.9673216610757717e-06, + "loss": 2.5562, + "step": 32781 + }, + { + "epoch": 0.9720961954748985, + "grad_norm": 0.06676264107227325, + "learning_rate": 1.9631541301590173e-06, + "loss": 2.5286, + "step": 32782 + }, + { + "epoch": 0.9721258488272099, + "grad_norm": 0.06404571980237961, + "learning_rate": 1.9589910094439665e-06, + "loss": 2.5302, + "step": 32783 + }, + { + "epoch": 0.9721555021795214, + "grad_norm": 0.06464246660470963, + "learning_rate": 1.9548322989675903e-06, + "loss": 2.5811, + "step": 32784 + }, + { + "epoch": 0.9721851555318328, + "grad_norm": 0.06618348509073257, + "learning_rate": 1.9506779987666366e-06, + "loss": 2.5915, + "step": 32785 + }, + { + "epoch": 0.9722148088841444, + "grad_norm": 0.06529252976179123, + "learning_rate": 1.9465281088779096e-06, + "loss": 2.582, + "step": 32786 + }, + { + "epoch": 0.9722444622364559, + "grad_norm": 0.06332650780677795, + "learning_rate": 1.9423826293381574e-06, + "loss": 2.5923, + "step": 32787 + }, + { + "epoch": 0.9722741155887673, + "grad_norm": 0.06621909886598587, + "learning_rate": 1.938241560184073e-06, + "loss": 2.5486, + "step": 32788 + }, + { + "epoch": 0.9723037689410788, + "grad_norm": 0.06865314394235611, + "learning_rate": 1.9341049014524047e-06, + "loss": 2.5437, + "step": 32789 + }, + { + "epoch": 0.9723334222933903, + "grad_norm": 0.06657009571790695, + "learning_rate": 1.9299726531797344e-06, + "loss": 2.5442, + "step": 32790 + }, + { + "epoch": 0.9723630756457018, + "grad_norm": 0.06893143057823181, + "learning_rate": 1.925844815402644e-06, + "loss": 2.5647, + "step": 32791 + }, + { + "epoch": 0.9723927289980132, + "grad_norm": 0.06521884351968765, + "learning_rate": 1.92172138815766e-06, + "loss": 2.6161, + "step": 32792 + }, + { + "epoch": 0.9724223823503247, + "grad_norm": 0.07173211127519608, + "learning_rate": 1.917602371481364e-06, + "loss": 2.5493, + "step": 32793 + }, + { + "epoch": 0.9724520357026362, + "grad_norm": 0.06955885142087936, + "learning_rate": 1.9134877654101714e-06, + "loss": 2.5906, + "step": 32794 + }, + { + "epoch": 0.9724816890549477, + "grad_norm": 0.06768526136875153, + "learning_rate": 1.9093775699805526e-06, + "loss": 2.5824, + "step": 32795 + }, + { + "epoch": 0.9725113424072591, + "grad_norm": 0.06637366861104965, + "learning_rate": 1.9052717852288681e-06, + "loss": 2.5793, + "step": 32796 + }, + { + "epoch": 0.9725409957595706, + "grad_norm": 0.06701067835092545, + "learning_rate": 1.9011704111914773e-06, + "loss": 2.5344, + "step": 32797 + }, + { + "epoch": 0.9725706491118821, + "grad_norm": 0.06563831865787506, + "learning_rate": 1.8970734479047957e-06, + "loss": 2.569, + "step": 32798 + }, + { + "epoch": 0.9726003024641936, + "grad_norm": 0.06573738157749176, + "learning_rate": 1.892980895404961e-06, + "loss": 2.5371, + "step": 32799 + }, + { + "epoch": 0.972629955816505, + "grad_norm": 0.06710922718048096, + "learning_rate": 1.8888927537282773e-06, + "loss": 2.5355, + "step": 32800 + }, + { + "epoch": 0.9726596091688166, + "grad_norm": 0.06487397849559784, + "learning_rate": 1.8848090229109937e-06, + "loss": 2.5864, + "step": 32801 + }, + { + "epoch": 0.972689262521128, + "grad_norm": 0.06618102639913559, + "learning_rate": 1.8807297029891924e-06, + "loss": 2.5801, + "step": 32802 + }, + { + "epoch": 0.9727189158734395, + "grad_norm": 0.06345120072364807, + "learning_rate": 1.8766547939990664e-06, + "loss": 2.5673, + "step": 32803 + }, + { + "epoch": 0.9727485692257509, + "grad_norm": 0.06594228744506836, + "learning_rate": 1.8725842959766425e-06, + "loss": 2.5527, + "step": 32804 + }, + { + "epoch": 0.9727782225780625, + "grad_norm": 0.06595253199338913, + "learning_rate": 1.8685182089580033e-06, + "loss": 2.5485, + "step": 32805 + }, + { + "epoch": 0.9728078759303739, + "grad_norm": 0.06715189665555954, + "learning_rate": 1.8644565329791197e-06, + "loss": 2.575, + "step": 32806 + }, + { + "epoch": 0.9728375292826854, + "grad_norm": 0.06637842208147049, + "learning_rate": 1.8603992680759629e-06, + "loss": 2.5494, + "step": 32807 + }, + { + "epoch": 0.9728671826349969, + "grad_norm": 0.06759199500083923, + "learning_rate": 1.8563464142845043e-06, + "loss": 2.5847, + "step": 32808 + }, + { + "epoch": 0.9728968359873084, + "grad_norm": 0.06649632751941681, + "learning_rate": 1.852297971640604e-06, + "loss": 2.5521, + "step": 32809 + }, + { + "epoch": 0.9729264893396199, + "grad_norm": 0.06699720770120621, + "learning_rate": 1.8482539401800669e-06, + "loss": 2.5632, + "step": 32810 + }, + { + "epoch": 0.9729561426919313, + "grad_norm": 0.0673515722155571, + "learning_rate": 1.8442143199388084e-06, + "loss": 2.5728, + "step": 32811 + }, + { + "epoch": 0.9729857960442428, + "grad_norm": 0.0673520416021347, + "learning_rate": 1.8401791109525222e-06, + "loss": 2.5809, + "step": 32812 + }, + { + "epoch": 0.9730154493965543, + "grad_norm": 0.06728852540254593, + "learning_rate": 1.8361483132569022e-06, + "loss": 2.5749, + "step": 32813 + }, + { + "epoch": 0.9730451027488658, + "grad_norm": 0.06815551221370697, + "learning_rate": 1.8321219268877531e-06, + "loss": 2.5898, + "step": 32814 + }, + { + "epoch": 0.9730747561011772, + "grad_norm": 0.06554027646780014, + "learning_rate": 1.8280999518806575e-06, + "loss": 2.5834, + "step": 32815 + }, + { + "epoch": 0.9731044094534888, + "grad_norm": 0.06577852368354797, + "learning_rate": 1.824082388271253e-06, + "loss": 2.5531, + "step": 32816 + }, + { + "epoch": 0.9731340628058002, + "grad_norm": 0.06465759873390198, + "learning_rate": 1.8200692360951232e-06, + "loss": 2.5724, + "step": 32817 + }, + { + "epoch": 0.9731637161581117, + "grad_norm": 0.0678185373544693, + "learning_rate": 1.8160604953877946e-06, + "loss": 2.5591, + "step": 32818 + }, + { + "epoch": 0.9731933695104231, + "grad_norm": 0.06680838018655777, + "learning_rate": 1.812056166184739e-06, + "loss": 2.5623, + "step": 32819 + }, + { + "epoch": 0.9732230228627347, + "grad_norm": 0.06667236238718033, + "learning_rate": 1.808056248521428e-06, + "loss": 2.5141, + "step": 32820 + }, + { + "epoch": 0.9732526762150461, + "grad_norm": 0.06705282628536224, + "learning_rate": 1.8040607424333333e-06, + "loss": 2.5575, + "step": 32821 + }, + { + "epoch": 0.9732823295673576, + "grad_norm": 0.06888965517282486, + "learning_rate": 1.8000696479557598e-06, + "loss": 2.5677, + "step": 32822 + }, + { + "epoch": 0.973311982919669, + "grad_norm": 0.06596770882606506, + "learning_rate": 1.7960829651240685e-06, + "loss": 2.5837, + "step": 32823 + }, + { + "epoch": 0.9733416362719806, + "grad_norm": 0.06503233313560486, + "learning_rate": 1.7921006939736195e-06, + "loss": 2.586, + "step": 32824 + }, + { + "epoch": 0.973371289624292, + "grad_norm": 0.06985732167959213, + "learning_rate": 1.7881228345396073e-06, + "loss": 2.5683, + "step": 32825 + }, + { + "epoch": 0.9734009429766035, + "grad_norm": 0.06641361117362976, + "learning_rate": 1.7841493868573367e-06, + "loss": 2.5637, + "step": 32826 + }, + { + "epoch": 0.9734305963289149, + "grad_norm": 0.06338141113519669, + "learning_rate": 1.7801803509618908e-06, + "loss": 2.5851, + "step": 32827 + }, + { + "epoch": 0.9734602496812265, + "grad_norm": 0.06537526845932007, + "learning_rate": 1.7762157268884638e-06, + "loss": 2.532, + "step": 32828 + }, + { + "epoch": 0.973489903033538, + "grad_norm": 0.06453035771846771, + "learning_rate": 1.772255514672194e-06, + "loss": 2.5842, + "step": 32829 + }, + { + "epoch": 0.9735195563858494, + "grad_norm": 0.06321773678064346, + "learning_rate": 1.7682997143481095e-06, + "loss": 2.5665, + "step": 32830 + }, + { + "epoch": 0.973549209738161, + "grad_norm": 0.06508282572031021, + "learning_rate": 1.7643483259512371e-06, + "loss": 2.5726, + "step": 32831 + }, + { + "epoch": 0.9735788630904724, + "grad_norm": 0.06317690759897232, + "learning_rate": 1.760401349516605e-06, + "loss": 2.5863, + "step": 32832 + }, + { + "epoch": 0.9736085164427839, + "grad_norm": 0.06344820559024811, + "learning_rate": 1.7564587850791291e-06, + "loss": 2.55, + "step": 32833 + }, + { + "epoch": 0.9736381697950953, + "grad_norm": 0.06652986258268356, + "learning_rate": 1.7525206326737264e-06, + "loss": 2.5591, + "step": 32834 + }, + { + "epoch": 0.9736678231474069, + "grad_norm": 0.06398218125104904, + "learning_rate": 1.7485868923352577e-06, + "loss": 2.5376, + "step": 32835 + }, + { + "epoch": 0.9736974764997183, + "grad_norm": 0.06301862746477127, + "learning_rate": 1.7446575640986395e-06, + "loss": 2.555, + "step": 32836 + }, + { + "epoch": 0.9737271298520298, + "grad_norm": 0.06649851053953171, + "learning_rate": 1.7407326479985664e-06, + "loss": 2.5575, + "step": 32837 + }, + { + "epoch": 0.9737567832043412, + "grad_norm": 0.06688159704208374, + "learning_rate": 1.7368121440698436e-06, + "loss": 2.5257, + "step": 32838 + }, + { + "epoch": 0.9737864365566528, + "grad_norm": 0.0690101757645607, + "learning_rate": 1.7328960523471105e-06, + "loss": 2.6056, + "step": 32839 + }, + { + "epoch": 0.9738160899089642, + "grad_norm": 0.066341832280159, + "learning_rate": 1.728984372865172e-06, + "loss": 2.5622, + "step": 32840 + }, + { + "epoch": 0.9738457432612757, + "grad_norm": 0.06881941109895706, + "learning_rate": 1.7250771056586122e-06, + "loss": 2.5494, + "step": 32841 + }, + { + "epoch": 0.9738753966135871, + "grad_norm": 0.06375616788864136, + "learning_rate": 1.7211742507619588e-06, + "loss": 2.5687, + "step": 32842 + }, + { + "epoch": 0.9739050499658987, + "grad_norm": 0.06741264462471008, + "learning_rate": 1.717275808209906e-06, + "loss": 2.5623, + "step": 32843 + }, + { + "epoch": 0.9739347033182101, + "grad_norm": 0.06542441993951797, + "learning_rate": 1.713381778036871e-06, + "loss": 2.573, + "step": 32844 + }, + { + "epoch": 0.9739643566705216, + "grad_norm": 0.07099318504333496, + "learning_rate": 1.7094921602773817e-06, + "loss": 2.5639, + "step": 32845 + }, + { + "epoch": 0.973994010022833, + "grad_norm": 0.06877633929252625, + "learning_rate": 1.7056069549658548e-06, + "loss": 2.5595, + "step": 32846 + }, + { + "epoch": 0.9740236633751446, + "grad_norm": 0.0659651830792427, + "learning_rate": 1.701726162136763e-06, + "loss": 2.5601, + "step": 32847 + }, + { + "epoch": 0.974053316727456, + "grad_norm": 0.0655214861035347, + "learning_rate": 1.6978497818243565e-06, + "loss": 2.5542, + "step": 32848 + }, + { + "epoch": 0.9740829700797675, + "grad_norm": 0.06527744233608246, + "learning_rate": 1.6939778140630525e-06, + "loss": 2.5683, + "step": 32849 + }, + { + "epoch": 0.974112623432079, + "grad_norm": 0.06756823509931564, + "learning_rate": 1.6901102588871009e-06, + "loss": 2.5413, + "step": 32850 + }, + { + "epoch": 0.9741422767843905, + "grad_norm": 0.0663314014673233, + "learning_rate": 1.6862471163307525e-06, + "loss": 2.5493, + "step": 32851 + }, + { + "epoch": 0.974171930136702, + "grad_norm": 0.06603579223155975, + "learning_rate": 1.6823883864282018e-06, + "loss": 2.5828, + "step": 32852 + }, + { + "epoch": 0.9742015834890134, + "grad_norm": 0.0688677579164505, + "learning_rate": 1.678534069213644e-06, + "loss": 2.5355, + "step": 32853 + }, + { + "epoch": 0.974231236841325, + "grad_norm": 0.06526076793670654, + "learning_rate": 1.6746841647212185e-06, + "loss": 2.5996, + "step": 32854 + }, + { + "epoch": 0.9742608901936364, + "grad_norm": 0.06692831963300705, + "learning_rate": 1.6708386729849535e-06, + "loss": 2.5526, + "step": 32855 + }, + { + "epoch": 0.9742905435459479, + "grad_norm": 0.06438789516687393, + "learning_rate": 1.6669975940390436e-06, + "loss": 2.5511, + "step": 32856 + }, + { + "epoch": 0.9743201968982593, + "grad_norm": 0.0667467713356018, + "learning_rate": 1.663160927917351e-06, + "loss": 2.6133, + "step": 32857 + }, + { + "epoch": 0.9743498502505709, + "grad_norm": 0.06235224008560181, + "learning_rate": 1.659328674653904e-06, + "loss": 2.5156, + "step": 32858 + }, + { + "epoch": 0.9743795036028823, + "grad_norm": 0.066738560795784, + "learning_rate": 1.655500834282675e-06, + "loss": 2.5953, + "step": 32859 + }, + { + "epoch": 0.9744091569551938, + "grad_norm": 0.06517232209444046, + "learning_rate": 1.6516774068374708e-06, + "loss": 2.5279, + "step": 32860 + }, + { + "epoch": 0.9744388103075052, + "grad_norm": 0.06393733620643616, + "learning_rate": 1.6478583923522638e-06, + "loss": 2.5741, + "step": 32861 + }, + { + "epoch": 0.9744684636598168, + "grad_norm": 0.06517404317855835, + "learning_rate": 1.6440437908607497e-06, + "loss": 2.5599, + "step": 32862 + }, + { + "epoch": 0.9744981170121282, + "grad_norm": 0.0646413266658783, + "learning_rate": 1.6402336023967902e-06, + "loss": 2.5666, + "step": 32863 + }, + { + "epoch": 0.9745277703644397, + "grad_norm": 0.0625796839594841, + "learning_rate": 1.6364278269941357e-06, + "loss": 2.5602, + "step": 32864 + }, + { + "epoch": 0.9745574237167511, + "grad_norm": 0.0649544820189476, + "learning_rate": 1.6326264646864263e-06, + "loss": 2.5524, + "step": 32865 + }, + { + "epoch": 0.9745870770690627, + "grad_norm": 0.06553564220666885, + "learning_rate": 1.6288295155073573e-06, + "loss": 2.5689, + "step": 32866 + }, + { + "epoch": 0.9746167304213741, + "grad_norm": 0.0677337571978569, + "learning_rate": 1.6250369794905129e-06, + "loss": 2.5891, + "step": 32867 + }, + { + "epoch": 0.9746463837736856, + "grad_norm": 0.06308843940496445, + "learning_rate": 1.6212488566695882e-06, + "loss": 2.6014, + "step": 32868 + }, + { + "epoch": 0.974676037125997, + "grad_norm": 0.06686891615390778, + "learning_rate": 1.617465147078001e-06, + "loss": 2.5835, + "step": 32869 + }, + { + "epoch": 0.9747056904783086, + "grad_norm": 0.06357251852750778, + "learning_rate": 1.6136858507493358e-06, + "loss": 2.5399, + "step": 32870 + }, + { + "epoch": 0.9747353438306201, + "grad_norm": 0.06431610882282257, + "learning_rate": 1.60991096771701e-06, + "loss": 2.5355, + "step": 32871 + }, + { + "epoch": 0.9747649971829315, + "grad_norm": 0.0678616464138031, + "learning_rate": 1.6061404980144412e-06, + "loss": 2.5505, + "step": 32872 + }, + { + "epoch": 0.9747946505352431, + "grad_norm": 0.06574209034442902, + "learning_rate": 1.6023744416751029e-06, + "loss": 2.5632, + "step": 32873 + }, + { + "epoch": 0.9748243038875545, + "grad_norm": 0.06619521975517273, + "learning_rate": 1.5986127987322463e-06, + "loss": 2.5864, + "step": 32874 + }, + { + "epoch": 0.974853957239866, + "grad_norm": 0.10200434178113937, + "learning_rate": 1.5948555692192336e-06, + "loss": 2.5785, + "step": 32875 + }, + { + "epoch": 0.9748836105921774, + "grad_norm": 0.064136803150177, + "learning_rate": 1.5911027531693156e-06, + "loss": 2.5481, + "step": 32876 + }, + { + "epoch": 0.974913263944489, + "grad_norm": 0.06758324056863785, + "learning_rate": 1.5873543506157439e-06, + "loss": 2.5744, + "step": 32877 + }, + { + "epoch": 0.9749429172968004, + "grad_norm": 0.06760481745004654, + "learning_rate": 1.583610361591714e-06, + "loss": 2.5777, + "step": 32878 + }, + { + "epoch": 0.9749725706491119, + "grad_norm": 0.06501220911741257, + "learning_rate": 1.5798707861303662e-06, + "loss": 2.5587, + "step": 32879 + }, + { + "epoch": 0.9750022240014233, + "grad_norm": 0.06551988422870636, + "learning_rate": 1.5761356242647851e-06, + "loss": 2.5844, + "step": 32880 + }, + { + "epoch": 0.9750318773537349, + "grad_norm": 0.06564924120903015, + "learning_rate": 1.5724048760281106e-06, + "loss": 2.5638, + "step": 32881 + }, + { + "epoch": 0.9750615307060463, + "grad_norm": 0.06384389102458954, + "learning_rate": 1.5686785414533167e-06, + "loss": 2.5717, + "step": 32882 + }, + { + "epoch": 0.9750911840583578, + "grad_norm": 0.06633290648460388, + "learning_rate": 1.5649566205734877e-06, + "loss": 2.5791, + "step": 32883 + }, + { + "epoch": 0.9751208374106692, + "grad_norm": 0.06811858713626862, + "learning_rate": 1.561239113421431e-06, + "loss": 2.5717, + "step": 32884 + }, + { + "epoch": 0.9751504907629808, + "grad_norm": 0.06597746908664703, + "learning_rate": 1.557526020030231e-06, + "loss": 2.5306, + "step": 32885 + }, + { + "epoch": 0.9751801441152922, + "grad_norm": 0.06651575118303299, + "learning_rate": 1.5538173404326395e-06, + "loss": 2.5605, + "step": 32886 + }, + { + "epoch": 0.9752097974676037, + "grad_norm": 0.06710577756166458, + "learning_rate": 1.55011307466163e-06, + "loss": 2.5546, + "step": 32887 + }, + { + "epoch": 0.9752394508199151, + "grad_norm": 0.06674075871706009, + "learning_rate": 1.546413222749843e-06, + "loss": 2.5656, + "step": 32888 + }, + { + "epoch": 0.9752691041722267, + "grad_norm": 0.06709407269954681, + "learning_rate": 1.542717784730141e-06, + "loss": 2.5978, + "step": 32889 + }, + { + "epoch": 0.9752987575245381, + "grad_norm": 0.06720326840877533, + "learning_rate": 1.5390267606352759e-06, + "loss": 2.5849, + "step": 32890 + }, + { + "epoch": 0.9753284108768496, + "grad_norm": 0.06899365037679672, + "learning_rate": 1.5353401504978882e-06, + "loss": 2.548, + "step": 32891 + }, + { + "epoch": 0.9753580642291612, + "grad_norm": 0.06808553636074066, + "learning_rate": 1.5316579543505626e-06, + "loss": 2.5712, + "step": 32892 + }, + { + "epoch": 0.9753877175814726, + "grad_norm": 0.06670770049095154, + "learning_rate": 1.5279801722259957e-06, + "loss": 2.555, + "step": 32893 + }, + { + "epoch": 0.9754173709337841, + "grad_norm": 0.06628862768411636, + "learning_rate": 1.5243068041567721e-06, + "loss": 2.5282, + "step": 32894 + }, + { + "epoch": 0.9754470242860955, + "grad_norm": 0.06827712059020996, + "learning_rate": 1.5206378501753104e-06, + "loss": 2.539, + "step": 32895 + }, + { + "epoch": 0.9754766776384071, + "grad_norm": 0.06566794216632843, + "learning_rate": 1.5169733103141959e-06, + "loss": 2.5549, + "step": 32896 + }, + { + "epoch": 0.9755063309907185, + "grad_norm": 0.06451178342103958, + "learning_rate": 1.5133131846058468e-06, + "loss": 2.615, + "step": 32897 + }, + { + "epoch": 0.97553598434303, + "grad_norm": 0.06629906594753265, + "learning_rate": 1.5096574730826818e-06, + "loss": 2.5268, + "step": 32898 + }, + { + "epoch": 0.9755656376953414, + "grad_norm": 0.06504135578870773, + "learning_rate": 1.5060061757770082e-06, + "loss": 2.5679, + "step": 32899 + }, + { + "epoch": 0.975595291047653, + "grad_norm": 0.06484083831310272, + "learning_rate": 1.5023592927213004e-06, + "loss": 2.6099, + "step": 32900 + }, + { + "epoch": 0.9756249443999644, + "grad_norm": 0.06889403611421585, + "learning_rate": 1.4987168239476989e-06, + "loss": 2.6129, + "step": 32901 + }, + { + "epoch": 0.9756545977522759, + "grad_norm": 0.06495437026023865, + "learning_rate": 1.495078769488567e-06, + "loss": 2.5769, + "step": 32902 + }, + { + "epoch": 0.9756842511045873, + "grad_norm": 0.0681530088186264, + "learning_rate": 1.4914451293760455e-06, + "loss": 2.581, + "step": 32903 + }, + { + "epoch": 0.9757139044568989, + "grad_norm": 0.06535040587186813, + "learning_rate": 1.4878159036423866e-06, + "loss": 2.6082, + "step": 32904 + }, + { + "epoch": 0.9757435578092103, + "grad_norm": 0.06436018645763397, + "learning_rate": 1.48419109231962e-06, + "loss": 2.5719, + "step": 32905 + }, + { + "epoch": 0.9757732111615218, + "grad_norm": 0.06562358886003494, + "learning_rate": 1.4805706954399978e-06, + "loss": 2.5476, + "step": 32906 + }, + { + "epoch": 0.9758028645138332, + "grad_norm": 0.06470037996768951, + "learning_rate": 1.4769547130354387e-06, + "loss": 2.5434, + "step": 32907 + }, + { + "epoch": 0.9758325178661448, + "grad_norm": 0.06768927723169327, + "learning_rate": 1.4733431451380285e-06, + "loss": 2.5877, + "step": 32908 + }, + { + "epoch": 0.9758621712184562, + "grad_norm": 0.06490480154752731, + "learning_rate": 1.4697359917797416e-06, + "loss": 2.5659, + "step": 32909 + }, + { + "epoch": 0.9758918245707677, + "grad_norm": 0.064644955098629, + "learning_rate": 1.4661332529924964e-06, + "loss": 2.581, + "step": 32910 + }, + { + "epoch": 0.9759214779230792, + "grad_norm": 0.06636414676904678, + "learning_rate": 1.4625349288082124e-06, + "loss": 2.5578, + "step": 32911 + }, + { + "epoch": 0.9759511312753907, + "grad_norm": 0.0668349489569664, + "learning_rate": 1.4589410192587526e-06, + "loss": 2.5801, + "step": 32912 + }, + { + "epoch": 0.9759807846277022, + "grad_norm": 0.0654558464884758, + "learning_rate": 1.4553515243759808e-06, + "loss": 2.5629, + "step": 32913 + }, + { + "epoch": 0.9760104379800136, + "grad_norm": 0.06777022778987885, + "learning_rate": 1.4517664441915934e-06, + "loss": 2.5815, + "step": 32914 + }, + { + "epoch": 0.9760400913323252, + "grad_norm": 0.06264875084161758, + "learning_rate": 1.4481857787374542e-06, + "loss": 2.539, + "step": 32915 + }, + { + "epoch": 0.9760697446846366, + "grad_norm": 0.07129481434822083, + "learning_rate": 1.4446095280451487e-06, + "loss": 2.5713, + "step": 32916 + }, + { + "epoch": 0.9760993980369481, + "grad_norm": 0.06712870299816132, + "learning_rate": 1.441037692146374e-06, + "loss": 2.5331, + "step": 32917 + }, + { + "epoch": 0.9761290513892595, + "grad_norm": 0.06426510214805603, + "learning_rate": 1.4374702710728826e-06, + "loss": 2.5776, + "step": 32918 + }, + { + "epoch": 0.9761587047415711, + "grad_norm": 0.065715491771698, + "learning_rate": 1.4339072648560935e-06, + "loss": 2.5639, + "step": 32919 + }, + { + "epoch": 0.9761883580938825, + "grad_norm": 0.06573368608951569, + "learning_rate": 1.4303486735276483e-06, + "loss": 2.5565, + "step": 32920 + }, + { + "epoch": 0.976218011446194, + "grad_norm": 0.06563443690538406, + "learning_rate": 1.4267944971190773e-06, + "loss": 2.6207, + "step": 32921 + }, + { + "epoch": 0.9762476647985054, + "grad_norm": 0.06417366862297058, + "learning_rate": 1.4232447356617995e-06, + "loss": 2.6081, + "step": 32922 + }, + { + "epoch": 0.976277318150817, + "grad_norm": 0.06537650525569916, + "learning_rate": 1.41969938918729e-06, + "loss": 2.5566, + "step": 32923 + }, + { + "epoch": 0.9763069715031284, + "grad_norm": 0.06735067069530487, + "learning_rate": 1.416158457726857e-06, + "loss": 2.5598, + "step": 32924 + }, + { + "epoch": 0.9763366248554399, + "grad_norm": 0.0656621977686882, + "learning_rate": 1.4126219413119757e-06, + "loss": 2.5346, + "step": 32925 + }, + { + "epoch": 0.9763662782077513, + "grad_norm": 0.06515751779079437, + "learning_rate": 1.4090898399738982e-06, + "loss": 2.5915, + "step": 32926 + }, + { + "epoch": 0.9763959315600629, + "grad_norm": 0.0626564472913742, + "learning_rate": 1.4055621537439333e-06, + "loss": 2.5835, + "step": 32927 + }, + { + "epoch": 0.9764255849123743, + "grad_norm": 0.06612804532051086, + "learning_rate": 1.402038882653278e-06, + "loss": 2.5483, + "step": 32928 + }, + { + "epoch": 0.9764552382646858, + "grad_norm": 0.06605318188667297, + "learning_rate": 1.39852002673313e-06, + "loss": 2.5979, + "step": 32929 + }, + { + "epoch": 0.9764848916169973, + "grad_norm": 0.068308524787426, + "learning_rate": 1.395005586014686e-06, + "loss": 2.5511, + "step": 32930 + }, + { + "epoch": 0.9765145449693088, + "grad_norm": 0.06513367593288422, + "learning_rate": 1.3914955605290326e-06, + "loss": 2.5785, + "step": 32931 + }, + { + "epoch": 0.9765441983216202, + "grad_norm": 0.0658014789223671, + "learning_rate": 1.3879899503073112e-06, + "loss": 2.5692, + "step": 32932 + }, + { + "epoch": 0.9765738516739317, + "grad_norm": 0.06577739119529724, + "learning_rate": 1.3844887553804974e-06, + "loss": 2.5588, + "step": 32933 + }, + { + "epoch": 0.9766035050262433, + "grad_norm": 0.06632503122091293, + "learning_rate": 1.3809919757796218e-06, + "loss": 2.5714, + "step": 32934 + }, + { + "epoch": 0.9766331583785547, + "grad_norm": 0.06494086235761642, + "learning_rate": 1.3774996115356597e-06, + "loss": 2.5861, + "step": 32935 + }, + { + "epoch": 0.9766628117308662, + "grad_norm": 0.0648130550980568, + "learning_rate": 1.3740116626795308e-06, + "loss": 2.5463, + "step": 32936 + }, + { + "epoch": 0.9766924650831776, + "grad_norm": 0.06688259541988373, + "learning_rate": 1.3705281292421546e-06, + "loss": 2.5421, + "step": 32937 + }, + { + "epoch": 0.9767221184354892, + "grad_norm": 0.06715700775384903, + "learning_rate": 1.3670490112542844e-06, + "loss": 2.5519, + "step": 32938 + }, + { + "epoch": 0.9767517717878006, + "grad_norm": 0.06424687802791595, + "learning_rate": 1.3635743087467844e-06, + "loss": 2.5594, + "step": 32939 + }, + { + "epoch": 0.9767814251401121, + "grad_norm": 0.06861142069101334, + "learning_rate": 1.3601040217504633e-06, + "loss": 2.5581, + "step": 32940 + }, + { + "epoch": 0.9768110784924235, + "grad_norm": 0.06524945050477982, + "learning_rate": 1.3566381502959634e-06, + "loss": 2.5773, + "step": 32941 + }, + { + "epoch": 0.9768407318447351, + "grad_norm": 0.06973113864660263, + "learning_rate": 1.3531766944140377e-06, + "loss": 2.5589, + "step": 32942 + }, + { + "epoch": 0.9768703851970465, + "grad_norm": 0.06595605611801147, + "learning_rate": 1.3497196541353284e-06, + "loss": 2.5706, + "step": 32943 + }, + { + "epoch": 0.976900038549358, + "grad_norm": 0.0668349415063858, + "learning_rate": 1.3462670294904777e-06, + "loss": 2.5804, + "step": 32944 + }, + { + "epoch": 0.9769296919016695, + "grad_norm": 0.06515438109636307, + "learning_rate": 1.3428188205099612e-06, + "loss": 2.5765, + "step": 32945 + }, + { + "epoch": 0.976959345253981, + "grad_norm": 0.06645681709051132, + "learning_rate": 1.339375027224421e-06, + "loss": 2.5871, + "step": 32946 + }, + { + "epoch": 0.9769889986062924, + "grad_norm": 0.06502003967761993, + "learning_rate": 1.3359356496642771e-06, + "loss": 2.5458, + "step": 32947 + }, + { + "epoch": 0.9770186519586039, + "grad_norm": 0.06578866392374039, + "learning_rate": 1.332500687860061e-06, + "loss": 2.5494, + "step": 32948 + }, + { + "epoch": 0.9770483053109154, + "grad_norm": 0.06465080380439758, + "learning_rate": 1.329070141842137e-06, + "loss": 2.5544, + "step": 32949 + }, + { + "epoch": 0.9770779586632269, + "grad_norm": 0.06394084542989731, + "learning_rate": 1.3256440116408697e-06, + "loss": 2.5808, + "step": 32950 + }, + { + "epoch": 0.9771076120155383, + "grad_norm": 0.06545088440179825, + "learning_rate": 1.3222222972866238e-06, + "loss": 2.5441, + "step": 32951 + }, + { + "epoch": 0.9771372653678498, + "grad_norm": 0.0629899874329567, + "learning_rate": 1.3188049988097083e-06, + "loss": 2.5452, + "step": 32952 + }, + { + "epoch": 0.9771669187201613, + "grad_norm": 0.06895525753498077, + "learning_rate": 1.315392116240377e-06, + "loss": 2.6206, + "step": 32953 + }, + { + "epoch": 0.9771965720724728, + "grad_norm": 0.06862680613994598, + "learning_rate": 1.3119836496088278e-06, + "loss": 2.567, + "step": 32954 + }, + { + "epoch": 0.9772262254247843, + "grad_norm": 0.06513593345880508, + "learning_rate": 1.3085795989452587e-06, + "loss": 2.5643, + "step": 32955 + }, + { + "epoch": 0.9772558787770957, + "grad_norm": 0.06587593257427216, + "learning_rate": 1.3051799642798124e-06, + "loss": 2.574, + "step": 32956 + }, + { + "epoch": 0.9772855321294073, + "grad_norm": 0.06842945516109467, + "learning_rate": 1.3017847456426312e-06, + "loss": 2.5489, + "step": 32957 + }, + { + "epoch": 0.9773151854817187, + "grad_norm": 0.06504625827074051, + "learning_rate": 1.298393943063747e-06, + "loss": 2.5715, + "step": 32958 + }, + { + "epoch": 0.9773448388340302, + "grad_norm": 0.06739939004182816, + "learning_rate": 1.2950075565731357e-06, + "loss": 2.553, + "step": 32959 + }, + { + "epoch": 0.9773744921863416, + "grad_norm": 0.06590668857097626, + "learning_rate": 1.2916255862008841e-06, + "loss": 2.5927, + "step": 32960 + }, + { + "epoch": 0.9774041455386532, + "grad_norm": 0.0674716979265213, + "learning_rate": 1.2882480319768575e-06, + "loss": 2.588, + "step": 32961 + }, + { + "epoch": 0.9774337988909646, + "grad_norm": 0.06513083726167679, + "learning_rate": 1.2848748939310318e-06, + "loss": 2.5616, + "step": 32962 + }, + { + "epoch": 0.9774634522432761, + "grad_norm": 0.06697595119476318, + "learning_rate": 1.2815061720932165e-06, + "loss": 2.5655, + "step": 32963 + }, + { + "epoch": 0.9774931055955876, + "grad_norm": 0.06512903422117233, + "learning_rate": 1.2781418664932765e-06, + "loss": 2.5791, + "step": 32964 + }, + { + "epoch": 0.9775227589478991, + "grad_norm": 0.0679435282945633, + "learning_rate": 1.2747819771610214e-06, + "loss": 2.5419, + "step": 32965 + }, + { + "epoch": 0.9775524123002105, + "grad_norm": 0.06640499085187912, + "learning_rate": 1.2714265041260942e-06, + "loss": 2.564, + "step": 32966 + }, + { + "epoch": 0.977582065652522, + "grad_norm": 0.06403260678052902, + "learning_rate": 1.2680754474183597e-06, + "loss": 2.5556, + "step": 32967 + }, + { + "epoch": 0.9776117190048335, + "grad_norm": 0.06632182002067566, + "learning_rate": 1.2647288070674058e-06, + "loss": 2.5598, + "step": 32968 + }, + { + "epoch": 0.977641372357145, + "grad_norm": 0.0653466135263443, + "learning_rate": 1.2613865831028747e-06, + "loss": 2.571, + "step": 32969 + }, + { + "epoch": 0.9776710257094564, + "grad_norm": 0.06642672419548035, + "learning_rate": 1.25804877555441e-06, + "loss": 2.5789, + "step": 32970 + }, + { + "epoch": 0.9777006790617679, + "grad_norm": 0.06701095402240753, + "learning_rate": 1.254715384451488e-06, + "loss": 2.572, + "step": 32971 + }, + { + "epoch": 0.9777303324140794, + "grad_norm": 0.0681261196732521, + "learning_rate": 1.251386409823696e-06, + "loss": 2.5586, + "step": 32972 + }, + { + "epoch": 0.9777599857663909, + "grad_norm": 0.06813526153564453, + "learning_rate": 1.2480618517005104e-06, + "loss": 2.5637, + "step": 32973 + }, + { + "epoch": 0.9777896391187023, + "grad_norm": 0.06755006313323975, + "learning_rate": 1.2447417101112968e-06, + "loss": 2.5805, + "step": 32974 + }, + { + "epoch": 0.9778192924710138, + "grad_norm": 0.06534404307603836, + "learning_rate": 1.2414259850855315e-06, + "loss": 2.5672, + "step": 32975 + }, + { + "epoch": 0.9778489458233254, + "grad_norm": 0.06358876079320908, + "learning_rate": 1.2381146766525243e-06, + "loss": 2.5812, + "step": 32976 + }, + { + "epoch": 0.9778785991756368, + "grad_norm": 0.06671200692653656, + "learning_rate": 1.2348077848416406e-06, + "loss": 2.5803, + "step": 32977 + }, + { + "epoch": 0.9779082525279483, + "grad_norm": 0.06553932279348373, + "learning_rate": 1.2315053096821905e-06, + "loss": 2.5319, + "step": 32978 + }, + { + "epoch": 0.9779379058802598, + "grad_norm": 0.0663212388753891, + "learning_rate": 1.228207251203317e-06, + "loss": 2.5301, + "step": 32979 + }, + { + "epoch": 0.9779675592325713, + "grad_norm": 0.06511322408914566, + "learning_rate": 1.224913609434275e-06, + "loss": 2.5732, + "step": 32980 + }, + { + "epoch": 0.9779972125848827, + "grad_norm": 0.068025141954422, + "learning_rate": 1.221624384404263e-06, + "loss": 2.5705, + "step": 32981 + }, + { + "epoch": 0.9780268659371942, + "grad_norm": 0.0631437748670578, + "learning_rate": 1.2183395761423687e-06, + "loss": 2.5833, + "step": 32982 + }, + { + "epoch": 0.9780565192895057, + "grad_norm": 0.06458550691604614, + "learning_rate": 1.2150591846776803e-06, + "loss": 2.5875, + "step": 32983 + }, + { + "epoch": 0.9780861726418172, + "grad_norm": 0.06745290756225586, + "learning_rate": 1.2117832100392857e-06, + "loss": 2.5854, + "step": 32984 + }, + { + "epoch": 0.9781158259941286, + "grad_norm": 0.06775261461734772, + "learning_rate": 1.2085116522561613e-06, + "loss": 2.5522, + "step": 32985 + }, + { + "epoch": 0.9781454793464401, + "grad_norm": 0.06798026710748672, + "learning_rate": 1.2052445113572841e-06, + "loss": 2.5592, + "step": 32986 + }, + { + "epoch": 0.9781751326987516, + "grad_norm": 0.06765559315681458, + "learning_rate": 1.2019817873715756e-06, + "loss": 2.5563, + "step": 32987 + }, + { + "epoch": 0.9782047860510631, + "grad_norm": 0.06387779116630554, + "learning_rate": 1.1987234803279567e-06, + "loss": 2.5464, + "step": 32988 + }, + { + "epoch": 0.9782344394033745, + "grad_norm": 0.0655176192522049, + "learning_rate": 1.1954695902552382e-06, + "loss": 2.5356, + "step": 32989 + }, + { + "epoch": 0.978264092755686, + "grad_norm": 0.06884028017520905, + "learning_rate": 1.19222011718223e-06, + "loss": 2.5862, + "step": 32990 + }, + { + "epoch": 0.9782937461079975, + "grad_norm": 0.06706012040376663, + "learning_rate": 1.188975061137798e-06, + "loss": 2.579, + "step": 32991 + }, + { + "epoch": 0.978323399460309, + "grad_norm": 0.06486520171165466, + "learning_rate": 1.1857344221505305e-06, + "loss": 2.5346, + "step": 32992 + }, + { + "epoch": 0.9783530528126204, + "grad_norm": 0.06758996844291687, + "learning_rate": 1.1824982002492934e-06, + "loss": 2.5754, + "step": 32993 + }, + { + "epoch": 0.978382706164932, + "grad_norm": 0.0663626492023468, + "learning_rate": 1.1792663954625637e-06, + "loss": 2.5519, + "step": 32994 + }, + { + "epoch": 0.9784123595172435, + "grad_norm": 0.06601449102163315, + "learning_rate": 1.1760390078190963e-06, + "loss": 2.5366, + "step": 32995 + }, + { + "epoch": 0.9784420128695549, + "grad_norm": 0.06498986482620239, + "learning_rate": 1.172816037347424e-06, + "loss": 2.5837, + "step": 32996 + }, + { + "epoch": 0.9784716662218664, + "grad_norm": 0.06606219708919525, + "learning_rate": 1.1695974840760792e-06, + "loss": 2.5806, + "step": 32997 + }, + { + "epoch": 0.9785013195741779, + "grad_norm": 0.0663689523935318, + "learning_rate": 1.166383348033595e-06, + "loss": 2.5408, + "step": 32998 + }, + { + "epoch": 0.9785309729264894, + "grad_norm": 0.06584449857473373, + "learning_rate": 1.1631736292484484e-06, + "loss": 2.5352, + "step": 32999 + }, + { + "epoch": 0.9785606262788008, + "grad_norm": 0.06564918160438538, + "learning_rate": 1.1599683277489502e-06, + "loss": 2.5757, + "step": 33000 + }, + { + "epoch": 0.9785902796311123, + "grad_norm": 0.06608521193265915, + "learning_rate": 1.1567674435635779e-06, + "loss": 2.5731, + "step": 33001 + }, + { + "epoch": 0.9786199329834238, + "grad_norm": 0.06363163888454437, + "learning_rate": 1.1535709767206415e-06, + "loss": 2.5577, + "step": 33002 + }, + { + "epoch": 0.9786495863357353, + "grad_norm": 0.06713895499706268, + "learning_rate": 1.1503789272485077e-06, + "loss": 2.5754, + "step": 33003 + }, + { + "epoch": 0.9786792396880467, + "grad_norm": 0.06598188728094101, + "learning_rate": 1.147191295175376e-06, + "loss": 2.5784, + "step": 33004 + }, + { + "epoch": 0.9787088930403582, + "grad_norm": 0.06341167539358139, + "learning_rate": 1.1440080805294463e-06, + "loss": 2.582, + "step": 33005 + }, + { + "epoch": 0.9787385463926697, + "grad_norm": 0.0649445503950119, + "learning_rate": 1.1408292833389732e-06, + "loss": 2.5463, + "step": 33006 + }, + { + "epoch": 0.9787681997449812, + "grad_norm": 0.06501712650060654, + "learning_rate": 1.1376549036321016e-06, + "loss": 2.5477, + "step": 33007 + }, + { + "epoch": 0.9787978530972926, + "grad_norm": 0.0650823786854744, + "learning_rate": 1.1344849414369195e-06, + "loss": 2.5534, + "step": 33008 + }, + { + "epoch": 0.9788275064496041, + "grad_norm": 0.06570973247289658, + "learning_rate": 1.1313193967815161e-06, + "loss": 2.5752, + "step": 33009 + }, + { + "epoch": 0.9788571598019156, + "grad_norm": 0.06695343554019928, + "learning_rate": 1.128158269693924e-06, + "loss": 2.5913, + "step": 33010 + }, + { + "epoch": 0.9788868131542271, + "grad_norm": 0.06616955995559692, + "learning_rate": 1.1250015602020658e-06, + "loss": 2.5754, + "step": 33011 + }, + { + "epoch": 0.9789164665065385, + "grad_norm": 0.06708650290966034, + "learning_rate": 1.1218492683339743e-06, + "loss": 2.5653, + "step": 33012 + }, + { + "epoch": 0.97894611985885, + "grad_norm": 0.06513667851686478, + "learning_rate": 1.1187013941175717e-06, + "loss": 2.5481, + "step": 33013 + }, + { + "epoch": 0.9789757732111615, + "grad_norm": 0.06556485593318939, + "learning_rate": 1.115557937580669e-06, + "loss": 2.5771, + "step": 33014 + }, + { + "epoch": 0.979005426563473, + "grad_norm": 0.06760363280773163, + "learning_rate": 1.112418898751133e-06, + "loss": 2.5533, + "step": 33015 + }, + { + "epoch": 0.9790350799157845, + "grad_norm": 0.06553855538368225, + "learning_rate": 1.1092842776567747e-06, + "loss": 2.5473, + "step": 33016 + }, + { + "epoch": 0.979064733268096, + "grad_norm": 0.06764544546604156, + "learning_rate": 1.106154074325294e-06, + "loss": 2.5757, + "step": 33017 + }, + { + "epoch": 0.9790943866204075, + "grad_norm": 0.06486243009567261, + "learning_rate": 1.1030282887845022e-06, + "loss": 2.5232, + "step": 33018 + }, + { + "epoch": 0.9791240399727189, + "grad_norm": 0.06328756362199783, + "learning_rate": 1.0999069210619883e-06, + "loss": 2.5458, + "step": 33019 + }, + { + "epoch": 0.9791536933250304, + "grad_norm": 0.06513170897960663, + "learning_rate": 1.0967899711854523e-06, + "loss": 2.5854, + "step": 33020 + }, + { + "epoch": 0.9791833466773419, + "grad_norm": 0.06800252199172974, + "learning_rate": 1.0936774391824833e-06, + "loss": 2.5966, + "step": 33021 + }, + { + "epoch": 0.9792130000296534, + "grad_norm": 0.06488428264856339, + "learning_rate": 1.0905693250806703e-06, + "loss": 2.5509, + "step": 33022 + }, + { + "epoch": 0.9792426533819648, + "grad_norm": 0.06596768647432327, + "learning_rate": 1.087465628907436e-06, + "loss": 2.5394, + "step": 33023 + }, + { + "epoch": 0.9792723067342763, + "grad_norm": 0.06376770883798599, + "learning_rate": 1.084366350690369e-06, + "loss": 2.5702, + "step": 33024 + }, + { + "epoch": 0.9793019600865878, + "grad_norm": 0.07001122832298279, + "learning_rate": 1.0812714904568922e-06, + "loss": 2.5666, + "step": 33025 + }, + { + "epoch": 0.9793316134388993, + "grad_norm": 0.0645519495010376, + "learning_rate": 1.0781810482343723e-06, + "loss": 2.5806, + "step": 33026 + }, + { + "epoch": 0.9793612667912107, + "grad_norm": 0.06691298633813858, + "learning_rate": 1.0750950240501766e-06, + "loss": 2.5944, + "step": 33027 + }, + { + "epoch": 0.9793909201435222, + "grad_norm": 0.06531642377376556, + "learning_rate": 1.072013417931672e-06, + "loss": 2.5505, + "step": 33028 + }, + { + "epoch": 0.9794205734958337, + "grad_norm": 0.062806136906147, + "learning_rate": 1.0689362299061144e-06, + "loss": 2.5218, + "step": 33029 + }, + { + "epoch": 0.9794502268481452, + "grad_norm": 0.0623953677713871, + "learning_rate": 1.0658634600008155e-06, + "loss": 2.5545, + "step": 33030 + }, + { + "epoch": 0.9794798802004566, + "grad_norm": 0.0641431212425232, + "learning_rate": 1.0627951082428643e-06, + "loss": 2.5622, + "step": 33031 + }, + { + "epoch": 0.9795095335527682, + "grad_norm": 0.06523184478282928, + "learning_rate": 1.059731174659573e-06, + "loss": 2.5434, + "step": 33032 + }, + { + "epoch": 0.9795391869050796, + "grad_norm": 0.07076912373304367, + "learning_rate": 1.0566716592779746e-06, + "loss": 2.5369, + "step": 33033 + }, + { + "epoch": 0.9795688402573911, + "grad_norm": 0.06591065227985382, + "learning_rate": 1.0536165621251592e-06, + "loss": 2.5219, + "step": 33034 + }, + { + "epoch": 0.9795984936097025, + "grad_norm": 0.06550372391939163, + "learning_rate": 1.0505658832282715e-06, + "loss": 2.5712, + "step": 33035 + }, + { + "epoch": 0.9796281469620141, + "grad_norm": 0.06505817174911499, + "learning_rate": 1.0475196226142348e-06, + "loss": 2.5438, + "step": 33036 + }, + { + "epoch": 0.9796578003143256, + "grad_norm": 0.06864786893129349, + "learning_rate": 1.044477780310027e-06, + "loss": 2.5318, + "step": 33037 + }, + { + "epoch": 0.979687453666637, + "grad_norm": 0.06683243811130524, + "learning_rate": 1.0414403563426822e-06, + "loss": 2.5956, + "step": 33038 + }, + { + "epoch": 0.9797171070189485, + "grad_norm": 0.06664832681417465, + "learning_rate": 1.038407350738957e-06, + "loss": 2.5536, + "step": 33039 + }, + { + "epoch": 0.97974676037126, + "grad_norm": 0.0661354586482048, + "learning_rate": 1.0353787635258294e-06, + "loss": 2.5115, + "step": 33040 + }, + { + "epoch": 0.9797764137235715, + "grad_norm": 0.06475352495908737, + "learning_rate": 1.032354594730056e-06, + "loss": 2.5851, + "step": 33041 + }, + { + "epoch": 0.9798060670758829, + "grad_norm": 0.06992625445127487, + "learning_rate": 1.0293348443784488e-06, + "loss": 2.5592, + "step": 33042 + }, + { + "epoch": 0.9798357204281944, + "grad_norm": 0.06724417954683304, + "learning_rate": 1.0263195124977088e-06, + "loss": 2.6132, + "step": 33043 + }, + { + "epoch": 0.9798653737805059, + "grad_norm": 0.06607294827699661, + "learning_rate": 1.0233085991145364e-06, + "loss": 2.5825, + "step": 33044 + }, + { + "epoch": 0.9798950271328174, + "grad_norm": 0.06808653473854065, + "learning_rate": 1.0203021042556326e-06, + "loss": 2.5549, + "step": 33045 + }, + { + "epoch": 0.9799246804851288, + "grad_norm": 0.0657832995057106, + "learning_rate": 1.017300027947643e-06, + "loss": 2.5626, + "step": 33046 + }, + { + "epoch": 0.9799543338374404, + "grad_norm": 0.06557288765907288, + "learning_rate": 1.0143023702170461e-06, + "loss": 2.5906, + "step": 33047 + }, + { + "epoch": 0.9799839871897518, + "grad_norm": 0.06909073889255524, + "learning_rate": 1.0113091310904876e-06, + "loss": 2.5043, + "step": 33048 + }, + { + "epoch": 0.9800136405420633, + "grad_norm": 0.06470591574907303, + "learning_rate": 1.0083203105943905e-06, + "loss": 2.5466, + "step": 33049 + }, + { + "epoch": 0.9800432938943747, + "grad_norm": 0.06550704687833786, + "learning_rate": 1.0053359087553448e-06, + "loss": 2.5779, + "step": 33050 + }, + { + "epoch": 0.9800729472466863, + "grad_norm": 0.06576711684465408, + "learning_rate": 1.0023559255996627e-06, + "loss": 2.5601, + "step": 33051 + }, + { + "epoch": 0.9801026005989977, + "grad_norm": 0.0663793534040451, + "learning_rate": 9.993803611537677e-07, + "loss": 2.5434, + "step": 33052 + }, + { + "epoch": 0.9801322539513092, + "grad_norm": 0.0649631917476654, + "learning_rate": 9.964092154439719e-07, + "loss": 2.5554, + "step": 33053 + }, + { + "epoch": 0.9801619073036206, + "grad_norm": 0.06542617827653885, + "learning_rate": 9.934424884966987e-07, + "loss": 2.5706, + "step": 33054 + }, + { + "epoch": 0.9801915606559322, + "grad_norm": 0.0663430318236351, + "learning_rate": 9.90480180338149e-07, + "loss": 2.6003, + "step": 33055 + }, + { + "epoch": 0.9802212140082436, + "grad_norm": 0.06511035561561584, + "learning_rate": 9.875222909944692e-07, + "loss": 2.5499, + "step": 33056 + }, + { + "epoch": 0.9802508673605551, + "grad_norm": 0.06546235084533691, + "learning_rate": 9.845688204920267e-07, + "loss": 2.5829, + "step": 33057 + }, + { + "epoch": 0.9802805207128666, + "grad_norm": 0.06668931245803833, + "learning_rate": 9.816197688568007e-07, + "loss": 2.5446, + "step": 33058 + }, + { + "epoch": 0.9803101740651781, + "grad_norm": 0.0659899115562439, + "learning_rate": 9.786751361149926e-07, + "loss": 2.5579, + "step": 33059 + }, + { + "epoch": 0.9803398274174896, + "grad_norm": 0.06454774737358093, + "learning_rate": 9.75734922292748e-07, + "loss": 2.5782, + "step": 33060 + }, + { + "epoch": 0.980369480769801, + "grad_norm": 0.06620538234710693, + "learning_rate": 9.727991274159352e-07, + "loss": 2.5651, + "step": 33061 + }, + { + "epoch": 0.9803991341221125, + "grad_norm": 0.0670568197965622, + "learning_rate": 9.698677515107001e-07, + "loss": 2.543, + "step": 33062 + }, + { + "epoch": 0.980428787474424, + "grad_norm": 0.06746687740087509, + "learning_rate": 9.669407946029663e-07, + "loss": 2.5893, + "step": 33063 + }, + { + "epoch": 0.9804584408267355, + "grad_norm": 0.06831790506839752, + "learning_rate": 9.640182567185463e-07, + "loss": 2.5737, + "step": 33064 + }, + { + "epoch": 0.9804880941790469, + "grad_norm": 0.0649193599820137, + "learning_rate": 9.61100137883475e-07, + "loss": 2.5581, + "step": 33065 + }, + { + "epoch": 0.9805177475313585, + "grad_norm": 0.06466976553201675, + "learning_rate": 9.581864381235094e-07, + "loss": 2.5474, + "step": 33066 + }, + { + "epoch": 0.9805474008836699, + "grad_norm": 0.06551449000835419, + "learning_rate": 9.552771574644625e-07, + "loss": 2.5619, + "step": 33067 + }, + { + "epoch": 0.9805770542359814, + "grad_norm": 0.06598608940839767, + "learning_rate": 9.523722959320913e-07, + "loss": 2.5547, + "step": 33068 + }, + { + "epoch": 0.9806067075882928, + "grad_norm": 0.06605035811662674, + "learning_rate": 9.494718535520974e-07, + "loss": 2.5961, + "step": 33069 + }, + { + "epoch": 0.9806363609406044, + "grad_norm": 0.06590313464403152, + "learning_rate": 9.465758303502381e-07, + "loss": 2.5842, + "step": 33070 + }, + { + "epoch": 0.9806660142929158, + "grad_norm": 0.0657312348484993, + "learning_rate": 9.436842263520484e-07, + "loss": 2.5303, + "step": 33071 + }, + { + "epoch": 0.9806956676452273, + "grad_norm": 0.06764242053031921, + "learning_rate": 9.407970415832301e-07, + "loss": 2.5348, + "step": 33072 + }, + { + "epoch": 0.9807253209975387, + "grad_norm": 0.0651930570602417, + "learning_rate": 9.379142760693183e-07, + "loss": 2.5622, + "step": 33073 + }, + { + "epoch": 0.9807549743498503, + "grad_norm": 0.06903044134378433, + "learning_rate": 9.350359298358479e-07, + "loss": 2.5814, + "step": 33074 + }, + { + "epoch": 0.9807846277021617, + "grad_norm": 0.06779985874891281, + "learning_rate": 9.321620029082989e-07, + "loss": 2.5604, + "step": 33075 + }, + { + "epoch": 0.9808142810544732, + "grad_norm": 0.06442410498857498, + "learning_rate": 9.292924953120951e-07, + "loss": 2.5494, + "step": 33076 + }, + { + "epoch": 0.9808439344067846, + "grad_norm": 0.06549546867609024, + "learning_rate": 9.264274070727163e-07, + "loss": 2.5847, + "step": 33077 + }, + { + "epoch": 0.9808735877590962, + "grad_norm": 0.06335054337978363, + "learning_rate": 9.2356673821542e-07, + "loss": 2.5741, + "step": 33078 + }, + { + "epoch": 0.9809032411114077, + "grad_norm": 0.06442944705486298, + "learning_rate": 9.207104887656859e-07, + "loss": 2.5229, + "step": 33079 + }, + { + "epoch": 0.9809328944637191, + "grad_norm": 0.06733804941177368, + "learning_rate": 9.178586587486603e-07, + "loss": 2.5586, + "step": 33080 + }, + { + "epoch": 0.9809625478160306, + "grad_norm": 0.06666523963212967, + "learning_rate": 9.150112481896567e-07, + "loss": 2.5454, + "step": 33081 + }, + { + "epoch": 0.9809922011683421, + "grad_norm": 0.0655086487531662, + "learning_rate": 9.121682571139323e-07, + "loss": 2.5514, + "step": 33082 + }, + { + "epoch": 0.9810218545206536, + "grad_norm": 0.06720800697803497, + "learning_rate": 9.093296855466338e-07, + "loss": 2.5579, + "step": 33083 + }, + { + "epoch": 0.981051507872965, + "grad_norm": 0.06751366704702377, + "learning_rate": 9.064955335128522e-07, + "loss": 2.5644, + "step": 33084 + }, + { + "epoch": 0.9810811612252766, + "grad_norm": 0.06766928732395172, + "learning_rate": 9.036658010377341e-07, + "loss": 2.558, + "step": 33085 + }, + { + "epoch": 0.981110814577588, + "grad_norm": 0.0646996721625328, + "learning_rate": 9.008404881463705e-07, + "loss": 2.5373, + "step": 33086 + }, + { + "epoch": 0.9811404679298995, + "grad_norm": 0.0674237608909607, + "learning_rate": 8.98019594863686e-07, + "loss": 2.5879, + "step": 33087 + }, + { + "epoch": 0.9811701212822109, + "grad_norm": 0.06581146270036697, + "learning_rate": 8.95203121214716e-07, + "loss": 2.5587, + "step": 33088 + }, + { + "epoch": 0.9811997746345225, + "grad_norm": 0.06952833384275436, + "learning_rate": 8.923910672243851e-07, + "loss": 2.5742, + "step": 33089 + }, + { + "epoch": 0.9812294279868339, + "grad_norm": 0.06467508524656296, + "learning_rate": 8.895834329176177e-07, + "loss": 2.5486, + "step": 33090 + }, + { + "epoch": 0.9812590813391454, + "grad_norm": 0.06609570235013962, + "learning_rate": 8.867802183192275e-07, + "loss": 2.5563, + "step": 33091 + }, + { + "epoch": 0.9812887346914568, + "grad_norm": 0.06691630929708481, + "learning_rate": 8.839814234540833e-07, + "loss": 2.5571, + "step": 33092 + }, + { + "epoch": 0.9813183880437684, + "grad_norm": 0.06447136402130127, + "learning_rate": 8.811870483469985e-07, + "loss": 2.5692, + "step": 33093 + }, + { + "epoch": 0.9813480413960798, + "grad_norm": 0.06469734758138657, + "learning_rate": 8.783970930226204e-07, + "loss": 2.5714, + "step": 33094 + }, + { + "epoch": 0.9813776947483913, + "grad_norm": 0.06925541162490845, + "learning_rate": 8.756115575057066e-07, + "loss": 2.5548, + "step": 33095 + }, + { + "epoch": 0.9814073481007027, + "grad_norm": 0.06796523183584213, + "learning_rate": 8.728304418209598e-07, + "loss": 2.5747, + "step": 33096 + }, + { + "epoch": 0.9814370014530143, + "grad_norm": 0.0647883340716362, + "learning_rate": 8.700537459929714e-07, + "loss": 2.5495, + "step": 33097 + }, + { + "epoch": 0.9814666548053257, + "grad_norm": 0.06398558616638184, + "learning_rate": 8.672814700463328e-07, + "loss": 2.5413, + "step": 33098 + }, + { + "epoch": 0.9814963081576372, + "grad_norm": 0.06696565449237823, + "learning_rate": 8.645136140055798e-07, + "loss": 2.5847, + "step": 33099 + }, + { + "epoch": 0.9815259615099488, + "grad_norm": 0.06590499728918076, + "learning_rate": 8.617501778952486e-07, + "loss": 2.5757, + "step": 33100 + }, + { + "epoch": 0.9815556148622602, + "grad_norm": 0.06794009357690811, + "learning_rate": 8.589911617397639e-07, + "loss": 2.5439, + "step": 33101 + }, + { + "epoch": 0.9815852682145717, + "grad_norm": 0.06664353609085083, + "learning_rate": 8.562365655636062e-07, + "loss": 2.5369, + "step": 33102 + }, + { + "epoch": 0.9816149215668831, + "grad_norm": 0.06810267269611359, + "learning_rate": 8.534863893911449e-07, + "loss": 2.5423, + "step": 33103 + }, + { + "epoch": 0.9816445749191947, + "grad_norm": 0.06714078783988953, + "learning_rate": 8.507406332467494e-07, + "loss": 2.5802, + "step": 33104 + }, + { + "epoch": 0.9816742282715061, + "grad_norm": 0.06674428284168243, + "learning_rate": 8.479992971547334e-07, + "loss": 2.5762, + "step": 33105 + }, + { + "epoch": 0.9817038816238176, + "grad_norm": 0.06469104439020157, + "learning_rate": 8.452623811393557e-07, + "loss": 2.5542, + "step": 33106 + }, + { + "epoch": 0.981733534976129, + "grad_norm": 0.06348352879285812, + "learning_rate": 8.425298852248742e-07, + "loss": 2.5803, + "step": 33107 + }, + { + "epoch": 0.9817631883284406, + "grad_norm": 0.0637778714299202, + "learning_rate": 8.398018094354365e-07, + "loss": 2.5697, + "step": 33108 + }, + { + "epoch": 0.981792841680752, + "grad_norm": 0.06696341186761856, + "learning_rate": 8.370781537952454e-07, + "loss": 2.5414, + "step": 33109 + }, + { + "epoch": 0.9818224950330635, + "grad_norm": 0.07300003618001938, + "learning_rate": 8.343589183283928e-07, + "loss": 2.5648, + "step": 33110 + }, + { + "epoch": 0.9818521483853749, + "grad_norm": 0.06452463567256927, + "learning_rate": 8.31644103059026e-07, + "loss": 2.5671, + "step": 33111 + }, + { + "epoch": 0.9818818017376865, + "grad_norm": 0.0651349425315857, + "learning_rate": 8.289337080110704e-07, + "loss": 2.5831, + "step": 33112 + }, + { + "epoch": 0.9819114550899979, + "grad_norm": 0.06403730809688568, + "learning_rate": 8.262277332086177e-07, + "loss": 2.5404, + "step": 33113 + }, + { + "epoch": 0.9819411084423094, + "grad_norm": 0.0659773126244545, + "learning_rate": 8.235261786755932e-07, + "loss": 2.5526, + "step": 33114 + }, + { + "epoch": 0.9819707617946208, + "grad_norm": 0.06453496217727661, + "learning_rate": 8.208290444359223e-07, + "loss": 2.5442, + "step": 33115 + }, + { + "epoch": 0.9820004151469324, + "grad_norm": 0.06620766967535019, + "learning_rate": 8.181363305134748e-07, + "loss": 2.5529, + "step": 33116 + }, + { + "epoch": 0.9820300684992438, + "grad_norm": 0.0652180016040802, + "learning_rate": 8.15448036932176e-07, + "loss": 2.6034, + "step": 33117 + }, + { + "epoch": 0.9820597218515553, + "grad_norm": 0.0667010024189949, + "learning_rate": 8.127641637157291e-07, + "loss": 2.5652, + "step": 33118 + }, + { + "epoch": 0.9820893752038667, + "grad_norm": 0.06865815073251724, + "learning_rate": 8.100847108879483e-07, + "loss": 2.5745, + "step": 33119 + }, + { + "epoch": 0.9821190285561783, + "grad_norm": 0.06379524618387222, + "learning_rate": 8.074096784725371e-07, + "loss": 2.572, + "step": 33120 + }, + { + "epoch": 0.9821486819084898, + "grad_norm": 0.06646841764450073, + "learning_rate": 8.047390664931986e-07, + "loss": 2.55, + "step": 33121 + }, + { + "epoch": 0.9821783352608012, + "grad_norm": 0.06643673777580261, + "learning_rate": 8.020728749735806e-07, + "loss": 2.5763, + "step": 33122 + }, + { + "epoch": 0.9822079886131128, + "grad_norm": 0.06604956090450287, + "learning_rate": 7.994111039373309e-07, + "loss": 2.5655, + "step": 33123 + }, + { + "epoch": 0.9822376419654242, + "grad_norm": 0.06690956652164459, + "learning_rate": 7.967537534079305e-07, + "loss": 2.5711, + "step": 33124 + }, + { + "epoch": 0.9822672953177357, + "grad_norm": 0.06832510232925415, + "learning_rate": 7.941008234089719e-07, + "loss": 2.5698, + "step": 33125 + }, + { + "epoch": 0.9822969486700471, + "grad_norm": 0.06711482256650925, + "learning_rate": 7.914523139639918e-07, + "loss": 2.6104, + "step": 33126 + }, + { + "epoch": 0.9823266020223587, + "grad_norm": 0.0677337720990181, + "learning_rate": 7.888082250963046e-07, + "loss": 2.5444, + "step": 33127 + }, + { + "epoch": 0.9823562553746701, + "grad_norm": 0.06850205361843109, + "learning_rate": 7.861685568294474e-07, + "loss": 2.5912, + "step": 33128 + }, + { + "epoch": 0.9823859087269816, + "grad_norm": 0.06463252007961273, + "learning_rate": 7.835333091867903e-07, + "loss": 2.5679, + "step": 33129 + }, + { + "epoch": 0.982415562079293, + "grad_norm": 0.06515607237815857, + "learning_rate": 7.809024821916477e-07, + "loss": 2.552, + "step": 33130 + }, + { + "epoch": 0.9824452154316046, + "grad_norm": 0.07086541503667831, + "learning_rate": 7.782760758672236e-07, + "loss": 2.5753, + "step": 33131 + }, + { + "epoch": 0.982474868783916, + "grad_norm": 0.06694390624761581, + "learning_rate": 7.756540902368881e-07, + "loss": 2.5607, + "step": 33132 + }, + { + "epoch": 0.9825045221362275, + "grad_norm": 0.0685129165649414, + "learning_rate": 7.730365253238447e-07, + "loss": 2.5736, + "step": 33133 + }, + { + "epoch": 0.9825341754885389, + "grad_norm": 0.06505315005779266, + "learning_rate": 7.704233811512417e-07, + "loss": 2.5658, + "step": 33134 + }, + { + "epoch": 0.9825638288408505, + "grad_norm": 0.0673728659749031, + "learning_rate": 7.678146577422273e-07, + "loss": 2.5659, + "step": 33135 + }, + { + "epoch": 0.9825934821931619, + "grad_norm": 0.0703076645731926, + "learning_rate": 7.652103551198941e-07, + "loss": 2.583, + "step": 33136 + }, + { + "epoch": 0.9826231355454734, + "grad_norm": 0.0665511041879654, + "learning_rate": 7.626104733073347e-07, + "loss": 2.581, + "step": 33137 + }, + { + "epoch": 0.9826527888977848, + "grad_norm": 0.06708616763353348, + "learning_rate": 7.600150123275861e-07, + "loss": 2.5442, + "step": 33138 + }, + { + "epoch": 0.9826824422500964, + "grad_norm": 0.06599855422973633, + "learning_rate": 7.574239722035747e-07, + "loss": 2.5895, + "step": 33139 + }, + { + "epoch": 0.9827120956024078, + "grad_norm": 0.061958156526088715, + "learning_rate": 7.548373529582264e-07, + "loss": 2.5644, + "step": 33140 + }, + { + "epoch": 0.9827417489547193, + "grad_norm": 0.06543821096420288, + "learning_rate": 7.522551546145229e-07, + "loss": 2.5617, + "step": 33141 + }, + { + "epoch": 0.9827714023070309, + "grad_norm": 0.06777945905923843, + "learning_rate": 7.496773771952792e-07, + "loss": 2.5741, + "step": 33142 + }, + { + "epoch": 0.9828010556593423, + "grad_norm": 0.06436678767204285, + "learning_rate": 7.471040207233659e-07, + "loss": 2.5693, + "step": 33143 + }, + { + "epoch": 0.9828307090116538, + "grad_norm": 0.06506085395812988, + "learning_rate": 7.445350852215427e-07, + "loss": 2.5562, + "step": 33144 + }, + { + "epoch": 0.9828603623639652, + "grad_norm": 0.06615466624498367, + "learning_rate": 7.419705707125135e-07, + "loss": 2.5522, + "step": 33145 + }, + { + "epoch": 0.9828900157162768, + "grad_norm": 0.06353722512722015, + "learning_rate": 7.394104772190935e-07, + "loss": 2.5894, + "step": 33146 + }, + { + "epoch": 0.9829196690685882, + "grad_norm": 0.06452871114015579, + "learning_rate": 7.368548047638202e-07, + "loss": 2.5578, + "step": 33147 + }, + { + "epoch": 0.9829493224208997, + "grad_norm": 0.06781649589538574, + "learning_rate": 7.343035533694531e-07, + "loss": 2.5262, + "step": 33148 + }, + { + "epoch": 0.9829789757732111, + "grad_norm": 0.06551818549633026, + "learning_rate": 7.317567230584743e-07, + "loss": 2.5587, + "step": 33149 + }, + { + "epoch": 0.9830086291255227, + "grad_norm": 0.06510227918624878, + "learning_rate": 7.292143138535323e-07, + "loss": 2.5377, + "step": 33150 + }, + { + "epoch": 0.9830382824778341, + "grad_norm": 0.06814635545015335, + "learning_rate": 7.266763257770537e-07, + "loss": 2.5723, + "step": 33151 + }, + { + "epoch": 0.9830679358301456, + "grad_norm": 0.06673876941204071, + "learning_rate": 7.241427588516314e-07, + "loss": 2.5976, + "step": 33152 + }, + { + "epoch": 0.983097589182457, + "grad_norm": 0.06530553102493286, + "learning_rate": 7.216136130995255e-07, + "loss": 2.53, + "step": 33153 + }, + { + "epoch": 0.9831272425347686, + "grad_norm": 0.066569484770298, + "learning_rate": 7.190888885433289e-07, + "loss": 2.5458, + "step": 33154 + }, + { + "epoch": 0.98315689588708, + "grad_norm": 0.06530934572219849, + "learning_rate": 7.165685852052462e-07, + "loss": 2.5514, + "step": 33155 + }, + { + "epoch": 0.9831865492393915, + "grad_norm": 0.06593272089958191, + "learning_rate": 7.140527031076483e-07, + "loss": 2.5589, + "step": 33156 + }, + { + "epoch": 0.983216202591703, + "grad_norm": 0.06842296570539474, + "learning_rate": 7.115412422727952e-07, + "loss": 2.5946, + "step": 33157 + }, + { + "epoch": 0.9832458559440145, + "grad_norm": 0.06329410523176193, + "learning_rate": 7.090342027230023e-07, + "loss": 2.5639, + "step": 33158 + }, + { + "epoch": 0.9832755092963259, + "grad_norm": 0.06463099271059036, + "learning_rate": 7.065315844803632e-07, + "loss": 2.5673, + "step": 33159 + }, + { + "epoch": 0.9833051626486374, + "grad_norm": 0.06588765233755112, + "learning_rate": 7.040333875671378e-07, + "loss": 2.5306, + "step": 33160 + }, + { + "epoch": 0.9833348160009489, + "grad_norm": 0.06326666474342346, + "learning_rate": 7.015396120053641e-07, + "loss": 2.5773, + "step": 33161 + }, + { + "epoch": 0.9833644693532604, + "grad_norm": 0.06681711971759796, + "learning_rate": 6.990502578171354e-07, + "loss": 2.5704, + "step": 33162 + }, + { + "epoch": 0.9833941227055719, + "grad_norm": 0.06404627114534378, + "learning_rate": 6.965653250246007e-07, + "loss": 2.5681, + "step": 33163 + }, + { + "epoch": 0.9834237760578833, + "grad_norm": 0.06761246919631958, + "learning_rate": 6.940848136496314e-07, + "loss": 2.5731, + "step": 33164 + }, + { + "epoch": 0.9834534294101949, + "grad_norm": 0.06733770668506622, + "learning_rate": 6.916087237142099e-07, + "loss": 2.5726, + "step": 33165 + }, + { + "epoch": 0.9834830827625063, + "grad_norm": 0.06740141659975052, + "learning_rate": 6.891370552403741e-07, + "loss": 2.5531, + "step": 33166 + }, + { + "epoch": 0.9835127361148178, + "grad_norm": 0.06519781798124313, + "learning_rate": 6.866698082498846e-07, + "loss": 2.545, + "step": 33167 + }, + { + "epoch": 0.9835423894671292, + "grad_norm": 0.06398187577724457, + "learning_rate": 6.84206982764668e-07, + "loss": 2.5517, + "step": 33168 + }, + { + "epoch": 0.9835720428194408, + "grad_norm": 0.06425006687641144, + "learning_rate": 6.817485788064847e-07, + "loss": 2.575, + "step": 33169 + }, + { + "epoch": 0.9836016961717522, + "grad_norm": 0.06425358355045319, + "learning_rate": 6.792945963971509e-07, + "loss": 2.5734, + "step": 33170 + }, + { + "epoch": 0.9836313495240637, + "grad_norm": 0.06670694798231125, + "learning_rate": 6.768450355583155e-07, + "loss": 2.5749, + "step": 33171 + }, + { + "epoch": 0.9836610028763751, + "grad_norm": 0.06713972985744476, + "learning_rate": 6.743998963117947e-07, + "loss": 2.5671, + "step": 33172 + }, + { + "epoch": 0.9836906562286867, + "grad_norm": 0.06423826515674591, + "learning_rate": 6.719591786791268e-07, + "loss": 2.5443, + "step": 33173 + }, + { + "epoch": 0.9837203095809981, + "grad_norm": 0.06606751680374146, + "learning_rate": 6.69522882681961e-07, + "loss": 2.5592, + "step": 33174 + }, + { + "epoch": 0.9837499629333096, + "grad_norm": 0.06558600068092346, + "learning_rate": 6.670910083419468e-07, + "loss": 2.5584, + "step": 33175 + }, + { + "epoch": 0.983779616285621, + "grad_norm": 0.06549277156591415, + "learning_rate": 6.646635556804559e-07, + "loss": 2.578, + "step": 33176 + }, + { + "epoch": 0.9838092696379326, + "grad_norm": 0.0652414932847023, + "learning_rate": 6.622405247191377e-07, + "loss": 2.5142, + "step": 33177 + }, + { + "epoch": 0.983838922990244, + "grad_norm": 0.06532788276672363, + "learning_rate": 6.598219154794194e-07, + "loss": 2.6024, + "step": 33178 + }, + { + "epoch": 0.9838685763425555, + "grad_norm": 0.06314613670110703, + "learning_rate": 6.574077279826174e-07, + "loss": 2.5543, + "step": 33179 + }, + { + "epoch": 0.983898229694867, + "grad_norm": 0.06802885979413986, + "learning_rate": 6.549979622502145e-07, + "loss": 2.5495, + "step": 33180 + }, + { + "epoch": 0.9839278830471785, + "grad_norm": 0.06741432845592499, + "learning_rate": 6.525926183035269e-07, + "loss": 2.5775, + "step": 33181 + }, + { + "epoch": 0.9839575363994899, + "grad_norm": 0.0690002292394638, + "learning_rate": 6.501916961638154e-07, + "loss": 2.5149, + "step": 33182 + }, + { + "epoch": 0.9839871897518014, + "grad_norm": 0.07112529873847961, + "learning_rate": 6.477951958523965e-07, + "loss": 2.5885, + "step": 33183 + }, + { + "epoch": 0.984016843104113, + "grad_norm": 0.06493819504976273, + "learning_rate": 6.454031173904196e-07, + "loss": 2.5322, + "step": 33184 + }, + { + "epoch": 0.9840464964564244, + "grad_norm": 0.0652163103222847, + "learning_rate": 6.430154607991456e-07, + "loss": 2.5489, + "step": 33185 + }, + { + "epoch": 0.9840761498087359, + "grad_norm": 0.06401354819536209, + "learning_rate": 6.406322260997244e-07, + "loss": 2.5489, + "step": 33186 + }, + { + "epoch": 0.9841058031610473, + "grad_norm": 0.0694633349776268, + "learning_rate": 6.382534133131391e-07, + "loss": 2.573, + "step": 33187 + }, + { + "epoch": 0.9841354565133589, + "grad_norm": 0.0663948804140091, + "learning_rate": 6.358790224605949e-07, + "loss": 2.5187, + "step": 33188 + }, + { + "epoch": 0.9841651098656703, + "grad_norm": 0.06225726380944252, + "learning_rate": 6.335090535630195e-07, + "loss": 2.5611, + "step": 33189 + }, + { + "epoch": 0.9841947632179818, + "grad_norm": 0.0656120553612709, + "learning_rate": 6.311435066414517e-07, + "loss": 2.5547, + "step": 33190 + }, + { + "epoch": 0.9842244165702932, + "grad_norm": 0.06724321842193604, + "learning_rate": 6.287823817168193e-07, + "loss": 2.5652, + "step": 33191 + }, + { + "epoch": 0.9842540699226048, + "grad_norm": 0.06773164123296738, + "learning_rate": 6.264256788100497e-07, + "loss": 2.6249, + "step": 33192 + }, + { + "epoch": 0.9842837232749162, + "grad_norm": 0.06581810116767883, + "learning_rate": 6.2407339794196e-07, + "loss": 2.5341, + "step": 33193 + }, + { + "epoch": 0.9843133766272277, + "grad_norm": 0.06642629206180573, + "learning_rate": 6.217255391334775e-07, + "loss": 2.5344, + "step": 33194 + }, + { + "epoch": 0.9843430299795392, + "grad_norm": 0.06517162919044495, + "learning_rate": 6.193821024052526e-07, + "loss": 2.5633, + "step": 33195 + }, + { + "epoch": 0.9843726833318507, + "grad_norm": 0.06842698901891708, + "learning_rate": 6.170430877782129e-07, + "loss": 2.5543, + "step": 33196 + }, + { + "epoch": 0.9844023366841621, + "grad_norm": 0.06658079475164413, + "learning_rate": 6.147084952728976e-07, + "loss": 2.5914, + "step": 33197 + }, + { + "epoch": 0.9844319900364736, + "grad_norm": 0.06463947147130966, + "learning_rate": 6.123783249100679e-07, + "loss": 2.556, + "step": 33198 + }, + { + "epoch": 0.9844616433887851, + "grad_norm": 0.06967474520206451, + "learning_rate": 6.100525767103737e-07, + "loss": 2.521, + "step": 33199 + }, + { + "epoch": 0.9844912967410966, + "grad_norm": 0.06496639549732208, + "learning_rate": 6.077312506943544e-07, + "loss": 2.5297, + "step": 33200 + }, + { + "epoch": 0.984520950093408, + "grad_norm": 0.06937618553638458, + "learning_rate": 6.054143468826045e-07, + "loss": 2.5697, + "step": 33201 + }, + { + "epoch": 0.9845506034457195, + "grad_norm": 0.06309420615434647, + "learning_rate": 6.031018652956633e-07, + "loss": 2.5842, + "step": 33202 + }, + { + "epoch": 0.9845802567980311, + "grad_norm": 0.06522374600172043, + "learning_rate": 6.007938059539031e-07, + "loss": 2.5934, + "step": 33203 + }, + { + "epoch": 0.9846099101503425, + "grad_norm": 0.0663895457983017, + "learning_rate": 5.984901688779187e-07, + "loss": 2.582, + "step": 33204 + }, + { + "epoch": 0.984639563502654, + "grad_norm": 0.0645793005824089, + "learning_rate": 5.961909540879718e-07, + "loss": 2.5447, + "step": 33205 + }, + { + "epoch": 0.9846692168549654, + "grad_norm": 0.06507924944162369, + "learning_rate": 5.938961616044902e-07, + "loss": 2.547, + "step": 33206 + }, + { + "epoch": 0.984698870207277, + "grad_norm": 0.06619546562433243, + "learning_rate": 5.916057914477913e-07, + "loss": 2.565, + "step": 33207 + }, + { + "epoch": 0.9847285235595884, + "grad_norm": 0.06601756811141968, + "learning_rate": 5.893198436381364e-07, + "loss": 2.5656, + "step": 33208 + }, + { + "epoch": 0.9847581769118999, + "grad_norm": 0.0659460723400116, + "learning_rate": 5.870383181957873e-07, + "loss": 2.5531, + "step": 33209 + }, + { + "epoch": 0.9847878302642114, + "grad_norm": 0.06742296367883682, + "learning_rate": 5.847612151408943e-07, + "loss": 2.5822, + "step": 33210 + }, + { + "epoch": 0.9848174836165229, + "grad_norm": 0.06449612975120544, + "learning_rate": 5.824885344937191e-07, + "loss": 2.5759, + "step": 33211 + }, + { + "epoch": 0.9848471369688343, + "grad_norm": 0.06618598848581314, + "learning_rate": 5.802202762743014e-07, + "loss": 2.5746, + "step": 33212 + }, + { + "epoch": 0.9848767903211458, + "grad_norm": 0.06442888081073761, + "learning_rate": 5.779564405027359e-07, + "loss": 2.5896, + "step": 33213 + }, + { + "epoch": 0.9849064436734573, + "grad_norm": 0.06655722856521606, + "learning_rate": 5.75697027199118e-07, + "loss": 2.5308, + "step": 33214 + }, + { + "epoch": 0.9849360970257688, + "grad_norm": 0.06816589832305908, + "learning_rate": 5.734420363834314e-07, + "loss": 2.5685, + "step": 33215 + }, + { + "epoch": 0.9849657503780802, + "grad_norm": 0.07292278856039047, + "learning_rate": 5.711914680756048e-07, + "loss": 2.5344, + "step": 33216 + }, + { + "epoch": 0.9849954037303917, + "grad_norm": 0.06634678691625595, + "learning_rate": 5.689453222956775e-07, + "loss": 2.579, + "step": 33217 + }, + { + "epoch": 0.9850250570827032, + "grad_norm": 0.0632065087556839, + "learning_rate": 5.667035990634117e-07, + "loss": 2.5524, + "step": 33218 + }, + { + "epoch": 0.9850547104350147, + "grad_norm": 0.06750466674566269, + "learning_rate": 5.644662983987359e-07, + "loss": 2.535, + "step": 33219 + }, + { + "epoch": 0.9850843637873261, + "grad_norm": 0.06328821182250977, + "learning_rate": 5.622334203214119e-07, + "loss": 2.5578, + "step": 33220 + }, + { + "epoch": 0.9851140171396376, + "grad_norm": 0.06816038489341736, + "learning_rate": 5.600049648512573e-07, + "loss": 2.5822, + "step": 33221 + }, + { + "epoch": 0.9851436704919491, + "grad_norm": 0.06584654003381729, + "learning_rate": 5.577809320079786e-07, + "loss": 2.5497, + "step": 33222 + }, + { + "epoch": 0.9851733238442606, + "grad_norm": 0.06392575055360794, + "learning_rate": 5.55561321811282e-07, + "loss": 2.5435, + "step": 33223 + }, + { + "epoch": 0.9852029771965721, + "grad_norm": 0.06720893830060959, + "learning_rate": 5.533461342808189e-07, + "loss": 2.5869, + "step": 33224 + }, + { + "epoch": 0.9852326305488835, + "grad_norm": 0.06765848398208618, + "learning_rate": 5.511353694361843e-07, + "loss": 2.5519, + "step": 33225 + }, + { + "epoch": 0.9852622839011951, + "grad_norm": 0.06535279005765915, + "learning_rate": 5.489290272970294e-07, + "loss": 2.5483, + "step": 33226 + }, + { + "epoch": 0.9852919372535065, + "grad_norm": 0.06515994668006897, + "learning_rate": 5.467271078827829e-07, + "loss": 2.5508, + "step": 33227 + }, + { + "epoch": 0.985321590605818, + "grad_norm": 0.06523876637220383, + "learning_rate": 5.445296112130404e-07, + "loss": 2.5613, + "step": 33228 + }, + { + "epoch": 0.9853512439581295, + "grad_norm": 0.06602455675601959, + "learning_rate": 5.423365373071754e-07, + "loss": 2.584, + "step": 33229 + }, + { + "epoch": 0.985380897310441, + "grad_norm": 0.06734994798898697, + "learning_rate": 5.40147886184672e-07, + "loss": 2.5197, + "step": 33230 + }, + { + "epoch": 0.9854105506627524, + "grad_norm": 0.06789731979370117, + "learning_rate": 5.379636578649038e-07, + "loss": 2.5814, + "step": 33231 + }, + { + "epoch": 0.9854402040150639, + "grad_norm": 0.0660717636346817, + "learning_rate": 5.357838523671888e-07, + "loss": 2.5354, + "step": 33232 + }, + { + "epoch": 0.9854698573673754, + "grad_norm": 0.06288904696702957, + "learning_rate": 5.33608469710789e-07, + "loss": 2.5731, + "step": 33233 + }, + { + "epoch": 0.9854995107196869, + "grad_norm": 0.06725183129310608, + "learning_rate": 5.314375099150781e-07, + "loss": 2.5835, + "step": 33234 + }, + { + "epoch": 0.9855291640719983, + "grad_norm": 0.06345884501934052, + "learning_rate": 5.292709729992073e-07, + "loss": 2.5318, + "step": 33235 + }, + { + "epoch": 0.9855588174243098, + "grad_norm": 0.06236747279763222, + "learning_rate": 5.271088589823836e-07, + "loss": 2.574, + "step": 33236 + }, + { + "epoch": 0.9855884707766213, + "grad_norm": 0.066808320581913, + "learning_rate": 5.249511678837582e-07, + "loss": 2.5685, + "step": 33237 + }, + { + "epoch": 0.9856181241289328, + "grad_norm": 0.06524588167667389, + "learning_rate": 5.227978997223715e-07, + "loss": 2.5831, + "step": 33238 + }, + { + "epoch": 0.9856477774812442, + "grad_norm": 0.06706662476062775, + "learning_rate": 5.206490545173747e-07, + "loss": 2.576, + "step": 33239 + }, + { + "epoch": 0.9856774308335557, + "grad_norm": 0.07077925652265549, + "learning_rate": 5.185046322877529e-07, + "loss": 2.5799, + "step": 33240 + }, + { + "epoch": 0.9857070841858672, + "grad_norm": 0.06598515808582306, + "learning_rate": 5.163646330525462e-07, + "loss": 2.5776, + "step": 33241 + }, + { + "epoch": 0.9857367375381787, + "grad_norm": 0.06712329387664795, + "learning_rate": 5.142290568306285e-07, + "loss": 2.529, + "step": 33242 + }, + { + "epoch": 0.9857663908904901, + "grad_norm": 0.06639286130666733, + "learning_rate": 5.120979036409845e-07, + "loss": 2.5603, + "step": 33243 + }, + { + "epoch": 0.9857960442428017, + "grad_norm": 0.06488385796546936, + "learning_rate": 5.099711735024327e-07, + "loss": 2.5313, + "step": 33244 + }, + { + "epoch": 0.9858256975951132, + "grad_norm": 0.06843404471874237, + "learning_rate": 5.07848866433791e-07, + "loss": 2.5894, + "step": 33245 + }, + { + "epoch": 0.9858553509474246, + "grad_norm": 0.06577549874782562, + "learning_rate": 5.057309824538781e-07, + "loss": 2.563, + "step": 33246 + }, + { + "epoch": 0.9858850042997361, + "grad_norm": 0.06653562933206558, + "learning_rate": 5.03617521581512e-07, + "loss": 2.5686, + "step": 33247 + }, + { + "epoch": 0.9859146576520476, + "grad_norm": 0.06741207838058472, + "learning_rate": 5.01508483835289e-07, + "loss": 2.5617, + "step": 33248 + }, + { + "epoch": 0.9859443110043591, + "grad_norm": 0.06674634665250778, + "learning_rate": 4.994038692340275e-07, + "loss": 2.5543, + "step": 33249 + }, + { + "epoch": 0.9859739643566705, + "grad_norm": 0.06647904217243195, + "learning_rate": 4.973036777962125e-07, + "loss": 2.5863, + "step": 33250 + }, + { + "epoch": 0.986003617708982, + "grad_norm": 0.06792353838682175, + "learning_rate": 4.952079095405515e-07, + "loss": 2.5581, + "step": 33251 + }, + { + "epoch": 0.9860332710612935, + "grad_norm": 0.06689032167196274, + "learning_rate": 4.931165644855296e-07, + "loss": 2.5575, + "step": 33252 + }, + { + "epoch": 0.986062924413605, + "grad_norm": 0.06595465540885925, + "learning_rate": 4.910296426496874e-07, + "loss": 2.5586, + "step": 33253 + }, + { + "epoch": 0.9860925777659164, + "grad_norm": 0.06608890742063522, + "learning_rate": 4.889471440515658e-07, + "loss": 2.554, + "step": 33254 + }, + { + "epoch": 0.9861222311182279, + "grad_norm": 0.06247430667281151, + "learning_rate": 4.868690687095389e-07, + "loss": 2.5814, + "step": 33255 + }, + { + "epoch": 0.9861518844705394, + "grad_norm": 0.06808621436357498, + "learning_rate": 4.847954166420366e-07, + "loss": 2.5521, + "step": 33256 + }, + { + "epoch": 0.9861815378228509, + "grad_norm": 0.06388755887746811, + "learning_rate": 4.827261878673772e-07, + "loss": 2.5496, + "step": 33257 + }, + { + "epoch": 0.9862111911751623, + "grad_norm": 0.06550273299217224, + "learning_rate": 4.806613824039352e-07, + "loss": 2.5627, + "step": 33258 + }, + { + "epoch": 0.9862408445274738, + "grad_norm": 0.06588665395975113, + "learning_rate": 4.786010002699736e-07, + "loss": 2.5678, + "step": 33259 + }, + { + "epoch": 0.9862704978797853, + "grad_norm": 0.06618687510490417, + "learning_rate": 4.765450414837558e-07, + "loss": 2.5272, + "step": 33260 + }, + { + "epoch": 0.9863001512320968, + "grad_norm": 0.06615959107875824, + "learning_rate": 4.7449350606348916e-07, + "loss": 2.5819, + "step": 33261 + }, + { + "epoch": 0.9863298045844082, + "grad_norm": 0.06892482936382294, + "learning_rate": 4.7244639402732604e-07, + "loss": 2.5707, + "step": 33262 + }, + { + "epoch": 0.9863594579367198, + "grad_norm": 0.06498191505670547, + "learning_rate": 4.7040370539336297e-07, + "loss": 2.5536, + "step": 33263 + }, + { + "epoch": 0.9863891112890312, + "grad_norm": 0.06645887345075607, + "learning_rate": 4.6836544017969664e-07, + "loss": 2.5643, + "step": 33264 + }, + { + "epoch": 0.9864187646413427, + "grad_norm": 0.06682795286178589, + "learning_rate": 4.663315984044236e-07, + "loss": 2.5917, + "step": 33265 + }, + { + "epoch": 0.9864484179936542, + "grad_norm": 0.06810781359672546, + "learning_rate": 4.643021800855296e-07, + "loss": 2.5696, + "step": 33266 + }, + { + "epoch": 0.9864780713459657, + "grad_norm": 0.06629659235477448, + "learning_rate": 4.6227718524100013e-07, + "loss": 2.6105, + "step": 33267 + }, + { + "epoch": 0.9865077246982772, + "grad_norm": 0.06487411260604858, + "learning_rate": 4.602566138887654e-07, + "loss": 2.5484, + "step": 33268 + }, + { + "epoch": 0.9865373780505886, + "grad_norm": 0.06583461910486221, + "learning_rate": 4.5824046604664436e-07, + "loss": 2.5487, + "step": 33269 + }, + { + "epoch": 0.9865670314029001, + "grad_norm": 0.06481311470270157, + "learning_rate": 4.5622874173262277e-07, + "loss": 2.6034, + "step": 33270 + }, + { + "epoch": 0.9865966847552116, + "grad_norm": 0.06588272750377655, + "learning_rate": 4.5422144096435303e-07, + "loss": 2.5662, + "step": 33271 + }, + { + "epoch": 0.9866263381075231, + "grad_norm": 0.06441278010606766, + "learning_rate": 4.5221856375976535e-07, + "loss": 2.5844, + "step": 33272 + }, + { + "epoch": 0.9866559914598345, + "grad_norm": 0.06506888568401337, + "learning_rate": 4.502201101365122e-07, + "loss": 2.5284, + "step": 33273 + }, + { + "epoch": 0.986685644812146, + "grad_norm": 0.06416317075490952, + "learning_rate": 4.482260801123017e-07, + "loss": 2.5665, + "step": 33274 + }, + { + "epoch": 0.9867152981644575, + "grad_norm": 0.06603940576314926, + "learning_rate": 4.462364737047864e-07, + "loss": 2.5777, + "step": 33275 + }, + { + "epoch": 0.986744951516769, + "grad_norm": 0.06387802958488464, + "learning_rate": 4.4425129093161876e-07, + "loss": 2.5852, + "step": 33276 + }, + { + "epoch": 0.9867746048690804, + "grad_norm": 0.06435488164424896, + "learning_rate": 4.422705318103404e-07, + "loss": 2.5702, + "step": 33277 + }, + { + "epoch": 0.986804258221392, + "grad_norm": 0.06747853010892868, + "learning_rate": 4.402941963584928e-07, + "loss": 2.5523, + "step": 33278 + }, + { + "epoch": 0.9868339115737034, + "grad_norm": 0.06514063477516174, + "learning_rate": 4.3832228459361747e-07, + "loss": 2.5797, + "step": 33279 + }, + { + "epoch": 0.9868635649260149, + "grad_norm": 0.06798706203699112, + "learning_rate": 4.363547965330894e-07, + "loss": 2.5447, + "step": 33280 + }, + { + "epoch": 0.9868932182783263, + "grad_norm": 0.06302487850189209, + "learning_rate": 4.343917321944502e-07, + "loss": 2.5256, + "step": 33281 + }, + { + "epoch": 0.9869228716306379, + "grad_norm": 0.06529796123504639, + "learning_rate": 4.324330915950192e-07, + "loss": 2.5638, + "step": 33282 + }, + { + "epoch": 0.9869525249829493, + "grad_norm": 0.06534890085458755, + "learning_rate": 4.30478874752116e-07, + "loss": 2.5253, + "step": 33283 + }, + { + "epoch": 0.9869821783352608, + "grad_norm": 0.06538907438516617, + "learning_rate": 4.2852908168306006e-07, + "loss": 2.5303, + "step": 33284 + }, + { + "epoch": 0.9870118316875722, + "grad_norm": 0.06515399366617203, + "learning_rate": 4.265837124051708e-07, + "loss": 2.5733, + "step": 33285 + }, + { + "epoch": 0.9870414850398838, + "grad_norm": 0.06938094645738602, + "learning_rate": 4.246427669356012e-07, + "loss": 2.5786, + "step": 33286 + }, + { + "epoch": 0.9870711383921953, + "grad_norm": 0.0649036094546318, + "learning_rate": 4.2270624529155974e-07, + "loss": 2.5853, + "step": 33287 + }, + { + "epoch": 0.9871007917445067, + "grad_norm": 0.06510946899652481, + "learning_rate": 4.2077414749025487e-07, + "loss": 2.5998, + "step": 33288 + }, + { + "epoch": 0.9871304450968182, + "grad_norm": 0.06627102941274643, + "learning_rate": 4.188464735487285e-07, + "loss": 2.5539, + "step": 33289 + }, + { + "epoch": 0.9871600984491297, + "grad_norm": 0.06562501937150955, + "learning_rate": 4.169232234840226e-07, + "loss": 2.5558, + "step": 33290 + }, + { + "epoch": 0.9871897518014412, + "grad_norm": 0.06608466804027557, + "learning_rate": 4.150043973132345e-07, + "loss": 2.5123, + "step": 33291 + }, + { + "epoch": 0.9872194051537526, + "grad_norm": 0.06556146591901779, + "learning_rate": 4.1308999505335067e-07, + "loss": 2.5735, + "step": 33292 + }, + { + "epoch": 0.9872490585060641, + "grad_norm": 0.06665725260972977, + "learning_rate": 4.111800167213575e-07, + "loss": 2.5652, + "step": 33293 + }, + { + "epoch": 0.9872787118583756, + "grad_norm": 0.06874740868806839, + "learning_rate": 4.0927446233407493e-07, + "loss": 2.5593, + "step": 33294 + }, + { + "epoch": 0.9873083652106871, + "grad_norm": 0.06549905240535736, + "learning_rate": 4.0737333190837834e-07, + "loss": 2.5181, + "step": 33295 + }, + { + "epoch": 0.9873380185629985, + "grad_norm": 0.06840723752975464, + "learning_rate": 4.054766254611986e-07, + "loss": 2.5707, + "step": 33296 + }, + { + "epoch": 0.98736767191531, + "grad_norm": 0.06419772654771805, + "learning_rate": 4.035843430092445e-07, + "loss": 2.5739, + "step": 33297 + }, + { + "epoch": 0.9873973252676215, + "grad_norm": 0.06438415497541428, + "learning_rate": 4.0169648456933606e-07, + "loss": 2.5511, + "step": 33298 + }, + { + "epoch": 0.987426978619933, + "grad_norm": 0.06808672845363617, + "learning_rate": 3.998130501581265e-07, + "loss": 2.5355, + "step": 33299 + }, + { + "epoch": 0.9874566319722444, + "grad_norm": 0.06540767848491669, + "learning_rate": 3.979340397923803e-07, + "loss": 2.5481, + "step": 33300 + }, + { + "epoch": 0.987486285324556, + "grad_norm": 0.06639779359102249, + "learning_rate": 3.960594534886397e-07, + "loss": 2.5687, + "step": 33301 + }, + { + "epoch": 0.9875159386768674, + "grad_norm": 0.06577976047992706, + "learning_rate": 3.941892912635581e-07, + "loss": 2.5576, + "step": 33302 + }, + { + "epoch": 0.9875455920291789, + "grad_norm": 0.06439239531755447, + "learning_rate": 3.923235531336777e-07, + "loss": 2.5735, + "step": 33303 + }, + { + "epoch": 0.9875752453814903, + "grad_norm": 0.06617120653390884, + "learning_rate": 3.90462239115541e-07, + "loss": 2.5531, + "step": 33304 + }, + { + "epoch": 0.9876048987338019, + "grad_norm": 0.06707601249217987, + "learning_rate": 3.8860534922563475e-07, + "loss": 2.5819, + "step": 33305 + }, + { + "epoch": 0.9876345520861133, + "grad_norm": 0.06630104035139084, + "learning_rate": 3.8675288348033467e-07, + "loss": 2.5643, + "step": 33306 + }, + { + "epoch": 0.9876642054384248, + "grad_norm": 0.06256548315286636, + "learning_rate": 3.849048418961276e-07, + "loss": 2.5328, + "step": 33307 + }, + { + "epoch": 0.9876938587907363, + "grad_norm": 0.06472201645374298, + "learning_rate": 3.830612244893339e-07, + "loss": 2.5525, + "step": 33308 + }, + { + "epoch": 0.9877235121430478, + "grad_norm": 0.06747865676879883, + "learning_rate": 3.812220312763293e-07, + "loss": 2.5548, + "step": 33309 + }, + { + "epoch": 0.9877531654953593, + "grad_norm": 0.06540325284004211, + "learning_rate": 3.79387262273323e-07, + "loss": 2.5283, + "step": 33310 + }, + { + "epoch": 0.9877828188476707, + "grad_norm": 0.06694384664297104, + "learning_rate": 3.7755691749663536e-07, + "loss": 2.5505, + "step": 33311 + }, + { + "epoch": 0.9878124721999822, + "grad_norm": 0.06267455220222473, + "learning_rate": 3.7573099696242007e-07, + "loss": 2.5689, + "step": 33312 + }, + { + "epoch": 0.9878421255522937, + "grad_norm": 0.06546978652477264, + "learning_rate": 3.7390950068683094e-07, + "loss": 2.5531, + "step": 33313 + }, + { + "epoch": 0.9878717789046052, + "grad_norm": 0.06451548635959625, + "learning_rate": 3.7209242868607716e-07, + "loss": 2.5465, + "step": 33314 + }, + { + "epoch": 0.9879014322569166, + "grad_norm": 0.06677104532718658, + "learning_rate": 3.702797809762015e-07, + "loss": 2.561, + "step": 33315 + }, + { + "epoch": 0.9879310856092282, + "grad_norm": 0.06466484814882278, + "learning_rate": 3.684715575732467e-07, + "loss": 2.562, + "step": 33316 + }, + { + "epoch": 0.9879607389615396, + "grad_norm": 0.0644049346446991, + "learning_rate": 3.666677584932554e-07, + "loss": 2.5596, + "step": 33317 + }, + { + "epoch": 0.9879903923138511, + "grad_norm": 0.06547433137893677, + "learning_rate": 3.6486838375215935e-07, + "loss": 2.5714, + "step": 33318 + }, + { + "epoch": 0.9880200456661625, + "grad_norm": 0.06618871539831161, + "learning_rate": 3.630734333658903e-07, + "loss": 2.5782, + "step": 33319 + }, + { + "epoch": 0.9880496990184741, + "grad_norm": 0.06759154796600342, + "learning_rate": 3.6128290735043534e-07, + "loss": 2.6063, + "step": 33320 + }, + { + "epoch": 0.9880793523707855, + "grad_norm": 0.06562257558107376, + "learning_rate": 3.5949680572155975e-07, + "loss": 2.5185, + "step": 33321 + }, + { + "epoch": 0.988109005723097, + "grad_norm": 0.0666382685303688, + "learning_rate": 3.5771512849508415e-07, + "loss": 2.5624, + "step": 33322 + }, + { + "epoch": 0.9881386590754084, + "grad_norm": 0.06501834839582443, + "learning_rate": 3.559378756867737e-07, + "loss": 2.5693, + "step": 33323 + }, + { + "epoch": 0.98816831242772, + "grad_norm": 0.06713703274726868, + "learning_rate": 3.5416504731244915e-07, + "loss": 2.5474, + "step": 33324 + }, + { + "epoch": 0.9881979657800314, + "grad_norm": 0.06533840298652649, + "learning_rate": 3.5239664338776454e-07, + "loss": 2.5586, + "step": 33325 + }, + { + "epoch": 0.9882276191323429, + "grad_norm": 0.06548862904310226, + "learning_rate": 3.506326639283186e-07, + "loss": 2.5594, + "step": 33326 + }, + { + "epoch": 0.9882572724846543, + "grad_norm": 0.0646398663520813, + "learning_rate": 3.4887310894982095e-07, + "loss": 2.5477, + "step": 33327 + }, + { + "epoch": 0.9882869258369659, + "grad_norm": 0.06264717876911163, + "learning_rate": 3.471179784678147e-07, + "loss": 2.5766, + "step": 33328 + }, + { + "epoch": 0.9883165791892774, + "grad_norm": 0.06514905393123627, + "learning_rate": 3.4536727249784296e-07, + "loss": 2.587, + "step": 33329 + }, + { + "epoch": 0.9883462325415888, + "grad_norm": 0.06590119004249573, + "learning_rate": 3.4362099105539336e-07, + "loss": 2.5574, + "step": 33330 + }, + { + "epoch": 0.9883758858939004, + "grad_norm": 0.06618475168943405, + "learning_rate": 3.418791341559535e-07, + "loss": 2.581, + "step": 33331 + }, + { + "epoch": 0.9884055392462118, + "grad_norm": 0.06659873574972153, + "learning_rate": 3.401417018149e-07, + "loss": 2.5527, + "step": 33332 + }, + { + "epoch": 0.9884351925985233, + "grad_norm": 0.0633992999792099, + "learning_rate": 3.3840869404772046e-07, + "loss": 2.5442, + "step": 33333 + }, + { + "epoch": 0.9884648459508347, + "grad_norm": 0.06476064771413803, + "learning_rate": 3.3668011086968043e-07, + "loss": 2.5681, + "step": 33334 + }, + { + "epoch": 0.9884944993031463, + "grad_norm": 0.0653340294957161, + "learning_rate": 3.3495595229610097e-07, + "loss": 2.5778, + "step": 33335 + }, + { + "epoch": 0.9885241526554577, + "grad_norm": 0.068148672580719, + "learning_rate": 3.332362183422477e-07, + "loss": 2.5746, + "step": 33336 + }, + { + "epoch": 0.9885538060077692, + "grad_norm": 0.07181842625141144, + "learning_rate": 3.3152090902333063e-07, + "loss": 2.5567, + "step": 33337 + }, + { + "epoch": 0.9885834593600806, + "grad_norm": 0.06597936898469925, + "learning_rate": 3.2981002435461537e-07, + "loss": 2.5527, + "step": 33338 + }, + { + "epoch": 0.9886131127123922, + "grad_norm": 0.06463977694511414, + "learning_rate": 3.281035643511454e-07, + "loss": 2.5832, + "step": 33339 + }, + { + "epoch": 0.9886427660647036, + "grad_norm": 0.06482377648353577, + "learning_rate": 3.264015290281308e-07, + "loss": 2.5434, + "step": 33340 + }, + { + "epoch": 0.9886724194170151, + "grad_norm": 0.06555252522230148, + "learning_rate": 3.2470391840055957e-07, + "loss": 2.5354, + "step": 33341 + }, + { + "epoch": 0.9887020727693265, + "grad_norm": 0.06827542930841446, + "learning_rate": 3.2301073248353076e-07, + "loss": 2.5746, + "step": 33342 + }, + { + "epoch": 0.9887317261216381, + "grad_norm": 0.06739675998687744, + "learning_rate": 3.2132197129197683e-07, + "loss": 2.5813, + "step": 33343 + }, + { + "epoch": 0.9887613794739495, + "grad_norm": 0.0647457093000412, + "learning_rate": 3.196376348409413e-07, + "loss": 2.5649, + "step": 33344 + }, + { + "epoch": 0.988791032826261, + "grad_norm": 0.06666459888219833, + "learning_rate": 3.1795772314524574e-07, + "loss": 2.5706, + "step": 33345 + }, + { + "epoch": 0.9888206861785724, + "grad_norm": 0.06607215106487274, + "learning_rate": 3.1628223621982255e-07, + "loss": 2.5789, + "step": 33346 + }, + { + "epoch": 0.988850339530884, + "grad_norm": 0.06857999414205551, + "learning_rate": 3.146111740794377e-07, + "loss": 2.5588, + "step": 33347 + }, + { + "epoch": 0.9888799928831954, + "grad_norm": 0.07110406458377838, + "learning_rate": 3.129445367390238e-07, + "loss": 2.5951, + "step": 33348 + }, + { + "epoch": 0.9889096462355069, + "grad_norm": 0.06250540912151337, + "learning_rate": 3.1128232421318017e-07, + "loss": 2.5554, + "step": 33349 + }, + { + "epoch": 0.9889392995878185, + "grad_norm": 0.06717817485332489, + "learning_rate": 3.096245365167283e-07, + "loss": 2.5733, + "step": 33350 + }, + { + "epoch": 0.9889689529401299, + "grad_norm": 0.06395146995782852, + "learning_rate": 3.0797117366437865e-07, + "loss": 2.5764, + "step": 33351 + }, + { + "epoch": 0.9889986062924414, + "grad_norm": 0.06424742192029953, + "learning_rate": 3.0632223567061966e-07, + "loss": 2.5663, + "step": 33352 + }, + { + "epoch": 0.9890282596447528, + "grad_norm": 0.06654351949691772, + "learning_rate": 3.046777225502173e-07, + "loss": 2.5775, + "step": 33353 + }, + { + "epoch": 0.9890579129970644, + "grad_norm": 0.06566081196069717, + "learning_rate": 3.0303763431765995e-07, + "loss": 2.5605, + "step": 33354 + }, + { + "epoch": 0.9890875663493758, + "grad_norm": 0.06361722201108932, + "learning_rate": 3.0140197098743605e-07, + "loss": 2.5708, + "step": 33355 + }, + { + "epoch": 0.9891172197016873, + "grad_norm": 0.06398274004459381, + "learning_rate": 2.997707325740895e-07, + "loss": 2.5651, + "step": 33356 + }, + { + "epoch": 0.9891468730539987, + "grad_norm": 0.06609820574522018, + "learning_rate": 2.981439190920532e-07, + "loss": 2.5507, + "step": 33357 + }, + { + "epoch": 0.9891765264063103, + "grad_norm": 0.0655813217163086, + "learning_rate": 2.9652153055570454e-07, + "loss": 2.606, + "step": 33358 + }, + { + "epoch": 0.9892061797586217, + "grad_norm": 0.06537286937236786, + "learning_rate": 2.949035669794764e-07, + "loss": 2.5621, + "step": 33359 + }, + { + "epoch": 0.9892358331109332, + "grad_norm": 0.06542575359344482, + "learning_rate": 2.932900283776352e-07, + "loss": 2.5391, + "step": 33360 + }, + { + "epoch": 0.9892654864632446, + "grad_norm": 0.06486079096794128, + "learning_rate": 2.9168091476444724e-07, + "loss": 2.5631, + "step": 33361 + }, + { + "epoch": 0.9892951398155562, + "grad_norm": 0.06549645215272903, + "learning_rate": 2.9007622615423446e-07, + "loss": 2.5559, + "step": 33362 + }, + { + "epoch": 0.9893247931678676, + "grad_norm": 0.06780495494604111, + "learning_rate": 2.8847596256115214e-07, + "loss": 2.5724, + "step": 33363 + }, + { + "epoch": 0.9893544465201791, + "grad_norm": 0.06456942111253738, + "learning_rate": 2.868801239994112e-07, + "loss": 2.5901, + "step": 33364 + }, + { + "epoch": 0.9893840998724905, + "grad_norm": 0.06637421250343323, + "learning_rate": 2.852887104830559e-07, + "loss": 2.5576, + "step": 33365 + }, + { + "epoch": 0.9894137532248021, + "grad_norm": 0.06698421388864517, + "learning_rate": 2.837017220262972e-07, + "loss": 2.5811, + "step": 33366 + }, + { + "epoch": 0.9894434065771135, + "grad_norm": 0.06559417396783829, + "learning_rate": 2.821191586431793e-07, + "loss": 2.5926, + "step": 33367 + }, + { + "epoch": 0.989473059929425, + "grad_norm": 0.06461917608976364, + "learning_rate": 2.8054102034758e-07, + "loss": 2.5946, + "step": 33368 + }, + { + "epoch": 0.9895027132817364, + "grad_norm": 0.06387277692556381, + "learning_rate": 2.789673071535992e-07, + "loss": 2.5721, + "step": 33369 + }, + { + "epoch": 0.989532366634048, + "grad_norm": 0.06613507866859436, + "learning_rate": 2.7739801907517015e-07, + "loss": 2.5579, + "step": 33370 + }, + { + "epoch": 0.9895620199863595, + "grad_norm": 0.06783691048622131, + "learning_rate": 2.758331561261151e-07, + "loss": 2.5745, + "step": 33371 + }, + { + "epoch": 0.9895916733386709, + "grad_norm": 0.06482794880867004, + "learning_rate": 2.742727183203675e-07, + "loss": 2.5535, + "step": 33372 + }, + { + "epoch": 0.9896213266909825, + "grad_norm": 0.0651765763759613, + "learning_rate": 2.7271670567169393e-07, + "loss": 2.5468, + "step": 33373 + }, + { + "epoch": 0.9896509800432939, + "grad_norm": 0.06860116869211197, + "learning_rate": 2.7116511819391674e-07, + "loss": 2.5776, + "step": 33374 + }, + { + "epoch": 0.9896806333956054, + "grad_norm": 0.06528452783823013, + "learning_rate": 2.696179559007472e-07, + "loss": 2.5509, + "step": 33375 + }, + { + "epoch": 0.9897102867479168, + "grad_norm": 0.06795603036880493, + "learning_rate": 2.6807521880584107e-07, + "loss": 2.5741, + "step": 33376 + }, + { + "epoch": 0.9897399401002284, + "grad_norm": 0.06794123351573944, + "learning_rate": 2.6653690692296506e-07, + "loss": 2.5329, + "step": 33377 + }, + { + "epoch": 0.9897695934525398, + "grad_norm": 0.0706624761223793, + "learning_rate": 2.6500302026566394e-07, + "loss": 2.6014, + "step": 33378 + }, + { + "epoch": 0.9897992468048513, + "grad_norm": 0.06619290262460709, + "learning_rate": 2.6347355884748236e-07, + "loss": 2.5382, + "step": 33379 + }, + { + "epoch": 0.9898289001571627, + "grad_norm": 0.06420034915208817, + "learning_rate": 2.6194852268207616e-07, + "loss": 2.5673, + "step": 33380 + }, + { + "epoch": 0.9898585535094743, + "grad_norm": 0.06279066205024719, + "learning_rate": 2.6042791178287894e-07, + "loss": 2.5643, + "step": 33381 + }, + { + "epoch": 0.9898882068617857, + "grad_norm": 0.0657828077673912, + "learning_rate": 2.5891172616338e-07, + "loss": 2.571, + "step": 33382 + }, + { + "epoch": 0.9899178602140972, + "grad_norm": 0.0637635886669159, + "learning_rate": 2.5739996583701297e-07, + "loss": 2.5622, + "step": 33383 + }, + { + "epoch": 0.9899475135664086, + "grad_norm": 0.06428159773349762, + "learning_rate": 2.558926308171561e-07, + "loss": 2.5581, + "step": 33384 + }, + { + "epoch": 0.9899771669187202, + "grad_norm": 0.06488605588674545, + "learning_rate": 2.54389721117132e-07, + "loss": 2.5415, + "step": 33385 + }, + { + "epoch": 0.9900068202710316, + "grad_norm": 0.06413275748491287, + "learning_rate": 2.5289123675026337e-07, + "loss": 2.5893, + "step": 33386 + }, + { + "epoch": 0.9900364736233431, + "grad_norm": 0.07650201767683029, + "learning_rate": 2.513971777298174e-07, + "loss": 2.5751, + "step": 33387 + }, + { + "epoch": 0.9900661269756545, + "grad_norm": 0.06486555933952332, + "learning_rate": 2.4990754406900574e-07, + "loss": 2.56, + "step": 33388 + }, + { + "epoch": 0.9900957803279661, + "grad_norm": 0.06765174120664597, + "learning_rate": 2.484223357810955e-07, + "loss": 2.5475, + "step": 33389 + }, + { + "epoch": 0.9901254336802775, + "grad_norm": 0.06436797231435776, + "learning_rate": 2.469415528791319e-07, + "loss": 2.5332, + "step": 33390 + }, + { + "epoch": 0.990155087032589, + "grad_norm": 0.0662858858704567, + "learning_rate": 2.454651953763265e-07, + "loss": 2.5901, + "step": 33391 + }, + { + "epoch": 0.9901847403849006, + "grad_norm": 0.06536667793989182, + "learning_rate": 2.4399326328572447e-07, + "loss": 2.5665, + "step": 33392 + }, + { + "epoch": 0.990214393737212, + "grad_norm": 0.06584952026605606, + "learning_rate": 2.425257566203154e-07, + "loss": 2.5587, + "step": 33393 + }, + { + "epoch": 0.9902440470895235, + "grad_norm": 0.0641426369547844, + "learning_rate": 2.410626753931444e-07, + "loss": 2.5769, + "step": 33394 + }, + { + "epoch": 0.9902737004418349, + "grad_norm": 0.06389731168746948, + "learning_rate": 2.396040196170901e-07, + "loss": 2.5783, + "step": 33395 + }, + { + "epoch": 0.9903033537941465, + "grad_norm": 0.06453295797109604, + "learning_rate": 2.3814978930514208e-07, + "loss": 2.5611, + "step": 33396 + }, + { + "epoch": 0.9903330071464579, + "grad_norm": 0.06483777612447739, + "learning_rate": 2.3669998447017893e-07, + "loss": 2.5982, + "step": 33397 + }, + { + "epoch": 0.9903626604987694, + "grad_norm": 0.06436816602945328, + "learning_rate": 2.3525460512502373e-07, + "loss": 2.53, + "step": 33398 + }, + { + "epoch": 0.9903923138510808, + "grad_norm": 0.06553851813077927, + "learning_rate": 2.3381365128249955e-07, + "loss": 2.5961, + "step": 33399 + }, + { + "epoch": 0.9904219672033924, + "grad_norm": 0.06654245406389236, + "learning_rate": 2.3237712295531844e-07, + "loss": 2.5514, + "step": 33400 + }, + { + "epoch": 0.9904516205557038, + "grad_norm": 0.06330078840255737, + "learning_rate": 2.3094502015619245e-07, + "loss": 2.5597, + "step": 33401 + }, + { + "epoch": 0.9904812739080153, + "grad_norm": 0.06307873129844666, + "learning_rate": 2.2951734289783367e-07, + "loss": 2.5945, + "step": 33402 + }, + { + "epoch": 0.9905109272603267, + "grad_norm": 0.06580770760774612, + "learning_rate": 2.280940911929541e-07, + "loss": 2.523, + "step": 33403 + }, + { + "epoch": 0.9905405806126383, + "grad_norm": 0.06609589606523514, + "learning_rate": 2.2667526505398828e-07, + "loss": 2.5564, + "step": 33404 + }, + { + "epoch": 0.9905702339649497, + "grad_norm": 0.06493397802114487, + "learning_rate": 2.2526086449364824e-07, + "loss": 2.5436, + "step": 33405 + }, + { + "epoch": 0.9905998873172612, + "grad_norm": 0.06681320816278458, + "learning_rate": 2.2385088952442402e-07, + "loss": 2.5678, + "step": 33406 + }, + { + "epoch": 0.9906295406695727, + "grad_norm": 0.06662409007549286, + "learning_rate": 2.2244534015875006e-07, + "loss": 2.5579, + "step": 33407 + }, + { + "epoch": 0.9906591940218842, + "grad_norm": 0.06782279908657074, + "learning_rate": 2.2104421640911643e-07, + "loss": 2.5818, + "step": 33408 + }, + { + "epoch": 0.9906888473741956, + "grad_norm": 0.06498776376247406, + "learning_rate": 2.196475182879576e-07, + "loss": 2.5898, + "step": 33409 + }, + { + "epoch": 0.9907185007265071, + "grad_norm": 0.06441966444253922, + "learning_rate": 2.1825524580754152e-07, + "loss": 2.5645, + "step": 33410 + }, + { + "epoch": 0.9907481540788187, + "grad_norm": 0.0800524577498436, + "learning_rate": 2.1686739898030272e-07, + "loss": 2.546, + "step": 33411 + }, + { + "epoch": 0.9907778074311301, + "grad_norm": 0.0649724006652832, + "learning_rate": 2.1548397781850913e-07, + "loss": 2.5605, + "step": 33412 + }, + { + "epoch": 0.9908074607834416, + "grad_norm": 0.06553814560174942, + "learning_rate": 2.1410498233437326e-07, + "loss": 2.5805, + "step": 33413 + }, + { + "epoch": 0.990837114135753, + "grad_norm": 0.06533936411142349, + "learning_rate": 2.127304125401075e-07, + "loss": 2.5443, + "step": 33414 + }, + { + "epoch": 0.9908667674880646, + "grad_norm": 0.06600870937108994, + "learning_rate": 2.1136026844792434e-07, + "loss": 2.574, + "step": 33415 + }, + { + "epoch": 0.990896420840376, + "grad_norm": 0.06635831296443939, + "learning_rate": 2.099945500699252e-07, + "loss": 2.5327, + "step": 33416 + }, + { + "epoch": 0.9909260741926875, + "grad_norm": 0.06847264617681503, + "learning_rate": 2.086332574182115e-07, + "loss": 2.576, + "step": 33417 + }, + { + "epoch": 0.9909557275449989, + "grad_norm": 0.06453332304954529, + "learning_rate": 2.0727639050482917e-07, + "loss": 2.5294, + "step": 33418 + }, + { + "epoch": 0.9909853808973105, + "grad_norm": 0.06841866672039032, + "learning_rate": 2.0592394934182413e-07, + "loss": 2.5883, + "step": 33419 + }, + { + "epoch": 0.9910150342496219, + "grad_norm": 0.06655404716730118, + "learning_rate": 2.0457593394113128e-07, + "loss": 2.5617, + "step": 33420 + }, + { + "epoch": 0.9910446876019334, + "grad_norm": 0.06682801991701126, + "learning_rate": 2.032323443146855e-07, + "loss": 2.5398, + "step": 33421 + }, + { + "epoch": 0.9910743409542448, + "grad_norm": 0.06461948156356812, + "learning_rate": 2.0189318047447724e-07, + "loss": 2.5749, + "step": 33422 + }, + { + "epoch": 0.9911039943065564, + "grad_norm": 0.06382617354393005, + "learning_rate": 2.0055844243221932e-07, + "loss": 2.5808, + "step": 33423 + }, + { + "epoch": 0.9911336476588678, + "grad_norm": 0.06685445457696915, + "learning_rate": 1.9922813019984663e-07, + "loss": 2.5783, + "step": 33424 + }, + { + "epoch": 0.9911633010111793, + "grad_norm": 0.06660112738609314, + "learning_rate": 1.9790224378907207e-07, + "loss": 2.5663, + "step": 33425 + }, + { + "epoch": 0.9911929543634908, + "grad_norm": 0.06707150489091873, + "learning_rate": 1.9658078321171947e-07, + "loss": 2.5616, + "step": 33426 + }, + { + "epoch": 0.9912226077158023, + "grad_norm": 0.06522560119628906, + "learning_rate": 1.9526374847939066e-07, + "loss": 2.57, + "step": 33427 + }, + { + "epoch": 0.9912522610681137, + "grad_norm": 0.06475730240345001, + "learning_rate": 1.939511396037985e-07, + "loss": 2.5921, + "step": 33428 + }, + { + "epoch": 0.9912819144204252, + "grad_norm": 0.06755971908569336, + "learning_rate": 1.9264295659654486e-07, + "loss": 2.5634, + "step": 33429 + }, + { + "epoch": 0.9913115677727367, + "grad_norm": 0.0643046423792839, + "learning_rate": 1.913391994692315e-07, + "loss": 2.5815, + "step": 33430 + }, + { + "epoch": 0.9913412211250482, + "grad_norm": 0.06712762266397476, + "learning_rate": 1.900398682334048e-07, + "loss": 2.548, + "step": 33431 + }, + { + "epoch": 0.9913708744773597, + "grad_norm": 0.06523488461971283, + "learning_rate": 1.8874496290055554e-07, + "loss": 2.5664, + "step": 33432 + }, + { + "epoch": 0.9914005278296711, + "grad_norm": 0.06681668758392334, + "learning_rate": 1.8745448348223006e-07, + "loss": 2.56, + "step": 33433 + }, + { + "epoch": 0.9914301811819827, + "grad_norm": 0.06465959548950195, + "learning_rate": 1.8616842998969706e-07, + "loss": 2.5437, + "step": 33434 + }, + { + "epoch": 0.9914598345342941, + "grad_norm": 0.0632840245962143, + "learning_rate": 1.8488680243450294e-07, + "loss": 2.5587, + "step": 33435 + }, + { + "epoch": 0.9914894878866056, + "grad_norm": 0.06658215820789337, + "learning_rate": 1.8360960082786093e-07, + "loss": 2.5524, + "step": 33436 + }, + { + "epoch": 0.991519141238917, + "grad_norm": 0.06410744041204453, + "learning_rate": 1.8233682518120631e-07, + "loss": 2.5497, + "step": 33437 + }, + { + "epoch": 0.9915487945912286, + "grad_norm": 0.07137799263000488, + "learning_rate": 1.8106847550569682e-07, + "loss": 2.5592, + "step": 33438 + }, + { + "epoch": 0.99157844794354, + "grad_norm": 0.06613463908433914, + "learning_rate": 1.7980455181265675e-07, + "loss": 2.5659, + "step": 33439 + }, + { + "epoch": 0.9916081012958515, + "grad_norm": 0.06459162384271622, + "learning_rate": 1.7854505411324383e-07, + "loss": 2.5794, + "step": 33440 + }, + { + "epoch": 0.991637754648163, + "grad_norm": 0.06771937012672424, + "learning_rate": 1.7728998241861581e-07, + "loss": 2.5387, + "step": 33441 + }, + { + "epoch": 0.9916674080004745, + "grad_norm": 0.061998751014471054, + "learning_rate": 1.7603933673987492e-07, + "loss": 2.5518, + "step": 33442 + }, + { + "epoch": 0.9916970613527859, + "grad_norm": 0.06634803861379623, + "learning_rate": 1.747931170880679e-07, + "loss": 2.6106, + "step": 33443 + }, + { + "epoch": 0.9917267147050974, + "grad_norm": 0.06585268676280975, + "learning_rate": 1.7355132347429692e-07, + "loss": 2.5907, + "step": 33444 + }, + { + "epoch": 0.9917563680574089, + "grad_norm": 0.06603386253118515, + "learning_rate": 1.7231395590949772e-07, + "loss": 2.5165, + "step": 33445 + }, + { + "epoch": 0.9917860214097204, + "grad_norm": 0.06365980207920074, + "learning_rate": 1.7108101440466151e-07, + "loss": 2.5437, + "step": 33446 + }, + { + "epoch": 0.9918156747620318, + "grad_norm": 0.0658339336514473, + "learning_rate": 1.6985249897066845e-07, + "loss": 2.5399, + "step": 33447 + }, + { + "epoch": 0.9918453281143433, + "grad_norm": 0.0656120777130127, + "learning_rate": 1.6862840961845427e-07, + "loss": 2.5683, + "step": 33448 + }, + { + "epoch": 0.9918749814666548, + "grad_norm": 0.06617894768714905, + "learning_rate": 1.6740874635884362e-07, + "loss": 2.5622, + "step": 33449 + }, + { + "epoch": 0.9919046348189663, + "grad_norm": 0.06603480875492096, + "learning_rate": 1.6619350920260568e-07, + "loss": 2.5688, + "step": 33450 + }, + { + "epoch": 0.9919342881712777, + "grad_norm": 0.06560897082090378, + "learning_rate": 1.649826981605651e-07, + "loss": 2.5573, + "step": 33451 + }, + { + "epoch": 0.9919639415235892, + "grad_norm": 0.06320856511592865, + "learning_rate": 1.6377631324332453e-07, + "loss": 2.5878, + "step": 33452 + }, + { + "epoch": 0.9919935948759008, + "grad_norm": 0.06477242708206177, + "learning_rate": 1.6257435446170866e-07, + "loss": 2.5831, + "step": 33453 + }, + { + "epoch": 0.9920232482282122, + "grad_norm": 0.06642397493124008, + "learning_rate": 1.6137682182620905e-07, + "loss": 2.5788, + "step": 33454 + }, + { + "epoch": 0.9920529015805237, + "grad_norm": 0.06379114091396332, + "learning_rate": 1.601837153475949e-07, + "loss": 2.5659, + "step": 33455 + }, + { + "epoch": 0.9920825549328351, + "grad_norm": 0.06379342079162598, + "learning_rate": 1.589950350363023e-07, + "loss": 2.5255, + "step": 33456 + }, + { + "epoch": 0.9921122082851467, + "grad_norm": 0.06296157836914062, + "learning_rate": 1.5781078090293387e-07, + "loss": 2.5333, + "step": 33457 + }, + { + "epoch": 0.9921418616374581, + "grad_norm": 0.064627505838871, + "learning_rate": 1.5663095295792574e-07, + "loss": 2.5943, + "step": 33458 + }, + { + "epoch": 0.9921715149897696, + "grad_norm": 0.06634379178285599, + "learning_rate": 1.5545555121176946e-07, + "loss": 2.5487, + "step": 33459 + }, + { + "epoch": 0.992201168342081, + "grad_norm": 0.06548836082220078, + "learning_rate": 1.5428457567484566e-07, + "loss": 2.5956, + "step": 33460 + }, + { + "epoch": 0.9922308216943926, + "grad_norm": 0.06499408930540085, + "learning_rate": 1.531180263575349e-07, + "loss": 2.5724, + "step": 33461 + }, + { + "epoch": 0.992260475046704, + "grad_norm": 0.06748166680335999, + "learning_rate": 1.5195590327010677e-07, + "loss": 2.5202, + "step": 33462 + }, + { + "epoch": 0.9922901283990155, + "grad_norm": 0.06559582054615021, + "learning_rate": 1.5079820642299735e-07, + "loss": 2.5415, + "step": 33463 + }, + { + "epoch": 0.992319781751327, + "grad_norm": 0.06382886320352554, + "learning_rate": 1.4964493582630967e-07, + "loss": 2.5839, + "step": 33464 + }, + { + "epoch": 0.9923494351036385, + "grad_norm": 0.06468497216701508, + "learning_rate": 1.484960914903133e-07, + "loss": 2.5422, + "step": 33465 + }, + { + "epoch": 0.9923790884559499, + "grad_norm": 0.06200967729091644, + "learning_rate": 1.4735167342516675e-07, + "loss": 2.5459, + "step": 33466 + }, + { + "epoch": 0.9924087418082614, + "grad_norm": 0.06512442231178284, + "learning_rate": 1.462116816410841e-07, + "loss": 2.5816, + "step": 33467 + }, + { + "epoch": 0.9924383951605729, + "grad_norm": 0.06804270297288895, + "learning_rate": 1.4507611614805738e-07, + "loss": 2.5658, + "step": 33468 + }, + { + "epoch": 0.9924680485128844, + "grad_norm": 0.0638837143778801, + "learning_rate": 1.4394497695618958e-07, + "loss": 2.5717, + "step": 33469 + }, + { + "epoch": 0.9924977018651958, + "grad_norm": 0.06416475027799606, + "learning_rate": 1.428182640754727e-07, + "loss": 2.5569, + "step": 33470 + }, + { + "epoch": 0.9925273552175073, + "grad_norm": 0.06764408946037292, + "learning_rate": 1.416959775158988e-07, + "loss": 2.5733, + "step": 33471 + }, + { + "epoch": 0.9925570085698188, + "grad_norm": 0.0642334446310997, + "learning_rate": 1.4057811728740432e-07, + "loss": 2.56, + "step": 33472 + }, + { + "epoch": 0.9925866619221303, + "grad_norm": 0.06570033729076385, + "learning_rate": 1.3946468339992579e-07, + "loss": 2.5883, + "step": 33473 + }, + { + "epoch": 0.9926163152744418, + "grad_norm": 0.06443873047828674, + "learning_rate": 1.3835567586323318e-07, + "loss": 2.5314, + "step": 33474 + }, + { + "epoch": 0.9926459686267533, + "grad_norm": 0.06730800867080688, + "learning_rate": 1.3725109468726293e-07, + "loss": 2.554, + "step": 33475 + }, + { + "epoch": 0.9926756219790648, + "grad_norm": 0.06368064880371094, + "learning_rate": 1.3615093988167403e-07, + "loss": 2.5523, + "step": 33476 + }, + { + "epoch": 0.9927052753313762, + "grad_norm": 0.06587207317352295, + "learning_rate": 1.3505521145629195e-07, + "loss": 2.5241, + "step": 33477 + }, + { + "epoch": 0.9927349286836877, + "grad_norm": 0.06713461130857468, + "learning_rate": 1.3396390942083115e-07, + "loss": 2.5741, + "step": 33478 + }, + { + "epoch": 0.9927645820359992, + "grad_norm": 0.0662446990609169, + "learning_rate": 1.3287703378489503e-07, + "loss": 2.592, + "step": 33479 + }, + { + "epoch": 0.9927942353883107, + "grad_norm": 0.06700079143047333, + "learning_rate": 1.3179458455814254e-07, + "loss": 2.5372, + "step": 33480 + }, + { + "epoch": 0.9928238887406221, + "grad_norm": 0.06502309441566467, + "learning_rate": 1.307165617501216e-07, + "loss": 2.5719, + "step": 33481 + }, + { + "epoch": 0.9928535420929336, + "grad_norm": 0.06841455399990082, + "learning_rate": 1.2964296537043562e-07, + "loss": 2.5724, + "step": 33482 + }, + { + "epoch": 0.9928831954452451, + "grad_norm": 0.06393688917160034, + "learning_rate": 1.2857379542852153e-07, + "loss": 2.5923, + "step": 33483 + }, + { + "epoch": 0.9929128487975566, + "grad_norm": 0.07723353058099747, + "learning_rate": 1.2750905193392726e-07, + "loss": 2.584, + "step": 33484 + }, + { + "epoch": 0.992942502149868, + "grad_norm": 0.06574325263500214, + "learning_rate": 1.2644873489603416e-07, + "loss": 2.589, + "step": 33485 + }, + { + "epoch": 0.9929721555021795, + "grad_norm": 0.0632685050368309, + "learning_rate": 1.253928443242236e-07, + "loss": 2.5977, + "step": 33486 + }, + { + "epoch": 0.993001808854491, + "grad_norm": 0.06460732221603394, + "learning_rate": 1.24341380227877e-07, + "loss": 2.5473, + "step": 33487 + }, + { + "epoch": 0.9930314622068025, + "grad_norm": 0.06637896597385406, + "learning_rate": 1.2329434261632022e-07, + "loss": 2.5708, + "step": 33488 + }, + { + "epoch": 0.9930611155591139, + "grad_norm": 0.0653354823589325, + "learning_rate": 1.222517314987681e-07, + "loss": 2.566, + "step": 33489 + }, + { + "epoch": 0.9930907689114254, + "grad_norm": 0.06428033858537674, + "learning_rate": 1.2121354688443552e-07, + "loss": 2.5603, + "step": 33490 + }, + { + "epoch": 0.9931204222637369, + "grad_norm": 0.06793984770774841, + "learning_rate": 1.2017978878264834e-07, + "loss": 2.5846, + "step": 33491 + }, + { + "epoch": 0.9931500756160484, + "grad_norm": 0.06565868109464645, + "learning_rate": 1.1915045720239936e-07, + "loss": 2.5852, + "step": 33492 + }, + { + "epoch": 0.9931797289683598, + "grad_norm": 0.06494218856096268, + "learning_rate": 1.1812555215290343e-07, + "loss": 2.5918, + "step": 33493 + }, + { + "epoch": 0.9932093823206714, + "grad_norm": 0.06595593690872192, + "learning_rate": 1.1710507364320888e-07, + "loss": 2.5369, + "step": 33494 + }, + { + "epoch": 0.9932390356729829, + "grad_norm": 0.06607118248939514, + "learning_rate": 1.1608902168236401e-07, + "loss": 2.55, + "step": 33495 + }, + { + "epoch": 0.9932686890252943, + "grad_norm": 0.067750483751297, + "learning_rate": 1.1507739627930614e-07, + "loss": 2.5683, + "step": 33496 + }, + { + "epoch": 0.9932983423776058, + "grad_norm": 0.0637931153178215, + "learning_rate": 1.1407019744308356e-07, + "loss": 2.5744, + "step": 33497 + }, + { + "epoch": 0.9933279957299173, + "grad_norm": 0.06345602869987488, + "learning_rate": 1.1306742518257807e-07, + "loss": 2.5884, + "step": 33498 + }, + { + "epoch": 0.9933576490822288, + "grad_norm": 0.06522684544324875, + "learning_rate": 1.1206907950667145e-07, + "loss": 2.5414, + "step": 33499 + }, + { + "epoch": 0.9933873024345402, + "grad_norm": 0.06641559302806854, + "learning_rate": 1.1107516042418998e-07, + "loss": 2.5824, + "step": 33500 + }, + { + "epoch": 0.9934169557868517, + "grad_norm": 0.06569914519786835, + "learning_rate": 1.1008566794390439e-07, + "loss": 2.5645, + "step": 33501 + }, + { + "epoch": 0.9934466091391632, + "grad_norm": 0.06730027496814728, + "learning_rate": 1.0910060207464101e-07, + "loss": 2.6025, + "step": 33502 + }, + { + "epoch": 0.9934762624914747, + "grad_norm": 0.06773342192173004, + "learning_rate": 1.0811996282511504e-07, + "loss": 2.5579, + "step": 33503 + }, + { + "epoch": 0.9935059158437861, + "grad_norm": 0.0657903254032135, + "learning_rate": 1.0714375020398626e-07, + "loss": 2.5772, + "step": 33504 + }, + { + "epoch": 0.9935355691960976, + "grad_norm": 0.06593237072229385, + "learning_rate": 1.0617196421991438e-07, + "loss": 2.5894, + "step": 33505 + }, + { + "epoch": 0.9935652225484091, + "grad_norm": 0.0647692009806633, + "learning_rate": 1.0520460488144812e-07, + "loss": 2.5769, + "step": 33506 + }, + { + "epoch": 0.9935948759007206, + "grad_norm": 0.06707559525966644, + "learning_rate": 1.0424167219724723e-07, + "loss": 2.5733, + "step": 33507 + }, + { + "epoch": 0.993624529253032, + "grad_norm": 0.06497973948717117, + "learning_rate": 1.032831661757494e-07, + "loss": 2.541, + "step": 33508 + }, + { + "epoch": 0.9936541826053435, + "grad_norm": 0.06587866693735123, + "learning_rate": 1.0232908682550335e-07, + "loss": 2.5588, + "step": 33509 + }, + { + "epoch": 0.993683835957655, + "grad_norm": 0.06483829021453857, + "learning_rate": 1.0137943415494677e-07, + "loss": 2.5674, + "step": 33510 + }, + { + "epoch": 0.9937134893099665, + "grad_norm": 0.06548325717449188, + "learning_rate": 1.0043420817251736e-07, + "loss": 2.5692, + "step": 33511 + }, + { + "epoch": 0.9937431426622779, + "grad_norm": 0.06520868837833405, + "learning_rate": 9.949340888648629e-08, + "loss": 2.5794, + "step": 33512 + }, + { + "epoch": 0.9937727960145895, + "grad_norm": 0.06443954259157181, + "learning_rate": 9.855703630529123e-08, + "loss": 2.5844, + "step": 33513 + }, + { + "epoch": 0.9938024493669009, + "grad_norm": 0.06802321970462799, + "learning_rate": 9.762509043714784e-08, + "loss": 2.5765, + "step": 33514 + }, + { + "epoch": 0.9938321027192124, + "grad_norm": 0.0655113235116005, + "learning_rate": 9.669757129038282e-08, + "loss": 2.5714, + "step": 33515 + }, + { + "epoch": 0.9938617560715239, + "grad_norm": 0.06454827636480331, + "learning_rate": 9.577447887315627e-08, + "loss": 2.5689, + "step": 33516 + }, + { + "epoch": 0.9938914094238354, + "grad_norm": 0.06523717194795609, + "learning_rate": 9.485581319362835e-08, + "loss": 2.5591, + "step": 33517 + }, + { + "epoch": 0.9939210627761469, + "grad_norm": 0.0639549046754837, + "learning_rate": 9.39415742599592e-08, + "loss": 2.5881, + "step": 33518 + }, + { + "epoch": 0.9939507161284583, + "grad_norm": 0.06480588018894196, + "learning_rate": 9.303176208025344e-08, + "loss": 2.5882, + "step": 33519 + }, + { + "epoch": 0.9939803694807698, + "grad_norm": 0.06459157913923264, + "learning_rate": 9.212637666261569e-08, + "loss": 2.5464, + "step": 33520 + }, + { + "epoch": 0.9940100228330813, + "grad_norm": 0.06582602113485336, + "learning_rate": 9.122541801492856e-08, + "loss": 2.5641, + "step": 33521 + }, + { + "epoch": 0.9940396761853928, + "grad_norm": 0.0639057606458664, + "learning_rate": 9.032888614529667e-08, + "loss": 2.5639, + "step": 33522 + }, + { + "epoch": 0.9940693295377042, + "grad_norm": 0.06391505897045135, + "learning_rate": 8.943678106154706e-08, + "loss": 2.527, + "step": 33523 + }, + { + "epoch": 0.9940989828900157, + "grad_norm": 0.06420746445655823, + "learning_rate": 8.854910277172889e-08, + "loss": 2.5532, + "step": 33524 + }, + { + "epoch": 0.9941286362423272, + "grad_norm": 0.06775595992803574, + "learning_rate": 8.766585128355819e-08, + "loss": 2.5418, + "step": 33525 + }, + { + "epoch": 0.9941582895946387, + "grad_norm": 0.06530475616455078, + "learning_rate": 8.678702660491755e-08, + "loss": 2.6059, + "step": 33526 + }, + { + "epoch": 0.9941879429469501, + "grad_norm": 0.06909339874982834, + "learning_rate": 8.591262874363403e-08, + "loss": 2.5861, + "step": 33527 + }, + { + "epoch": 0.9942175962992617, + "grad_norm": 0.06510824710130692, + "learning_rate": 8.504265770736819e-08, + "loss": 2.5311, + "step": 33528 + }, + { + "epoch": 0.9942472496515731, + "grad_norm": 0.06767326593399048, + "learning_rate": 8.417711350383605e-08, + "loss": 2.5578, + "step": 33529 + }, + { + "epoch": 0.9942769030038846, + "grad_norm": 0.0678657740354538, + "learning_rate": 8.331599614075369e-08, + "loss": 2.5547, + "step": 33530 + }, + { + "epoch": 0.994306556356196, + "grad_norm": 0.06495003402233124, + "learning_rate": 8.245930562572613e-08, + "loss": 2.5391, + "step": 33531 + }, + { + "epoch": 0.9943362097085076, + "grad_norm": 0.06451502442359924, + "learning_rate": 8.160704196630286e-08, + "loss": 2.5726, + "step": 33532 + }, + { + "epoch": 0.994365863060819, + "grad_norm": 0.06318158656358719, + "learning_rate": 8.075920517008895e-08, + "loss": 2.5955, + "step": 33533 + }, + { + "epoch": 0.9943955164131305, + "grad_norm": 0.06803079694509506, + "learning_rate": 7.991579524457836e-08, + "loss": 2.5738, + "step": 33534 + }, + { + "epoch": 0.9944251697654419, + "grad_norm": 0.06346142292022705, + "learning_rate": 7.907681219715413e-08, + "loss": 2.5598, + "step": 33535 + }, + { + "epoch": 0.9944548231177535, + "grad_norm": 0.06687434762716293, + "learning_rate": 7.824225603536573e-08, + "loss": 2.5523, + "step": 33536 + }, + { + "epoch": 0.994484476470065, + "grad_norm": 0.06535611301660538, + "learning_rate": 7.741212676654064e-08, + "loss": 2.5873, + "step": 33537 + }, + { + "epoch": 0.9945141298223764, + "grad_norm": 0.06985478848218918, + "learning_rate": 7.658642439806185e-08, + "loss": 2.5763, + "step": 33538 + }, + { + "epoch": 0.994543783174688, + "grad_norm": 0.06689450144767761, + "learning_rate": 7.576514893720132e-08, + "loss": 2.5699, + "step": 33539 + }, + { + "epoch": 0.9945734365269994, + "grad_norm": 0.06425966322422028, + "learning_rate": 7.494830039123101e-08, + "loss": 2.5484, + "step": 33540 + }, + { + "epoch": 0.9946030898793109, + "grad_norm": 0.06734805554151535, + "learning_rate": 7.413587876742289e-08, + "loss": 2.6107, + "step": 33541 + }, + { + "epoch": 0.9946327432316223, + "grad_norm": 0.06596115231513977, + "learning_rate": 7.33278840729934e-08, + "loss": 2.5675, + "step": 33542 + }, + { + "epoch": 0.9946623965839338, + "grad_norm": 0.06360548734664917, + "learning_rate": 7.252431631499246e-08, + "loss": 2.5417, + "step": 33543 + }, + { + "epoch": 0.9946920499362453, + "grad_norm": 0.0659642219543457, + "learning_rate": 7.172517550063651e-08, + "loss": 2.5673, + "step": 33544 + }, + { + "epoch": 0.9947217032885568, + "grad_norm": 0.06558408588171005, + "learning_rate": 7.093046163697548e-08, + "loss": 2.5698, + "step": 33545 + }, + { + "epoch": 0.9947513566408682, + "grad_norm": 0.06397926807403564, + "learning_rate": 7.014017473100375e-08, + "loss": 2.5752, + "step": 33546 + }, + { + "epoch": 0.9947810099931798, + "grad_norm": 0.06597589701414108, + "learning_rate": 6.935431478977128e-08, + "loss": 2.5759, + "step": 33547 + }, + { + "epoch": 0.9948106633454912, + "grad_norm": 0.06474237889051437, + "learning_rate": 6.857288182021692e-08, + "loss": 2.5924, + "step": 33548 + }, + { + "epoch": 0.9948403166978027, + "grad_norm": 0.06583216786384583, + "learning_rate": 6.779587582927959e-08, + "loss": 2.5366, + "step": 33549 + }, + { + "epoch": 0.9948699700501141, + "grad_norm": 0.06605995446443558, + "learning_rate": 6.702329682378717e-08, + "loss": 2.5367, + "step": 33550 + }, + { + "epoch": 0.9948996234024257, + "grad_norm": 0.06789389252662659, + "learning_rate": 6.6255144810623e-08, + "loss": 2.5918, + "step": 33551 + }, + { + "epoch": 0.9949292767547371, + "grad_norm": 0.06579034775495529, + "learning_rate": 6.549141979661499e-08, + "loss": 2.5383, + "step": 33552 + }, + { + "epoch": 0.9949589301070486, + "grad_norm": 0.06442640721797943, + "learning_rate": 6.473212178842446e-08, + "loss": 2.5826, + "step": 33553 + }, + { + "epoch": 0.99498858345936, + "grad_norm": 0.06704283505678177, + "learning_rate": 6.397725079287931e-08, + "loss": 2.5709, + "step": 33554 + }, + { + "epoch": 0.9950182368116716, + "grad_norm": 0.06627288460731506, + "learning_rate": 6.322680681664083e-08, + "loss": 2.5614, + "step": 33555 + }, + { + "epoch": 0.995047890163983, + "grad_norm": 0.06331980973482132, + "learning_rate": 6.24807898663704e-08, + "loss": 2.5526, + "step": 33556 + }, + { + "epoch": 0.9950775435162945, + "grad_norm": 0.06636689603328705, + "learning_rate": 6.173919994861833e-08, + "loss": 2.542, + "step": 33557 + }, + { + "epoch": 0.995107196868606, + "grad_norm": 0.06525122374296188, + "learning_rate": 6.10020370699349e-08, + "loss": 2.5467, + "step": 33558 + }, + { + "epoch": 0.9951368502209175, + "grad_norm": 0.0650331974029541, + "learning_rate": 6.026930123692598e-08, + "loss": 2.5229, + "step": 33559 + }, + { + "epoch": 0.995166503573229, + "grad_norm": 0.06555724143981934, + "learning_rate": 5.9540992456086354e-08, + "loss": 2.561, + "step": 33560 + }, + { + "epoch": 0.9951961569255404, + "grad_norm": 0.065681092441082, + "learning_rate": 5.881711073379981e-08, + "loss": 2.551, + "step": 33561 + }, + { + "epoch": 0.995225810277852, + "grad_norm": 0.0638367161154747, + "learning_rate": 5.809765607645012e-08, + "loss": 2.5738, + "step": 33562 + }, + { + "epoch": 0.9952554636301634, + "grad_norm": 0.06501586735248566, + "learning_rate": 5.7382628490532105e-08, + "loss": 2.5776, + "step": 33563 + }, + { + "epoch": 0.9952851169824749, + "grad_norm": 0.06673186272382736, + "learning_rate": 5.6672027982263006e-08, + "loss": 2.5442, + "step": 33564 + }, + { + "epoch": 0.9953147703347863, + "grad_norm": 0.06155013665556908, + "learning_rate": 5.5965854557971095e-08, + "loss": 2.5385, + "step": 33565 + }, + { + "epoch": 0.9953444236870979, + "grad_norm": 0.0650225505232811, + "learning_rate": 5.526410822392913e-08, + "loss": 2.5802, + "step": 33566 + }, + { + "epoch": 0.9953740770394093, + "grad_norm": 0.06515329331159592, + "learning_rate": 5.456678898635436e-08, + "loss": 2.5455, + "step": 33567 + }, + { + "epoch": 0.9954037303917208, + "grad_norm": 0.0662330836057663, + "learning_rate": 5.387389685140853e-08, + "loss": 2.5704, + "step": 33568 + }, + { + "epoch": 0.9954333837440322, + "grad_norm": 0.06487753987312317, + "learning_rate": 5.318543182519786e-08, + "loss": 2.5727, + "step": 33569 + }, + { + "epoch": 0.9954630370963438, + "grad_norm": 0.06851249188184738, + "learning_rate": 5.250139391382858e-08, + "loss": 2.563, + "step": 33570 + }, + { + "epoch": 0.9954926904486552, + "grad_norm": 0.06716292351484299, + "learning_rate": 5.182178312340691e-08, + "loss": 2.5993, + "step": 33571 + }, + { + "epoch": 0.9955223438009667, + "grad_norm": 0.0673932209610939, + "learning_rate": 5.1146599459928054e-08, + "loss": 2.5368, + "step": 33572 + }, + { + "epoch": 0.9955519971532781, + "grad_norm": 0.06832768768072128, + "learning_rate": 5.047584292933172e-08, + "loss": 2.5361, + "step": 33573 + }, + { + "epoch": 0.9955816505055897, + "grad_norm": 0.06448537856340408, + "learning_rate": 4.9809513537613096e-08, + "loss": 2.537, + "step": 33574 + }, + { + "epoch": 0.9956113038579011, + "grad_norm": 0.06393151730298996, + "learning_rate": 4.914761129060086e-08, + "loss": 2.532, + "step": 33575 + }, + { + "epoch": 0.9956409572102126, + "grad_norm": 0.0632992535829544, + "learning_rate": 4.84901361942347e-08, + "loss": 2.5632, + "step": 33576 + }, + { + "epoch": 0.995670610562524, + "grad_norm": 0.07349705696105957, + "learning_rate": 4.7837088254343296e-08, + "loss": 2.5639, + "step": 33577 + }, + { + "epoch": 0.9957002639148356, + "grad_norm": 0.06530850380659103, + "learning_rate": 4.7188467476588784e-08, + "loss": 2.5826, + "step": 33578 + }, + { + "epoch": 0.9957299172671471, + "grad_norm": 0.06503099948167801, + "learning_rate": 4.654427386685534e-08, + "loss": 2.5964, + "step": 33579 + }, + { + "epoch": 0.9957595706194585, + "grad_norm": 0.06990589201450348, + "learning_rate": 4.5904507430749584e-08, + "loss": 2.575, + "step": 33580 + }, + { + "epoch": 0.99578922397177, + "grad_norm": 0.06685379147529602, + "learning_rate": 4.52691681740447e-08, + "loss": 2.5541, + "step": 33581 + }, + { + "epoch": 0.9958188773240815, + "grad_norm": 0.06660442799329758, + "learning_rate": 4.463825610223626e-08, + "loss": 2.5553, + "step": 33582 + }, + { + "epoch": 0.995848530676393, + "grad_norm": 0.06779745221138, + "learning_rate": 4.401177122098643e-08, + "loss": 2.5739, + "step": 33583 + }, + { + "epoch": 0.9958781840287044, + "grad_norm": 0.06725352257490158, + "learning_rate": 4.33897135357908e-08, + "loss": 2.5314, + "step": 33584 + }, + { + "epoch": 0.995907837381016, + "grad_norm": 0.06258350610733032, + "learning_rate": 4.2772083052255994e-08, + "loss": 2.5411, + "step": 33585 + }, + { + "epoch": 0.9959374907333274, + "grad_norm": 0.06468796730041504, + "learning_rate": 4.21588797757666e-08, + "loss": 2.5374, + "step": 33586 + }, + { + "epoch": 0.9959671440856389, + "grad_norm": 0.06362693756818771, + "learning_rate": 4.155010371176271e-08, + "loss": 2.5751, + "step": 33587 + }, + { + "epoch": 0.9959967974379503, + "grad_norm": 0.06430462002754211, + "learning_rate": 4.094575486568442e-08, + "loss": 2.5669, + "step": 33588 + }, + { + "epoch": 0.9960264507902619, + "grad_norm": 0.06480565667152405, + "learning_rate": 4.0345833242805276e-08, + "loss": 2.5639, + "step": 33589 + }, + { + "epoch": 0.9960561041425733, + "grad_norm": 0.06401887536048889, + "learning_rate": 3.975033884850987e-08, + "loss": 2.5915, + "step": 33590 + }, + { + "epoch": 0.9960857574948848, + "grad_norm": 0.06625223159790039, + "learning_rate": 3.915927168801625e-08, + "loss": 2.5542, + "step": 33591 + }, + { + "epoch": 0.9961154108471962, + "grad_norm": 0.06623136252164841, + "learning_rate": 3.8572631766653487e-08, + "loss": 2.5544, + "step": 33592 + }, + { + "epoch": 0.9961450641995078, + "grad_norm": 0.06550756841897964, + "learning_rate": 3.799041908947309e-08, + "loss": 2.5601, + "step": 33593 + }, + { + "epoch": 0.9961747175518192, + "grad_norm": 0.06672792136669159, + "learning_rate": 3.741263366174863e-08, + "loss": 2.5974, + "step": 33594 + }, + { + "epoch": 0.9962043709041307, + "grad_norm": 0.06234763190150261, + "learning_rate": 3.683927548853161e-08, + "loss": 2.589, + "step": 33595 + }, + { + "epoch": 0.9962340242564421, + "grad_norm": 0.06339732557535172, + "learning_rate": 3.627034457492906e-08, + "loss": 2.5691, + "step": 33596 + }, + { + "epoch": 0.9962636776087537, + "grad_norm": 0.0640668123960495, + "learning_rate": 3.570584092593698e-08, + "loss": 2.5345, + "step": 33597 + }, + { + "epoch": 0.9962933309610651, + "grad_norm": 0.06499364227056503, + "learning_rate": 3.51457645466069e-08, + "loss": 2.5807, + "step": 33598 + }, + { + "epoch": 0.9963229843133766, + "grad_norm": 0.06547845900058746, + "learning_rate": 3.459011544187929e-08, + "loss": 2.6227, + "step": 33599 + }, + { + "epoch": 0.9963526376656882, + "grad_norm": 0.06535229831933975, + "learning_rate": 3.403889361669465e-08, + "loss": 2.5704, + "step": 33600 + }, + { + "epoch": 0.9963822910179996, + "grad_norm": 0.07140438258647919, + "learning_rate": 3.349209907588247e-08, + "loss": 2.5898, + "step": 33601 + }, + { + "epoch": 0.9964119443703111, + "grad_norm": 0.06604969501495361, + "learning_rate": 3.294973182438321e-08, + "loss": 2.5593, + "step": 33602 + }, + { + "epoch": 0.9964415977226225, + "grad_norm": 0.06563783437013626, + "learning_rate": 3.241179186685983e-08, + "loss": 2.5914, + "step": 33603 + }, + { + "epoch": 0.9964712510749341, + "grad_norm": 0.06330104917287827, + "learning_rate": 3.187827920814179e-08, + "loss": 2.5764, + "step": 33604 + }, + { + "epoch": 0.9965009044272455, + "grad_norm": 0.06407997012138367, + "learning_rate": 3.134919385300306e-08, + "loss": 2.5443, + "step": 33605 + }, + { + "epoch": 0.996530557779557, + "grad_norm": 0.07162216305732727, + "learning_rate": 3.082453580610656e-08, + "loss": 2.5491, + "step": 33606 + }, + { + "epoch": 0.9965602111318684, + "grad_norm": 0.06825640052556992, + "learning_rate": 3.030430507200421e-08, + "loss": 2.5583, + "step": 33607 + }, + { + "epoch": 0.99658986448418, + "grad_norm": 0.06580028682947159, + "learning_rate": 2.978850165541447e-08, + "loss": 2.5444, + "step": 33608 + }, + { + "epoch": 0.9966195178364914, + "grad_norm": 0.06380478292703629, + "learning_rate": 2.9277125560889238e-08, + "loss": 2.5545, + "step": 33609 + }, + { + "epoch": 0.9966491711888029, + "grad_norm": 0.06565167754888535, + "learning_rate": 2.8770176792924928e-08, + "loss": 2.5551, + "step": 33610 + }, + { + "epoch": 0.9966788245411143, + "grad_norm": 0.06472856551408768, + "learning_rate": 2.826765535596243e-08, + "loss": 2.5532, + "step": 33611 + }, + { + "epoch": 0.9967084778934259, + "grad_norm": 0.06281960755586624, + "learning_rate": 2.7769561254553656e-08, + "loss": 2.5738, + "step": 33612 + }, + { + "epoch": 0.9967381312457373, + "grad_norm": 0.06845670938491821, + "learning_rate": 2.7275894493083986e-08, + "loss": 2.5865, + "step": 33613 + }, + { + "epoch": 0.9967677845980488, + "grad_norm": 0.06385979056358337, + "learning_rate": 2.6786655075883292e-08, + "loss": 2.5933, + "step": 33614 + }, + { + "epoch": 0.9967974379503602, + "grad_norm": 0.06748829036951065, + "learning_rate": 2.6301843007281445e-08, + "loss": 2.5465, + "step": 33615 + }, + { + "epoch": 0.9968270913026718, + "grad_norm": 0.06498958170413971, + "learning_rate": 2.5821458291663825e-08, + "loss": 2.5538, + "step": 33616 + }, + { + "epoch": 0.9968567446549832, + "grad_norm": 0.0656876415014267, + "learning_rate": 2.5345500933138254e-08, + "loss": 2.5461, + "step": 33617 + }, + { + "epoch": 0.9968863980072947, + "grad_norm": 0.06364694982767105, + "learning_rate": 2.4873970936034605e-08, + "loss": 2.5836, + "step": 33618 + }, + { + "epoch": 0.9969160513596063, + "grad_norm": 0.0654037743806839, + "learning_rate": 2.4406868304516215e-08, + "loss": 2.5352, + "step": 33619 + }, + { + "epoch": 0.9969457047119177, + "grad_norm": 0.06484024226665497, + "learning_rate": 2.394419304269091e-08, + "loss": 2.5416, + "step": 33620 + }, + { + "epoch": 0.9969753580642292, + "grad_norm": 0.06742911785840988, + "learning_rate": 2.3485945154611e-08, + "loss": 2.5654, + "step": 33621 + }, + { + "epoch": 0.9970050114165406, + "grad_norm": 0.06090177223086357, + "learning_rate": 2.3032124644439823e-08, + "loss": 2.5475, + "step": 33622 + }, + { + "epoch": 0.9970346647688522, + "grad_norm": 0.06460104882717133, + "learning_rate": 2.258273151606316e-08, + "loss": 2.6082, + "step": 33623 + }, + { + "epoch": 0.9970643181211636, + "grad_norm": 0.06371895223855972, + "learning_rate": 2.2137765773588835e-08, + "loss": 2.5922, + "step": 33624 + }, + { + "epoch": 0.9970939714734751, + "grad_norm": 0.06511005759239197, + "learning_rate": 2.169722742090263e-08, + "loss": 2.5578, + "step": 33625 + }, + { + "epoch": 0.9971236248257865, + "grad_norm": 0.06591013818979263, + "learning_rate": 2.126111646189033e-08, + "loss": 2.5471, + "step": 33626 + }, + { + "epoch": 0.9971532781780981, + "grad_norm": 0.06560184061527252, + "learning_rate": 2.0829432900493216e-08, + "loss": 2.5674, + "step": 33627 + }, + { + "epoch": 0.9971829315304095, + "grad_norm": 0.06465036422014236, + "learning_rate": 2.0402176740375034e-08, + "loss": 2.5615, + "step": 33628 + }, + { + "epoch": 0.997212584882721, + "grad_norm": 0.06364239007234573, + "learning_rate": 1.997934798547707e-08, + "loss": 2.5753, + "step": 33629 + }, + { + "epoch": 0.9972422382350324, + "grad_norm": 0.06834492832422256, + "learning_rate": 1.956094663946306e-08, + "loss": 2.5614, + "step": 33630 + }, + { + "epoch": 0.997271891587344, + "grad_norm": 0.06511641293764114, + "learning_rate": 1.9146972706107768e-08, + "loss": 2.5494, + "step": 33631 + }, + { + "epoch": 0.9973015449396554, + "grad_norm": 0.06773653626441956, + "learning_rate": 1.8737426188963903e-08, + "loss": 2.5768, + "step": 33632 + }, + { + "epoch": 0.9973311982919669, + "grad_norm": 0.06522074341773987, + "learning_rate": 1.833230709175071e-08, + "loss": 2.5168, + "step": 33633 + }, + { + "epoch": 0.9973608516442783, + "grad_norm": 0.06501169502735138, + "learning_rate": 1.793161541802091e-08, + "loss": 2.5487, + "step": 33634 + }, + { + "epoch": 0.9973905049965899, + "grad_norm": 0.06599073112010956, + "learning_rate": 1.7535351171271697e-08, + "loss": 2.5781, + "step": 33635 + }, + { + "epoch": 0.9974201583489013, + "grad_norm": 0.0665142610669136, + "learning_rate": 1.7143514355166813e-08, + "loss": 2.5823, + "step": 33636 + }, + { + "epoch": 0.9974498117012128, + "grad_norm": 0.06532701849937439, + "learning_rate": 1.675610497298141e-08, + "loss": 2.5502, + "step": 33637 + }, + { + "epoch": 0.9974794650535243, + "grad_norm": 0.06450517475605011, + "learning_rate": 1.6373123028323723e-08, + "loss": 2.5173, + "step": 33638 + }, + { + "epoch": 0.9975091184058358, + "grad_norm": 0.06955007463693619, + "learning_rate": 1.59945685244689e-08, + "loss": 2.5536, + "step": 33639 + }, + { + "epoch": 0.9975387717581473, + "grad_norm": 0.06508027017116547, + "learning_rate": 1.5620441464803124e-08, + "loss": 2.5577, + "step": 33640 + }, + { + "epoch": 0.9975684251104587, + "grad_norm": 0.06581191718578339, + "learning_rate": 1.5250741852601556e-08, + "loss": 2.5928, + "step": 33641 + }, + { + "epoch": 0.9975980784627703, + "grad_norm": 0.06702511757612228, + "learning_rate": 1.4885469691250374e-08, + "loss": 2.603, + "step": 33642 + }, + { + "epoch": 0.9976277318150817, + "grad_norm": 0.06624499708414078, + "learning_rate": 1.4524624983858203e-08, + "loss": 2.5765, + "step": 33643 + }, + { + "epoch": 0.9976573851673932, + "grad_norm": 0.0659359022974968, + "learning_rate": 1.41682077337002e-08, + "loss": 2.5496, + "step": 33644 + }, + { + "epoch": 0.9976870385197046, + "grad_norm": 0.06651744991540909, + "learning_rate": 1.381621794388499e-08, + "loss": 2.5888, + "step": 33645 + }, + { + "epoch": 0.9977166918720162, + "grad_norm": 0.06622853130102158, + "learning_rate": 1.3468655617521197e-08, + "loss": 2.5643, + "step": 33646 + }, + { + "epoch": 0.9977463452243276, + "grad_norm": 0.06552214920520782, + "learning_rate": 1.3125520757772958e-08, + "loss": 2.5505, + "step": 33647 + }, + { + "epoch": 0.9977759985766391, + "grad_norm": 0.06552308797836304, + "learning_rate": 1.2786813367582362e-08, + "loss": 2.5873, + "step": 33648 + }, + { + "epoch": 0.9978056519289505, + "grad_norm": 0.06563222408294678, + "learning_rate": 1.2452533450002523e-08, + "loss": 2.5457, + "step": 33649 + }, + { + "epoch": 0.9978353052812621, + "grad_norm": 0.06682946532964706, + "learning_rate": 1.2122681007975533e-08, + "loss": 2.6033, + "step": 33650 + }, + { + "epoch": 0.9978649586335735, + "grad_norm": 0.06673219799995422, + "learning_rate": 1.1797256044387971e-08, + "loss": 2.5842, + "step": 33651 + }, + { + "epoch": 0.997894611985885, + "grad_norm": 0.0652557760477066, + "learning_rate": 1.1476258562181929e-08, + "loss": 2.5529, + "step": 33652 + }, + { + "epoch": 0.9979242653381964, + "grad_norm": 0.06308048963546753, + "learning_rate": 1.1159688564188475e-08, + "loss": 2.5289, + "step": 33653 + }, + { + "epoch": 0.997953918690508, + "grad_norm": 0.06252738833427429, + "learning_rate": 1.0847546053183165e-08, + "loss": 2.5712, + "step": 33654 + }, + { + "epoch": 0.9979835720428194, + "grad_norm": 0.06324788182973862, + "learning_rate": 1.0539831031997071e-08, + "loss": 2.5475, + "step": 33655 + }, + { + "epoch": 0.9980132253951309, + "grad_norm": 0.0691889226436615, + "learning_rate": 1.0236543503239215e-08, + "loss": 2.5858, + "step": 33656 + }, + { + "epoch": 0.9980428787474424, + "grad_norm": 0.0678345188498497, + "learning_rate": 9.937683469685155e-09, + "loss": 2.6035, + "step": 33657 + }, + { + "epoch": 0.9980725320997539, + "grad_norm": 0.06425690650939941, + "learning_rate": 9.643250933943915e-09, + "loss": 2.5612, + "step": 33658 + }, + { + "epoch": 0.9981021854520653, + "grad_norm": 0.06782712042331696, + "learning_rate": 9.353245898624518e-09, + "loss": 2.5734, + "step": 33659 + }, + { + "epoch": 0.9981318388043768, + "grad_norm": 0.06901752948760986, + "learning_rate": 9.06766836633599e-09, + "loss": 2.5458, + "step": 33660 + }, + { + "epoch": 0.9981614921566884, + "grad_norm": 0.0664505735039711, + "learning_rate": 8.786518339576332e-09, + "loss": 2.5409, + "step": 33661 + }, + { + "epoch": 0.9981911455089998, + "grad_norm": 0.06352637708187103, + "learning_rate": 8.509795820788036e-09, + "loss": 2.5482, + "step": 33662 + }, + { + "epoch": 0.9982207988613113, + "grad_norm": 0.06591980904340744, + "learning_rate": 8.237500812524612e-09, + "loss": 2.5842, + "step": 33663 + }, + { + "epoch": 0.9982504522136227, + "grad_norm": 0.06600088626146317, + "learning_rate": 7.96963331711753e-09, + "loss": 2.5801, + "step": 33664 + }, + { + "epoch": 0.9982801055659343, + "grad_norm": 0.06309519708156586, + "learning_rate": 7.706193336953771e-09, + "loss": 2.566, + "step": 33665 + }, + { + "epoch": 0.9983097589182457, + "grad_norm": 0.0655239000916481, + "learning_rate": 7.4471808743648e-09, + "loss": 2.6012, + "step": 33666 + }, + { + "epoch": 0.9983394122705572, + "grad_norm": 0.06562425196170807, + "learning_rate": 7.192595931682089e-09, + "loss": 2.5836, + "step": 33667 + }, + { + "epoch": 0.9983690656228686, + "grad_norm": 0.064283087849617, + "learning_rate": 6.94243851107057e-09, + "loss": 2.5658, + "step": 33668 + }, + { + "epoch": 0.9983987189751802, + "grad_norm": 0.06586740911006927, + "learning_rate": 6.696708614861713e-09, + "loss": 2.5691, + "step": 33669 + }, + { + "epoch": 0.9984283723274916, + "grad_norm": 0.0665392130613327, + "learning_rate": 6.455406245164941e-09, + "loss": 2.5914, + "step": 33670 + }, + { + "epoch": 0.9984580256798031, + "grad_norm": 0.06362605094909668, + "learning_rate": 6.218531404145189e-09, + "loss": 2.5456, + "step": 33671 + }, + { + "epoch": 0.9984876790321146, + "grad_norm": 0.063368059694767, + "learning_rate": 5.98608409385637e-09, + "loss": 2.5287, + "step": 33672 + }, + { + "epoch": 0.9985173323844261, + "grad_norm": 0.06760725378990173, + "learning_rate": 5.7580643164079075e-09, + "loss": 2.5957, + "step": 33673 + }, + { + "epoch": 0.9985469857367375, + "grad_norm": 0.0693790391087532, + "learning_rate": 5.534472073798202e-09, + "loss": 2.5817, + "step": 33674 + }, + { + "epoch": 0.998576639089049, + "grad_norm": 0.06534714251756668, + "learning_rate": 5.315307367970146e-09, + "loss": 2.59, + "step": 33675 + }, + { + "epoch": 0.9986062924413605, + "grad_norm": 0.06397242844104767, + "learning_rate": 5.100570200922139e-09, + "loss": 2.5713, + "step": 33676 + }, + { + "epoch": 0.998635945793672, + "grad_norm": 0.06403239071369171, + "learning_rate": 4.890260574541561e-09, + "loss": 2.5774, + "step": 33677 + }, + { + "epoch": 0.9986655991459834, + "grad_norm": 0.06738237291574478, + "learning_rate": 4.6843784906602796e-09, + "loss": 2.522, + "step": 33678 + }, + { + "epoch": 0.9986952524982949, + "grad_norm": 0.06362387537956238, + "learning_rate": 4.4829239511101625e-09, + "loss": 2.5466, + "step": 33679 + }, + { + "epoch": 0.9987249058506064, + "grad_norm": 0.0659869983792305, + "learning_rate": 4.285896957723079e-09, + "loss": 2.5691, + "step": 33680 + }, + { + "epoch": 0.9987545592029179, + "grad_norm": 0.06478781998157501, + "learning_rate": 4.093297512164362e-09, + "loss": 2.5704, + "step": 33681 + }, + { + "epoch": 0.9987842125552294, + "grad_norm": 0.06550644338130951, + "learning_rate": 3.90512561621037e-09, + "loss": 2.5694, + "step": 33682 + }, + { + "epoch": 0.9988138659075408, + "grad_norm": 0.06437807530164719, + "learning_rate": 3.721381271470925e-09, + "loss": 2.5541, + "step": 33683 + }, + { + "epoch": 0.9988435192598524, + "grad_norm": 0.06350070983171463, + "learning_rate": 3.5420644796668734e-09, + "loss": 2.5651, + "step": 33684 + }, + { + "epoch": 0.9988731726121638, + "grad_norm": 0.06303587555885315, + "learning_rate": 3.3671752422415046e-09, + "loss": 2.5239, + "step": 33685 + }, + { + "epoch": 0.9989028259644753, + "grad_norm": 0.06620875746011734, + "learning_rate": 3.196713560860154e-09, + "loss": 2.5383, + "step": 33686 + }, + { + "epoch": 0.9989324793167867, + "grad_norm": 0.06178933009505272, + "learning_rate": 3.0306794370216218e-09, + "loss": 2.5395, + "step": 33687 + }, + { + "epoch": 0.9989621326690983, + "grad_norm": 0.06454978883266449, + "learning_rate": 2.8690728721136872e-09, + "loss": 2.5832, + "step": 33688 + }, + { + "epoch": 0.9989917860214097, + "grad_norm": 0.06577923893928528, + "learning_rate": 2.711893867690662e-09, + "loss": 2.5777, + "step": 33689 + }, + { + "epoch": 0.9990214393737212, + "grad_norm": 0.06472418457269669, + "learning_rate": 2.559142425029304e-09, + "loss": 2.5634, + "step": 33690 + }, + { + "epoch": 0.9990510927260327, + "grad_norm": 0.06506563723087311, + "learning_rate": 2.4108185455173905e-09, + "loss": 2.597, + "step": 33691 + }, + { + "epoch": 0.9990807460783442, + "grad_norm": 0.06556876748800278, + "learning_rate": 2.2669222305427005e-09, + "loss": 2.6329, + "step": 33692 + }, + { + "epoch": 0.9991103994306556, + "grad_norm": 0.06354034692049026, + "learning_rate": 2.127453481270969e-09, + "loss": 2.5376, + "step": 33693 + }, + { + "epoch": 0.9991400527829671, + "grad_norm": 0.0663309097290039, + "learning_rate": 1.992412298978952e-09, + "loss": 2.5904, + "step": 33694 + }, + { + "epoch": 0.9991697061352786, + "grad_norm": 0.0633561983704567, + "learning_rate": 1.8617986848878942e-09, + "loss": 2.5679, + "step": 33695 + }, + { + "epoch": 0.9991993594875901, + "grad_norm": 0.06658925116062164, + "learning_rate": 1.7356126401080196e-09, + "loss": 2.5526, + "step": 33696 + }, + { + "epoch": 0.9992290128399015, + "grad_norm": 0.06124851852655411, + "learning_rate": 1.613854165805062e-09, + "loss": 2.5608, + "step": 33697 + }, + { + "epoch": 0.999258666192213, + "grad_norm": 0.06253410130739212, + "learning_rate": 1.4965232630337332e-09, + "loss": 2.5416, + "step": 33698 + }, + { + "epoch": 0.9992883195445245, + "grad_norm": 0.06229862943291664, + "learning_rate": 1.3836199328487452e-09, + "loss": 2.5841, + "step": 33699 + }, + { + "epoch": 0.999317972896836, + "grad_norm": 0.06849925220012665, + "learning_rate": 1.2751441762492988e-09, + "loss": 2.5552, + "step": 33700 + }, + { + "epoch": 0.9993476262491474, + "grad_norm": 0.06870552152395248, + "learning_rate": 1.1710959941235722e-09, + "loss": 2.5475, + "step": 33701 + }, + { + "epoch": 0.999377279601459, + "grad_norm": 0.0663042962551117, + "learning_rate": 1.071475387470766e-09, + "loss": 2.5458, + "step": 33702 + }, + { + "epoch": 0.9994069329537705, + "grad_norm": 0.06376367062330246, + "learning_rate": 9.762823571790592e-10, + "loss": 2.5426, + "step": 33703 + }, + { + "epoch": 0.9994365863060819, + "grad_norm": 0.06717630475759506, + "learning_rate": 8.855169040256072e-10, + "loss": 2.5695, + "step": 33704 + }, + { + "epoch": 0.9994662396583934, + "grad_norm": 0.06495529413223267, + "learning_rate": 7.99179028898589e-10, + "loss": 2.5615, + "step": 33705 + }, + { + "epoch": 0.9994958930107049, + "grad_norm": 0.06639502197504044, + "learning_rate": 7.172687325196492e-10, + "loss": 2.5801, + "step": 33706 + }, + { + "epoch": 0.9995255463630164, + "grad_norm": 0.0674513652920723, + "learning_rate": 6.397860155549218e-10, + "loss": 2.5636, + "step": 33707 + }, + { + "epoch": 0.9995551997153278, + "grad_norm": 0.06823224574327469, + "learning_rate": 5.66730878837074e-10, + "loss": 2.602, + "step": 33708 + }, + { + "epoch": 0.9995848530676393, + "grad_norm": 0.06825883686542511, + "learning_rate": 4.981033228657061e-10, + "loss": 2.5983, + "step": 33709 + }, + { + "epoch": 0.9996145064199508, + "grad_norm": 0.06676657497882843, + "learning_rate": 4.339033483069521e-10, + "loss": 2.5562, + "step": 33710 + }, + { + "epoch": 0.9996441597722623, + "grad_norm": 0.06410720199346542, + "learning_rate": 3.7413095571592337e-10, + "loss": 2.557, + "step": 33711 + }, + { + "epoch": 0.9996738131245737, + "grad_norm": 0.06293581426143646, + "learning_rate": 3.187861457032426e-10, + "loss": 2.5931, + "step": 33712 + }, + { + "epoch": 0.9997034664768852, + "grad_norm": 0.06413663178682327, + "learning_rate": 2.678689186019767e-10, + "loss": 2.5666, + "step": 33713 + }, + { + "epoch": 0.9997331198291967, + "grad_norm": 0.06498309224843979, + "learning_rate": 2.2137927502274836e-10, + "loss": 2.5578, + "step": 33714 + }, + { + "epoch": 0.9997627731815082, + "grad_norm": 0.06504468619823456, + "learning_rate": 1.7931721529862444e-10, + "loss": 2.5867, + "step": 33715 + }, + { + "epoch": 0.9997924265338196, + "grad_norm": 0.06879068166017532, + "learning_rate": 1.4168273976267188e-10, + "loss": 2.5826, + "step": 33716 + }, + { + "epoch": 0.9998220798861311, + "grad_norm": 0.06325571984052658, + "learning_rate": 1.0847584885897987e-10, + "loss": 2.5688, + "step": 33717 + }, + { + "epoch": 0.9998517332384426, + "grad_norm": 0.0655665248632431, + "learning_rate": 7.969654275408189e-11, + "loss": 2.6082, + "step": 33718 + }, + { + "epoch": 0.9998813865907541, + "grad_norm": 0.06529773026704788, + "learning_rate": 5.534482183655598e-11, + "loss": 2.5648, + "step": 33719 + }, + { + "epoch": 0.9999110399430655, + "grad_norm": 0.06637618690729141, + "learning_rate": 3.5420686217424444e-11, + "loss": 2.6027, + "step": 33720 + }, + { + "epoch": 0.999940693295377, + "grad_norm": 0.06570054590702057, + "learning_rate": 1.9924136063220743e-11, + "loss": 2.5739, + "step": 33721 + }, + { + "epoch": 0.9999703466476885, + "grad_norm": 0.06647421419620514, + "learning_rate": 8.855171651500627e-12, + "loss": 2.5671, + "step": 33722 + }, + { + "epoch": 1.0, + "grad_norm": 0.06456394493579865, + "learning_rate": 2.2137929267529444e-12, + "loss": 2.584, + "step": 33723 + } + ], + "logging_steps": 1.0, + "max_steps": 33723, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 6745, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.05392005996189e+20, + "train_batch_size": 24, + "trial_name": null, + "trial_params": null +}