{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.987241531016278, "eval_steps": 500, "global_step": 567, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005279366476022877, "grad_norm": 9.06258192697587, "learning_rate": 0.0, "loss": 1.678, "step": 1 }, { "epoch": 0.010558732952045754, "grad_norm": 9.487470045686269, "learning_rate": 1.7543859649122808e-07, "loss": 1.8487, "step": 2 }, { "epoch": 0.01583809942806863, "grad_norm": 8.792837550815417, "learning_rate": 3.5087719298245616e-07, "loss": 1.6855, "step": 3 }, { "epoch": 0.02111746590409151, "grad_norm": 9.323941973098307, "learning_rate": 5.263157894736843e-07, "loss": 1.79, "step": 4 }, { "epoch": 0.026396832380114386, "grad_norm": 8.5661762778806, "learning_rate": 7.017543859649123e-07, "loss": 1.6796, "step": 5 }, { "epoch": 0.03167619885613726, "grad_norm": 8.780182073885886, "learning_rate": 8.771929824561404e-07, "loss": 1.8326, "step": 6 }, { "epoch": 0.03695556533216014, "grad_norm": 8.658784025795836, "learning_rate": 1.0526315789473685e-06, "loss": 1.7229, "step": 7 }, { "epoch": 0.04223493180818302, "grad_norm": 8.678284010844528, "learning_rate": 1.2280701754385965e-06, "loss": 1.782, "step": 8 }, { "epoch": 0.0475142982842059, "grad_norm": 7.4972200445720425, "learning_rate": 1.4035087719298246e-06, "loss": 1.6596, "step": 9 }, { "epoch": 0.05279366476022877, "grad_norm": 8.204566912821951, "learning_rate": 1.5789473684210526e-06, "loss": 1.6707, "step": 10 }, { "epoch": 0.05807303123625165, "grad_norm": 7.461093116720939, "learning_rate": 1.7543859649122807e-06, "loss": 1.718, "step": 11 }, { "epoch": 0.06335239771227452, "grad_norm": 6.113172095963636, "learning_rate": 1.929824561403509e-06, "loss": 1.5939, "step": 12 }, { "epoch": 0.0686317641882974, "grad_norm": 5.4039227506301835, "learning_rate": 2.105263157894737e-06, "loss": 1.497, "step": 13 }, { "epoch": 0.07391113066432028, "grad_norm": 5.072244053671971, "learning_rate": 2.280701754385965e-06, "loss": 1.4584, "step": 14 }, { "epoch": 0.07919049714034315, "grad_norm": 4.290896240507921, "learning_rate": 2.456140350877193e-06, "loss": 1.4996, "step": 15 }, { "epoch": 0.08446986361636603, "grad_norm": 3.563114688671191, "learning_rate": 2.631578947368421e-06, "loss": 1.4042, "step": 16 }, { "epoch": 0.08974923009238892, "grad_norm": 3.598254571007457, "learning_rate": 2.8070175438596493e-06, "loss": 1.4125, "step": 17 }, { "epoch": 0.0950285965684118, "grad_norm": 3.6215074072359865, "learning_rate": 2.9824561403508774e-06, "loss": 1.4723, "step": 18 }, { "epoch": 0.10030796304443466, "grad_norm": 2.915485138222026, "learning_rate": 3.157894736842105e-06, "loss": 1.2987, "step": 19 }, { "epoch": 0.10558732952045755, "grad_norm": 2.4634215162994595, "learning_rate": 3.3333333333333333e-06, "loss": 1.2651, "step": 20 }, { "epoch": 0.11086669599648043, "grad_norm": 2.3107374113003103, "learning_rate": 3.5087719298245615e-06, "loss": 1.2512, "step": 21 }, { "epoch": 0.1161460624725033, "grad_norm": 2.7621133107185707, "learning_rate": 3.6842105263157896e-06, "loss": 1.3191, "step": 22 }, { "epoch": 0.12142542894852618, "grad_norm": 2.877480148788483, "learning_rate": 3.859649122807018e-06, "loss": 1.2354, "step": 23 }, { "epoch": 0.12670479542454904, "grad_norm": 2.329645780694447, "learning_rate": 4.035087719298246e-06, "loss": 1.174, "step": 24 }, { "epoch": 0.13198416190057194, "grad_norm": 2.692865294147214, "learning_rate": 4.210526315789474e-06, "loss": 1.2824, "step": 25 }, { "epoch": 0.1372635283765948, "grad_norm": 2.2247599668267615, "learning_rate": 4.385964912280702e-06, "loss": 1.2023, "step": 26 }, { "epoch": 0.14254289485261767, "grad_norm": 2.038589631126849, "learning_rate": 4.56140350877193e-06, "loss": 1.1412, "step": 27 }, { "epoch": 0.14782226132864057, "grad_norm": 2.0910541565756002, "learning_rate": 4.736842105263158e-06, "loss": 1.1989, "step": 28 }, { "epoch": 0.15310162780466344, "grad_norm": 1.7517601173871857, "learning_rate": 4.912280701754386e-06, "loss": 1.0738, "step": 29 }, { "epoch": 0.1583809942806863, "grad_norm": 1.6413195717279263, "learning_rate": 5.087719298245615e-06, "loss": 1.0853, "step": 30 }, { "epoch": 0.1636603607567092, "grad_norm": 1.688876231888243, "learning_rate": 5.263157894736842e-06, "loss": 1.061, "step": 31 }, { "epoch": 0.16893972723273207, "grad_norm": 1.5670006557231517, "learning_rate": 5.438596491228071e-06, "loss": 1.1496, "step": 32 }, { "epoch": 0.17421909370875496, "grad_norm": 1.7072674079415744, "learning_rate": 5.6140350877192985e-06, "loss": 1.1424, "step": 33 }, { "epoch": 0.17949846018477783, "grad_norm": 1.6307968487984736, "learning_rate": 5.789473684210527e-06, "loss": 1.0655, "step": 34 }, { "epoch": 0.1847778266608007, "grad_norm": 1.6358139036334913, "learning_rate": 5.964912280701755e-06, "loss": 1.0241, "step": 35 }, { "epoch": 0.1900571931368236, "grad_norm": 1.4688899759958145, "learning_rate": 6.140350877192983e-06, "loss": 1.0076, "step": 36 }, { "epoch": 0.19533655961284646, "grad_norm": 1.5254490097810096, "learning_rate": 6.31578947368421e-06, "loss": 1.0991, "step": 37 }, { "epoch": 0.20061592608886933, "grad_norm": 1.5005708253602146, "learning_rate": 6.491228070175439e-06, "loss": 1.0824, "step": 38 }, { "epoch": 0.20589529256489222, "grad_norm": 1.420109858670133, "learning_rate": 6.666666666666667e-06, "loss": 0.994, "step": 39 }, { "epoch": 0.2111746590409151, "grad_norm": 1.4491053784050478, "learning_rate": 6.842105263157896e-06, "loss": 1.0583, "step": 40 }, { "epoch": 0.21645402551693796, "grad_norm": 1.448588096978498, "learning_rate": 7.017543859649123e-06, "loss": 0.9892, "step": 41 }, { "epoch": 0.22173339199296085, "grad_norm": 1.4526199178661001, "learning_rate": 7.192982456140352e-06, "loss": 1.0051, "step": 42 }, { "epoch": 0.22701275846898372, "grad_norm": 1.4693112027438093, "learning_rate": 7.368421052631579e-06, "loss": 0.9833, "step": 43 }, { "epoch": 0.2322921249450066, "grad_norm": 1.3739429688061344, "learning_rate": 7.5438596491228074e-06, "loss": 0.9793, "step": 44 }, { "epoch": 0.23757149142102948, "grad_norm": 1.4247570212621312, "learning_rate": 7.719298245614036e-06, "loss": 0.9366, "step": 45 }, { "epoch": 0.24285085789705235, "grad_norm": 1.2987600676752833, "learning_rate": 7.894736842105265e-06, "loss": 0.9006, "step": 46 }, { "epoch": 0.24813022437307522, "grad_norm": 1.5006141932812773, "learning_rate": 8.070175438596492e-06, "loss": 0.9785, "step": 47 }, { "epoch": 0.2534095908490981, "grad_norm": 1.468806773441352, "learning_rate": 8.24561403508772e-06, "loss": 0.9941, "step": 48 }, { "epoch": 0.258688957325121, "grad_norm": 1.3084024864816892, "learning_rate": 8.421052631578948e-06, "loss": 0.938, "step": 49 }, { "epoch": 0.2639683238011439, "grad_norm": 1.4542464523472705, "learning_rate": 8.596491228070176e-06, "loss": 1.009, "step": 50 }, { "epoch": 0.2692476902771667, "grad_norm": 1.3097725384782457, "learning_rate": 8.771929824561405e-06, "loss": 0.9557, "step": 51 }, { "epoch": 0.2745270567531896, "grad_norm": 1.3424191425040415, "learning_rate": 8.947368421052632e-06, "loss": 0.9773, "step": 52 }, { "epoch": 0.2798064232292125, "grad_norm": 1.3120126450980685, "learning_rate": 9.12280701754386e-06, "loss": 0.979, "step": 53 }, { "epoch": 0.28508578970523535, "grad_norm": 1.4326002194117426, "learning_rate": 9.298245614035088e-06, "loss": 0.9909, "step": 54 }, { "epoch": 0.29036515618125824, "grad_norm": 1.3411895985410853, "learning_rate": 9.473684210526315e-06, "loss": 0.9343, "step": 55 }, { "epoch": 0.29564452265728114, "grad_norm": 1.4092777694950307, "learning_rate": 9.649122807017545e-06, "loss": 0.9711, "step": 56 }, { "epoch": 0.300923889133304, "grad_norm": 1.3183008246582104, "learning_rate": 9.824561403508772e-06, "loss": 1.0189, "step": 57 }, { "epoch": 0.3062032556093269, "grad_norm": 1.4351454680178788, "learning_rate": 1e-05, "loss": 0.9581, "step": 58 }, { "epoch": 0.31148262208534977, "grad_norm": 1.4327154249987661, "learning_rate": 9.999905136743635e-06, "loss": 1.0108, "step": 59 }, { "epoch": 0.3167619885613726, "grad_norm": 1.2783140310829089, "learning_rate": 9.999620550574155e-06, "loss": 0.8755, "step": 60 }, { "epoch": 0.3220413550373955, "grad_norm": 1.3286747156393754, "learning_rate": 9.999146252290264e-06, "loss": 0.9038, "step": 61 }, { "epoch": 0.3273207215134184, "grad_norm": 1.332121876828825, "learning_rate": 9.99848225988936e-06, "loss": 0.9326, "step": 62 }, { "epoch": 0.33260008798944124, "grad_norm": 1.279293629368409, "learning_rate": 9.99762859856683e-06, "loss": 0.8677, "step": 63 }, { "epoch": 0.33787945446546414, "grad_norm": 1.273872943436948, "learning_rate": 9.996585300715117e-06, "loss": 0.9299, "step": 64 }, { "epoch": 0.34315882094148703, "grad_norm": 1.2562758551505886, "learning_rate": 9.995352405922467e-06, "loss": 0.9017, "step": 65 }, { "epoch": 0.3484381874175099, "grad_norm": 1.4032802781936289, "learning_rate": 9.99392996097145e-06, "loss": 0.9452, "step": 66 }, { "epoch": 0.35371755389353277, "grad_norm": 1.4699498993510196, "learning_rate": 9.992318019837171e-06, "loss": 1.0884, "step": 67 }, { "epoch": 0.35899692036955566, "grad_norm": 1.2352131609899204, "learning_rate": 9.990516643685222e-06, "loss": 0.9171, "step": 68 }, { "epoch": 0.36427628684557856, "grad_norm": 1.3192425912595864, "learning_rate": 9.988525900869366e-06, "loss": 0.9075, "step": 69 }, { "epoch": 0.3695556533216014, "grad_norm": 1.2967022211363584, "learning_rate": 9.98634586692894e-06, "loss": 0.885, "step": 70 }, { "epoch": 0.3748350197976243, "grad_norm": 1.3926263384486055, "learning_rate": 9.983976624585996e-06, "loss": 0.9215, "step": 71 }, { "epoch": 0.3801143862736472, "grad_norm": 1.2279876199917352, "learning_rate": 9.981418263742148e-06, "loss": 0.9121, "step": 72 }, { "epoch": 0.38539375274967, "grad_norm": 1.3296649614901273, "learning_rate": 9.978670881475173e-06, "loss": 0.8719, "step": 73 }, { "epoch": 0.3906731192256929, "grad_norm": 1.4366980284008448, "learning_rate": 9.975734582035323e-06, "loss": 0.9158, "step": 74 }, { "epoch": 0.3959524857017158, "grad_norm": 1.4161336324128757, "learning_rate": 9.972609476841368e-06, "loss": 0.9519, "step": 75 }, { "epoch": 0.40123185217773866, "grad_norm": 1.2765268199027533, "learning_rate": 9.96929568447637e-06, "loss": 0.8892, "step": 76 }, { "epoch": 0.40651121865376155, "grad_norm": 1.2903936251174384, "learning_rate": 9.965793330683182e-06, "loss": 0.8749, "step": 77 }, { "epoch": 0.41179058512978445, "grad_norm": 1.328926357616079, "learning_rate": 9.96210254835968e-06, "loss": 0.8218, "step": 78 }, { "epoch": 0.4170699516058073, "grad_norm": 1.192377478319799, "learning_rate": 9.958223477553715e-06, "loss": 0.8215, "step": 79 }, { "epoch": 0.4223493180818302, "grad_norm": 1.2953681293753618, "learning_rate": 9.954156265457801e-06, "loss": 0.8103, "step": 80 }, { "epoch": 0.4276286845578531, "grad_norm": 1.3072992620223463, "learning_rate": 9.949901066403536e-06, "loss": 0.8992, "step": 81 }, { "epoch": 0.4329080510338759, "grad_norm": 1.2374067876062618, "learning_rate": 9.945458041855732e-06, "loss": 0.8831, "step": 82 }, { "epoch": 0.4381874175098988, "grad_norm": 1.3974745936753212, "learning_rate": 9.940827360406297e-06, "loss": 0.8843, "step": 83 }, { "epoch": 0.4434667839859217, "grad_norm": 1.2232655465327642, "learning_rate": 9.936009197767847e-06, "loss": 0.8714, "step": 84 }, { "epoch": 0.44874615046194455, "grad_norm": 1.3329454900801005, "learning_rate": 9.931003736767013e-06, "loss": 0.9267, "step": 85 }, { "epoch": 0.45402551693796744, "grad_norm": 1.2551849569644793, "learning_rate": 9.925811167337533e-06, "loss": 0.8765, "step": 86 }, { "epoch": 0.45930488341399034, "grad_norm": 1.324099190921721, "learning_rate": 9.920431686513023e-06, "loss": 0.8835, "step": 87 }, { "epoch": 0.4645842498900132, "grad_norm": 1.2343457957658, "learning_rate": 9.91486549841951e-06, "loss": 0.8557, "step": 88 }, { "epoch": 0.4698636163660361, "grad_norm": 1.357007540882203, "learning_rate": 9.909112814267686e-06, "loss": 0.93, "step": 89 }, { "epoch": 0.47514298284205897, "grad_norm": 1.2685007078756392, "learning_rate": 9.903173852344889e-06, "loss": 0.8493, "step": 90 }, { "epoch": 0.4804223493180818, "grad_norm": 1.3198835038669643, "learning_rate": 9.89704883800683e-06, "loss": 0.9375, "step": 91 }, { "epoch": 0.4857017157941047, "grad_norm": 1.2921827299558808, "learning_rate": 9.890738003669029e-06, "loss": 0.8502, "step": 92 }, { "epoch": 0.4909810822701276, "grad_norm": 1.3792411082761877, "learning_rate": 9.884241588798004e-06, "loss": 0.8722, "step": 93 }, { "epoch": 0.49626044874615044, "grad_norm": 1.3853454707822626, "learning_rate": 9.877559839902185e-06, "loss": 0.9781, "step": 94 }, { "epoch": 0.5015398152221734, "grad_norm": 1.4954228020473375, "learning_rate": 9.870693010522552e-06, "loss": 0.9494, "step": 95 }, { "epoch": 0.5068191816981962, "grad_norm": 1.4537142950730755, "learning_rate": 9.863641361223025e-06, "loss": 0.8948, "step": 96 }, { "epoch": 0.5120985481742191, "grad_norm": 1.3353911864277976, "learning_rate": 9.85640515958057e-06, "loss": 0.9305, "step": 97 }, { "epoch": 0.517377914650242, "grad_norm": 1.3069189595420478, "learning_rate": 9.848984680175049e-06, "loss": 1.0241, "step": 98 }, { "epoch": 0.5226572811262649, "grad_norm": 1.4160480924828072, "learning_rate": 9.841380204578795e-06, "loss": 0.8869, "step": 99 }, { "epoch": 0.5279366476022878, "grad_norm": 1.6379023507410095, "learning_rate": 9.833592021345938e-06, "loss": 0.8854, "step": 100 }, { "epoch": 0.5332160140783107, "grad_norm": 1.4984036893204795, "learning_rate": 9.825620426001446e-06, "loss": 0.9243, "step": 101 }, { "epoch": 0.5384953805543334, "grad_norm": 1.3234999163374, "learning_rate": 9.817465721029916e-06, "loss": 0.8645, "step": 102 }, { "epoch": 0.5437747470303563, "grad_norm": 1.3367013313050653, "learning_rate": 9.809128215864096e-06, "loss": 0.9064, "step": 103 }, { "epoch": 0.5490541135063792, "grad_norm": 1.2888959675698541, "learning_rate": 9.800608226873143e-06, "loss": 0.8828, "step": 104 }, { "epoch": 0.5543334799824021, "grad_norm": 1.3417129361301388, "learning_rate": 9.791906077350613e-06, "loss": 0.8687, "step": 105 }, { "epoch": 0.559612846458425, "grad_norm": 1.387011742779051, "learning_rate": 9.783022097502204e-06, "loss": 1.0081, "step": 106 }, { "epoch": 0.5648922129344479, "grad_norm": 1.2844930485660748, "learning_rate": 9.773956624433224e-06, "loss": 0.8988, "step": 107 }, { "epoch": 0.5701715794104707, "grad_norm": 1.362695293650949, "learning_rate": 9.764710002135784e-06, "loss": 0.8097, "step": 108 }, { "epoch": 0.5754509458864936, "grad_norm": 1.311448593659273, "learning_rate": 9.755282581475769e-06, "loss": 0.8707, "step": 109 }, { "epoch": 0.5807303123625165, "grad_norm": 1.4708655509314155, "learning_rate": 9.745674720179507e-06, "loss": 0.913, "step": 110 }, { "epoch": 0.5860096788385394, "grad_norm": 1.3142546545330323, "learning_rate": 9.735886782820202e-06, "loss": 0.8495, "step": 111 }, { "epoch": 0.5912890453145623, "grad_norm": 1.2448969618431651, "learning_rate": 9.7259191408041e-06, "loss": 0.7694, "step": 112 }, { "epoch": 0.5965684117905852, "grad_norm": 1.2817154139311548, "learning_rate": 9.715772172356388e-06, "loss": 0.8874, "step": 113 }, { "epoch": 0.601847778266608, "grad_norm": 1.303527565876824, "learning_rate": 9.705446262506858e-06, "loss": 0.9887, "step": 114 }, { "epoch": 0.6071271447426309, "grad_norm": 1.3162139512016877, "learning_rate": 9.694941803075285e-06, "loss": 0.8731, "step": 115 }, { "epoch": 0.6124065112186537, "grad_norm": 1.3631926446929201, "learning_rate": 9.684259192656554e-06, "loss": 0.8773, "step": 116 }, { "epoch": 0.6176858776946766, "grad_norm": 1.354454946440946, "learning_rate": 9.673398836605554e-06, "loss": 0.9415, "step": 117 }, { "epoch": 0.6229652441706995, "grad_norm": 1.2806247492822374, "learning_rate": 9.66236114702178e-06, "loss": 0.8214, "step": 118 }, { "epoch": 0.6282446106467224, "grad_norm": 1.3067756329426423, "learning_rate": 9.651146542733702e-06, "loss": 0.9561, "step": 119 }, { "epoch": 0.6335239771227452, "grad_norm": 1.3982461215765278, "learning_rate": 9.639755449282874e-06, "loss": 0.8812, "step": 120 }, { "epoch": 0.6388033435987681, "grad_norm": 1.2734112863554599, "learning_rate": 9.628188298907782e-06, "loss": 0.803, "step": 121 }, { "epoch": 0.644082710074791, "grad_norm": 1.4042153797470949, "learning_rate": 9.616445530527448e-06, "loss": 0.8159, "step": 122 }, { "epoch": 0.6493620765508139, "grad_norm": 1.230725944316371, "learning_rate": 9.60452758972477e-06, "loss": 0.8846, "step": 123 }, { "epoch": 0.6546414430268368, "grad_norm": 1.242349305278167, "learning_rate": 9.592434928729617e-06, "loss": 0.7621, "step": 124 }, { "epoch": 0.6599208095028597, "grad_norm": 1.4468948851039833, "learning_rate": 9.58016800640167e-06, "loss": 0.9327, "step": 125 }, { "epoch": 0.6652001759788825, "grad_norm": 1.3595406863398718, "learning_rate": 9.567727288213005e-06, "loss": 0.8629, "step": 126 }, { "epoch": 0.6704795424549054, "grad_norm": 1.3391481209651412, "learning_rate": 9.555113246230443e-06, "loss": 0.943, "step": 127 }, { "epoch": 0.6757589089309283, "grad_norm": 1.3816914835623122, "learning_rate": 9.542326359097619e-06, "loss": 0.8841, "step": 128 }, { "epoch": 0.6810382754069512, "grad_norm": 1.2580871478999183, "learning_rate": 9.529367112016836e-06, "loss": 0.8039, "step": 129 }, { "epoch": 0.6863176418829741, "grad_norm": 1.3901966106075496, "learning_rate": 9.516235996730645e-06, "loss": 0.8409, "step": 130 }, { "epoch": 0.691597008358997, "grad_norm": 1.2698547636051096, "learning_rate": 9.502933511503187e-06, "loss": 0.8499, "step": 131 }, { "epoch": 0.6968763748350199, "grad_norm": 1.390987513649665, "learning_rate": 9.489460161101291e-06, "loss": 0.8549, "step": 132 }, { "epoch": 0.7021557413110426, "grad_norm": 1.3670503603402828, "learning_rate": 9.475816456775313e-06, "loss": 0.8436, "step": 133 }, { "epoch": 0.7074351077870655, "grad_norm": 1.2624531025923733, "learning_rate": 9.46200291623974e-06, "loss": 0.853, "step": 134 }, { "epoch": 0.7127144742630884, "grad_norm": 1.2250644357723437, "learning_rate": 9.44802006365355e-06, "loss": 0.8283, "step": 135 }, { "epoch": 0.7179938407391113, "grad_norm": 1.2734440256518054, "learning_rate": 9.43386842960031e-06, "loss": 0.8474, "step": 136 }, { "epoch": 0.7232732072151342, "grad_norm": 1.3509174047115824, "learning_rate": 9.419548551068061e-06, "loss": 0.8657, "step": 137 }, { "epoch": 0.7285525736911571, "grad_norm": 1.370913524291918, "learning_rate": 9.405060971428924e-06, "loss": 0.8915, "step": 138 }, { "epoch": 0.7338319401671799, "grad_norm": 1.2433881056858749, "learning_rate": 9.39040624041849e-06, "loss": 1.024, "step": 139 }, { "epoch": 0.7391113066432028, "grad_norm": 1.1648887790445894, "learning_rate": 9.375584914114963e-06, "loss": 0.7808, "step": 140 }, { "epoch": 0.7443906731192257, "grad_norm": 1.3435997997706464, "learning_rate": 9.360597554918055e-06, "loss": 0.9497, "step": 141 }, { "epoch": 0.7496700395952486, "grad_norm": 1.401286458253954, "learning_rate": 9.345444731527642e-06, "loss": 0.927, "step": 142 }, { "epoch": 0.7549494060712715, "grad_norm": 1.2883022833187427, "learning_rate": 9.330127018922195e-06, "loss": 0.9708, "step": 143 }, { "epoch": 0.7602287725472944, "grad_norm": 1.2683832198503506, "learning_rate": 9.31464499833695e-06, "loss": 0.8359, "step": 144 }, { "epoch": 0.7655081390233172, "grad_norm": 1.2280565258082867, "learning_rate": 9.298999257241862e-06, "loss": 0.8085, "step": 145 }, { "epoch": 0.77078750549934, "grad_norm": 1.3268019274693263, "learning_rate": 9.283190389319315e-06, "loss": 0.8619, "step": 146 }, { "epoch": 0.776066871975363, "grad_norm": 1.3405013825655892, "learning_rate": 9.26721899444158e-06, "loss": 0.8757, "step": 147 }, { "epoch": 0.7813462384513858, "grad_norm": 1.3113438616437634, "learning_rate": 9.251085678648072e-06, "loss": 0.8586, "step": 148 }, { "epoch": 0.7866256049274087, "grad_norm": 1.2692947439879936, "learning_rate": 9.234791054122336e-06, "loss": 0.8315, "step": 149 }, { "epoch": 0.7919049714034316, "grad_norm": 1.288784803692807, "learning_rate": 9.218335739168833e-06, "loss": 0.8138, "step": 150 }, { "epoch": 0.7971843378794544, "grad_norm": 1.2853502663214944, "learning_rate": 9.201720358189464e-06, "loss": 0.8953, "step": 151 }, { "epoch": 0.8024637043554773, "grad_norm": 1.3844299853873043, "learning_rate": 9.18494554165989e-06, "loss": 0.8527, "step": 152 }, { "epoch": 0.8077430708315002, "grad_norm": 1.2348194951175424, "learning_rate": 9.168011926105598e-06, "loss": 0.7873, "step": 153 }, { "epoch": 0.8130224373075231, "grad_norm": 1.4444636219923, "learning_rate": 9.150920154077753e-06, "loss": 0.9273, "step": 154 }, { "epoch": 0.818301803783546, "grad_norm": 1.3415281982191147, "learning_rate": 9.133670874128818e-06, "loss": 0.8165, "step": 155 }, { "epoch": 0.8235811702595689, "grad_norm": 1.4171257480887072, "learning_rate": 9.116264740787937e-06, "loss": 0.903, "step": 156 }, { "epoch": 0.8288605367355917, "grad_norm": 1.4009278155261338, "learning_rate": 9.098702414536107e-06, "loss": 0.8654, "step": 157 }, { "epoch": 0.8341399032116146, "grad_norm": 1.3353961972609718, "learning_rate": 9.08098456178111e-06, "loss": 0.8821, "step": 158 }, { "epoch": 0.8394192696876375, "grad_norm": 1.465378569130035, "learning_rate": 9.06311185483223e-06, "loss": 0.8585, "step": 159 }, { "epoch": 0.8446986361636604, "grad_norm": 1.3247392983018136, "learning_rate": 9.045084971874738e-06, "loss": 0.8409, "step": 160 }, { "epoch": 0.8499780026396833, "grad_norm": 1.3094980178064088, "learning_rate": 9.026904596944163e-06, "loss": 0.8423, "step": 161 }, { "epoch": 0.8552573691157062, "grad_norm": 1.3167256777304588, "learning_rate": 9.008571419900334e-06, "loss": 0.8123, "step": 162 }, { "epoch": 0.8605367355917289, "grad_norm": 1.3199382134511854, "learning_rate": 8.990086136401199e-06, "loss": 0.8171, "step": 163 }, { "epoch": 0.8658161020677518, "grad_norm": 1.2759439255391014, "learning_rate": 8.97144944787643e-06, "loss": 0.7351, "step": 164 }, { "epoch": 0.8710954685437747, "grad_norm": 1.3371401649586945, "learning_rate": 8.952662061500817e-06, "loss": 0.9156, "step": 165 }, { "epoch": 0.8763748350197976, "grad_norm": 1.2566756220149857, "learning_rate": 8.933724690167417e-06, "loss": 0.9278, "step": 166 }, { "epoch": 0.8816542014958205, "grad_norm": 1.2918012041625928, "learning_rate": 8.914638052460515e-06, "loss": 0.796, "step": 167 }, { "epoch": 0.8869335679718434, "grad_norm": 1.3420464341954295, "learning_rate": 8.895402872628352e-06, "loss": 0.8289, "step": 168 }, { "epoch": 0.8922129344478662, "grad_norm": 1.386536525968897, "learning_rate": 8.87601988055565e-06, "loss": 0.9055, "step": 169 }, { "epoch": 0.8974923009238891, "grad_norm": 1.2764097483742913, "learning_rate": 8.856489811735904e-06, "loss": 0.8221, "step": 170 }, { "epoch": 0.902771667399912, "grad_norm": 1.3242925889713713, "learning_rate": 8.836813407243485e-06, "loss": 0.8052, "step": 171 }, { "epoch": 0.9080510338759349, "grad_norm": 1.135434673976009, "learning_rate": 8.816991413705515e-06, "loss": 0.8048, "step": 172 }, { "epoch": 0.9133304003519578, "grad_norm": 1.3633873651508777, "learning_rate": 8.797024583273536e-06, "loss": 0.8403, "step": 173 }, { "epoch": 0.9186097668279807, "grad_norm": 1.4254861868193114, "learning_rate": 8.776913673594968e-06, "loss": 0.8558, "step": 174 }, { "epoch": 0.9238891333040036, "grad_norm": 1.3944884567839855, "learning_rate": 8.756659447784367e-06, "loss": 0.8265, "step": 175 }, { "epoch": 0.9291684997800264, "grad_norm": 1.2473776372568752, "learning_rate": 8.736262674394455e-06, "loss": 0.8558, "step": 176 }, { "epoch": 0.9344478662560493, "grad_norm": 1.1199152507629353, "learning_rate": 8.715724127386971e-06, "loss": 0.7684, "step": 177 }, { "epoch": 0.9397272327320721, "grad_norm": 1.4769772323796146, "learning_rate": 8.695044586103297e-06, "loss": 0.8404, "step": 178 }, { "epoch": 0.945006599208095, "grad_norm": 1.2812768021421608, "learning_rate": 8.674224835234879e-06, "loss": 0.855, "step": 179 }, { "epoch": 0.9502859656841179, "grad_norm": 1.4074704240057607, "learning_rate": 8.653265664793466e-06, "loss": 0.8966, "step": 180 }, { "epoch": 0.9555653321601408, "grad_norm": 1.3552977566183917, "learning_rate": 8.632167870081122e-06, "loss": 0.8983, "step": 181 }, { "epoch": 0.9608446986361636, "grad_norm": 1.2662415913666043, "learning_rate": 8.610932251660046e-06, "loss": 0.7676, "step": 182 }, { "epoch": 0.9661240651121865, "grad_norm": 1.3122048439005143, "learning_rate": 8.58955961532221e-06, "loss": 0.8486, "step": 183 }, { "epoch": 0.9714034315882094, "grad_norm": 1.2880133358543706, "learning_rate": 8.568050772058763e-06, "loss": 1.0695, "step": 184 }, { "epoch": 0.9766827980642323, "grad_norm": 1.2876646312084281, "learning_rate": 8.546406538029268e-06, "loss": 0.8744, "step": 185 }, { "epoch": 0.9819621645402552, "grad_norm": 1.3907216667545839, "learning_rate": 8.524627734530738e-06, "loss": 0.8009, "step": 186 }, { "epoch": 0.9872415310162781, "grad_norm": 1.298714668518304, "learning_rate": 8.502715187966455e-06, "loss": 0.8211, "step": 187 }, { "epoch": 0.9925208974923009, "grad_norm": 1.4112133312678243, "learning_rate": 8.480669729814635e-06, "loss": 0.8909, "step": 188 }, { "epoch": 0.9978002639683238, "grad_norm": 1.2429484880228319, "learning_rate": 8.458492196596852e-06, "loss": 0.7842, "step": 189 }, { "epoch": 1.0, "grad_norm": 1.2429484880228319, "learning_rate": 8.436183429846314e-06, "loss": 0.8917, "step": 190 }, { "epoch": 1.005279366476023, "grad_norm": 2.3017818904176828, "learning_rate": 8.413744276075928e-06, "loss": 0.7453, "step": 191 }, { "epoch": 1.0105587329520458, "grad_norm": 1.223195818545867, "learning_rate": 8.39117558674617e-06, "loss": 0.6252, "step": 192 }, { "epoch": 1.0158380994280687, "grad_norm": 1.2015392058187855, "learning_rate": 8.368478218232787e-06, "loss": 0.6357, "step": 193 }, { "epoch": 1.0211174659040916, "grad_norm": 1.2194373310662718, "learning_rate": 8.345653031794292e-06, "loss": 0.6568, "step": 194 }, { "epoch": 1.0263968323801145, "grad_norm": 1.251074042866813, "learning_rate": 8.32270089353929e-06, "loss": 0.6674, "step": 195 }, { "epoch": 1.0316761988561374, "grad_norm": 1.3553593206962593, "learning_rate": 8.299622674393615e-06, "loss": 0.7704, "step": 196 }, { "epoch": 1.03695556533216, "grad_norm": 1.3610141965233205, "learning_rate": 8.27641925006727e-06, "loss": 0.6893, "step": 197 }, { "epoch": 1.042234931808183, "grad_norm": 1.4430956370832788, "learning_rate": 8.25309150102121e-06, "loss": 0.6783, "step": 198 }, { "epoch": 1.0475142982842058, "grad_norm": 1.3237087640173875, "learning_rate": 8.229640312433938e-06, "loss": 0.6328, "step": 199 }, { "epoch": 1.0527936647602287, "grad_norm": 1.4354224675777918, "learning_rate": 8.206066574167893e-06, "loss": 0.7054, "step": 200 }, { "epoch": 1.0580730312362516, "grad_norm": 1.3820711783982724, "learning_rate": 8.182371180735708e-06, "loss": 0.6596, "step": 201 }, { "epoch": 1.0633523977122745, "grad_norm": 1.5892680249474918, "learning_rate": 8.158555031266255e-06, "loss": 0.7119, "step": 202 }, { "epoch": 1.0686317641882974, "grad_norm": 1.3831340041775368, "learning_rate": 8.134619029470535e-06, "loss": 0.6956, "step": 203 }, { "epoch": 1.0739111306643203, "grad_norm": 1.4618391544645484, "learning_rate": 8.110564083607371e-06, "loss": 0.6927, "step": 204 }, { "epoch": 1.0791904971403432, "grad_norm": 1.399225289114619, "learning_rate": 8.086391106448965e-06, "loss": 0.6719, "step": 205 }, { "epoch": 1.084469863616366, "grad_norm": 1.39102454168437, "learning_rate": 8.06210101524625e-06, "loss": 0.6677, "step": 206 }, { "epoch": 1.089749230092389, "grad_norm": 1.4013045019864605, "learning_rate": 8.037694731694085e-06, "loss": 0.6807, "step": 207 }, { "epoch": 1.095028596568412, "grad_norm": 1.3292251495775314, "learning_rate": 8.013173181896283e-06, "loss": 0.685, "step": 208 }, { "epoch": 1.1003079630444346, "grad_norm": 1.468780970931853, "learning_rate": 7.988537296330468e-06, "loss": 0.6559, "step": 209 }, { "epoch": 1.1055873295204575, "grad_norm": 1.2560486698645472, "learning_rate": 7.963788009812775e-06, "loss": 0.5966, "step": 210 }, { "epoch": 1.1108666959964804, "grad_norm": 1.3263436068368955, "learning_rate": 7.938926261462366e-06, "loss": 0.6426, "step": 211 }, { "epoch": 1.1161460624725033, "grad_norm": 1.4262360139572436, "learning_rate": 7.913952994665805e-06, "loss": 0.7044, "step": 212 }, { "epoch": 1.1214254289485261, "grad_norm": 1.413724127688339, "learning_rate": 7.888869157041257e-06, "loss": 0.6892, "step": 213 }, { "epoch": 1.126704795424549, "grad_norm": 1.3691991114364659, "learning_rate": 7.863675700402527e-06, "loss": 0.6913, "step": 214 }, { "epoch": 1.131984161900572, "grad_norm": 1.5050181189304115, "learning_rate": 7.838373580722952e-06, "loss": 0.7563, "step": 215 }, { "epoch": 1.1372635283765948, "grad_norm": 1.280216474266895, "learning_rate": 7.812963758099118e-06, "loss": 0.6034, "step": 216 }, { "epoch": 1.1425428948526177, "grad_norm": 1.3442258228040502, "learning_rate": 7.787447196714428e-06, "loss": 0.703, "step": 217 }, { "epoch": 1.1478222613286406, "grad_norm": 1.374879774404637, "learning_rate": 7.76182486480253e-06, "loss": 0.6622, "step": 218 }, { "epoch": 1.1531016278046635, "grad_norm": 1.1079594025474235, "learning_rate": 7.736097734610557e-06, "loss": 0.6343, "step": 219 }, { "epoch": 1.1583809942806864, "grad_norm": 1.4099196984187832, "learning_rate": 7.710266782362248e-06, "loss": 0.7379, "step": 220 }, { "epoch": 1.163660360756709, "grad_norm": 1.3722807994126047, "learning_rate": 7.684332988220901e-06, "loss": 0.7447, "step": 221 }, { "epoch": 1.168939727232732, "grad_norm": 1.495776876658676, "learning_rate": 7.658297336252181e-06, "loss": 0.6477, "step": 222 }, { "epoch": 1.1742190937087549, "grad_norm": 1.3604596279976626, "learning_rate": 7.63216081438678e-06, "loss": 0.7295, "step": 223 }, { "epoch": 1.1794984601847778, "grad_norm": 1.3235758656247603, "learning_rate": 7.605924414382926e-06, "loss": 0.6585, "step": 224 }, { "epoch": 1.1847778266608007, "grad_norm": 1.4440449502234758, "learning_rate": 7.579589131788756e-06, "loss": 0.6244, "step": 225 }, { "epoch": 1.1900571931368236, "grad_norm": 1.3928149968149692, "learning_rate": 7.553155965904535e-06, "loss": 0.637, "step": 226 }, { "epoch": 1.1953365596128465, "grad_norm": 1.4032725114348137, "learning_rate": 7.526625919744741e-06, "loss": 0.6644, "step": 227 }, { "epoch": 1.2006159260888694, "grad_norm": 1.3266237278115651, "learning_rate": 7.500000000000001e-06, "loss": 0.6354, "step": 228 }, { "epoch": 1.2058952925648923, "grad_norm": 1.5117455894482101, "learning_rate": 7.473279216998896e-06, "loss": 0.634, "step": 229 }, { "epoch": 1.2111746590409151, "grad_norm": 1.5645913721329012, "learning_rate": 7.4464645846696186e-06, "loss": 0.8021, "step": 230 }, { "epoch": 1.216454025516938, "grad_norm": 1.595585311092696, "learning_rate": 7.419557120501508e-06, "loss": 0.6831, "step": 231 }, { "epoch": 1.221733391992961, "grad_norm": 1.3739554991120078, "learning_rate": 7.392557845506433e-06, "loss": 0.6571, "step": 232 }, { "epoch": 1.2270127584689838, "grad_norm": 1.383758484574002, "learning_rate": 7.365467784180051e-06, "loss": 0.6015, "step": 233 }, { "epoch": 1.2322921249450065, "grad_norm": 1.2935886046335063, "learning_rate": 7.3382879644629345e-06, "loss": 0.684, "step": 234 }, { "epoch": 1.2375714914210294, "grad_norm": 1.4930967440370626, "learning_rate": 7.311019417701567e-06, "loss": 0.618, "step": 235 }, { "epoch": 1.2428508578970523, "grad_norm": 1.4340994519601895, "learning_rate": 7.283663178609204e-06, "loss": 0.6676, "step": 236 }, { "epoch": 1.2481302243730752, "grad_norm": 1.332079262932709, "learning_rate": 7.256220285226615e-06, "loss": 0.6518, "step": 237 }, { "epoch": 1.253409590849098, "grad_norm": 1.4124012184704442, "learning_rate": 7.2286917788826926e-06, "loss": 0.7255, "step": 238 }, { "epoch": 1.258688957325121, "grad_norm": 1.435352374027868, "learning_rate": 7.201078704154938e-06, "loss": 0.6427, "step": 239 }, { "epoch": 1.2639683238011439, "grad_norm": 1.3129409573708608, "learning_rate": 7.173382108829826e-06, "loss": 0.6435, "step": 240 }, { "epoch": 1.2692476902771668, "grad_norm": 1.3029674291538322, "learning_rate": 7.145603043863045e-06, "loss": 0.6018, "step": 241 }, { "epoch": 1.2745270567531897, "grad_norm": 1.4384137050989114, "learning_rate": 7.117742563339622e-06, "loss": 0.6399, "step": 242 }, { "epoch": 1.2798064232292126, "grad_norm": 1.404893956226061, "learning_rate": 7.089801724433918e-06, "loss": 0.6591, "step": 243 }, { "epoch": 1.2850857897052355, "grad_norm": 1.3631545091800101, "learning_rate": 7.061781587369518e-06, "loss": 0.661, "step": 244 }, { "epoch": 1.2903651561812581, "grad_norm": 1.3722052435590018, "learning_rate": 7.033683215379002e-06, "loss": 0.7329, "step": 245 }, { "epoch": 1.2956445226572813, "grad_norm": 1.3373371627523003, "learning_rate": 7.005507674663594e-06, "loss": 0.6853, "step": 246 }, { "epoch": 1.300923889133304, "grad_norm": 1.347515123739391, "learning_rate": 6.977256034352713e-06, "loss": 0.6356, "step": 247 }, { "epoch": 1.3062032556093268, "grad_norm": 1.3251248469180115, "learning_rate": 6.948929366463397e-06, "loss": 0.6542, "step": 248 }, { "epoch": 1.3114826220853497, "grad_norm": 1.413403738053324, "learning_rate": 6.9205287458596305e-06, "loss": 0.6732, "step": 249 }, { "epoch": 1.3167619885613726, "grad_norm": 1.3195739239798052, "learning_rate": 6.892055250211552e-06, "loss": 0.6157, "step": 250 }, { "epoch": 1.3220413550373955, "grad_norm": 1.398192094348221, "learning_rate": 6.86350995995457e-06, "loss": 0.6903, "step": 251 }, { "epoch": 1.3273207215134184, "grad_norm": 1.4424925485085278, "learning_rate": 6.834893958248361e-06, "loss": 0.6967, "step": 252 }, { "epoch": 1.3326000879894413, "grad_norm": 1.3715769454036013, "learning_rate": 6.806208330935766e-06, "loss": 0.6402, "step": 253 }, { "epoch": 1.3378794544654642, "grad_norm": 1.3832262511831421, "learning_rate": 6.77745416650159e-06, "loss": 0.6684, "step": 254 }, { "epoch": 1.343158820941487, "grad_norm": 1.3503209557607232, "learning_rate": 6.748632556031306e-06, "loss": 0.7828, "step": 255 }, { "epoch": 1.34843818741751, "grad_norm": 1.3619508308924722, "learning_rate": 6.719744593169642e-06, "loss": 0.6583, "step": 256 }, { "epoch": 1.3537175538935329, "grad_norm": 1.543700428502048, "learning_rate": 6.690791374079086e-06, "loss": 0.6687, "step": 257 }, { "epoch": 1.3589969203695555, "grad_norm": 1.3454959558325137, "learning_rate": 6.6617739973982985e-06, "loss": 0.6109, "step": 258 }, { "epoch": 1.3642762868455787, "grad_norm": 1.3091432151076758, "learning_rate": 6.6326935642004165e-06, "loss": 0.6819, "step": 259 }, { "epoch": 1.3695556533216013, "grad_norm": 1.4211398893275302, "learning_rate": 6.6035511779512764e-06, "loss": 0.6106, "step": 260 }, { "epoch": 1.3748350197976242, "grad_norm": 1.3056584370485818, "learning_rate": 6.57434794446754e-06, "loss": 0.6348, "step": 261 }, { "epoch": 1.3801143862736471, "grad_norm": 1.432042689389407, "learning_rate": 6.545084971874738e-06, "loss": 0.6428, "step": 262 }, { "epoch": 1.38539375274967, "grad_norm": 1.3426884980712488, "learning_rate": 6.515763370565218e-06, "loss": 0.6076, "step": 263 }, { "epoch": 1.390673119225693, "grad_norm": 1.2615012752998496, "learning_rate": 6.486384253156014e-06, "loss": 0.7665, "step": 264 }, { "epoch": 1.3959524857017158, "grad_norm": 1.3453647581013601, "learning_rate": 6.456948734446624e-06, "loss": 0.6377, "step": 265 }, { "epoch": 1.4012318521777387, "grad_norm": 1.4161701822318469, "learning_rate": 6.427457931376712e-06, "loss": 0.6732, "step": 266 }, { "epoch": 1.4065112186537616, "grad_norm": 1.32784380027798, "learning_rate": 6.39791296298372e-06, "loss": 0.628, "step": 267 }, { "epoch": 1.4117905851297845, "grad_norm": 1.3941543313635256, "learning_rate": 6.368314950360416e-06, "loss": 0.6554, "step": 268 }, { "epoch": 1.4170699516058072, "grad_norm": 1.4362476191327336, "learning_rate": 6.3386650166123406e-06, "loss": 0.7686, "step": 269 }, { "epoch": 1.4223493180818303, "grad_norm": 1.4890178566624934, "learning_rate": 6.308964286815203e-06, "loss": 0.6515, "step": 270 }, { "epoch": 1.427628684557853, "grad_norm": 1.52171364834682, "learning_rate": 6.279213887972179e-06, "loss": 0.6851, "step": 271 }, { "epoch": 1.4329080510338759, "grad_norm": 1.170459513215867, "learning_rate": 6.249414948971154e-06, "loss": 0.634, "step": 272 }, { "epoch": 1.4381874175098988, "grad_norm": 1.2892794847690134, "learning_rate": 6.219568600541886e-06, "loss": 0.5732, "step": 273 }, { "epoch": 1.4434667839859217, "grad_norm": 1.4147522223651536, "learning_rate": 6.189675975213094e-06, "loss": 0.6505, "step": 274 }, { "epoch": 1.4487461504619445, "grad_norm": 1.2766098284530847, "learning_rate": 6.159738207269491e-06, "loss": 0.613, "step": 275 }, { "epoch": 1.4540255169379674, "grad_norm": 1.3170993095662313, "learning_rate": 6.129756432708739e-06, "loss": 0.6058, "step": 276 }, { "epoch": 1.4593048834139903, "grad_norm": 1.3490297596989358, "learning_rate": 6.099731789198344e-06, "loss": 0.7526, "step": 277 }, { "epoch": 1.4645842498900132, "grad_norm": 1.49015141935795, "learning_rate": 6.0696654160324875e-06, "loss": 0.6664, "step": 278 }, { "epoch": 1.4698636163660361, "grad_norm": 1.353677527773509, "learning_rate": 6.039558454088796e-06, "loss": 0.6508, "step": 279 }, { "epoch": 1.475142982842059, "grad_norm": 1.3542791249145698, "learning_rate": 6.009412045785051e-06, "loss": 0.6868, "step": 280 }, { "epoch": 1.480422349318082, "grad_norm": 1.239412403087578, "learning_rate": 5.9792273350358354e-06, "loss": 0.6542, "step": 281 }, { "epoch": 1.4857017157941046, "grad_norm": 1.3657653754563595, "learning_rate": 5.9490054672091305e-06, "loss": 0.695, "step": 282 }, { "epoch": 1.4909810822701277, "grad_norm": 1.4092314752807444, "learning_rate": 5.918747589082853e-06, "loss": 0.6472, "step": 283 }, { "epoch": 1.4962604487461504, "grad_norm": 1.518575708392721, "learning_rate": 5.888454848801345e-06, "loss": 0.6623, "step": 284 }, { "epoch": 1.5015398152221735, "grad_norm": 1.4295896368916283, "learning_rate": 5.8581283958317995e-06, "loss": 0.7579, "step": 285 }, { "epoch": 1.5068191816981962, "grad_norm": 1.479897530210997, "learning_rate": 5.82776938092065e-06, "loss": 0.7334, "step": 286 }, { "epoch": 1.512098548174219, "grad_norm": 1.4366013380091691, "learning_rate": 5.797378956049905e-06, "loss": 0.6739, "step": 287 }, { "epoch": 1.517377914650242, "grad_norm": 1.4716566746195219, "learning_rate": 5.766958274393428e-06, "loss": 0.7233, "step": 288 }, { "epoch": 1.5226572811262649, "grad_norm": 1.3374013752311613, "learning_rate": 5.736508490273189e-06, "loss": 0.6999, "step": 289 }, { "epoch": 1.5279366476022878, "grad_norm": 1.404502862270622, "learning_rate": 5.706030759115458e-06, "loss": 0.6502, "step": 290 }, { "epoch": 1.5332160140783107, "grad_norm": 1.3895925622506242, "learning_rate": 5.675526237406965e-06, "loss": 0.6693, "step": 291 }, { "epoch": 1.5384953805543335, "grad_norm": 1.3933211625692163, "learning_rate": 5.644996082651018e-06, "loss": 0.6272, "step": 292 }, { "epoch": 1.5437747470303562, "grad_norm": 1.2462836635087724, "learning_rate": 5.614441453323571e-06, "loss": 0.6725, "step": 293 }, { "epoch": 1.5490541135063793, "grad_norm": 1.4996013016049565, "learning_rate": 5.583863508829281e-06, "loss": 0.6956, "step": 294 }, { "epoch": 1.554333479982402, "grad_norm": 1.3766256340590475, "learning_rate": 5.553263409457504e-06, "loss": 0.659, "step": 295 }, { "epoch": 1.5596128464584251, "grad_norm": 1.3350837109105465, "learning_rate": 5.522642316338268e-06, "loss": 0.6357, "step": 296 }, { "epoch": 1.5648922129344478, "grad_norm": 1.3570996604619927, "learning_rate": 5.492001391398214e-06, "loss": 0.6544, "step": 297 }, { "epoch": 1.5701715794104707, "grad_norm": 1.4608558691508997, "learning_rate": 5.46134179731651e-06, "loss": 0.6512, "step": 298 }, { "epoch": 1.5754509458864936, "grad_norm": 1.2494448543139998, "learning_rate": 5.430664697480731e-06, "loss": 0.5658, "step": 299 }, { "epoch": 1.5807303123625165, "grad_norm": 1.444693017380396, "learning_rate": 5.399971255942708e-06, "loss": 0.6901, "step": 300 }, { "epoch": 1.5860096788385394, "grad_norm": 1.4186391329903683, "learning_rate": 5.36926263737437e-06, "loss": 0.8807, "step": 301 }, { "epoch": 1.5912890453145623, "grad_norm": 1.29633534515009, "learning_rate": 5.338540007023538e-06, "loss": 0.6461, "step": 302 }, { "epoch": 1.5965684117905852, "grad_norm": 1.4448726879769416, "learning_rate": 5.3078045306697154e-06, "loss": 0.6523, "step": 303 }, { "epoch": 1.6018477782666078, "grad_norm": 1.266507195220378, "learning_rate": 5.27705737457985e-06, "loss": 0.6408, "step": 304 }, { "epoch": 1.607127144742631, "grad_norm": 1.3540583386343656, "learning_rate": 5.246299705464085e-06, "loss": 0.6488, "step": 305 }, { "epoch": 1.6124065112186536, "grad_norm": 1.343878144578292, "learning_rate": 5.2155326904314795e-06, "loss": 0.6031, "step": 306 }, { "epoch": 1.6176858776946768, "grad_norm": 1.390922633295502, "learning_rate": 5.184757496945726e-06, "loss": 0.6732, "step": 307 }, { "epoch": 1.6229652441706994, "grad_norm": 1.303700297184845, "learning_rate": 5.153975292780852e-06, "loss": 0.644, "step": 308 }, { "epoch": 1.6282446106467225, "grad_norm": 1.4719857684130002, "learning_rate": 5.123187245976912e-06, "loss": 0.6542, "step": 309 }, { "epoch": 1.6335239771227452, "grad_norm": 1.5316116004451763, "learning_rate": 5.09239452479565e-06, "loss": 0.6741, "step": 310 }, { "epoch": 1.6388033435987681, "grad_norm": 1.5058092447545324, "learning_rate": 5.061598297676192e-06, "loss": 0.6624, "step": 311 }, { "epoch": 1.644082710074791, "grad_norm": 1.2957852805869594, "learning_rate": 5.030799733190694e-06, "loss": 0.6866, "step": 312 }, { "epoch": 1.649362076550814, "grad_norm": 1.3465817125883073, "learning_rate": 5e-06, "loss": 0.665, "step": 313 }, { "epoch": 1.6546414430268368, "grad_norm": 1.2222436930506864, "learning_rate": 4.9692002668093075e-06, "loss": 0.5887, "step": 314 }, { "epoch": 1.6599208095028597, "grad_norm": 1.441331154425715, "learning_rate": 4.9384017023238085e-06, "loss": 0.673, "step": 315 }, { "epoch": 1.6652001759788826, "grad_norm": 1.3514338153223537, "learning_rate": 4.907605475204352e-06, "loss": 0.7095, "step": 316 }, { "epoch": 1.6704795424549053, "grad_norm": 1.4614586482457859, "learning_rate": 4.876812754023092e-06, "loss": 0.7205, "step": 317 }, { "epoch": 1.6757589089309284, "grad_norm": 1.3928056564895086, "learning_rate": 4.846024707219149e-06, "loss": 0.6358, "step": 318 }, { "epoch": 1.681038275406951, "grad_norm": 1.3113749307682454, "learning_rate": 4.815242503054277e-06, "loss": 0.6465, "step": 319 }, { "epoch": 1.6863176418829742, "grad_norm": 1.4316497180240197, "learning_rate": 4.784467309568524e-06, "loss": 0.6794, "step": 320 }, { "epoch": 1.6915970083589968, "grad_norm": 1.3763481895692722, "learning_rate": 4.753700294535916e-06, "loss": 0.7105, "step": 321 }, { "epoch": 1.69687637483502, "grad_norm": 1.3560535615138942, "learning_rate": 4.7229426254201504e-06, "loss": 0.6566, "step": 322 }, { "epoch": 1.7021557413110426, "grad_norm": 1.3113897207300194, "learning_rate": 4.692195469330286e-06, "loss": 0.753, "step": 323 }, { "epoch": 1.7074351077870655, "grad_norm": 1.2314416333529012, "learning_rate": 4.661459992976463e-06, "loss": 0.6087, "step": 324 }, { "epoch": 1.7127144742630884, "grad_norm": 1.357070932304121, "learning_rate": 4.630737362625631e-06, "loss": 0.678, "step": 325 }, { "epoch": 1.7179938407391113, "grad_norm": 1.3907841932602958, "learning_rate": 4.6000287440572925e-06, "loss": 0.6819, "step": 326 }, { "epoch": 1.7232732072151342, "grad_norm": 1.3821824046618116, "learning_rate": 4.569335302519271e-06, "loss": 0.6329, "step": 327 }, { "epoch": 1.7285525736911571, "grad_norm": 1.4473432204015564, "learning_rate": 4.53865820268349e-06, "loss": 0.7144, "step": 328 }, { "epoch": 1.73383194016718, "grad_norm": 1.4376742031177947, "learning_rate": 4.507998608601787e-06, "loss": 0.6086, "step": 329 }, { "epoch": 1.7391113066432027, "grad_norm": 1.2849628847256984, "learning_rate": 4.477357683661734e-06, "loss": 0.6101, "step": 330 }, { "epoch": 1.7443906731192258, "grad_norm": 1.3554057763386258, "learning_rate": 4.446736590542497e-06, "loss": 0.5833, "step": 331 }, { "epoch": 1.7496700395952485, "grad_norm": 1.3213798453951964, "learning_rate": 4.41613649117072e-06, "loss": 0.6859, "step": 332 }, { "epoch": 1.7549494060712716, "grad_norm": 1.319837554365992, "learning_rate": 4.3855585466764305e-06, "loss": 0.655, "step": 333 }, { "epoch": 1.7602287725472943, "grad_norm": 1.3686144434660683, "learning_rate": 4.355003917348985e-06, "loss": 0.6474, "step": 334 }, { "epoch": 1.7655081390233172, "grad_norm": 1.3793264604803168, "learning_rate": 4.324473762593037e-06, "loss": 0.5843, "step": 335 }, { "epoch": 1.77078750549934, "grad_norm": 1.3441234479337094, "learning_rate": 4.293969240884545e-06, "loss": 0.5984, "step": 336 }, { "epoch": 1.776066871975363, "grad_norm": 1.237308449464165, "learning_rate": 4.263491509726812e-06, "loss": 0.6477, "step": 337 }, { "epoch": 1.7813462384513858, "grad_norm": 1.3355474025021052, "learning_rate": 4.233041725606573e-06, "loss": 0.636, "step": 338 }, { "epoch": 1.7866256049274087, "grad_norm": 1.3458947073703338, "learning_rate": 4.202621043950096e-06, "loss": 0.6152, "step": 339 }, { "epoch": 1.7919049714034316, "grad_norm": 1.3724772310082562, "learning_rate": 4.17223061907935e-06, "loss": 0.6669, "step": 340 }, { "epoch": 1.7971843378794543, "grad_norm": 1.3927314177261432, "learning_rate": 4.141871604168201e-06, "loss": 0.6871, "step": 341 }, { "epoch": 1.8024637043554774, "grad_norm": 1.425898039985732, "learning_rate": 4.111545151198657e-06, "loss": 0.6479, "step": 342 }, { "epoch": 1.8077430708315, "grad_norm": 1.4786764449830878, "learning_rate": 4.081252410917148e-06, "loss": 0.6758, "step": 343 }, { "epoch": 1.8130224373075232, "grad_norm": 1.5596067837918601, "learning_rate": 4.050994532790871e-06, "loss": 0.6792, "step": 344 }, { "epoch": 1.818301803783546, "grad_norm": 1.3295616520702254, "learning_rate": 4.020772664964166e-06, "loss": 0.6447, "step": 345 }, { "epoch": 1.823581170259569, "grad_norm": 1.356711049558864, "learning_rate": 3.99058795421495e-06, "loss": 0.6988, "step": 346 }, { "epoch": 1.8288605367355917, "grad_norm": 1.3459848599920097, "learning_rate": 3.960441545911205e-06, "loss": 0.6793, "step": 347 }, { "epoch": 1.8341399032116146, "grad_norm": 1.2796136680768018, "learning_rate": 3.930334583967514e-06, "loss": 0.6404, "step": 348 }, { "epoch": 1.8394192696876375, "grad_norm": 1.430373997763793, "learning_rate": 3.9002682108016585e-06, "loss": 0.7089, "step": 349 }, { "epoch": 1.8446986361636604, "grad_norm": 1.443868769930965, "learning_rate": 3.870243567291263e-06, "loss": 0.6088, "step": 350 }, { "epoch": 1.8499780026396833, "grad_norm": 1.3354422816955691, "learning_rate": 3.840261792730511e-06, "loss": 0.6469, "step": 351 }, { "epoch": 1.8552573691157062, "grad_norm": 1.5007921332211551, "learning_rate": 3.8103240247869077e-06, "loss": 0.6917, "step": 352 }, { "epoch": 1.860536735591729, "grad_norm": 1.4369902254697013, "learning_rate": 3.7804313994581143e-06, "loss": 0.6745, "step": 353 }, { "epoch": 1.8658161020677517, "grad_norm": 1.2954980945001948, "learning_rate": 3.7505850510288455e-06, "loss": 0.6402, "step": 354 }, { "epoch": 1.8710954685437748, "grad_norm": 1.3479250337805435, "learning_rate": 3.720786112027822e-06, "loss": 0.6281, "step": 355 }, { "epoch": 1.8763748350197975, "grad_norm": 1.3528354522984527, "learning_rate": 3.6910357131847986e-06, "loss": 0.6253, "step": 356 }, { "epoch": 1.8816542014958206, "grad_norm": 1.2410984546080153, "learning_rate": 3.6613349833876607e-06, "loss": 0.5576, "step": 357 }, { "epoch": 1.8869335679718433, "grad_norm": 1.1827340907861352, "learning_rate": 3.6316850496395863e-06, "loss": 0.5936, "step": 358 }, { "epoch": 1.8922129344478662, "grad_norm": 1.2980573086194132, "learning_rate": 3.602087037016281e-06, "loss": 0.8214, "step": 359 }, { "epoch": 1.897492300923889, "grad_norm": 1.4315757982637016, "learning_rate": 3.5725420686232903e-06, "loss": 0.6522, "step": 360 }, { "epoch": 1.902771667399912, "grad_norm": 1.4091204255580805, "learning_rate": 3.5430512655533774e-06, "loss": 0.5795, "step": 361 }, { "epoch": 1.9080510338759349, "grad_norm": 1.3444722372985694, "learning_rate": 3.513615746843987e-06, "loss": 0.7231, "step": 362 }, { "epoch": 1.9133304003519578, "grad_norm": 1.5246355682127404, "learning_rate": 3.484236629434783e-06, "loss": 0.6603, "step": 363 }, { "epoch": 1.9186097668279807, "grad_norm": 1.415464008217028, "learning_rate": 3.4549150281252635e-06, "loss": 0.6775, "step": 364 }, { "epoch": 1.9238891333040036, "grad_norm": 1.3192883237623132, "learning_rate": 3.4256520555324613e-06, "loss": 0.6316, "step": 365 }, { "epoch": 1.9291684997800265, "grad_norm": 1.428352611949904, "learning_rate": 3.3964488220487252e-06, "loss": 0.6544, "step": 366 }, { "epoch": 1.9344478662560491, "grad_norm": 1.5172404820075067, "learning_rate": 3.3673064357995844e-06, "loss": 0.5938, "step": 367 }, { "epoch": 1.9397272327320723, "grad_norm": 1.3819196499385575, "learning_rate": 3.3382260026017027e-06, "loss": 0.658, "step": 368 }, { "epoch": 1.945006599208095, "grad_norm": 1.2219098246955071, "learning_rate": 3.3092086259209144e-06, "loss": 0.6436, "step": 369 }, { "epoch": 1.950285965684118, "grad_norm": 1.3112983916512726, "learning_rate": 3.2802554068303595e-06, "loss": 0.6277, "step": 370 }, { "epoch": 1.9555653321601407, "grad_norm": 1.3181733253990144, "learning_rate": 3.2513674439686945e-06, "loss": 0.6051, "step": 371 }, { "epoch": 1.9608446986361636, "grad_norm": 1.3950428699097805, "learning_rate": 3.22254583349841e-06, "loss": 0.6047, "step": 372 }, { "epoch": 1.9661240651121865, "grad_norm": 1.401575375536184, "learning_rate": 3.1937916690642356e-06, "loss": 0.7536, "step": 373 }, { "epoch": 1.9714034315882094, "grad_norm": 1.4675958168712424, "learning_rate": 3.16510604175164e-06, "loss": 0.6029, "step": 374 }, { "epoch": 1.9766827980642323, "grad_norm": 1.2821606191811943, "learning_rate": 3.13649004004543e-06, "loss": 0.5921, "step": 375 }, { "epoch": 1.9819621645402552, "grad_norm": 1.352642460803231, "learning_rate": 3.107944749788449e-06, "loss": 0.6541, "step": 376 }, { "epoch": 1.987241531016278, "grad_norm": 1.4500846871984014, "learning_rate": 3.0794712541403716e-06, "loss": 0.5958, "step": 377 }, { "epoch": 1.9925208974923008, "grad_norm": 1.344043810326557, "learning_rate": 3.0510706335366034e-06, "loss": 0.678, "step": 378 }, { "epoch": 1.9978002639683239, "grad_norm": 1.3311569208901686, "learning_rate": 3.0227439656472878e-06, "loss": 0.5696, "step": 379 }, { "epoch": 2.0, "grad_norm": 1.3311569208901686, "learning_rate": 2.9944923253364066e-06, "loss": 0.6124, "step": 380 }, { "epoch": 2.0052793664760227, "grad_norm": 2.2453037842894434, "learning_rate": 2.966316784621e-06, "loss": 0.5021, "step": 381 }, { "epoch": 2.010558732952046, "grad_norm": 1.448400627279987, "learning_rate": 2.9382184126304834e-06, "loss": 0.4754, "step": 382 }, { "epoch": 2.0158380994280685, "grad_norm": 1.3928060588224582, "learning_rate": 2.910198275566085e-06, "loss": 0.5552, "step": 383 }, { "epoch": 2.0211174659040916, "grad_norm": 1.3053890423029602, "learning_rate": 2.8822574366603804e-06, "loss": 0.5096, "step": 384 }, { "epoch": 2.0263968323801143, "grad_norm": 1.312006902737542, "learning_rate": 2.8543969561369556e-06, "loss": 0.4384, "step": 385 }, { "epoch": 2.0316761988561374, "grad_norm": 1.3424294511811377, "learning_rate": 2.8266178911701757e-06, "loss": 0.4524, "step": 386 }, { "epoch": 2.03695556533216, "grad_norm": 1.353115219108087, "learning_rate": 2.798921295845064e-06, "loss": 0.5847, "step": 387 }, { "epoch": 2.042234931808183, "grad_norm": 1.359151667072332, "learning_rate": 2.771308221117309e-06, "loss": 0.5274, "step": 388 }, { "epoch": 2.047514298284206, "grad_norm": 1.1804464420852299, "learning_rate": 2.743779714773386e-06, "loss": 0.4868, "step": 389 }, { "epoch": 2.052793664760229, "grad_norm": 1.2257778269575734, "learning_rate": 2.7163368213907975e-06, "loss": 0.4974, "step": 390 }, { "epoch": 2.0580730312362516, "grad_norm": 1.3592658717787198, "learning_rate": 2.6889805822984348e-06, "loss": 0.429, "step": 391 }, { "epoch": 2.0633523977122747, "grad_norm": 1.4335615689021757, "learning_rate": 2.6617120355370667e-06, "loss": 0.4936, "step": 392 }, { "epoch": 2.0686317641882974, "grad_norm": 1.4971943650916089, "learning_rate": 2.6345322158199503e-06, "loss": 0.4891, "step": 393 }, { "epoch": 2.07391113066432, "grad_norm": 1.3630600653303417, "learning_rate": 2.607442154493568e-06, "loss": 0.4339, "step": 394 }, { "epoch": 2.079190497140343, "grad_norm": 1.3452492502199729, "learning_rate": 2.5804428794984926e-06, "loss": 0.4788, "step": 395 }, { "epoch": 2.084469863616366, "grad_norm": 1.3717266175336726, "learning_rate": 2.5535354153303827e-06, "loss": 0.4589, "step": 396 }, { "epoch": 2.089749230092389, "grad_norm": 1.4172021390970797, "learning_rate": 2.526720783001107e-06, "loss": 0.474, "step": 397 }, { "epoch": 2.0950285965684117, "grad_norm": 1.4012306931647633, "learning_rate": 2.5000000000000015e-06, "loss": 0.4703, "step": 398 }, { "epoch": 2.100307963044435, "grad_norm": 1.3223744927738885, "learning_rate": 2.473374080255261e-06, "loss": 0.4927, "step": 399 }, { "epoch": 2.1055873295204575, "grad_norm": 1.4340111903752237, "learning_rate": 2.4468440340954664e-06, "loss": 0.4911, "step": 400 }, { "epoch": 2.1108666959964806, "grad_norm": 1.4908200109241228, "learning_rate": 2.4204108682112443e-06, "loss": 0.4923, "step": 401 }, { "epoch": 2.1161460624725033, "grad_norm": 1.4498803710381125, "learning_rate": 2.3940755856170744e-06, "loss": 0.5264, "step": 402 }, { "epoch": 2.1214254289485264, "grad_norm": 1.5579303480326079, "learning_rate": 2.3678391856132203e-06, "loss": 0.4671, "step": 403 }, { "epoch": 2.126704795424549, "grad_norm": 1.3682695560666365, "learning_rate": 2.341702663747819e-06, "loss": 0.4791, "step": 404 }, { "epoch": 2.131984161900572, "grad_norm": 1.3740119474315011, "learning_rate": 2.3156670117790996e-06, "loss": 0.5081, "step": 405 }, { "epoch": 2.137263528376595, "grad_norm": 1.4378876285894175, "learning_rate": 2.289733217637753e-06, "loss": 0.6887, "step": 406 }, { "epoch": 2.1425428948526175, "grad_norm": 1.3945669735187922, "learning_rate": 2.2639022653894443e-06, "loss": 0.4747, "step": 407 }, { "epoch": 2.1478222613286406, "grad_norm": 1.5102801606024971, "learning_rate": 2.238175135197471e-06, "loss": 0.4772, "step": 408 }, { "epoch": 2.1531016278046633, "grad_norm": 1.343039872751995, "learning_rate": 2.2125528032855727e-06, "loss": 0.4662, "step": 409 }, { "epoch": 2.1583809942806864, "grad_norm": 1.7777099063240305, "learning_rate": 2.1870362419008844e-06, "loss": 0.4426, "step": 410 }, { "epoch": 2.163660360756709, "grad_norm": 1.3840206283915173, "learning_rate": 2.1616264192770496e-06, "loss": 0.451, "step": 411 }, { "epoch": 2.168939727232732, "grad_norm": 1.2930950541912372, "learning_rate": 2.136324299597474e-06, "loss": 0.523, "step": 412 }, { "epoch": 2.174219093708755, "grad_norm": 1.4431983442764567, "learning_rate": 2.1111308429587446e-06, "loss": 0.4051, "step": 413 }, { "epoch": 2.179498460184778, "grad_norm": 1.2550043379116107, "learning_rate": 2.0860470053341957e-06, "loss": 0.499, "step": 414 }, { "epoch": 2.1847778266608007, "grad_norm": 1.4359151954027813, "learning_rate": 2.061073738537635e-06, "loss": 0.4591, "step": 415 }, { "epoch": 2.190057193136824, "grad_norm": 1.3446535561143784, "learning_rate": 2.0362119901872262e-06, "loss": 0.4076, "step": 416 }, { "epoch": 2.1953365596128465, "grad_norm": 1.2459969483698727, "learning_rate": 2.011462703669532e-06, "loss": 0.4957, "step": 417 }, { "epoch": 2.200615926088869, "grad_norm": 1.2992424145598012, "learning_rate": 1.9868268181037186e-06, "loss": 0.463, "step": 418 }, { "epoch": 2.2058952925648923, "grad_norm": 1.4602888158466671, "learning_rate": 1.9623052683059164e-06, "loss": 0.4719, "step": 419 }, { "epoch": 2.211174659040915, "grad_norm": 1.3759597249907445, "learning_rate": 1.937898984753751e-06, "loss": 0.4951, "step": 420 }, { "epoch": 2.216454025516938, "grad_norm": 1.412150518484592, "learning_rate": 1.913608893551036e-06, "loss": 0.52, "step": 421 }, { "epoch": 2.2217333919929607, "grad_norm": 1.485490691650101, "learning_rate": 1.8894359163926312e-06, "loss": 0.444, "step": 422 }, { "epoch": 2.227012758468984, "grad_norm": 1.3670795340613098, "learning_rate": 1.865380970529469e-06, "loss": 0.5399, "step": 423 }, { "epoch": 2.2322921249450065, "grad_norm": 1.3525729496527066, "learning_rate": 1.8414449687337467e-06, "loss": 0.5159, "step": 424 }, { "epoch": 2.2375714914210296, "grad_norm": 1.4059006780837846, "learning_rate": 1.8176288192642944e-06, "loss": 0.5099, "step": 425 }, { "epoch": 2.2428508578970523, "grad_norm": 1.2761502912826002, "learning_rate": 1.7939334258321094e-06, "loss": 0.4717, "step": 426 }, { "epoch": 2.2481302243730754, "grad_norm": 1.4040503918498035, "learning_rate": 1.7703596875660645e-06, "loss": 0.4469, "step": 427 }, { "epoch": 2.253409590849098, "grad_norm": 1.2908543753758535, "learning_rate": 1.746908498978791e-06, "loss": 0.485, "step": 428 }, { "epoch": 2.2586889573251208, "grad_norm": 1.3759893346792271, "learning_rate": 1.7235807499327335e-06, "loss": 0.5101, "step": 429 }, { "epoch": 2.263968323801144, "grad_norm": 1.4728227490351313, "learning_rate": 1.7003773256063882e-06, "loss": 0.5347, "step": 430 }, { "epoch": 2.2692476902771666, "grad_norm": 1.489969037850193, "learning_rate": 1.6772991064607113e-06, "loss": 0.4467, "step": 431 }, { "epoch": 2.2745270567531897, "grad_norm": 1.417700490467943, "learning_rate": 1.6543469682057105e-06, "loss": 0.481, "step": 432 }, { "epoch": 2.2798064232292123, "grad_norm": 1.386346425394359, "learning_rate": 1.6315217817672142e-06, "loss": 0.4621, "step": 433 }, { "epoch": 2.2850857897052355, "grad_norm": 1.2589332584059243, "learning_rate": 1.60882441325383e-06, "loss": 0.5175, "step": 434 }, { "epoch": 2.290365156181258, "grad_norm": 1.3880552404176263, "learning_rate": 1.5862557239240729e-06, "loss": 0.4387, "step": 435 }, { "epoch": 2.2956445226572813, "grad_norm": 1.360835274455909, "learning_rate": 1.5638165701536866e-06, "loss": 0.5115, "step": 436 }, { "epoch": 2.300923889133304, "grad_norm": 1.3843299657391916, "learning_rate": 1.54150780340315e-06, "loss": 0.4899, "step": 437 }, { "epoch": 2.306203255609327, "grad_norm": 1.3835648119835473, "learning_rate": 1.5193302701853674e-06, "loss": 0.4664, "step": 438 }, { "epoch": 2.3114826220853497, "grad_norm": 1.4640561341524838, "learning_rate": 1.4972848120335453e-06, "loss": 0.4609, "step": 439 }, { "epoch": 2.316761988561373, "grad_norm": 1.421258969771172, "learning_rate": 1.475372265469265e-06, "loss": 0.4763, "step": 440 }, { "epoch": 2.3220413550373955, "grad_norm": 1.2751966894937146, "learning_rate": 1.453593461970733e-06, "loss": 0.4701, "step": 441 }, { "epoch": 2.327320721513418, "grad_norm": 1.340345062295216, "learning_rate": 1.4319492279412388e-06, "loss": 0.4832, "step": 442 }, { "epoch": 2.3326000879894413, "grad_norm": 1.4159387316202012, "learning_rate": 1.410440384677791e-06, "loss": 0.42, "step": 443 }, { "epoch": 2.337879454465464, "grad_norm": 1.2648065651654534, "learning_rate": 1.389067748339954e-06, "loss": 0.4404, "step": 444 }, { "epoch": 2.343158820941487, "grad_norm": 1.3711182773880273, "learning_rate": 1.3678321299188802e-06, "loss": 0.4388, "step": 445 }, { "epoch": 2.3484381874175098, "grad_norm": 1.3906995330557852, "learning_rate": 1.3467343352065349e-06, "loss": 0.5312, "step": 446 }, { "epoch": 2.353717553893533, "grad_norm": 1.397658739943819, "learning_rate": 1.3257751647651223e-06, "loss": 0.4675, "step": 447 }, { "epoch": 2.3589969203695555, "grad_norm": 1.4627906940091926, "learning_rate": 1.3049554138967052e-06, "loss": 0.4395, "step": 448 }, { "epoch": 2.3642762868455787, "grad_norm": 1.381779631997442, "learning_rate": 1.2842758726130283e-06, "loss": 0.569, "step": 449 }, { "epoch": 2.3695556533216013, "grad_norm": 1.4402592673817487, "learning_rate": 1.2637373256055445e-06, "loss": 0.4903, "step": 450 }, { "epoch": 2.3748350197976245, "grad_norm": 1.3333412823689215, "learning_rate": 1.2433405522156334e-06, "loss": 0.4824, "step": 451 }, { "epoch": 2.380114386273647, "grad_norm": 1.4106276931723192, "learning_rate": 1.2230863264050308e-06, "loss": 0.487, "step": 452 }, { "epoch": 2.3853937527496702, "grad_norm": 1.3557673199870695, "learning_rate": 1.202975416726464e-06, "loss": 0.5265, "step": 453 }, { "epoch": 2.390673119225693, "grad_norm": 1.4343260095491823, "learning_rate": 1.1830085862944851e-06, "loss": 0.449, "step": 454 }, { "epoch": 2.3959524857017156, "grad_norm": 1.446767986226991, "learning_rate": 1.163186592756515e-06, "loss": 0.4699, "step": 455 }, { "epoch": 2.4012318521777387, "grad_norm": 1.424117946851896, "learning_rate": 1.1435101882640964e-06, "loss": 0.4514, "step": 456 }, { "epoch": 2.4065112186537614, "grad_norm": 1.5365862656275142, "learning_rate": 1.1239801194443507e-06, "loss": 0.4373, "step": 457 }, { "epoch": 2.4117905851297845, "grad_norm": 1.2648145260275343, "learning_rate": 1.1045971273716476e-06, "loss": 0.4329, "step": 458 }, { "epoch": 2.417069951605807, "grad_norm": 1.3311965815866447, "learning_rate": 1.085361947539486e-06, "loss": 0.4769, "step": 459 }, { "epoch": 2.4223493180818303, "grad_norm": 1.3975266216041633, "learning_rate": 1.066275309832584e-06, "loss": 0.472, "step": 460 }, { "epoch": 2.427628684557853, "grad_norm": 1.3252996893653324, "learning_rate": 1.0473379384991833e-06, "loss": 0.4243, "step": 461 }, { "epoch": 2.432908051033876, "grad_norm": 1.4139994351082152, "learning_rate": 1.02855055212357e-06, "loss": 0.4785, "step": 462 }, { "epoch": 2.4381874175098988, "grad_norm": 1.3372117034643396, "learning_rate": 1.0099138635988026e-06, "loss": 0.4215, "step": 463 }, { "epoch": 2.443466783985922, "grad_norm": 1.3606372944047547, "learning_rate": 9.91428580099667e-07, "loss": 0.4413, "step": 464 }, { "epoch": 2.4487461504619445, "grad_norm": 1.318507119449003, "learning_rate": 9.73095403055837e-07, "loss": 0.415, "step": 465 }, { "epoch": 2.4540255169379677, "grad_norm": 1.2123750888837692, "learning_rate": 9.549150281252633e-07, "loss": 0.4886, "step": 466 }, { "epoch": 2.4593048834139903, "grad_norm": 1.3528666735660853, "learning_rate": 9.368881451677725e-07, "loss": 0.4838, "step": 467 }, { "epoch": 2.464584249890013, "grad_norm": 1.3418881482008247, "learning_rate": 9.190154382188921e-07, "loss": 0.4466, "step": 468 }, { "epoch": 2.469863616366036, "grad_norm": 1.3296751447001665, "learning_rate": 9.01297585463895e-07, "loss": 0.4595, "step": 469 }, { "epoch": 2.475142982842059, "grad_norm": 1.4089808402301305, "learning_rate": 8.837352592120646e-07, "loss": 0.4365, "step": 470 }, { "epoch": 2.480422349318082, "grad_norm": 1.402157984134382, "learning_rate": 8.663291258711831e-07, "loss": 0.4776, "step": 471 }, { "epoch": 2.4857017157941046, "grad_norm": 1.334493882835527, "learning_rate": 8.490798459222477e-07, "loss": 0.446, "step": 472 }, { "epoch": 2.4909810822701277, "grad_norm": 1.4469360566334513, "learning_rate": 8.31988073894403e-07, "loss": 0.5585, "step": 473 }, { "epoch": 2.4962604487461504, "grad_norm": 1.3547614775330397, "learning_rate": 8.150544583401116e-07, "loss": 0.4951, "step": 474 }, { "epoch": 2.5015398152221735, "grad_norm": 1.4317273864472844, "learning_rate": 7.98279641810537e-07, "loss": 0.4658, "step": 475 }, { "epoch": 2.506819181698196, "grad_norm": 1.3305013073280645, "learning_rate": 7.816642608311692e-07, "loss": 0.5777, "step": 476 }, { "epoch": 2.512098548174219, "grad_norm": 1.3136690941260454, "learning_rate": 7.652089458776651e-07, "loss": 0.499, "step": 477 }, { "epoch": 2.517377914650242, "grad_norm": 1.3874569359917572, "learning_rate": 7.489143213519301e-07, "loss": 0.5347, "step": 478 }, { "epoch": 2.522657281126265, "grad_norm": 1.447251018353361, "learning_rate": 7.327810055584211e-07, "loss": 0.435, "step": 479 }, { "epoch": 2.5279366476022878, "grad_norm": 1.3496149617623217, "learning_rate": 7.168096106806871e-07, "loss": 0.4171, "step": 480 }, { "epoch": 2.5332160140783104, "grad_norm": 1.249564521876932, "learning_rate": 7.010007427581378e-07, "loss": 0.4364, "step": 481 }, { "epoch": 2.5384953805543335, "grad_norm": 1.16664480296305, "learning_rate": 6.853550016630517e-07, "loss": 0.4704, "step": 482 }, { "epoch": 2.543774747030356, "grad_norm": 1.360240021411605, "learning_rate": 6.698729810778065e-07, "loss": 0.4452, "step": 483 }, { "epoch": 2.5490541135063793, "grad_norm": 1.342403768766378, "learning_rate": 6.545552684723583e-07, "loss": 0.4693, "step": 484 }, { "epoch": 2.554333479982402, "grad_norm": 1.3325018963351474, "learning_rate": 6.394024450819458e-07, "loss": 0.6651, "step": 485 }, { "epoch": 2.559612846458425, "grad_norm": 1.3913165441700324, "learning_rate": 6.244150858850368e-07, "loss": 0.4975, "step": 486 }, { "epoch": 2.564892212934448, "grad_norm": 1.2628490579653824, "learning_rate": 6.095937595815104e-07, "loss": 0.492, "step": 487 }, { "epoch": 2.570171579410471, "grad_norm": 1.2910442047354849, "learning_rate": 5.949390285710777e-07, "loss": 0.4534, "step": 488 }, { "epoch": 2.5754509458864936, "grad_norm": 1.3249433260471921, "learning_rate": 5.804514489319402e-07, "loss": 0.487, "step": 489 }, { "epoch": 2.5807303123625163, "grad_norm": 1.4072341388559009, "learning_rate": 5.661315703996905e-07, "loss": 0.4675, "step": 490 }, { "epoch": 2.5860096788385394, "grad_norm": 1.4015735453825087, "learning_rate": 5.519799363464523e-07, "loss": 0.4845, "step": 491 }, { "epoch": 2.5912890453145625, "grad_norm": 1.4217891239143823, "learning_rate": 5.379970837602611e-07, "loss": 0.4998, "step": 492 }, { "epoch": 2.596568411790585, "grad_norm": 1.3648246182499375, "learning_rate": 5.241835432246888e-07, "loss": 0.5176, "step": 493 }, { "epoch": 2.601847778266608, "grad_norm": 1.414958969535076, "learning_rate": 5.105398388987098e-07, "loss": 0.4967, "step": 494 }, { "epoch": 2.607127144742631, "grad_norm": 1.3790275298806813, "learning_rate": 4.970664884968135e-07, "loss": 0.4526, "step": 495 }, { "epoch": 2.6124065112186536, "grad_norm": 1.4260512881673313, "learning_rate": 4.837640032693558e-07, "loss": 0.4988, "step": 496 }, { "epoch": 2.6176858776946768, "grad_norm": 1.4375522094160005, "learning_rate": 4.7063288798316397e-07, "loss": 0.5034, "step": 497 }, { "epoch": 2.6229652441706994, "grad_norm": 1.2795249845130867, "learning_rate": 4.576736409023813e-07, "loss": 0.4697, "step": 498 }, { "epoch": 2.6282446106467225, "grad_norm": 1.341408752696913, "learning_rate": 4.448867537695578e-07, "loss": 0.4577, "step": 499 }, { "epoch": 2.633523977122745, "grad_norm": 1.388819232620493, "learning_rate": 4.322727117869951e-07, "loss": 0.4578, "step": 500 }, { "epoch": 2.6388033435987683, "grad_norm": 1.4259575315112532, "learning_rate": 4.198319935983325e-07, "loss": 0.432, "step": 501 }, { "epoch": 2.644082710074791, "grad_norm": 1.3388679638482945, "learning_rate": 4.0756507127038494e-07, "loss": 0.4297, "step": 502 }, { "epoch": 2.6493620765508137, "grad_norm": 1.343061176539468, "learning_rate": 3.9547241027523164e-07, "loss": 0.4731, "step": 503 }, { "epoch": 2.654641443026837, "grad_norm": 1.3358210629083995, "learning_rate": 3.8355446947255293e-07, "loss": 0.3901, "step": 504 }, { "epoch": 2.65992080950286, "grad_norm": 1.2293751714391306, "learning_rate": 3.71811701092219e-07, "loss": 0.4707, "step": 505 }, { "epoch": 2.6652001759788826, "grad_norm": 1.3129862851940244, "learning_rate": 3.602445507171276e-07, "loss": 0.4352, "step": 506 }, { "epoch": 2.6704795424549053, "grad_norm": 1.4034927380523827, "learning_rate": 3.488534572662994e-07, "loss": 0.4641, "step": 507 }, { "epoch": 2.6757589089309284, "grad_norm": 1.2489349783536317, "learning_rate": 3.3763885297822153e-07, "loss": 0.4681, "step": 508 }, { "epoch": 2.681038275406951, "grad_norm": 1.338257980632932, "learning_rate": 3.266011633944477e-07, "loss": 0.4466, "step": 509 }, { "epoch": 2.686317641882974, "grad_norm": 1.340765808641228, "learning_rate": 3.1574080734344757e-07, "loss": 0.4427, "step": 510 }, { "epoch": 2.691597008358997, "grad_norm": 1.3488826792679693, "learning_rate": 3.0505819692471797e-07, "loss": 0.4425, "step": 511 }, { "epoch": 2.69687637483502, "grad_norm": 1.2448047137351241, "learning_rate": 2.9455373749314285e-07, "loss": 0.5045, "step": 512 }, { "epoch": 2.7021557413110426, "grad_norm": 1.327982717505523, "learning_rate": 2.842278276436128e-07, "loss": 0.4434, "step": 513 }, { "epoch": 2.7074351077870658, "grad_norm": 1.3317380787717712, "learning_rate": 2.7408085919590265e-07, "loss": 0.4685, "step": 514 }, { "epoch": 2.7127144742630884, "grad_norm": 1.418889669781179, "learning_rate": 2.6411321717979886e-07, "loss": 0.4459, "step": 515 }, { "epoch": 2.717993840739111, "grad_norm": 1.4212898616431393, "learning_rate": 2.5432527982049424e-07, "loss": 0.4436, "step": 516 }, { "epoch": 2.723273207215134, "grad_norm": 1.372505212467702, "learning_rate": 2.447174185242324e-07, "loss": 0.4942, "step": 517 }, { "epoch": 2.7285525736911573, "grad_norm": 1.429171310323447, "learning_rate": 2.3528999786421758e-07, "loss": 0.4636, "step": 518 }, { "epoch": 2.73383194016718, "grad_norm": 1.338261834173231, "learning_rate": 2.2604337556677846e-07, "loss": 0.4656, "step": 519 }, { "epoch": 2.7391113066432027, "grad_norm": 1.324849203082196, "learning_rate": 2.1697790249779638e-07, "loss": 0.4788, "step": 520 }, { "epoch": 2.744390673119226, "grad_norm": 1.350337658446117, "learning_rate": 2.080939226493889e-07, "loss": 0.4846, "step": 521 }, { "epoch": 2.7496700395952485, "grad_norm": 1.3500445059182393, "learning_rate": 1.9939177312685963e-07, "loss": 0.4419, "step": 522 }, { "epoch": 2.7549494060712716, "grad_norm": 1.375463337098407, "learning_rate": 1.908717841359048e-07, "loss": 0.4687, "step": 523 }, { "epoch": 2.7602287725472943, "grad_norm": 1.4823623308521863, "learning_rate": 1.825342789700846e-07, "loss": 0.4779, "step": 524 }, { "epoch": 2.765508139023317, "grad_norm": 1.4161590791252803, "learning_rate": 1.7437957399855488e-07, "loss": 0.4685, "step": 525 }, { "epoch": 2.77078750549934, "grad_norm": 1.2986729971436586, "learning_rate": 1.664079786540629e-07, "loss": 0.4771, "step": 526 }, { "epoch": 2.776066871975363, "grad_norm": 1.3827623556792923, "learning_rate": 1.5861979542120598e-07, "loss": 0.4634, "step": 527 }, { "epoch": 2.781346238451386, "grad_norm": 1.3339794328080277, "learning_rate": 1.510153198249531e-07, "loss": 0.4435, "step": 528 }, { "epoch": 2.7866256049274085, "grad_norm": 1.3094435959629867, "learning_rate": 1.435948404194304e-07, "loss": 0.4537, "step": 529 }, { "epoch": 2.7919049714034316, "grad_norm": 1.395651401211996, "learning_rate": 1.363586387769761e-07, "loss": 0.4729, "step": 530 }, { "epoch": 2.7971843378794543, "grad_norm": 1.3803361937212524, "learning_rate": 1.2930698947744957e-07, "loss": 0.5551, "step": 531 }, { "epoch": 2.8024637043554774, "grad_norm": 1.403170004866064, "learning_rate": 1.22440160097817e-07, "loss": 0.4659, "step": 532 }, { "epoch": 2.8077430708315, "grad_norm": 1.4326502602660136, "learning_rate": 1.157584112019966e-07, "loss": 0.4532, "step": 533 }, { "epoch": 2.813022437307523, "grad_norm": 1.413170997771312, "learning_rate": 1.0926199633097156e-07, "loss": 0.5077, "step": 534 }, { "epoch": 2.818301803783546, "grad_norm": 1.4978501520243235, "learning_rate": 1.0295116199317057e-07, "loss": 0.4524, "step": 535 }, { "epoch": 2.823581170259569, "grad_norm": 1.3811339585897102, "learning_rate": 9.682614765511134e-08, "loss": 0.4267, "step": 536 }, { "epoch": 2.8288605367355917, "grad_norm": 1.2532370949093539, "learning_rate": 9.08871857323157e-08, "loss": 0.5053, "step": 537 }, { "epoch": 2.8341399032116144, "grad_norm": 1.3788001439277469, "learning_rate": 8.513450158049109e-08, "loss": 0.4377, "step": 538 }, { "epoch": 2.8394192696876375, "grad_norm": 1.3386655810754182, "learning_rate": 7.956831348697791e-08, "loss": 0.5369, "step": 539 }, { "epoch": 2.8446986361636606, "grad_norm": 1.4031016680793478, "learning_rate": 7.418883266246734e-08, "loss": 0.4692, "step": 540 }, { "epoch": 2.8499780026396833, "grad_norm": 1.4223612412457645, "learning_rate": 6.899626323298714e-08, "loss": 0.4975, "step": 541 }, { "epoch": 2.855257369115706, "grad_norm": 1.468568132224329, "learning_rate": 6.399080223215503e-08, "loss": 0.4278, "step": 542 }, { "epoch": 2.860536735591729, "grad_norm": 1.2905228846748407, "learning_rate": 5.917263959370312e-08, "loss": 0.4459, "step": 543 }, { "epoch": 2.8658161020677517, "grad_norm": 1.30438415852383, "learning_rate": 5.454195814427021e-08, "loss": 0.4881, "step": 544 }, { "epoch": 2.871095468543775, "grad_norm": 1.4658032992281533, "learning_rate": 5.009893359646523e-08, "loss": 0.422, "step": 545 }, { "epoch": 2.8763748350197975, "grad_norm": 1.403126172079016, "learning_rate": 4.584373454219859e-08, "loss": 0.4298, "step": 546 }, { "epoch": 2.8816542014958206, "grad_norm": 1.305238740579575, "learning_rate": 4.177652244628627e-08, "loss": 0.447, "step": 547 }, { "epoch": 2.8869335679718433, "grad_norm": 1.4226875729692003, "learning_rate": 3.7897451640321326e-08, "loss": 0.537, "step": 548 }, { "epoch": 2.8922129344478664, "grad_norm": 1.3043310631991534, "learning_rate": 3.4206669316819155e-08, "loss": 0.495, "step": 549 }, { "epoch": 2.897492300923889, "grad_norm": 1.4379823363129705, "learning_rate": 3.0704315523631956e-08, "loss": 0.4139, "step": 550 }, { "epoch": 2.9027716673999118, "grad_norm": 1.3544951690739686, "learning_rate": 2.7390523158633552e-08, "loss": 0.5213, "step": 551 }, { "epoch": 2.908051033875935, "grad_norm": 1.5078288720536737, "learning_rate": 2.426541796467785e-08, "loss": 0.4605, "step": 552 }, { "epoch": 2.913330400351958, "grad_norm": 1.35610421146911, "learning_rate": 2.1329118524827662e-08, "loss": 0.4708, "step": 553 }, { "epoch": 2.9186097668279807, "grad_norm": 1.4726737456407306, "learning_rate": 1.8581736257852756e-08, "loss": 0.4868, "step": 554 }, { "epoch": 2.9238891333040034, "grad_norm": 1.353334569052309, "learning_rate": 1.6023375414004894e-08, "loss": 0.4867, "step": 555 }, { "epoch": 2.9291684997800265, "grad_norm": 1.3416226125226052, "learning_rate": 1.3654133071059894e-08, "loss": 0.4396, "step": 556 }, { "epoch": 2.934447866256049, "grad_norm": 1.226026839315963, "learning_rate": 1.1474099130635575e-08, "loss": 0.4901, "step": 557 }, { "epoch": 2.9397272327320723, "grad_norm": 1.3404474468548127, "learning_rate": 9.48335631477948e-09, "loss": 0.4651, "step": 558 }, { "epoch": 2.945006599208095, "grad_norm": 1.3923456514703572, "learning_rate": 7.681980162830283e-09, "loss": 0.5356, "step": 559 }, { "epoch": 2.950285965684118, "grad_norm": 1.2760505962703481, "learning_rate": 6.070039028550634e-09, "loss": 0.5385, "step": 560 }, { "epoch": 2.9555653321601407, "grad_norm": 2.2171211350170843, "learning_rate": 4.647594077534235e-09, "loss": 0.5793, "step": 561 }, { "epoch": 2.960844698636164, "grad_norm": 1.5115950710680348, "learning_rate": 3.41469928488547e-09, "loss": 0.5072, "step": 562 }, { "epoch": 2.9661240651121865, "grad_norm": 1.386901569403929, "learning_rate": 2.371401433170495e-09, "loss": 0.4858, "step": 563 }, { "epoch": 2.971403431588209, "grad_norm": 1.4637428485442363, "learning_rate": 1.5177401106419853e-09, "loss": 0.4478, "step": 564 }, { "epoch": 2.9766827980642323, "grad_norm": 1.4551106718059847, "learning_rate": 8.537477097364522e-10, "loss": 0.4673, "step": 565 }, { "epoch": 2.9819621645402554, "grad_norm": 1.3415126297649902, "learning_rate": 3.7944942584688947e-10, "loss": 0.4997, "step": 566 }, { "epoch": 2.987241531016278, "grad_norm": 1.2328749287439438, "learning_rate": 9.486325636576254e-11, "loss": 0.5271, "step": 567 }, { "epoch": 2.987241531016278, "step": 567, "total_flos": 4.6211312738788966e+17, "train_loss": 0.7107904120832944, "train_runtime": 62456.134, "train_samples_per_second": 0.437, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 567, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.6211312738788966e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }