{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.34584125886218225, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006916825177243646, "grad_norm": 0.17447508871555328, "learning_rate": 0.0, "loss": 2.4721, "step": 1 }, { "epoch": 0.0013833650354487291, "grad_norm": 0.24757890403270721, "learning_rate": 1.3698630136986302e-06, "loss": 2.7788, "step": 2 }, { "epoch": 0.0020750475531730937, "grad_norm": 0.20169955492019653, "learning_rate": 2.7397260273972604e-06, "loss": 2.817, "step": 3 }, { "epoch": 0.0027667300708974583, "grad_norm": 0.2597520053386688, "learning_rate": 4.10958904109589e-06, "loss": 2.8883, "step": 4 }, { "epoch": 0.0034584125886218224, "grad_norm": 0.2476697564125061, "learning_rate": 5.479452054794521e-06, "loss": 2.5194, "step": 5 }, { "epoch": 0.004150095106346187, "grad_norm": 0.19448921084403992, "learning_rate": 6.849315068493151e-06, "loss": 2.0166, "step": 6 }, { "epoch": 0.0048417776240705515, "grad_norm": 0.30129846930503845, "learning_rate": 8.21917808219178e-06, "loss": 2.0576, "step": 7 }, { "epoch": 0.0055334601417949165, "grad_norm": 0.29668128490448, "learning_rate": 9.589041095890411e-06, "loss": 2.0307, "step": 8 }, { "epoch": 0.006225142659519281, "grad_norm": 0.20030352473258972, "learning_rate": 1.0958904109589042e-05, "loss": 2.6198, "step": 9 }, { "epoch": 0.006916825177243645, "grad_norm": 0.2806167006492615, "learning_rate": 1.2328767123287671e-05, "loss": 2.7184, "step": 10 }, { "epoch": 0.00760850769496801, "grad_norm": 0.3155820965766907, "learning_rate": 1.3698630136986302e-05, "loss": 2.5599, "step": 11 }, { "epoch": 0.008300190212692375, "grad_norm": 0.37342140078544617, "learning_rate": 1.5068493150684931e-05, "loss": 1.6116, "step": 12 }, { "epoch": 0.008991872730416739, "grad_norm": 0.3110700845718384, "learning_rate": 1.643835616438356e-05, "loss": 2.03, "step": 13 }, { "epoch": 0.009683555248141103, "grad_norm": 0.22983220219612122, "learning_rate": 1.780821917808219e-05, "loss": 3.0022, "step": 14 }, { "epoch": 0.010375237765865467, "grad_norm": 0.20027688145637512, "learning_rate": 1.9178082191780822e-05, "loss": 2.3026, "step": 15 }, { "epoch": 0.011066920283589833, "grad_norm": 0.2387463003396988, "learning_rate": 2.0547945205479453e-05, "loss": 2.9633, "step": 16 }, { "epoch": 0.011758602801314197, "grad_norm": 0.21235717833042145, "learning_rate": 2.1917808219178083e-05, "loss": 2.8526, "step": 17 }, { "epoch": 0.012450285319038561, "grad_norm": 0.29500314593315125, "learning_rate": 2.328767123287671e-05, "loss": 2.5277, "step": 18 }, { "epoch": 0.013141967836762926, "grad_norm": 0.20916298031806946, "learning_rate": 2.4657534246575342e-05, "loss": 2.3433, "step": 19 }, { "epoch": 0.01383365035448729, "grad_norm": 0.29092636704444885, "learning_rate": 2.6027397260273973e-05, "loss": 2.6725, "step": 20 }, { "epoch": 0.014525332872211656, "grad_norm": 0.2366064488887787, "learning_rate": 2.7397260273972603e-05, "loss": 2.7656, "step": 21 }, { "epoch": 0.01521701538993602, "grad_norm": 0.23627543449401855, "learning_rate": 2.8767123287671234e-05, "loss": 2.5008, "step": 22 }, { "epoch": 0.015908697907660384, "grad_norm": 0.30645817518234253, "learning_rate": 3.0136986301369862e-05, "loss": 1.8266, "step": 23 }, { "epoch": 0.01660038042538475, "grad_norm": 0.5485689043998718, "learning_rate": 3.1506849315068496e-05, "loss": 2.7961, "step": 24 }, { "epoch": 0.017292062943109112, "grad_norm": 0.2701663076877594, "learning_rate": 3.287671232876712e-05, "loss": 2.7765, "step": 25 }, { "epoch": 0.017983745460833478, "grad_norm": 0.28117069602012634, "learning_rate": 3.424657534246575e-05, "loss": 2.7523, "step": 26 }, { "epoch": 0.01867542797855784, "grad_norm": 0.38187164068222046, "learning_rate": 3.561643835616438e-05, "loss": 2.2493, "step": 27 }, { "epoch": 0.019367110496282206, "grad_norm": 0.23057831823825836, "learning_rate": 3.698630136986301e-05, "loss": 2.8409, "step": 28 }, { "epoch": 0.020058793014006572, "grad_norm": 0.314023494720459, "learning_rate": 3.8356164383561644e-05, "loss": 2.7994, "step": 29 }, { "epoch": 0.020750475531730934, "grad_norm": 0.5843497514724731, "learning_rate": 3.9726027397260274e-05, "loss": 2.8692, "step": 30 }, { "epoch": 0.0214421580494553, "grad_norm": 0.3582310974597931, "learning_rate": 4.1095890410958905e-05, "loss": 2.5466, "step": 31 }, { "epoch": 0.022133840567179666, "grad_norm": 0.2534823715686798, "learning_rate": 4.2465753424657536e-05, "loss": 2.5193, "step": 32 }, { "epoch": 0.02282552308490403, "grad_norm": 0.34849318861961365, "learning_rate": 4.383561643835617e-05, "loss": 1.7904, "step": 33 }, { "epoch": 0.023517205602628394, "grad_norm": 0.35676291584968567, "learning_rate": 4.520547945205479e-05, "loss": 1.6514, "step": 34 }, { "epoch": 0.024208888120352757, "grad_norm": 0.24594038724899292, "learning_rate": 4.657534246575342e-05, "loss": 2.3192, "step": 35 }, { "epoch": 0.024900570638077123, "grad_norm": 0.3562544882297516, "learning_rate": 4.794520547945205e-05, "loss": 2.8913, "step": 36 }, { "epoch": 0.02559225315580149, "grad_norm": 0.4177575409412384, "learning_rate": 4.9315068493150684e-05, "loss": 2.4487, "step": 37 }, { "epoch": 0.02628393567352585, "grad_norm": 0.3126346170902252, "learning_rate": 5.068493150684932e-05, "loss": 1.4528, "step": 38 }, { "epoch": 0.026975618191250217, "grad_norm": 0.30482178926467896, "learning_rate": 5.2054794520547945e-05, "loss": 2.6085, "step": 39 }, { "epoch": 0.02766730070897458, "grad_norm": 0.31493324041366577, "learning_rate": 5.342465753424658e-05, "loss": 1.8078, "step": 40 }, { "epoch": 0.028358983226698945, "grad_norm": 0.4693777859210968, "learning_rate": 5.479452054794521e-05, "loss": 1.363, "step": 41 }, { "epoch": 0.02905066574442331, "grad_norm": 0.6538149118423462, "learning_rate": 5.616438356164384e-05, "loss": 1.2077, "step": 42 }, { "epoch": 0.029742348262147673, "grad_norm": 0.32594457268714905, "learning_rate": 5.753424657534247e-05, "loss": 2.2301, "step": 43 }, { "epoch": 0.03043403077987204, "grad_norm": 0.35883522033691406, "learning_rate": 5.89041095890411e-05, "loss": 2.5199, "step": 44 }, { "epoch": 0.0311257132975964, "grad_norm": 0.3142642080783844, "learning_rate": 6.0273972602739724e-05, "loss": 2.0425, "step": 45 }, { "epoch": 0.03181739581532077, "grad_norm": 0.31059974431991577, "learning_rate": 6.164383561643835e-05, "loss": 2.6123, "step": 46 }, { "epoch": 0.03250907833304513, "grad_norm": 0.24734480679035187, "learning_rate": 6.301369863013699e-05, "loss": 2.5415, "step": 47 }, { "epoch": 0.0332007608507695, "grad_norm": 0.3828027546405792, "learning_rate": 6.438356164383562e-05, "loss": 2.3858, "step": 48 }, { "epoch": 0.03389244336849386, "grad_norm": 0.32175979018211365, "learning_rate": 6.575342465753424e-05, "loss": 2.135, "step": 49 }, { "epoch": 0.034584125886218224, "grad_norm": 0.5965486764907837, "learning_rate": 6.712328767123288e-05, "loss": 2.2111, "step": 50 }, { "epoch": 0.03527580840394259, "grad_norm": 0.5561261177062988, "learning_rate": 6.84931506849315e-05, "loss": 1.725, "step": 51 }, { "epoch": 0.035967490921666956, "grad_norm": 0.46931296586990356, "learning_rate": 6.986301369863014e-05, "loss": 2.5344, "step": 52 }, { "epoch": 0.03665917343939132, "grad_norm": 0.4662770926952362, "learning_rate": 7.123287671232876e-05, "loss": 2.4031, "step": 53 }, { "epoch": 0.03735085595711568, "grad_norm": 0.3315083980560303, "learning_rate": 7.26027397260274e-05, "loss": 2.0916, "step": 54 }, { "epoch": 0.03804253847484005, "grad_norm": 0.36718690395355225, "learning_rate": 7.397260273972603e-05, "loss": 2.6542, "step": 55 }, { "epoch": 0.03873422099256441, "grad_norm": 0.43732333183288574, "learning_rate": 7.534246575342466e-05, "loss": 2.3609, "step": 56 }, { "epoch": 0.039425903510288775, "grad_norm": 0.5333709716796875, "learning_rate": 7.671232876712329e-05, "loss": 2.0736, "step": 57 }, { "epoch": 0.040117586028013144, "grad_norm": 0.609667181968689, "learning_rate": 7.808219178082192e-05, "loss": 1.9863, "step": 58 }, { "epoch": 0.040809268545737507, "grad_norm": 0.38461530208587646, "learning_rate": 7.945205479452055e-05, "loss": 2.0244, "step": 59 }, { "epoch": 0.04150095106346187, "grad_norm": 0.37384161353111267, "learning_rate": 8.082191780821919e-05, "loss": 2.5084, "step": 60 }, { "epoch": 0.04219263358118624, "grad_norm": 0.575130820274353, "learning_rate": 8.219178082191781e-05, "loss": 1.5943, "step": 61 }, { "epoch": 0.0428843160989106, "grad_norm": 0.47650450468063354, "learning_rate": 8.356164383561645e-05, "loss": 2.4752, "step": 62 }, { "epoch": 0.04357599861663496, "grad_norm": 0.36863836646080017, "learning_rate": 8.493150684931507e-05, "loss": 2.5023, "step": 63 }, { "epoch": 0.04426768113435933, "grad_norm": 0.3590114116668701, "learning_rate": 8.630136986301371e-05, "loss": 2.1982, "step": 64 }, { "epoch": 0.044959363652083695, "grad_norm": 0.4362975060939789, "learning_rate": 8.767123287671233e-05, "loss": 2.0463, "step": 65 }, { "epoch": 0.04565104616980806, "grad_norm": 0.39568379521369934, "learning_rate": 8.904109589041096e-05, "loss": 2.5595, "step": 66 }, { "epoch": 0.04634272868753242, "grad_norm": 0.41195258498191833, "learning_rate": 9.041095890410958e-05, "loss": 2.348, "step": 67 }, { "epoch": 0.04703441120525679, "grad_norm": 0.5394885540008545, "learning_rate": 9.178082191780822e-05, "loss": 1.2385, "step": 68 }, { "epoch": 0.04772609372298115, "grad_norm": 0.41482311487197876, "learning_rate": 9.315068493150684e-05, "loss": 2.7036, "step": 69 }, { "epoch": 0.048417776240705514, "grad_norm": 0.4505786597728729, "learning_rate": 9.452054794520548e-05, "loss": 1.7636, "step": 70 }, { "epoch": 0.04910945875842988, "grad_norm": 0.26118066906929016, "learning_rate": 9.58904109589041e-05, "loss": 1.7056, "step": 71 }, { "epoch": 0.049801141276154245, "grad_norm": 0.7240022420883179, "learning_rate": 9.726027397260274e-05, "loss": 2.3171, "step": 72 }, { "epoch": 0.05049282379387861, "grad_norm": 0.5030315518379211, "learning_rate": 9.863013698630137e-05, "loss": 2.1896, "step": 73 }, { "epoch": 0.05118450631160298, "grad_norm": 0.7940630316734314, "learning_rate": 0.0001, "loss": 1.4996, "step": 74 }, { "epoch": 0.05187618882932734, "grad_norm": 0.45342186093330383, "learning_rate": 9.992716678805537e-05, "loss": 2.1487, "step": 75 }, { "epoch": 0.0525678713470517, "grad_norm": 0.4451451301574707, "learning_rate": 9.985433357611071e-05, "loss": 2.1561, "step": 76 }, { "epoch": 0.053259553864776064, "grad_norm": 0.39434245228767395, "learning_rate": 9.978150036416607e-05, "loss": 1.8114, "step": 77 }, { "epoch": 0.053951236382500434, "grad_norm": 0.5808930397033691, "learning_rate": 9.970866715222141e-05, "loss": 2.0528, "step": 78 }, { "epoch": 0.054642918900224796, "grad_norm": 0.4741731882095337, "learning_rate": 9.963583394027677e-05, "loss": 1.988, "step": 79 }, { "epoch": 0.05533460141794916, "grad_norm": 0.38925132155418396, "learning_rate": 9.956300072833212e-05, "loss": 1.1629, "step": 80 }, { "epoch": 0.05602628393567353, "grad_norm": 0.5677796602249146, "learning_rate": 9.949016751638748e-05, "loss": 1.7325, "step": 81 }, { "epoch": 0.05671796645339789, "grad_norm": 0.5454609990119934, "learning_rate": 9.941733430444284e-05, "loss": 2.0386, "step": 82 }, { "epoch": 0.05740964897112225, "grad_norm": 0.42700880765914917, "learning_rate": 9.934450109249819e-05, "loss": 2.1646, "step": 83 }, { "epoch": 0.05810133148884662, "grad_norm": 0.4880882203578949, "learning_rate": 9.927166788055353e-05, "loss": 1.8193, "step": 84 }, { "epoch": 0.058793014006570984, "grad_norm": 0.4928286373615265, "learning_rate": 9.919883466860888e-05, "loss": 1.6243, "step": 85 }, { "epoch": 0.05948469652429535, "grad_norm": 0.5179658532142639, "learning_rate": 9.912600145666424e-05, "loss": 1.6846, "step": 86 }, { "epoch": 0.060176379042019716, "grad_norm": 0.6409149765968323, "learning_rate": 9.905316824471959e-05, "loss": 0.8419, "step": 87 }, { "epoch": 0.06086806155974408, "grad_norm": 0.7547211050987244, "learning_rate": 9.898033503277495e-05, "loss": 2.399, "step": 88 }, { "epoch": 0.06155974407746844, "grad_norm": 0.412672758102417, "learning_rate": 9.890750182083031e-05, "loss": 2.2212, "step": 89 }, { "epoch": 0.0622514265951928, "grad_norm": 0.5434851050376892, "learning_rate": 9.883466860888566e-05, "loss": 2.3083, "step": 90 }, { "epoch": 0.06294310911291717, "grad_norm": 0.4428962767124176, "learning_rate": 9.876183539694101e-05, "loss": 2.3586, "step": 91 }, { "epoch": 0.06363479163064154, "grad_norm": 0.643883466720581, "learning_rate": 9.868900218499635e-05, "loss": 2.4555, "step": 92 }, { "epoch": 0.0643264741483659, "grad_norm": 1.8360555171966553, "learning_rate": 9.861616897305172e-05, "loss": 2.3463, "step": 93 }, { "epoch": 0.06501815666609026, "grad_norm": 0.5649489164352417, "learning_rate": 9.854333576110706e-05, "loss": 1.9054, "step": 94 }, { "epoch": 0.06570983918381462, "grad_norm": 0.5043310523033142, "learning_rate": 9.847050254916242e-05, "loss": 2.0604, "step": 95 }, { "epoch": 0.066401521701539, "grad_norm": 0.49205994606018066, "learning_rate": 9.839766933721779e-05, "loss": 2.5182, "step": 96 }, { "epoch": 0.06709320421926336, "grad_norm": 0.9012492299079895, "learning_rate": 9.832483612527313e-05, "loss": 1.9895, "step": 97 }, { "epoch": 0.06778488673698772, "grad_norm": 0.576653778553009, "learning_rate": 9.825200291332848e-05, "loss": 2.5201, "step": 98 }, { "epoch": 0.06847656925471209, "grad_norm": 0.4745285212993622, "learning_rate": 9.817916970138383e-05, "loss": 1.9348, "step": 99 }, { "epoch": 0.06916825177243645, "grad_norm": 0.5546420216560364, "learning_rate": 9.810633648943919e-05, "loss": 1.6732, "step": 100 }, { "epoch": 0.06985993429016081, "grad_norm": 0.8806080222129822, "learning_rate": 9.803350327749454e-05, "loss": 1.713, "step": 101 }, { "epoch": 0.07055161680788519, "grad_norm": 0.6029040217399597, "learning_rate": 9.79606700655499e-05, "loss": 1.5178, "step": 102 }, { "epoch": 0.07124329932560955, "grad_norm": 0.56731778383255, "learning_rate": 9.788783685360526e-05, "loss": 1.6848, "step": 103 }, { "epoch": 0.07193498184333391, "grad_norm": 0.5833808183670044, "learning_rate": 9.781500364166059e-05, "loss": 1.703, "step": 104 }, { "epoch": 0.07262666436105827, "grad_norm": 0.724471390247345, "learning_rate": 9.774217042971595e-05, "loss": 2.227, "step": 105 }, { "epoch": 0.07331834687878264, "grad_norm": 0.722097635269165, "learning_rate": 9.76693372177713e-05, "loss": 1.2173, "step": 106 }, { "epoch": 0.074010029396507, "grad_norm": 0.7538579702377319, "learning_rate": 9.759650400582666e-05, "loss": 1.8546, "step": 107 }, { "epoch": 0.07470171191423136, "grad_norm": 0.7746326327323914, "learning_rate": 9.752367079388201e-05, "loss": 2.4067, "step": 108 }, { "epoch": 0.07539339443195574, "grad_norm": 0.5197188258171082, "learning_rate": 9.745083758193737e-05, "loss": 1.9193, "step": 109 }, { "epoch": 0.0760850769496801, "grad_norm": 0.4536001682281494, "learning_rate": 9.737800436999273e-05, "loss": 2.2905, "step": 110 }, { "epoch": 0.07677675946740446, "grad_norm": 0.6520904302597046, "learning_rate": 9.730517115804807e-05, "loss": 2.012, "step": 111 }, { "epoch": 0.07746844198512882, "grad_norm": 0.5670090913772583, "learning_rate": 9.723233794610343e-05, "loss": 2.0289, "step": 112 }, { "epoch": 0.07816012450285319, "grad_norm": 0.8609398603439331, "learning_rate": 9.715950473415877e-05, "loss": 1.332, "step": 113 }, { "epoch": 0.07885180702057755, "grad_norm": 0.5880535244941711, "learning_rate": 9.708667152221414e-05, "loss": 1.7267, "step": 114 }, { "epoch": 0.07954348953830193, "grad_norm": 0.6090431213378906, "learning_rate": 9.701383831026948e-05, "loss": 1.3968, "step": 115 }, { "epoch": 0.08023517205602629, "grad_norm": 0.5701532363891602, "learning_rate": 9.694100509832484e-05, "loss": 2.1829, "step": 116 }, { "epoch": 0.08092685457375065, "grad_norm": 0.6294519305229187, "learning_rate": 9.68681718863802e-05, "loss": 1.7385, "step": 117 }, { "epoch": 0.08161853709147501, "grad_norm": 0.5141008496284485, "learning_rate": 9.679533867443554e-05, "loss": 1.8417, "step": 118 }, { "epoch": 0.08231021960919938, "grad_norm": 1.0640853643417358, "learning_rate": 9.67225054624909e-05, "loss": 2.1031, "step": 119 }, { "epoch": 0.08300190212692374, "grad_norm": 0.6235936284065247, "learning_rate": 9.664967225054625e-05, "loss": 1.9911, "step": 120 }, { "epoch": 0.0836935846446481, "grad_norm": 0.8041340112686157, "learning_rate": 9.657683903860161e-05, "loss": 1.9172, "step": 121 }, { "epoch": 0.08438526716237248, "grad_norm": 0.6447221040725708, "learning_rate": 9.650400582665696e-05, "loss": 2.0157, "step": 122 }, { "epoch": 0.08507694968009684, "grad_norm": 0.7299740314483643, "learning_rate": 9.643117261471232e-05, "loss": 1.9157, "step": 123 }, { "epoch": 0.0857686321978212, "grad_norm": 0.8950564861297607, "learning_rate": 9.635833940276767e-05, "loss": 1.4313, "step": 124 }, { "epoch": 0.08646031471554556, "grad_norm": 0.5062771439552307, "learning_rate": 9.628550619082301e-05, "loss": 1.7742, "step": 125 }, { "epoch": 0.08715199723326993, "grad_norm": 0.7482561469078064, "learning_rate": 9.621267297887837e-05, "loss": 1.6122, "step": 126 }, { "epoch": 0.08784367975099429, "grad_norm": 0.4244556725025177, "learning_rate": 9.613983976693372e-05, "loss": 1.3825, "step": 127 }, { "epoch": 0.08853536226871866, "grad_norm": 0.5349308252334595, "learning_rate": 9.606700655498908e-05, "loss": 1.9771, "step": 128 }, { "epoch": 0.08922704478644303, "grad_norm": 0.5429610013961792, "learning_rate": 9.599417334304443e-05, "loss": 2.5704, "step": 129 }, { "epoch": 0.08991872730416739, "grad_norm": 0.4875735938549042, "learning_rate": 9.592134013109979e-05, "loss": 2.0568, "step": 130 }, { "epoch": 0.09061040982189175, "grad_norm": 0.5956497192382812, "learning_rate": 9.584850691915514e-05, "loss": 2.3604, "step": 131 }, { "epoch": 0.09130209233961611, "grad_norm": 1.0286632776260376, "learning_rate": 9.577567370721049e-05, "loss": 1.278, "step": 132 }, { "epoch": 0.09199377485734048, "grad_norm": 0.5342454314231873, "learning_rate": 9.570284049526585e-05, "loss": 1.7806, "step": 133 }, { "epoch": 0.09268545737506484, "grad_norm": 0.5517177581787109, "learning_rate": 9.56300072833212e-05, "loss": 2.0241, "step": 134 }, { "epoch": 0.09337713989278922, "grad_norm": 0.6514694094657898, "learning_rate": 9.555717407137656e-05, "loss": 2.002, "step": 135 }, { "epoch": 0.09406882241051358, "grad_norm": 0.6356580257415771, "learning_rate": 9.54843408594319e-05, "loss": 2.2756, "step": 136 }, { "epoch": 0.09476050492823794, "grad_norm": 0.6672503352165222, "learning_rate": 9.541150764748726e-05, "loss": 1.4608, "step": 137 }, { "epoch": 0.0954521874459623, "grad_norm": 0.8465030193328857, "learning_rate": 9.533867443554261e-05, "loss": 2.0107, "step": 138 }, { "epoch": 0.09614386996368667, "grad_norm": 0.6064625978469849, "learning_rate": 9.526584122359796e-05, "loss": 2.1404, "step": 139 }, { "epoch": 0.09683555248141103, "grad_norm": 0.6728728413581848, "learning_rate": 9.519300801165332e-05, "loss": 1.7548, "step": 140 }, { "epoch": 0.0975272349991354, "grad_norm": 0.7992367744445801, "learning_rate": 9.512017479970867e-05, "loss": 1.5071, "step": 141 }, { "epoch": 0.09821891751685977, "grad_norm": 0.466144859790802, "learning_rate": 9.504734158776403e-05, "loss": 1.5908, "step": 142 }, { "epoch": 0.09891060003458413, "grad_norm": 0.694807767868042, "learning_rate": 9.497450837581938e-05, "loss": 1.6261, "step": 143 }, { "epoch": 0.09960228255230849, "grad_norm": 0.5561632513999939, "learning_rate": 9.490167516387472e-05, "loss": 1.2933, "step": 144 }, { "epoch": 0.10029396507003285, "grad_norm": 0.6609554886817932, "learning_rate": 9.482884195193008e-05, "loss": 1.7889, "step": 145 }, { "epoch": 0.10098564758775722, "grad_norm": 0.9272559285163879, "learning_rate": 9.475600873998543e-05, "loss": 1.2951, "step": 146 }, { "epoch": 0.10167733010548158, "grad_norm": 1.5346007347106934, "learning_rate": 9.46831755280408e-05, "loss": 1.5395, "step": 147 }, { "epoch": 0.10236901262320595, "grad_norm": 0.8031420111656189, "learning_rate": 9.461034231609614e-05, "loss": 2.2035, "step": 148 }, { "epoch": 0.10306069514093032, "grad_norm": 0.5305848121643066, "learning_rate": 9.45375091041515e-05, "loss": 1.784, "step": 149 }, { "epoch": 0.10375237765865468, "grad_norm": 0.6742457151412964, "learning_rate": 9.446467589220685e-05, "loss": 2.0042, "step": 150 }, { "epoch": 0.10444406017637904, "grad_norm": 0.7282371520996094, "learning_rate": 9.43918426802622e-05, "loss": 2.2441, "step": 151 }, { "epoch": 0.1051357426941034, "grad_norm": 0.5512478351593018, "learning_rate": 9.431900946831756e-05, "loss": 2.0307, "step": 152 }, { "epoch": 0.10582742521182777, "grad_norm": 0.7224892377853394, "learning_rate": 9.42461762563729e-05, "loss": 2.4692, "step": 153 }, { "epoch": 0.10651910772955213, "grad_norm": 0.7150198817253113, "learning_rate": 9.417334304442827e-05, "loss": 2.0194, "step": 154 }, { "epoch": 0.1072107902472765, "grad_norm": 0.5024279356002808, "learning_rate": 9.410050983248361e-05, "loss": 2.0461, "step": 155 }, { "epoch": 0.10790247276500087, "grad_norm": 1.0661766529083252, "learning_rate": 9.402767662053898e-05, "loss": 1.3458, "step": 156 }, { "epoch": 0.10859415528272523, "grad_norm": 0.577153205871582, "learning_rate": 9.395484340859432e-05, "loss": 1.517, "step": 157 }, { "epoch": 0.10928583780044959, "grad_norm": 0.6876221895217896, "learning_rate": 9.388201019664967e-05, "loss": 1.9622, "step": 158 }, { "epoch": 0.10997752031817395, "grad_norm": 0.5576820373535156, "learning_rate": 9.380917698470503e-05, "loss": 1.6063, "step": 159 }, { "epoch": 0.11066920283589832, "grad_norm": 0.8604760766029358, "learning_rate": 9.373634377276038e-05, "loss": 1.8693, "step": 160 }, { "epoch": 0.1113608853536227, "grad_norm": 0.6998944282531738, "learning_rate": 9.366351056081574e-05, "loss": 1.8942, "step": 161 }, { "epoch": 0.11205256787134706, "grad_norm": 0.7106878757476807, "learning_rate": 9.359067734887109e-05, "loss": 1.9999, "step": 162 }, { "epoch": 0.11274425038907142, "grad_norm": 1.167450189590454, "learning_rate": 9.351784413692645e-05, "loss": 1.26, "step": 163 }, { "epoch": 0.11343593290679578, "grad_norm": 0.6208682060241699, "learning_rate": 9.34450109249818e-05, "loss": 1.7881, "step": 164 }, { "epoch": 0.11412761542452014, "grad_norm": 0.7046381235122681, "learning_rate": 9.337217771303714e-05, "loss": 1.7403, "step": 165 }, { "epoch": 0.1148192979422445, "grad_norm": 0.6957246661186218, "learning_rate": 9.32993445010925e-05, "loss": 2.1148, "step": 166 }, { "epoch": 0.11551098045996887, "grad_norm": 0.6169411540031433, "learning_rate": 9.322651128914785e-05, "loss": 1.9182, "step": 167 }, { "epoch": 0.11620266297769324, "grad_norm": 0.8539403080940247, "learning_rate": 9.315367807720321e-05, "loss": 1.7015, "step": 168 }, { "epoch": 0.1168943454954176, "grad_norm": 1.037312626838684, "learning_rate": 9.308084486525856e-05, "loss": 1.3205, "step": 169 }, { "epoch": 0.11758602801314197, "grad_norm": 0.5971755981445312, "learning_rate": 9.300801165331392e-05, "loss": 1.1845, "step": 170 }, { "epoch": 0.11827771053086633, "grad_norm": 0.5809158086776733, "learning_rate": 9.293517844136927e-05, "loss": 1.5852, "step": 171 }, { "epoch": 0.1189693930485907, "grad_norm": 0.6503626704216003, "learning_rate": 9.286234522942462e-05, "loss": 1.9045, "step": 172 }, { "epoch": 0.11966107556631506, "grad_norm": 0.5208730101585388, "learning_rate": 9.278951201747998e-05, "loss": 1.7599, "step": 173 }, { "epoch": 0.12035275808403943, "grad_norm": 0.76451176404953, "learning_rate": 9.271667880553533e-05, "loss": 2.2624, "step": 174 }, { "epoch": 0.1210444406017638, "grad_norm": 0.5927959084510803, "learning_rate": 9.264384559359069e-05, "loss": 1.5911, "step": 175 }, { "epoch": 0.12173612311948816, "grad_norm": 0.6512097716331482, "learning_rate": 9.257101238164603e-05, "loss": 1.3409, "step": 176 }, { "epoch": 0.12242780563721252, "grad_norm": 0.4837232530117035, "learning_rate": 9.24981791697014e-05, "loss": 1.2417, "step": 177 }, { "epoch": 0.12311948815493688, "grad_norm": 0.6251150369644165, "learning_rate": 9.242534595775674e-05, "loss": 2.1377, "step": 178 }, { "epoch": 0.12381117067266124, "grad_norm": 0.5964468717575073, "learning_rate": 9.235251274581209e-05, "loss": 1.616, "step": 179 }, { "epoch": 0.1245028531903856, "grad_norm": 0.8185293674468994, "learning_rate": 9.227967953386745e-05, "loss": 2.372, "step": 180 }, { "epoch": 0.12519453570810998, "grad_norm": 0.7391765117645264, "learning_rate": 9.22068463219228e-05, "loss": 2.0473, "step": 181 }, { "epoch": 0.12588621822583435, "grad_norm": 0.6867517828941345, "learning_rate": 9.213401310997816e-05, "loss": 2.2592, "step": 182 }, { "epoch": 0.1265779007435587, "grad_norm": 0.6889393329620361, "learning_rate": 9.206117989803351e-05, "loss": 1.8863, "step": 183 }, { "epoch": 0.12726958326128307, "grad_norm": 0.6404229402542114, "learning_rate": 9.198834668608885e-05, "loss": 1.5859, "step": 184 }, { "epoch": 0.12796126577900743, "grad_norm": 2.46905255317688, "learning_rate": 9.191551347414422e-05, "loss": 2.4798, "step": 185 }, { "epoch": 0.1286529482967318, "grad_norm": 0.7490037083625793, "learning_rate": 9.184268026219956e-05, "loss": 2.1623, "step": 186 }, { "epoch": 0.12934463081445616, "grad_norm": 0.5947558283805847, "learning_rate": 9.176984705025492e-05, "loss": 2.2375, "step": 187 }, { "epoch": 0.13003631333218052, "grad_norm": 0.4829116463661194, "learning_rate": 9.169701383831027e-05, "loss": 1.66, "step": 188 }, { "epoch": 0.13072799584990488, "grad_norm": 0.5807337164878845, "learning_rate": 9.162418062636563e-05, "loss": 1.3098, "step": 189 }, { "epoch": 0.13141967836762924, "grad_norm": 0.8253926038742065, "learning_rate": 9.155134741442098e-05, "loss": 1.135, "step": 190 }, { "epoch": 0.13211136088535363, "grad_norm": 0.7517785429954529, "learning_rate": 9.147851420247633e-05, "loss": 1.6342, "step": 191 }, { "epoch": 0.132803043403078, "grad_norm": 0.791852593421936, "learning_rate": 9.140568099053169e-05, "loss": 1.3013, "step": 192 }, { "epoch": 0.13349472592080236, "grad_norm": 0.6530910730361938, "learning_rate": 9.133284777858704e-05, "loss": 2.4537, "step": 193 }, { "epoch": 0.13418640843852672, "grad_norm": 0.8071674704551697, "learning_rate": 9.12600145666424e-05, "loss": 1.1286, "step": 194 }, { "epoch": 0.13487809095625108, "grad_norm": 0.5800924301147461, "learning_rate": 9.118718135469774e-05, "loss": 1.913, "step": 195 }, { "epoch": 0.13556977347397545, "grad_norm": 0.6227284073829651, "learning_rate": 9.11143481427531e-05, "loss": 2.1569, "step": 196 }, { "epoch": 0.1362614559916998, "grad_norm": 0.7267847657203674, "learning_rate": 9.104151493080845e-05, "loss": 2.4129, "step": 197 }, { "epoch": 0.13695313850942417, "grad_norm": 0.6935005784034729, "learning_rate": 9.09686817188638e-05, "loss": 1.2993, "step": 198 }, { "epoch": 0.13764482102714853, "grad_norm": 0.5625126957893372, "learning_rate": 9.089584850691916e-05, "loss": 1.6137, "step": 199 }, { "epoch": 0.1383365035448729, "grad_norm": 0.6450534462928772, "learning_rate": 9.082301529497451e-05, "loss": 2.0705, "step": 200 }, { "epoch": 0.13902818606259726, "grad_norm": 0.7700170874595642, "learning_rate": 9.075018208302987e-05, "loss": 1.2858, "step": 201 }, { "epoch": 0.13971986858032162, "grad_norm": 0.4889370799064636, "learning_rate": 9.067734887108522e-05, "loss": 1.1423, "step": 202 }, { "epoch": 0.14041155109804598, "grad_norm": 0.5464507341384888, "learning_rate": 9.060451565914058e-05, "loss": 1.0928, "step": 203 }, { "epoch": 0.14110323361577037, "grad_norm": 0.7002052664756775, "learning_rate": 9.053168244719593e-05, "loss": 2.154, "step": 204 }, { "epoch": 0.14179491613349474, "grad_norm": 0.73245769739151, "learning_rate": 9.045884923525127e-05, "loss": 2.3437, "step": 205 }, { "epoch": 0.1424865986512191, "grad_norm": 0.6388504505157471, "learning_rate": 9.038601602330664e-05, "loss": 1.6209, "step": 206 }, { "epoch": 0.14317828116894346, "grad_norm": 0.5278307199478149, "learning_rate": 9.031318281136198e-05, "loss": 1.1904, "step": 207 }, { "epoch": 0.14386996368666782, "grad_norm": 0.6711840033531189, "learning_rate": 9.024034959941734e-05, "loss": 1.7129, "step": 208 }, { "epoch": 0.14456164620439219, "grad_norm": 0.6358078122138977, "learning_rate": 9.016751638747269e-05, "loss": 1.6655, "step": 209 }, { "epoch": 0.14525332872211655, "grad_norm": 0.6745946407318115, "learning_rate": 9.009468317552805e-05, "loss": 1.8217, "step": 210 }, { "epoch": 0.1459450112398409, "grad_norm": 2.227377414703369, "learning_rate": 9.00218499635834e-05, "loss": 3.1095, "step": 211 }, { "epoch": 0.14663669375756527, "grad_norm": 0.5836493968963623, "learning_rate": 8.994901675163875e-05, "loss": 1.9259, "step": 212 }, { "epoch": 0.14732837627528964, "grad_norm": 0.6573939323425293, "learning_rate": 8.987618353969411e-05, "loss": 1.9532, "step": 213 }, { "epoch": 0.148020058793014, "grad_norm": 0.8865386843681335, "learning_rate": 8.980335032774946e-05, "loss": 1.7339, "step": 214 }, { "epoch": 0.14871174131073836, "grad_norm": 0.7364823818206787, "learning_rate": 8.973051711580482e-05, "loss": 1.2418, "step": 215 }, { "epoch": 0.14940342382846272, "grad_norm": 0.7573681473731995, "learning_rate": 8.965768390386016e-05, "loss": 1.5159, "step": 216 }, { "epoch": 0.1500951063461871, "grad_norm": 0.6876881718635559, "learning_rate": 8.958485069191553e-05, "loss": 2.2091, "step": 217 }, { "epoch": 0.15078678886391148, "grad_norm": 0.6091864705085754, "learning_rate": 8.951201747997087e-05, "loss": 1.2208, "step": 218 }, { "epoch": 0.15147847138163584, "grad_norm": 0.7913519740104675, "learning_rate": 8.943918426802622e-05, "loss": 2.3047, "step": 219 }, { "epoch": 0.1521701538993602, "grad_norm": 1.3490211963653564, "learning_rate": 8.936635105608158e-05, "loss": 1.7839, "step": 220 }, { "epoch": 0.15286183641708456, "grad_norm": 0.8339107036590576, "learning_rate": 8.929351784413693e-05, "loss": 2.1117, "step": 221 }, { "epoch": 0.15355351893480892, "grad_norm": 1.5490026473999023, "learning_rate": 8.922068463219229e-05, "loss": 1.8909, "step": 222 }, { "epoch": 0.1542452014525333, "grad_norm": 0.7403117418289185, "learning_rate": 8.914785142024764e-05, "loss": 1.838, "step": 223 }, { "epoch": 0.15493688397025765, "grad_norm": 0.9296958446502686, "learning_rate": 8.907501820830299e-05, "loss": 1.8148, "step": 224 }, { "epoch": 0.155628566487982, "grad_norm": 0.6639947891235352, "learning_rate": 8.900218499635835e-05, "loss": 1.8576, "step": 225 }, { "epoch": 0.15632024900570637, "grad_norm": 0.7341317534446716, "learning_rate": 8.89293517844137e-05, "loss": 1.9323, "step": 226 }, { "epoch": 0.15701193152343074, "grad_norm": 0.5162379145622253, "learning_rate": 8.885651857246906e-05, "loss": 2.1562, "step": 227 }, { "epoch": 0.1577036140411551, "grad_norm": 0.6806768774986267, "learning_rate": 8.87836853605244e-05, "loss": 1.8393, "step": 228 }, { "epoch": 0.15839529655887946, "grad_norm": 0.5701708197593689, "learning_rate": 8.871085214857976e-05, "loss": 1.4365, "step": 229 }, { "epoch": 0.15908697907660385, "grad_norm": 0.7863550186157227, "learning_rate": 8.863801893663511e-05, "loss": 2.071, "step": 230 }, { "epoch": 0.15977866159432821, "grad_norm": 0.6164423823356628, "learning_rate": 8.856518572469046e-05, "loss": 2.0573, "step": 231 }, { "epoch": 0.16047034411205258, "grad_norm": 0.8759453892707825, "learning_rate": 8.849235251274582e-05, "loss": 1.8708, "step": 232 }, { "epoch": 0.16116202662977694, "grad_norm": 1.3226007223129272, "learning_rate": 8.841951930080117e-05, "loss": 1.4042, "step": 233 }, { "epoch": 0.1618537091475013, "grad_norm": 0.5054848790168762, "learning_rate": 8.834668608885653e-05, "loss": 1.2018, "step": 234 }, { "epoch": 0.16254539166522566, "grad_norm": 0.6680052876472473, "learning_rate": 8.827385287691188e-05, "loss": 2.1112, "step": 235 }, { "epoch": 0.16323707418295003, "grad_norm": 0.838471531867981, "learning_rate": 8.820101966496724e-05, "loss": 2.3499, "step": 236 }, { "epoch": 0.1639287567006744, "grad_norm": 0.6450280547142029, "learning_rate": 8.812818645302258e-05, "loss": 1.5009, "step": 237 }, { "epoch": 0.16462043921839875, "grad_norm": 0.5540733933448792, "learning_rate": 8.805535324107793e-05, "loss": 1.7475, "step": 238 }, { "epoch": 0.1653121217361231, "grad_norm": 0.6765146255493164, "learning_rate": 8.798252002913329e-05, "loss": 2.0798, "step": 239 }, { "epoch": 0.16600380425384748, "grad_norm": 0.6925728917121887, "learning_rate": 8.790968681718864e-05, "loss": 1.9599, "step": 240 }, { "epoch": 0.16669548677157184, "grad_norm": 0.787634015083313, "learning_rate": 8.7836853605244e-05, "loss": 1.5637, "step": 241 }, { "epoch": 0.1673871692892962, "grad_norm": 0.7191415429115295, "learning_rate": 8.776402039329935e-05, "loss": 2.1707, "step": 242 }, { "epoch": 0.1680788518070206, "grad_norm": 0.7958317995071411, "learning_rate": 8.769118718135471e-05, "loss": 2.147, "step": 243 }, { "epoch": 0.16877053432474495, "grad_norm": 0.6804454922676086, "learning_rate": 8.761835396941004e-05, "loss": 1.9832, "step": 244 }, { "epoch": 0.16946221684246932, "grad_norm": 0.8284922242164612, "learning_rate": 8.75455207574654e-05, "loss": 1.2555, "step": 245 }, { "epoch": 0.17015389936019368, "grad_norm": 0.7180774211883545, "learning_rate": 8.747268754552075e-05, "loss": 1.8228, "step": 246 }, { "epoch": 0.17084558187791804, "grad_norm": 0.8632107377052307, "learning_rate": 8.739985433357611e-05, "loss": 1.1625, "step": 247 }, { "epoch": 0.1715372643956424, "grad_norm": 0.5004351139068604, "learning_rate": 8.732702112163147e-05, "loss": 1.6948, "step": 248 }, { "epoch": 0.17222894691336676, "grad_norm": 0.6219859719276428, "learning_rate": 8.725418790968682e-05, "loss": 1.6748, "step": 249 }, { "epoch": 0.17292062943109113, "grad_norm": 0.8690136075019836, "learning_rate": 8.718135469774218e-05, "loss": 1.7746, "step": 250 }, { "epoch": 0.1736123119488155, "grad_norm": 0.6689561605453491, "learning_rate": 8.710852148579752e-05, "loss": 1.1883, "step": 251 }, { "epoch": 0.17430399446653985, "grad_norm": 1.0592402219772339, "learning_rate": 8.703568827385288e-05, "loss": 1.6322, "step": 252 }, { "epoch": 0.17499567698426421, "grad_norm": 0.721960723400116, "learning_rate": 8.696285506190823e-05, "loss": 1.8509, "step": 253 }, { "epoch": 0.17568735950198858, "grad_norm": 0.7862136960029602, "learning_rate": 8.689002184996359e-05, "loss": 1.5465, "step": 254 }, { "epoch": 0.17637904201971294, "grad_norm": 0.62255859375, "learning_rate": 8.681718863801895e-05, "loss": 1.7961, "step": 255 }, { "epoch": 0.17707072453743733, "grad_norm": 0.8009974956512451, "learning_rate": 8.67443554260743e-05, "loss": 1.7475, "step": 256 }, { "epoch": 0.1777624070551617, "grad_norm": 0.6300957202911377, "learning_rate": 8.667152221412966e-05, "loss": 1.8628, "step": 257 }, { "epoch": 0.17845408957288605, "grad_norm": 0.5678866505622864, "learning_rate": 8.659868900218499e-05, "loss": 1.2443, "step": 258 }, { "epoch": 0.17914577209061042, "grad_norm": 0.6319582462310791, "learning_rate": 8.652585579024035e-05, "loss": 1.3356, "step": 259 }, { "epoch": 0.17983745460833478, "grad_norm": 0.5843179821968079, "learning_rate": 8.64530225782957e-05, "loss": 1.9961, "step": 260 }, { "epoch": 0.18052913712605914, "grad_norm": 0.6211555600166321, "learning_rate": 8.638018936635106e-05, "loss": 1.1243, "step": 261 }, { "epoch": 0.1812208196437835, "grad_norm": 0.7266420722007751, "learning_rate": 8.630735615440642e-05, "loss": 2.2338, "step": 262 }, { "epoch": 0.18191250216150787, "grad_norm": 0.8187587857246399, "learning_rate": 8.623452294246177e-05, "loss": 2.0237, "step": 263 }, { "epoch": 0.18260418467923223, "grad_norm": 0.9444418549537659, "learning_rate": 8.616168973051712e-05, "loss": 1.6278, "step": 264 }, { "epoch": 0.1832958671969566, "grad_norm": 0.5889193415641785, "learning_rate": 8.608885651857246e-05, "loss": 2.1453, "step": 265 }, { "epoch": 0.18398754971468095, "grad_norm": 0.6819151639938354, "learning_rate": 8.601602330662782e-05, "loss": 1.2219, "step": 266 }, { "epoch": 0.18467923223240532, "grad_norm": 0.8448613286018372, "learning_rate": 8.594319009468317e-05, "loss": 2.1193, "step": 267 }, { "epoch": 0.18537091475012968, "grad_norm": 0.7247136235237122, "learning_rate": 8.587035688273853e-05, "loss": 1.6155, "step": 268 }, { "epoch": 0.18606259726785407, "grad_norm": 0.7371733784675598, "learning_rate": 8.57975236707939e-05, "loss": 1.5466, "step": 269 }, { "epoch": 0.18675427978557843, "grad_norm": 0.6676841974258423, "learning_rate": 8.572469045884924e-05, "loss": 2.1909, "step": 270 }, { "epoch": 0.1874459623033028, "grad_norm": 0.7091754078865051, "learning_rate": 8.565185724690459e-05, "loss": 2.3795, "step": 271 }, { "epoch": 0.18813764482102716, "grad_norm": 0.7999275922775269, "learning_rate": 8.557902403495994e-05, "loss": 2.0659, "step": 272 }, { "epoch": 0.18882932733875152, "grad_norm": 0.5176097750663757, "learning_rate": 8.55061908230153e-05, "loss": 1.4036, "step": 273 }, { "epoch": 0.18952100985647588, "grad_norm": 0.6579016447067261, "learning_rate": 8.543335761107065e-05, "loss": 1.9072, "step": 274 }, { "epoch": 0.19021269237420024, "grad_norm": 1.0604159832000732, "learning_rate": 8.536052439912601e-05, "loss": 2.1682, "step": 275 }, { "epoch": 0.1909043748919246, "grad_norm": 0.7157823443412781, "learning_rate": 8.528769118718137e-05, "loss": 1.9188, "step": 276 }, { "epoch": 0.19159605740964897, "grad_norm": 0.691941499710083, "learning_rate": 8.521485797523672e-05, "loss": 1.8626, "step": 277 }, { "epoch": 0.19228773992737333, "grad_norm": 0.5831330418586731, "learning_rate": 8.514202476329206e-05, "loss": 1.9975, "step": 278 }, { "epoch": 0.1929794224450977, "grad_norm": 0.6630810499191284, "learning_rate": 8.506919155134741e-05, "loss": 2.0809, "step": 279 }, { "epoch": 0.19367110496282205, "grad_norm": 0.601836621761322, "learning_rate": 8.499635833940277e-05, "loss": 1.2491, "step": 280 }, { "epoch": 0.19436278748054642, "grad_norm": 0.7532071471214294, "learning_rate": 8.492352512745812e-05, "loss": 2.0672, "step": 281 }, { "epoch": 0.1950544699982708, "grad_norm": 0.70450758934021, "learning_rate": 8.485069191551348e-05, "loss": 2.1466, "step": 282 }, { "epoch": 0.19574615251599517, "grad_norm": 0.6915740966796875, "learning_rate": 8.477785870356884e-05, "loss": 1.4086, "step": 283 }, { "epoch": 0.19643783503371953, "grad_norm": 0.949400007724762, "learning_rate": 8.470502549162418e-05, "loss": 1.6344, "step": 284 }, { "epoch": 0.1971295175514439, "grad_norm": 0.7051184177398682, "learning_rate": 8.463219227967954e-05, "loss": 1.4375, "step": 285 }, { "epoch": 0.19782120006916826, "grad_norm": 0.5827087163925171, "learning_rate": 8.455935906773488e-05, "loss": 1.4801, "step": 286 }, { "epoch": 0.19851288258689262, "grad_norm": 0.6058622002601624, "learning_rate": 8.448652585579024e-05, "loss": 1.4378, "step": 287 }, { "epoch": 0.19920456510461698, "grad_norm": 0.7779908180236816, "learning_rate": 8.441369264384559e-05, "loss": 1.6377, "step": 288 }, { "epoch": 0.19989624762234134, "grad_norm": 0.6972940564155579, "learning_rate": 8.434085943190095e-05, "loss": 1.9437, "step": 289 }, { "epoch": 0.2005879301400657, "grad_norm": 0.7289730906486511, "learning_rate": 8.426802621995631e-05, "loss": 1.5402, "step": 290 }, { "epoch": 0.20127961265779007, "grad_norm": 0.6566206216812134, "learning_rate": 8.419519300801165e-05, "loss": 1.4304, "step": 291 }, { "epoch": 0.20197129517551443, "grad_norm": 0.97121262550354, "learning_rate": 8.412235979606701e-05, "loss": 1.7801, "step": 292 }, { "epoch": 0.2026629776932388, "grad_norm": 0.7539506554603577, "learning_rate": 8.404952658412236e-05, "loss": 1.5326, "step": 293 }, { "epoch": 0.20335466021096316, "grad_norm": 0.8037034869194031, "learning_rate": 8.397669337217772e-05, "loss": 1.5559, "step": 294 }, { "epoch": 0.20404634272868752, "grad_norm": 1.2110254764556885, "learning_rate": 8.390386016023307e-05, "loss": 1.7529, "step": 295 }, { "epoch": 0.2047380252464119, "grad_norm": 0.9396728873252869, "learning_rate": 8.383102694828843e-05, "loss": 2.0101, "step": 296 }, { "epoch": 0.20542970776413627, "grad_norm": 0.7678546905517578, "learning_rate": 8.375819373634379e-05, "loss": 2.0231, "step": 297 }, { "epoch": 0.20612139028186063, "grad_norm": 1.1070085763931274, "learning_rate": 8.368536052439912e-05, "loss": 1.7999, "step": 298 }, { "epoch": 0.206813072799585, "grad_norm": 0.7589883804321289, "learning_rate": 8.361252731245448e-05, "loss": 1.6723, "step": 299 }, { "epoch": 0.20750475531730936, "grad_norm": 0.6468162536621094, "learning_rate": 8.353969410050983e-05, "loss": 1.3266, "step": 300 }, { "epoch": 0.20819643783503372, "grad_norm": 0.7720943689346313, "learning_rate": 8.346686088856519e-05, "loss": 1.5897, "step": 301 }, { "epoch": 0.20888812035275808, "grad_norm": 0.7349941730499268, "learning_rate": 8.339402767662054e-05, "loss": 1.9242, "step": 302 }, { "epoch": 0.20957980287048245, "grad_norm": 0.7902266979217529, "learning_rate": 8.33211944646759e-05, "loss": 2.2367, "step": 303 }, { "epoch": 0.2102714853882068, "grad_norm": 0.822661280632019, "learning_rate": 8.324836125273125e-05, "loss": 2.02, "step": 304 }, { "epoch": 0.21096316790593117, "grad_norm": 0.6001129746437073, "learning_rate": 8.31755280407866e-05, "loss": 1.4506, "step": 305 }, { "epoch": 0.21165485042365553, "grad_norm": 0.7674906253814697, "learning_rate": 8.310269482884196e-05, "loss": 1.9657, "step": 306 }, { "epoch": 0.2123465329413799, "grad_norm": 0.5808192491531372, "learning_rate": 8.30298616168973e-05, "loss": 1.8088, "step": 307 }, { "epoch": 0.21303821545910426, "grad_norm": 0.7100119590759277, "learning_rate": 8.295702840495266e-05, "loss": 1.9353, "step": 308 }, { "epoch": 0.21372989797682865, "grad_norm": 0.7359102964401245, "learning_rate": 8.288419519300801e-05, "loss": 1.7844, "step": 309 }, { "epoch": 0.214421580494553, "grad_norm": 0.7489526867866516, "learning_rate": 8.281136198106337e-05, "loss": 1.6238, "step": 310 }, { "epoch": 0.21511326301227737, "grad_norm": 0.8063543438911438, "learning_rate": 8.273852876911872e-05, "loss": 1.6634, "step": 311 }, { "epoch": 0.21580494553000173, "grad_norm": 0.7559211850166321, "learning_rate": 8.266569555717407e-05, "loss": 1.9663, "step": 312 }, { "epoch": 0.2164966280477261, "grad_norm": 0.7710449695587158, "learning_rate": 8.259286234522943e-05, "loss": 2.0552, "step": 313 }, { "epoch": 0.21718831056545046, "grad_norm": 1.0111525058746338, "learning_rate": 8.252002913328478e-05, "loss": 1.668, "step": 314 }, { "epoch": 0.21787999308317482, "grad_norm": 0.7205057144165039, "learning_rate": 8.244719592134014e-05, "loss": 2.1546, "step": 315 }, { "epoch": 0.21857167560089918, "grad_norm": 0.9559133052825928, "learning_rate": 8.237436270939549e-05, "loss": 1.6799, "step": 316 }, { "epoch": 0.21926335811862355, "grad_norm": 0.7418698668479919, "learning_rate": 8.230152949745085e-05, "loss": 1.2123, "step": 317 }, { "epoch": 0.2199550406363479, "grad_norm": 0.7784197330474854, "learning_rate": 8.22286962855062e-05, "loss": 1.1066, "step": 318 }, { "epoch": 0.22064672315407227, "grad_norm": 0.754654586315155, "learning_rate": 8.215586307356154e-05, "loss": 2.0185, "step": 319 }, { "epoch": 0.22133840567179663, "grad_norm": 0.6510013341903687, "learning_rate": 8.20830298616169e-05, "loss": 1.8316, "step": 320 }, { "epoch": 0.222030088189521, "grad_norm": 1.541455626487732, "learning_rate": 8.201019664967225e-05, "loss": 1.3254, "step": 321 }, { "epoch": 0.2227217707072454, "grad_norm": 0.8958970308303833, "learning_rate": 8.193736343772761e-05, "loss": 1.9616, "step": 322 }, { "epoch": 0.22341345322496975, "grad_norm": 0.6273507475852966, "learning_rate": 8.186453022578296e-05, "loss": 1.13, "step": 323 }, { "epoch": 0.2241051357426941, "grad_norm": 0.7866650223731995, "learning_rate": 8.17916970138383e-05, "loss": 1.784, "step": 324 }, { "epoch": 0.22479681826041847, "grad_norm": 0.9650758504867554, "learning_rate": 8.171886380189367e-05, "loss": 0.9438, "step": 325 }, { "epoch": 0.22548850077814284, "grad_norm": 0.7491368055343628, "learning_rate": 8.164603058994901e-05, "loss": 1.1472, "step": 326 }, { "epoch": 0.2261801832958672, "grad_norm": 0.7516188025474548, "learning_rate": 8.157319737800438e-05, "loss": 1.841, "step": 327 }, { "epoch": 0.22687186581359156, "grad_norm": 0.8555276989936829, "learning_rate": 8.150036416605972e-05, "loss": 2.0819, "step": 328 }, { "epoch": 0.22756354833131592, "grad_norm": 0.8025880455970764, "learning_rate": 8.142753095411508e-05, "loss": 1.871, "step": 329 }, { "epoch": 0.22825523084904029, "grad_norm": 0.8006131052970886, "learning_rate": 8.135469774217043e-05, "loss": 1.8293, "step": 330 }, { "epoch": 0.22894691336676465, "grad_norm": 0.6898099184036255, "learning_rate": 8.128186453022578e-05, "loss": 1.9701, "step": 331 }, { "epoch": 0.229638595884489, "grad_norm": 0.7827674150466919, "learning_rate": 8.120903131828114e-05, "loss": 1.9634, "step": 332 }, { "epoch": 0.23033027840221337, "grad_norm": 0.7763857245445251, "learning_rate": 8.113619810633649e-05, "loss": 1.4908, "step": 333 }, { "epoch": 0.23102196091993774, "grad_norm": 0.7040197253227234, "learning_rate": 8.106336489439185e-05, "loss": 1.9854, "step": 334 }, { "epoch": 0.23171364343766213, "grad_norm": 1.0337374210357666, "learning_rate": 8.09905316824472e-05, "loss": 2.1542, "step": 335 }, { "epoch": 0.2324053259553865, "grad_norm": 0.7228108048439026, "learning_rate": 8.091769847050256e-05, "loss": 1.9267, "step": 336 }, { "epoch": 0.23309700847311085, "grad_norm": 0.6841875910758972, "learning_rate": 8.08448652585579e-05, "loss": 1.769, "step": 337 }, { "epoch": 0.2337886909908352, "grad_norm": 0.6749361753463745, "learning_rate": 8.077203204661325e-05, "loss": 1.7758, "step": 338 }, { "epoch": 0.23448037350855958, "grad_norm": 0.6411470174789429, "learning_rate": 8.069919883466861e-05, "loss": 1.2938, "step": 339 }, { "epoch": 0.23517205602628394, "grad_norm": 0.6457031965255737, "learning_rate": 8.062636562272396e-05, "loss": 1.3416, "step": 340 }, { "epoch": 0.2358637385440083, "grad_norm": 0.7273378372192383, "learning_rate": 8.055353241077932e-05, "loss": 1.5468, "step": 341 }, { "epoch": 0.23655542106173266, "grad_norm": 0.7910048365592957, "learning_rate": 8.048069919883467e-05, "loss": 1.8577, "step": 342 }, { "epoch": 0.23724710357945702, "grad_norm": 0.7172124981880188, "learning_rate": 8.040786598689003e-05, "loss": 2.2877, "step": 343 }, { "epoch": 0.2379387860971814, "grad_norm": 0.6511913537979126, "learning_rate": 8.033503277494538e-05, "loss": 1.0914, "step": 344 }, { "epoch": 0.23863046861490575, "grad_norm": 0.6148791909217834, "learning_rate": 8.026219956300073e-05, "loss": 1.3388, "step": 345 }, { "epoch": 0.2393221511326301, "grad_norm": 0.7384858131408691, "learning_rate": 8.018936635105609e-05, "loss": 1.8186, "step": 346 }, { "epoch": 0.24001383365035447, "grad_norm": 0.793538510799408, "learning_rate": 8.011653313911143e-05, "loss": 1.5962, "step": 347 }, { "epoch": 0.24070551616807886, "grad_norm": 0.6316525936126709, "learning_rate": 8.00436999271668e-05, "loss": 1.4712, "step": 348 }, { "epoch": 0.24139719868580323, "grad_norm": 0.7671139240264893, "learning_rate": 7.997086671522214e-05, "loss": 2.2086, "step": 349 }, { "epoch": 0.2420888812035276, "grad_norm": 0.7423043251037598, "learning_rate": 7.98980335032775e-05, "loss": 2.1409, "step": 350 }, { "epoch": 0.24278056372125195, "grad_norm": 0.834575891494751, "learning_rate": 7.982520029133285e-05, "loss": 1.5166, "step": 351 }, { "epoch": 0.24347224623897631, "grad_norm": 1.0157177448272705, "learning_rate": 7.97523670793882e-05, "loss": 1.7054, "step": 352 }, { "epoch": 0.24416392875670068, "grad_norm": 0.8765892386436462, "learning_rate": 7.967953386744356e-05, "loss": 2.4397, "step": 353 }, { "epoch": 0.24485561127442504, "grad_norm": 0.6456695795059204, "learning_rate": 7.960670065549891e-05, "loss": 1.2428, "step": 354 }, { "epoch": 0.2455472937921494, "grad_norm": 0.813552737236023, "learning_rate": 7.953386744355427e-05, "loss": 1.7926, "step": 355 }, { "epoch": 0.24623897630987376, "grad_norm": 0.6702203750610352, "learning_rate": 7.946103423160962e-05, "loss": 1.6654, "step": 356 }, { "epoch": 0.24693065882759813, "grad_norm": 0.9113941788673401, "learning_rate": 7.938820101966498e-05, "loss": 2.1639, "step": 357 }, { "epoch": 0.2476223413453225, "grad_norm": 0.6006965041160583, "learning_rate": 7.931536780772032e-05, "loss": 1.815, "step": 358 }, { "epoch": 0.24831402386304685, "grad_norm": 0.9116283059120178, "learning_rate": 7.924253459577567e-05, "loss": 1.5244, "step": 359 }, { "epoch": 0.2490057063807712, "grad_norm": 0.8563610315322876, "learning_rate": 7.916970138383103e-05, "loss": 2.1287, "step": 360 }, { "epoch": 0.2496973888984956, "grad_norm": 0.7254488468170166, "learning_rate": 7.909686817188638e-05, "loss": 1.1997, "step": 361 }, { "epoch": 0.25038907141621997, "grad_norm": 0.8156582117080688, "learning_rate": 7.902403495994174e-05, "loss": 2.1275, "step": 362 }, { "epoch": 0.2510807539339443, "grad_norm": 0.7504958510398865, "learning_rate": 7.895120174799709e-05, "loss": 2.1381, "step": 363 }, { "epoch": 0.2517724364516687, "grad_norm": 0.6650155782699585, "learning_rate": 7.887836853605244e-05, "loss": 0.9823, "step": 364 }, { "epoch": 0.252464118969393, "grad_norm": 0.6706410646438599, "learning_rate": 7.88055353241078e-05, "loss": 1.1854, "step": 365 }, { "epoch": 0.2531558014871174, "grad_norm": 1.0841103792190552, "learning_rate": 7.873270211216315e-05, "loss": 1.4933, "step": 366 }, { "epoch": 0.25384748400484175, "grad_norm": 0.7275784611701965, "learning_rate": 7.86598689002185e-05, "loss": 2.162, "step": 367 }, { "epoch": 0.25453916652256614, "grad_norm": 0.994987964630127, "learning_rate": 7.858703568827385e-05, "loss": 1.6992, "step": 368 }, { "epoch": 0.25523084904029053, "grad_norm": 1.160749912261963, "learning_rate": 7.851420247632922e-05, "loss": 1.2521, "step": 369 }, { "epoch": 0.25592253155801487, "grad_norm": 0.8106790781021118, "learning_rate": 7.844136926438456e-05, "loss": 1.5949, "step": 370 }, { "epoch": 0.25661421407573926, "grad_norm": 0.5805374383926392, "learning_rate": 7.836853605243991e-05, "loss": 1.713, "step": 371 }, { "epoch": 0.2573058965934636, "grad_norm": 0.6805906295776367, "learning_rate": 7.829570284049527e-05, "loss": 1.7713, "step": 372 }, { "epoch": 0.257997579111188, "grad_norm": 0.6964860558509827, "learning_rate": 7.822286962855062e-05, "loss": 1.8013, "step": 373 }, { "epoch": 0.2586892616289123, "grad_norm": 1.2079120874404907, "learning_rate": 7.815003641660598e-05, "loss": 2.0171, "step": 374 }, { "epoch": 0.2593809441466367, "grad_norm": 0.689630925655365, "learning_rate": 7.807720320466133e-05, "loss": 1.8816, "step": 375 }, { "epoch": 0.26007262666436104, "grad_norm": 0.9079475402832031, "learning_rate": 7.800436999271669e-05, "loss": 1.5992, "step": 376 }, { "epoch": 0.26076430918208543, "grad_norm": 0.4846475422382355, "learning_rate": 7.793153678077204e-05, "loss": 0.9038, "step": 377 }, { "epoch": 0.26145599169980976, "grad_norm": 0.6311324238777161, "learning_rate": 7.785870356882738e-05, "loss": 1.757, "step": 378 }, { "epoch": 0.26214767421753415, "grad_norm": 0.6497640609741211, "learning_rate": 7.778587035688274e-05, "loss": 1.799, "step": 379 }, { "epoch": 0.2628393567352585, "grad_norm": 0.6967574954032898, "learning_rate": 7.771303714493809e-05, "loss": 2.0541, "step": 380 }, { "epoch": 0.2635310392529829, "grad_norm": 0.8538269400596619, "learning_rate": 7.764020393299345e-05, "loss": 1.4781, "step": 381 }, { "epoch": 0.26422272177070727, "grad_norm": 0.747353196144104, "learning_rate": 7.75673707210488e-05, "loss": 1.063, "step": 382 }, { "epoch": 0.2649144042884316, "grad_norm": 0.7110087871551514, "learning_rate": 7.749453750910416e-05, "loss": 1.6301, "step": 383 }, { "epoch": 0.265606086806156, "grad_norm": 0.8608129620552063, "learning_rate": 7.742170429715951e-05, "loss": 1.7459, "step": 384 }, { "epoch": 0.26629776932388033, "grad_norm": 0.903325617313385, "learning_rate": 7.734887108521486e-05, "loss": 1.0401, "step": 385 }, { "epoch": 0.2669894518416047, "grad_norm": 0.6887355446815491, "learning_rate": 7.727603787327022e-05, "loss": 1.6558, "step": 386 }, { "epoch": 0.26768113435932905, "grad_norm": 0.8748055100440979, "learning_rate": 7.720320466132557e-05, "loss": 2.2414, "step": 387 }, { "epoch": 0.26837281687705344, "grad_norm": 0.7315294146537781, "learning_rate": 7.713037144938093e-05, "loss": 1.7651, "step": 388 }, { "epoch": 0.2690644993947778, "grad_norm": 0.7685129642486572, "learning_rate": 7.705753823743627e-05, "loss": 1.64, "step": 389 }, { "epoch": 0.26975618191250217, "grad_norm": 0.7387799620628357, "learning_rate": 7.698470502549163e-05, "loss": 1.655, "step": 390 }, { "epoch": 0.2704478644302265, "grad_norm": 0.6962683200836182, "learning_rate": 7.691187181354698e-05, "loss": 2.0759, "step": 391 }, { "epoch": 0.2711395469479509, "grad_norm": 0.6928436160087585, "learning_rate": 7.683903860160233e-05, "loss": 1.902, "step": 392 }, { "epoch": 0.27183122946567523, "grad_norm": 0.6286187767982483, "learning_rate": 7.676620538965769e-05, "loss": 1.1205, "step": 393 }, { "epoch": 0.2725229119833996, "grad_norm": 0.8087384104728699, "learning_rate": 7.669337217771304e-05, "loss": 1.5928, "step": 394 }, { "epoch": 0.273214594501124, "grad_norm": 0.6950211524963379, "learning_rate": 7.66205389657684e-05, "loss": 1.9022, "step": 395 }, { "epoch": 0.27390627701884834, "grad_norm": 0.716234028339386, "learning_rate": 7.654770575382375e-05, "loss": 1.9103, "step": 396 }, { "epoch": 0.27459795953657273, "grad_norm": 1.0911515951156616, "learning_rate": 7.647487254187911e-05, "loss": 2.1727, "step": 397 }, { "epoch": 0.27528964205429707, "grad_norm": 1.0012110471725464, "learning_rate": 7.640203932993446e-05, "loss": 2.0671, "step": 398 }, { "epoch": 0.27598132457202146, "grad_norm": 0.7271398305892944, "learning_rate": 7.63292061179898e-05, "loss": 1.8487, "step": 399 }, { "epoch": 0.2766730070897458, "grad_norm": 0.9112443923950195, "learning_rate": 7.625637290604516e-05, "loss": 1.6583, "step": 400 }, { "epoch": 0.2773646896074702, "grad_norm": 0.8109308481216431, "learning_rate": 7.618353969410051e-05, "loss": 1.6202, "step": 401 }, { "epoch": 0.2780563721251945, "grad_norm": 0.7934839129447937, "learning_rate": 7.611070648215587e-05, "loss": 2.2605, "step": 402 }, { "epoch": 0.2787480546429189, "grad_norm": 0.7166281342506409, "learning_rate": 7.603787327021122e-05, "loss": 1.5321, "step": 403 }, { "epoch": 0.27943973716064324, "grad_norm": 0.6386390924453735, "learning_rate": 7.596504005826657e-05, "loss": 0.9431, "step": 404 }, { "epoch": 0.28013141967836763, "grad_norm": 1.2669974565505981, "learning_rate": 7.589220684632193e-05, "loss": 0.7634, "step": 405 }, { "epoch": 0.28082310219609197, "grad_norm": 0.7032380700111389, "learning_rate": 7.581937363437728e-05, "loss": 1.9926, "step": 406 }, { "epoch": 0.28151478471381636, "grad_norm": 0.648324191570282, "learning_rate": 7.574654042243264e-05, "loss": 1.2027, "step": 407 }, { "epoch": 0.28220646723154075, "grad_norm": 0.7746976613998413, "learning_rate": 7.567370721048798e-05, "loss": 1.3615, "step": 408 }, { "epoch": 0.2828981497492651, "grad_norm": 0.8174494504928589, "learning_rate": 7.560087399854335e-05, "loss": 1.9945, "step": 409 }, { "epoch": 0.28358983226698947, "grad_norm": 0.9701315760612488, "learning_rate": 7.55280407865987e-05, "loss": 1.4894, "step": 410 }, { "epoch": 0.2842815147847138, "grad_norm": 0.9906868934631348, "learning_rate": 7.545520757465404e-05, "loss": 1.7937, "step": 411 }, { "epoch": 0.2849731973024382, "grad_norm": 0.6380108594894409, "learning_rate": 7.53823743627094e-05, "loss": 1.6794, "step": 412 }, { "epoch": 0.28566487982016253, "grad_norm": 0.6367055177688599, "learning_rate": 7.530954115076475e-05, "loss": 1.4019, "step": 413 }, { "epoch": 0.2863565623378869, "grad_norm": 0.9779865741729736, "learning_rate": 7.523670793882011e-05, "loss": 1.8048, "step": 414 }, { "epoch": 0.28704824485561126, "grad_norm": 0.6246545910835266, "learning_rate": 7.516387472687546e-05, "loss": 1.7138, "step": 415 }, { "epoch": 0.28773992737333565, "grad_norm": 0.7418220639228821, "learning_rate": 7.509104151493082e-05, "loss": 1.9917, "step": 416 }, { "epoch": 0.28843160989106, "grad_norm": 0.7883859276771545, "learning_rate": 7.501820830298617e-05, "loss": 1.2134, "step": 417 }, { "epoch": 0.28912329240878437, "grad_norm": 0.8116041421890259, "learning_rate": 7.494537509104151e-05, "loss": 1.1634, "step": 418 }, { "epoch": 0.2898149749265087, "grad_norm": 0.7708969712257385, "learning_rate": 7.487254187909688e-05, "loss": 1.8895, "step": 419 }, { "epoch": 0.2905066574442331, "grad_norm": 1.3700909614562988, "learning_rate": 7.479970866715222e-05, "loss": 1.1294, "step": 420 }, { "epoch": 0.2911983399619575, "grad_norm": 0.8107597827911377, "learning_rate": 7.472687545520758e-05, "loss": 1.0097, "step": 421 }, { "epoch": 0.2918900224796818, "grad_norm": 0.8277159929275513, "learning_rate": 7.465404224326293e-05, "loss": 1.0633, "step": 422 }, { "epoch": 0.2925817049974062, "grad_norm": 0.75505131483078, "learning_rate": 7.458120903131829e-05, "loss": 1.4971, "step": 423 }, { "epoch": 0.29327338751513055, "grad_norm": 0.7761143445968628, "learning_rate": 7.450837581937363e-05, "loss": 1.569, "step": 424 }, { "epoch": 0.29396507003285494, "grad_norm": 0.8429370522499084, "learning_rate": 7.443554260742899e-05, "loss": 2.147, "step": 425 }, { "epoch": 0.29465675255057927, "grad_norm": 0.7506592273712158, "learning_rate": 7.436270939548435e-05, "loss": 2.1514, "step": 426 }, { "epoch": 0.29534843506830366, "grad_norm": 0.6654348969459534, "learning_rate": 7.42898761835397e-05, "loss": 1.7697, "step": 427 }, { "epoch": 0.296040117586028, "grad_norm": 1.179040551185608, "learning_rate": 7.421704297159506e-05, "loss": 1.6906, "step": 428 }, { "epoch": 0.2967318001037524, "grad_norm": 0.6145846247673035, "learning_rate": 7.41442097596504e-05, "loss": 1.8308, "step": 429 }, { "epoch": 0.2974234826214767, "grad_norm": 0.7995002269744873, "learning_rate": 7.407137654770577e-05, "loss": 1.7942, "step": 430 }, { "epoch": 0.2981151651392011, "grad_norm": 0.707095742225647, "learning_rate": 7.39985433357611e-05, "loss": 1.6325, "step": 431 }, { "epoch": 0.29880684765692545, "grad_norm": 0.5955487489700317, "learning_rate": 7.392571012381646e-05, "loss": 1.3466, "step": 432 }, { "epoch": 0.29949853017464984, "grad_norm": 0.9258183240890503, "learning_rate": 7.385287691187182e-05, "loss": 1.8544, "step": 433 }, { "epoch": 0.3001902126923742, "grad_norm": 0.8435434103012085, "learning_rate": 7.378004369992717e-05, "loss": 1.8884, "step": 434 }, { "epoch": 0.30088189521009856, "grad_norm": 0.7008845210075378, "learning_rate": 7.370721048798253e-05, "loss": 1.8409, "step": 435 }, { "epoch": 0.30157357772782295, "grad_norm": 0.7846685647964478, "learning_rate": 7.363437727603788e-05, "loss": 1.8821, "step": 436 }, { "epoch": 0.3022652602455473, "grad_norm": 0.7767228484153748, "learning_rate": 7.356154406409324e-05, "loss": 2.0034, "step": 437 }, { "epoch": 0.3029569427632717, "grad_norm": 0.787028431892395, "learning_rate": 7.348871085214857e-05, "loss": 1.8664, "step": 438 }, { "epoch": 0.303648625280996, "grad_norm": 0.6583303809165955, "learning_rate": 7.341587764020393e-05, "loss": 1.1991, "step": 439 }, { "epoch": 0.3043403077987204, "grad_norm": 0.8681322932243347, "learning_rate": 7.33430444282593e-05, "loss": 1.7124, "step": 440 }, { "epoch": 0.30503199031644473, "grad_norm": 0.9781901240348816, "learning_rate": 7.327021121631464e-05, "loss": 2.0128, "step": 441 }, { "epoch": 0.3057236728341691, "grad_norm": 0.6712149381637573, "learning_rate": 7.319737800437e-05, "loss": 1.4835, "step": 442 }, { "epoch": 0.30641535535189346, "grad_norm": 0.7322747111320496, "learning_rate": 7.312454479242535e-05, "loss": 1.1381, "step": 443 }, { "epoch": 0.30710703786961785, "grad_norm": 0.8576450347900391, "learning_rate": 7.30517115804807e-05, "loss": 1.8142, "step": 444 }, { "epoch": 0.3077987203873422, "grad_norm": 0.6087522506713867, "learning_rate": 7.297887836853605e-05, "loss": 1.2711, "step": 445 }, { "epoch": 0.3084904029050666, "grad_norm": 0.8685612082481384, "learning_rate": 7.290604515659141e-05, "loss": 1.2365, "step": 446 }, { "epoch": 0.30918208542279096, "grad_norm": 0.7040896415710449, "learning_rate": 7.283321194464677e-05, "loss": 1.6385, "step": 447 }, { "epoch": 0.3098737679405153, "grad_norm": 0.943147599697113, "learning_rate": 7.276037873270212e-05, "loss": 1.4699, "step": 448 }, { "epoch": 0.3105654504582397, "grad_norm": 0.6641246676445007, "learning_rate": 7.268754552075748e-05, "loss": 1.6489, "step": 449 }, { "epoch": 0.311257132975964, "grad_norm": 0.7420269250869751, "learning_rate": 7.261471230881282e-05, "loss": 2.1525, "step": 450 }, { "epoch": 0.3119488154936884, "grad_norm": 0.7198522090911865, "learning_rate": 7.254187909686817e-05, "loss": 2.01, "step": 451 }, { "epoch": 0.31264049801141275, "grad_norm": 0.8209108114242554, "learning_rate": 7.246904588492352e-05, "loss": 1.5205, "step": 452 }, { "epoch": 0.31333218052913714, "grad_norm": 1.0207217931747437, "learning_rate": 7.239621267297888e-05, "loss": 1.589, "step": 453 }, { "epoch": 0.3140238630468615, "grad_norm": 0.7939510345458984, "learning_rate": 7.232337946103424e-05, "loss": 1.9908, "step": 454 }, { "epoch": 0.31471554556458586, "grad_norm": 0.6668927073478699, "learning_rate": 7.225054624908959e-05, "loss": 1.7476, "step": 455 }, { "epoch": 0.3154072280823102, "grad_norm": 0.8068589568138123, "learning_rate": 7.217771303714495e-05, "loss": 1.7593, "step": 456 }, { "epoch": 0.3160989106000346, "grad_norm": 0.6828214526176453, "learning_rate": 7.21048798252003e-05, "loss": 1.5636, "step": 457 }, { "epoch": 0.3167905931177589, "grad_norm": 0.7853332757949829, "learning_rate": 7.203204661325565e-05, "loss": 1.1448, "step": 458 }, { "epoch": 0.3174822756354833, "grad_norm": 1.0938345193862915, "learning_rate": 7.195921340131099e-05, "loss": 1.7145, "step": 459 }, { "epoch": 0.3181739581532077, "grad_norm": 0.7735335826873779, "learning_rate": 7.188638018936635e-05, "loss": 1.2118, "step": 460 }, { "epoch": 0.31886564067093204, "grad_norm": 0.6759296655654907, "learning_rate": 7.181354697742171e-05, "loss": 1.1214, "step": 461 }, { "epoch": 0.31955732318865643, "grad_norm": 0.8163008689880371, "learning_rate": 7.174071376547706e-05, "loss": 1.9071, "step": 462 }, { "epoch": 0.32024900570638076, "grad_norm": 0.8205277919769287, "learning_rate": 7.166788055353242e-05, "loss": 1.0775, "step": 463 }, { "epoch": 0.32094068822410515, "grad_norm": 0.712774395942688, "learning_rate": 7.159504734158776e-05, "loss": 1.8912, "step": 464 }, { "epoch": 0.3216323707418295, "grad_norm": 0.7423759698867798, "learning_rate": 7.152221412964312e-05, "loss": 2.0655, "step": 465 }, { "epoch": 0.3223240532595539, "grad_norm": 0.6948613524436951, "learning_rate": 7.144938091769847e-05, "loss": 2.1957, "step": 466 }, { "epoch": 0.3230157357772782, "grad_norm": 0.6164250373840332, "learning_rate": 7.137654770575383e-05, "loss": 0.9997, "step": 467 }, { "epoch": 0.3237074182950026, "grad_norm": 0.7659111022949219, "learning_rate": 7.130371449380919e-05, "loss": 1.4006, "step": 468 }, { "epoch": 0.32439910081272694, "grad_norm": 0.830442488193512, "learning_rate": 7.123088128186454e-05, "loss": 1.6562, "step": 469 }, { "epoch": 0.3250907833304513, "grad_norm": 0.649169921875, "learning_rate": 7.11580480699199e-05, "loss": 1.8275, "step": 470 }, { "epoch": 0.32578246584817566, "grad_norm": 0.7854671478271484, "learning_rate": 7.108521485797523e-05, "loss": 0.3147, "step": 471 }, { "epoch": 0.32647414836590005, "grad_norm": 1.0381858348846436, "learning_rate": 7.101238164603059e-05, "loss": 2.0842, "step": 472 }, { "epoch": 0.32716583088362444, "grad_norm": 0.7317079305648804, "learning_rate": 7.093954843408594e-05, "loss": 2.1326, "step": 473 }, { "epoch": 0.3278575134013488, "grad_norm": 0.73940110206604, "learning_rate": 7.08667152221413e-05, "loss": 1.9647, "step": 474 }, { "epoch": 0.32854919591907317, "grad_norm": 0.8855507969856262, "learning_rate": 7.079388201019666e-05, "loss": 1.9308, "step": 475 }, { "epoch": 0.3292408784367975, "grad_norm": 0.7795459628105164, "learning_rate": 7.072104879825201e-05, "loss": 1.9297, "step": 476 }, { "epoch": 0.3299325609545219, "grad_norm": 0.765084981918335, "learning_rate": 7.064821558630737e-05, "loss": 2.3063, "step": 477 }, { "epoch": 0.3306242434722462, "grad_norm": 0.7164610624313354, "learning_rate": 7.05753823743627e-05, "loss": 1.8509, "step": 478 }, { "epoch": 0.3313159259899706, "grad_norm": 0.6976935863494873, "learning_rate": 7.050254916241806e-05, "loss": 1.8351, "step": 479 }, { "epoch": 0.33200760850769495, "grad_norm": 1.1228684186935425, "learning_rate": 7.042971595047341e-05, "loss": 1.4274, "step": 480 }, { "epoch": 0.33269929102541934, "grad_norm": 0.7229692935943604, "learning_rate": 7.035688273852877e-05, "loss": 1.5104, "step": 481 }, { "epoch": 0.3333909735431437, "grad_norm": 0.7485585808753967, "learning_rate": 7.028404952658413e-05, "loss": 1.1417, "step": 482 }, { "epoch": 0.33408265606086807, "grad_norm": 1.7345014810562134, "learning_rate": 7.021121631463948e-05, "loss": 1.6614, "step": 483 }, { "epoch": 0.3347743385785924, "grad_norm": 0.7559076547622681, "learning_rate": 7.013838310269483e-05, "loss": 1.8508, "step": 484 }, { "epoch": 0.3354660210963168, "grad_norm": 1.2972337007522583, "learning_rate": 7.006554989075018e-05, "loss": 1.9322, "step": 485 }, { "epoch": 0.3361577036140412, "grad_norm": 0.9719124436378479, "learning_rate": 6.999271667880554e-05, "loss": 1.6138, "step": 486 }, { "epoch": 0.3368493861317655, "grad_norm": 0.7772285342216492, "learning_rate": 6.991988346686089e-05, "loss": 2.1904, "step": 487 }, { "epoch": 0.3375410686494899, "grad_norm": 0.8187295198440552, "learning_rate": 6.984705025491625e-05, "loss": 1.4893, "step": 488 }, { "epoch": 0.33823275116721424, "grad_norm": 0.8635819554328918, "learning_rate": 6.977421704297161e-05, "loss": 2.0885, "step": 489 }, { "epoch": 0.33892443368493863, "grad_norm": 1.2180774211883545, "learning_rate": 6.970138383102696e-05, "loss": 1.1084, "step": 490 }, { "epoch": 0.33961611620266297, "grad_norm": 0.8329484462738037, "learning_rate": 6.96285506190823e-05, "loss": 1.3746, "step": 491 }, { "epoch": 0.34030779872038736, "grad_norm": 0.7301535606384277, "learning_rate": 6.955571740713765e-05, "loss": 1.2109, "step": 492 }, { "epoch": 0.3409994812381117, "grad_norm": 0.740479052066803, "learning_rate": 6.948288419519301e-05, "loss": 1.9807, "step": 493 }, { "epoch": 0.3416911637558361, "grad_norm": 0.9610648155212402, "learning_rate": 6.941005098324836e-05, "loss": 1.5924, "step": 494 }, { "epoch": 0.3423828462735604, "grad_norm": 0.7890329360961914, "learning_rate": 6.933721777130372e-05, "loss": 1.7208, "step": 495 }, { "epoch": 0.3430745287912848, "grad_norm": 0.7466354370117188, "learning_rate": 6.926438455935908e-05, "loss": 1.8483, "step": 496 }, { "epoch": 0.34376621130900914, "grad_norm": 0.8114244341850281, "learning_rate": 6.919155134741443e-05, "loss": 1.9642, "step": 497 }, { "epoch": 0.34445789382673353, "grad_norm": 0.8441615104675293, "learning_rate": 6.911871813546978e-05, "loss": 1.6589, "step": 498 }, { "epoch": 0.3451495763444579, "grad_norm": 0.771629273891449, "learning_rate": 6.904588492352512e-05, "loss": 2.1515, "step": 499 }, { "epoch": 0.34584125886218225, "grad_norm": 0.7773415446281433, "learning_rate": 6.897305171158048e-05, "loss": 1.1782, "step": 500 }, { "epoch": 0.34584125886218225, "eval_loss": 1.5188031196594238, "eval_runtime": 636.0478, "eval_samples_per_second": 2.02, "eval_steps_per_second": 1.011, "step": 500 } ], "logging_steps": 1, "max_steps": 1446, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.596867200925696e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }