{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9816849816849818, "eval_steps": 500, "global_step": 408, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007326007326007326, "grad_norm": 30.21496593413568, "learning_rate": 4.7619047619047623e-07, "loss": 1.0635175704956055, "step": 1 }, { "epoch": 0.014652014652014652, "grad_norm": 27.043345585334517, "learning_rate": 9.523809523809525e-07, "loss": 1.0006253719329834, "step": 2 }, { "epoch": 0.02197802197802198, "grad_norm": 21.646848717902948, "learning_rate": 1.4285714285714286e-06, "loss": 0.9898864030838013, "step": 3 }, { "epoch": 0.029304029304029304, "grad_norm": 22.348396709879044, "learning_rate": 1.904761904761905e-06, "loss": 0.988436758518219, "step": 4 }, { "epoch": 0.03663003663003663, "grad_norm": 19.157410690764603, "learning_rate": 2.380952380952381e-06, "loss": 0.9326452016830444, "step": 5 }, { "epoch": 0.04395604395604396, "grad_norm": 18.08116663695173, "learning_rate": 2.8571428571428573e-06, "loss": 0.8227804899215698, "step": 6 }, { "epoch": 0.05128205128205128, "grad_norm": 16.123821453273436, "learning_rate": 3.3333333333333333e-06, "loss": 0.816413938999176, "step": 7 }, { "epoch": 0.05860805860805861, "grad_norm": 10.059553232587936, "learning_rate": 3.80952380952381e-06, "loss": 0.6918408274650574, "step": 8 }, { "epoch": 0.06593406593406594, "grad_norm": 10.100146604450146, "learning_rate": 4.2857142857142855e-06, "loss": 0.709736704826355, "step": 9 }, { "epoch": 0.07326007326007326, "grad_norm": 3.6849384254746336, "learning_rate": 4.761904761904762e-06, "loss": 0.5924822688102722, "step": 10 }, { "epoch": 0.08058608058608059, "grad_norm": 2.515001245028391, "learning_rate": 5.2380952380952384e-06, "loss": 0.5829637050628662, "step": 11 }, { "epoch": 0.08791208791208792, "grad_norm": 2.7142652174838964, "learning_rate": 5.7142857142857145e-06, "loss": 0.5520739555358887, "step": 12 }, { "epoch": 0.09523809523809523, "grad_norm": 2.396783666272162, "learning_rate": 6.1904761904761914e-06, "loss": 0.5408250689506531, "step": 13 }, { "epoch": 0.10256410256410256, "grad_norm": 2.613664759900683, "learning_rate": 6.666666666666667e-06, "loss": 0.5364675521850586, "step": 14 }, { "epoch": 0.10989010989010989, "grad_norm": 2.140953850448383, "learning_rate": 7.1428571428571436e-06, "loss": 0.5406008958816528, "step": 15 }, { "epoch": 0.11721611721611722, "grad_norm": 2.1637811311224513, "learning_rate": 7.61904761904762e-06, "loss": 0.5351678133010864, "step": 16 }, { "epoch": 0.12454212454212454, "grad_norm": 1.889685988846562, "learning_rate": 8.095238095238097e-06, "loss": 0.5272159576416016, "step": 17 }, { "epoch": 0.13186813186813187, "grad_norm": 1.610391433650919, "learning_rate": 8.571428571428571e-06, "loss": 0.4894131124019623, "step": 18 }, { "epoch": 0.1391941391941392, "grad_norm": 1.5780714267255904, "learning_rate": 9.047619047619049e-06, "loss": 0.4680541753768921, "step": 19 }, { "epoch": 0.14652014652014653, "grad_norm": 1.3826646451572844, "learning_rate": 9.523809523809525e-06, "loss": 0.41832777857780457, "step": 20 }, { "epoch": 0.15384615384615385, "grad_norm": 1.9876445386481647, "learning_rate": 1e-05, "loss": 0.5307973623275757, "step": 21 }, { "epoch": 0.16117216117216118, "grad_norm": 1.989790222306354, "learning_rate": 9.999835253787472e-06, "loss": 0.4524080753326416, "step": 22 }, { "epoch": 0.1684981684981685, "grad_norm": 1.7132167306761634, "learning_rate": 9.99934102600642e-06, "loss": 0.4481462240219116, "step": 23 }, { "epoch": 0.17582417582417584, "grad_norm": 1.675845803554774, "learning_rate": 9.998517349225698e-06, "loss": 0.470619797706604, "step": 24 }, { "epoch": 0.18315018315018314, "grad_norm": 1.8889444953942198, "learning_rate": 9.997364277724362e-06, "loss": 0.4533703625202179, "step": 25 }, { "epoch": 0.19047619047619047, "grad_norm": 1.2961235388879089, "learning_rate": 9.99588188748808e-06, "loss": 0.4506123661994934, "step": 26 }, { "epoch": 0.1978021978021978, "grad_norm": 1.3960705328975553, "learning_rate": 9.994070276204115e-06, "loss": 0.5040400624275208, "step": 27 }, { "epoch": 0.20512820512820512, "grad_norm": 1.42158083553959, "learning_rate": 9.991929563254913e-06, "loss": 0.4825146794319153, "step": 28 }, { "epoch": 0.21245421245421245, "grad_norm": 1.1661369187587884, "learning_rate": 9.989459889710214e-06, "loss": 0.4361911714076996, "step": 29 }, { "epoch": 0.21978021978021978, "grad_norm": 1.2574093229638947, "learning_rate": 9.986661418317759e-06, "loss": 0.45438122749328613, "step": 30 }, { "epoch": 0.2271062271062271, "grad_norm": 1.3348666437119017, "learning_rate": 9.983534333492575e-06, "loss": 0.4379500448703766, "step": 31 }, { "epoch": 0.23443223443223443, "grad_norm": 1.246184387917552, "learning_rate": 9.980078841304817e-06, "loss": 0.4118569791316986, "step": 32 }, { "epoch": 0.24175824175824176, "grad_norm": 1.179330565300909, "learning_rate": 9.97629516946618e-06, "loss": 0.4063628315925598, "step": 33 }, { "epoch": 0.2490842490842491, "grad_norm": 1.1385770353147593, "learning_rate": 9.97218356731491e-06, "loss": 0.4152410626411438, "step": 34 }, { "epoch": 0.2564102564102564, "grad_norm": 1.6513827173956053, "learning_rate": 9.967744305799358e-06, "loss": 0.4543665647506714, "step": 35 }, { "epoch": 0.26373626373626374, "grad_norm": 1.1284245393981716, "learning_rate": 9.962977677460132e-06, "loss": 0.42496830224990845, "step": 36 }, { "epoch": 0.27106227106227104, "grad_norm": 1.0753146429921163, "learning_rate": 9.957883996410821e-06, "loss": 0.42909127473831177, "step": 37 }, { "epoch": 0.2783882783882784, "grad_norm": 1.295900042465006, "learning_rate": 9.952463598317286e-06, "loss": 0.3935036063194275, "step": 38 }, { "epoch": 0.2857142857142857, "grad_norm": 1.134829504686702, "learning_rate": 9.946716840375552e-06, "loss": 0.44124358892440796, "step": 39 }, { "epoch": 0.29304029304029305, "grad_norm": 1.1702725203395377, "learning_rate": 9.940644101288259e-06, "loss": 0.4439253509044647, "step": 40 }, { "epoch": 0.30036630036630035, "grad_norm": 1.256616648991797, "learning_rate": 9.934245781239714e-06, "loss": 0.3892786502838135, "step": 41 }, { "epoch": 0.3076923076923077, "grad_norm": 1.1589721621440654, "learning_rate": 9.927522301869515e-06, "loss": 0.36586904525756836, "step": 42 }, { "epoch": 0.315018315018315, "grad_norm": 1.2097406758159928, "learning_rate": 9.920474106244764e-06, "loss": 0.3926905393600464, "step": 43 }, { "epoch": 0.32234432234432236, "grad_norm": 1.1180887734407854, "learning_rate": 9.913101658830879e-06, "loss": 0.39513468742370605, "step": 44 }, { "epoch": 0.32967032967032966, "grad_norm": 1.489392826555109, "learning_rate": 9.905405445460972e-06, "loss": 0.4130535125732422, "step": 45 }, { "epoch": 0.336996336996337, "grad_norm": 1.154514234942715, "learning_rate": 9.897385973303845e-06, "loss": 0.4013521373271942, "step": 46 }, { "epoch": 0.3443223443223443, "grad_norm": 1.1719627644397432, "learning_rate": 9.889043770830566e-06, "loss": 0.4277796745300293, "step": 47 }, { "epoch": 0.3516483516483517, "grad_norm": 1.5740413108726932, "learning_rate": 9.880379387779637e-06, "loss": 0.38313156366348267, "step": 48 }, { "epoch": 0.358974358974359, "grad_norm": 1.1734369672836624, "learning_rate": 9.871393395120774e-06, "loss": 0.4053453803062439, "step": 49 }, { "epoch": 0.3663003663003663, "grad_norm": 1.4855170661495627, "learning_rate": 9.862086385017283e-06, "loss": 0.37065786123275757, "step": 50 }, { "epoch": 0.37362637362637363, "grad_norm": 1.2470491770738514, "learning_rate": 9.852458970787027e-06, "loss": 0.3932054042816162, "step": 51 }, { "epoch": 0.38095238095238093, "grad_norm": 1.1200733413865462, "learning_rate": 9.842511786862018e-06, "loss": 0.3844960331916809, "step": 52 }, { "epoch": 0.3882783882783883, "grad_norm": 1.2353594939698733, "learning_rate": 9.832245488746612e-06, "loss": 0.4030640125274658, "step": 53 }, { "epoch": 0.3956043956043956, "grad_norm": 1.2526400425035524, "learning_rate": 9.821660752974294e-06, "loss": 0.41372472047805786, "step": 54 }, { "epoch": 0.40293040293040294, "grad_norm": 1.1578568131070235, "learning_rate": 9.81075827706312e-06, "loss": 0.38878971338272095, "step": 55 }, { "epoch": 0.41025641025641024, "grad_norm": 1.1789063859126412, "learning_rate": 9.799538779469734e-06, "loss": 0.3679278492927551, "step": 56 }, { "epoch": 0.4175824175824176, "grad_norm": 1.0793218213230338, "learning_rate": 9.78800299954203e-06, "loss": 0.35747843980789185, "step": 57 }, { "epoch": 0.4249084249084249, "grad_norm": 1.4979207988768952, "learning_rate": 9.776151697470431e-06, "loss": 0.38858622312545776, "step": 58 }, { "epoch": 0.43223443223443225, "grad_norm": 1.5776030032036932, "learning_rate": 9.763985654237785e-06, "loss": 0.410378634929657, "step": 59 }, { "epoch": 0.43956043956043955, "grad_norm": 1.1687321357395992, "learning_rate": 9.751505671567914e-06, "loss": 0.3829571008682251, "step": 60 }, { "epoch": 0.4468864468864469, "grad_norm": 1.1022315738152733, "learning_rate": 9.738712571872765e-06, "loss": 0.38535329699516296, "step": 61 }, { "epoch": 0.4542124542124542, "grad_norm": 1.031148812173931, "learning_rate": 9.725607198198227e-06, "loss": 0.37591585516929626, "step": 62 }, { "epoch": 0.46153846153846156, "grad_norm": 1.1465009384770484, "learning_rate": 9.712190414168573e-06, "loss": 0.43732309341430664, "step": 63 }, { "epoch": 0.46886446886446886, "grad_norm": 3.901087295216021, "learning_rate": 9.698463103929542e-06, "loss": 0.35739046335220337, "step": 64 }, { "epoch": 0.47619047619047616, "grad_norm": 1.1832020160855363, "learning_rate": 9.684426172090084e-06, "loss": 0.39645135402679443, "step": 65 }, { "epoch": 0.4835164835164835, "grad_norm": 1.1501088941718765, "learning_rate": 9.670080543662742e-06, "loss": 0.3675802946090698, "step": 66 }, { "epoch": 0.4908424908424908, "grad_norm": 1.2208405532006774, "learning_rate": 9.655427164002692e-06, "loss": 0.39301833510398865, "step": 67 }, { "epoch": 0.4981684981684982, "grad_norm": 1.1252142850079323, "learning_rate": 9.640466998745456e-06, "loss": 0.4137623906135559, "step": 68 }, { "epoch": 0.5054945054945055, "grad_norm": 1.0526557640749403, "learning_rate": 9.625201033743262e-06, "loss": 0.3448682129383087, "step": 69 }, { "epoch": 0.5128205128205128, "grad_norm": 1.147790880025079, "learning_rate": 9.609630275000072e-06, "loss": 0.4187518358230591, "step": 70 }, { "epoch": 0.5201465201465202, "grad_norm": 1.1265458933551666, "learning_rate": 9.5937557486053e-06, "loss": 0.3735639452934265, "step": 71 }, { "epoch": 0.5274725274725275, "grad_norm": 1.1089149618339371, "learning_rate": 9.577578500666187e-06, "loss": 0.40197598934173584, "step": 72 }, { "epoch": 0.5347985347985348, "grad_norm": 1.3993390218741097, "learning_rate": 9.561099597238862e-06, "loss": 0.40231800079345703, "step": 73 }, { "epoch": 0.5421245421245421, "grad_norm": 1.1143458075824666, "learning_rate": 9.544320124258093e-06, "loss": 0.3674388527870178, "step": 74 }, { "epoch": 0.5494505494505495, "grad_norm": 1.0051401204214154, "learning_rate": 9.527241187465735e-06, "loss": 0.40350422263145447, "step": 75 }, { "epoch": 0.5567765567765568, "grad_norm": 1.1152468221193563, "learning_rate": 9.509863912337843e-06, "loss": 0.4011390507221222, "step": 76 }, { "epoch": 0.5641025641025641, "grad_norm": 1.259123019280062, "learning_rate": 9.492189444010522e-06, "loss": 0.3963205814361572, "step": 77 }, { "epoch": 0.5714285714285714, "grad_norm": 1.0882181942002715, "learning_rate": 9.47421894720446e-06, "loss": 0.3761591613292694, "step": 78 }, { "epoch": 0.5787545787545788, "grad_norm": 1.3171244527835357, "learning_rate": 9.455953606148172e-06, "loss": 0.37985581159591675, "step": 79 }, { "epoch": 0.5860805860805861, "grad_norm": 1.1576335484597036, "learning_rate": 9.437394624499957e-06, "loss": 0.38125094771385193, "step": 80 }, { "epoch": 0.5934065934065934, "grad_norm": 1.340371799783884, "learning_rate": 9.418543225268598e-06, "loss": 0.38609158992767334, "step": 81 }, { "epoch": 0.6007326007326007, "grad_norm": 1.062259591126727, "learning_rate": 9.399400650732735e-06, "loss": 0.36670905351638794, "step": 82 }, { "epoch": 0.608058608058608, "grad_norm": 1.2435607193732192, "learning_rate": 9.379968162359034e-06, "loss": 0.39242708683013916, "step": 83 }, { "epoch": 0.6153846153846154, "grad_norm": 1.0441743485299124, "learning_rate": 9.36024704071904e-06, "loss": 0.3625451326370239, "step": 84 }, { "epoch": 0.6227106227106227, "grad_norm": 1.1226141976759751, "learning_rate": 9.340238585404787e-06, "loss": 0.3682897090911865, "step": 85 }, { "epoch": 0.63003663003663, "grad_norm": 1.1834442757500532, "learning_rate": 9.319944114943171e-06, "loss": 0.3925570845603943, "step": 86 }, { "epoch": 0.6373626373626373, "grad_norm": 1.2543334036025904, "learning_rate": 9.299364966709051e-06, "loss": 0.40064454078674316, "step": 87 }, { "epoch": 0.6446886446886447, "grad_norm": 1.1281264939516502, "learning_rate": 9.278502496837116e-06, "loss": 0.350555956363678, "step": 88 }, { "epoch": 0.652014652014652, "grad_norm": 0.9918889297129596, "learning_rate": 9.257358080132524e-06, "loss": 0.3880164623260498, "step": 89 }, { "epoch": 0.6593406593406593, "grad_norm": 1.2461108393741742, "learning_rate": 9.235933109980302e-06, "loss": 0.36218661069869995, "step": 90 }, { "epoch": 0.6666666666666666, "grad_norm": 1.1159489708058996, "learning_rate": 9.214228998253526e-06, "loss": 0.39029616117477417, "step": 91 }, { "epoch": 0.673992673992674, "grad_norm": 1.1373218288826976, "learning_rate": 9.192247175220276e-06, "loss": 0.3715818524360657, "step": 92 }, { "epoch": 0.6813186813186813, "grad_norm": 1.1336385214393307, "learning_rate": 9.16998908944939e-06, "loss": 0.3895593285560608, "step": 93 }, { "epoch": 0.6886446886446886, "grad_norm": 1.2322915227119142, "learning_rate": 9.147456207714998e-06, "loss": 0.3933688700199127, "step": 94 }, { "epoch": 0.6959706959706959, "grad_norm": 1.110226455606631, "learning_rate": 9.124650014899868e-06, "loss": 0.37298864126205444, "step": 95 }, { "epoch": 0.7032967032967034, "grad_norm": 1.0929399186170907, "learning_rate": 9.101572013897555e-06, "loss": 0.35858720541000366, "step": 96 }, { "epoch": 0.7106227106227107, "grad_norm": 1.1200942735792472, "learning_rate": 9.078223725513366e-06, "loss": 0.35287725925445557, "step": 97 }, { "epoch": 0.717948717948718, "grad_norm": 1.107219943504411, "learning_rate": 9.05460668836413e-06, "loss": 0.3863588869571686, "step": 98 }, { "epoch": 0.7252747252747253, "grad_norm": 1.0734987755886534, "learning_rate": 9.030722458776815e-06, "loss": 0.35608792304992676, "step": 99 }, { "epoch": 0.7326007326007326, "grad_norm": 0.9382920838540864, "learning_rate": 9.006572610685969e-06, "loss": 0.3744190037250519, "step": 100 }, { "epoch": 0.73992673992674, "grad_norm": 1.2299021078001484, "learning_rate": 8.982158735529991e-06, "loss": 0.38028016686439514, "step": 101 }, { "epoch": 0.7472527472527473, "grad_norm": 1.0711983498848636, "learning_rate": 8.957482442146271e-06, "loss": 0.35936033725738525, "step": 102 }, { "epoch": 0.7545787545787546, "grad_norm": 1.0380773275659636, "learning_rate": 8.932545356665157e-06, "loss": 0.3793637454509735, "step": 103 }, { "epoch": 0.7619047619047619, "grad_norm": 1.1144376402102996, "learning_rate": 8.907349122402803e-06, "loss": 0.3640226721763611, "step": 104 }, { "epoch": 0.7692307692307693, "grad_norm": 1.1640128905082725, "learning_rate": 8.881895399752873e-06, "loss": 0.3372342586517334, "step": 105 }, { "epoch": 0.7765567765567766, "grad_norm": 1.199009223038314, "learning_rate": 8.85618586607713e-06, "loss": 0.37812405824661255, "step": 106 }, { "epoch": 0.7838827838827839, "grad_norm": 1.0273110860584846, "learning_rate": 8.83022221559489e-06, "loss": 0.35580265522003174, "step": 107 }, { "epoch": 0.7912087912087912, "grad_norm": 1.0516706336456754, "learning_rate": 8.80400615927139e-06, "loss": 0.3262220025062561, "step": 108 }, { "epoch": 0.7985347985347986, "grad_norm": 1.0704201377081104, "learning_rate": 8.777539424705022e-06, "loss": 0.3796083927154541, "step": 109 }, { "epoch": 0.8058608058608059, "grad_norm": 1.1209202523652468, "learning_rate": 8.750823756013498e-06, "loss": 0.3779366612434387, "step": 110 }, { "epoch": 0.8131868131868132, "grad_norm": 1.0413754375402127, "learning_rate": 8.72386091371891e-06, "loss": 0.34237805008888245, "step": 111 }, { "epoch": 0.8205128205128205, "grad_norm": 1.1829628343503562, "learning_rate": 8.696652674631716e-06, "loss": 0.3707228899002075, "step": 112 }, { "epoch": 0.8278388278388278, "grad_norm": 1.0593662409475815, "learning_rate": 8.669200831733655e-06, "loss": 0.3644367456436157, "step": 113 }, { "epoch": 0.8351648351648352, "grad_norm": 0.9884645087059205, "learning_rate": 8.64150719405958e-06, "loss": 0.354566752910614, "step": 114 }, { "epoch": 0.8424908424908425, "grad_norm": 1.1284969366067186, "learning_rate": 8.613573586578262e-06, "loss": 0.34911516308784485, "step": 115 }, { "epoch": 0.8498168498168498, "grad_norm": 1.0401559892838286, "learning_rate": 8.585401850072114e-06, "loss": 0.3552972078323364, "step": 116 }, { "epoch": 0.8571428571428571, "grad_norm": 1.096500663411754, "learning_rate": 8.55699384101589e-06, "loss": 0.3714224100112915, "step": 117 }, { "epoch": 0.8644688644688645, "grad_norm": 0.9686738471210327, "learning_rate": 8.528351431454352e-06, "loss": 0.3586280941963196, "step": 118 }, { "epoch": 0.8717948717948718, "grad_norm": 1.1418772784157138, "learning_rate": 8.499476508878894e-06, "loss": 0.37895333766937256, "step": 119 }, { "epoch": 0.8791208791208791, "grad_norm": 1.102623279686654, "learning_rate": 8.470370976103171e-06, "loss": 0.33410561084747314, "step": 120 }, { "epoch": 0.8864468864468864, "grad_norm": 1.2084381697438025, "learning_rate": 8.441036751137697e-06, "loss": 0.34863168001174927, "step": 121 }, { "epoch": 0.8937728937728938, "grad_norm": 1.0221326248807063, "learning_rate": 8.411475767063454e-06, "loss": 0.36963531374931335, "step": 122 }, { "epoch": 0.9010989010989011, "grad_norm": 0.9696301679675473, "learning_rate": 8.381689971904514e-06, "loss": 0.382631778717041, "step": 123 }, { "epoch": 0.9084249084249084, "grad_norm": 1.138279150305765, "learning_rate": 8.35168132849965e-06, "loss": 0.3976040780544281, "step": 124 }, { "epoch": 0.9157509157509157, "grad_norm": 1.0494402894524364, "learning_rate": 8.321451814372998e-06, "loss": 0.37572136521339417, "step": 125 }, { "epoch": 0.9230769230769231, "grad_norm": 1.0892258482751453, "learning_rate": 8.29100342160374e-06, "loss": 0.3367949426174164, "step": 126 }, { "epoch": 0.9304029304029304, "grad_norm": 1.5742935797211806, "learning_rate": 8.260338156694836e-06, "loss": 0.3577330410480499, "step": 127 }, { "epoch": 0.9377289377289377, "grad_norm": 1.0337951916800752, "learning_rate": 8.229458040440783e-06, "loss": 0.38907700777053833, "step": 128 }, { "epoch": 0.945054945054945, "grad_norm": 1.0463672741030141, "learning_rate": 8.198365107794457e-06, "loss": 0.35719990730285645, "step": 129 }, { "epoch": 0.9523809523809523, "grad_norm": 1.150609447800333, "learning_rate": 8.167061407733018e-06, "loss": 0.3516216278076172, "step": 130 }, { "epoch": 0.9597069597069597, "grad_norm": 1.0134109975745493, "learning_rate": 8.135549003122871e-06, "loss": 0.37219393253326416, "step": 131 }, { "epoch": 0.967032967032967, "grad_norm": 1.1888953837381901, "learning_rate": 8.103829970583742e-06, "loss": 0.3526732325553894, "step": 132 }, { "epoch": 0.9743589743589743, "grad_norm": 0.9504435010655483, "learning_rate": 8.071906400351823e-06, "loss": 0.3242747187614441, "step": 133 }, { "epoch": 0.9816849816849816, "grad_norm": 1.1620841266168631, "learning_rate": 8.039780396142023e-06, "loss": 0.3983690142631531, "step": 134 }, { "epoch": 0.989010989010989, "grad_norm": 1.080002653239463, "learning_rate": 8.007454075009352e-06, "loss": 0.3187519907951355, "step": 135 }, { "epoch": 0.9963369963369964, "grad_norm": 1.0777271233101429, "learning_rate": 7.974929567209399e-06, "loss": 0.33882084488868713, "step": 136 }, { "epoch": 1.0, "grad_norm": 2.182267386409593, "learning_rate": 7.942209016057954e-06, "loss": 0.36792832612991333, "step": 137 }, { "epoch": 1.0073260073260073, "grad_norm": 1.0686033471580987, "learning_rate": 7.909294577789765e-06, "loss": 0.2894274592399597, "step": 138 }, { "epoch": 1.0146520146520146, "grad_norm": 1.0153401594780536, "learning_rate": 7.87618842141645e-06, "loss": 0.30820950865745544, "step": 139 }, { "epoch": 1.021978021978022, "grad_norm": 1.0573547123823617, "learning_rate": 7.842892728583557e-06, "loss": 0.2707104980945587, "step": 140 }, { "epoch": 1.0293040293040292, "grad_norm": 1.068424976718832, "learning_rate": 7.809409693426803e-06, "loss": 0.2870473861694336, "step": 141 }, { "epoch": 1.0366300366300367, "grad_norm": 1.108505986461051, "learning_rate": 7.775741522427477e-06, "loss": 0.27766191959381104, "step": 142 }, { "epoch": 1.043956043956044, "grad_norm": 1.0788011444539796, "learning_rate": 7.741890434267043e-06, "loss": 0.2834435999393463, "step": 143 }, { "epoch": 1.0512820512820513, "grad_norm": 1.2267316092589455, "learning_rate": 7.707858659680924e-06, "loss": 0.2835862338542938, "step": 144 }, { "epoch": 1.0586080586080586, "grad_norm": 1.3229066354461252, "learning_rate": 7.67364844131151e-06, "loss": 0.27477383613586426, "step": 145 }, { "epoch": 1.065934065934066, "grad_norm": 1.2384024803974207, "learning_rate": 7.63926203356036e-06, "loss": 0.2960669994354248, "step": 146 }, { "epoch": 1.0732600732600732, "grad_norm": 1.18044187380908, "learning_rate": 7.604701702439652e-06, "loss": 0.2972232699394226, "step": 147 }, { "epoch": 1.0805860805860805, "grad_norm": 1.2009282809760102, "learning_rate": 7.5699697254228496e-06, "loss": 0.2825266420841217, "step": 148 }, { "epoch": 1.0879120879120878, "grad_norm": 1.019088056663372, "learning_rate": 7.535068391294618e-06, "loss": 0.29965272545814514, "step": 149 }, { "epoch": 1.0952380952380953, "grad_norm": 1.1839639402543083, "learning_rate": 7.500000000000001e-06, "loss": 0.29271137714385986, "step": 150 }, { "epoch": 1.1025641025641026, "grad_norm": 1.0809429287242711, "learning_rate": 7.464766862492856e-06, "loss": 0.2796112298965454, "step": 151 }, { "epoch": 1.10989010989011, "grad_norm": 1.1992697090491269, "learning_rate": 7.42937130058357e-06, "loss": 0.28620994091033936, "step": 152 }, { "epoch": 1.1172161172161172, "grad_norm": 1.1027252290575018, "learning_rate": 7.393815646786047e-06, "loss": 0.2959006428718567, "step": 153 }, { "epoch": 1.1245421245421245, "grad_norm": 1.0120783087707164, "learning_rate": 7.358102244164003e-06, "loss": 0.29200655221939087, "step": 154 }, { "epoch": 1.1318681318681318, "grad_norm": 1.1552817802815585, "learning_rate": 7.322233446176571e-06, "loss": 0.29077982902526855, "step": 155 }, { "epoch": 1.1391941391941391, "grad_norm": 1.1540271695101154, "learning_rate": 7.286211616523193e-06, "loss": 0.29103994369506836, "step": 156 }, { "epoch": 1.1465201465201464, "grad_norm": 1.0723601056663603, "learning_rate": 7.250039128987874e-06, "loss": 0.2747235894203186, "step": 157 }, { "epoch": 1.1538461538461537, "grad_norm": 1.1487058706441489, "learning_rate": 7.213718367282737e-06, "loss": 0.30069500207901, "step": 158 }, { "epoch": 1.1611721611721613, "grad_norm": 1.3318093229671866, "learning_rate": 7.177251724890957e-06, "loss": 0.3071703314781189, "step": 159 }, { "epoch": 1.1684981684981686, "grad_norm": 1.1534532452958286, "learning_rate": 7.14064160490902e-06, "loss": 0.28914874792099, "step": 160 }, { "epoch": 1.1758241758241759, "grad_norm": 1.1547701660303746, "learning_rate": 7.103890419888367e-06, "loss": 0.3017809987068176, "step": 161 }, { "epoch": 1.1831501831501832, "grad_norm": 1.2947227089667261, "learning_rate": 7.067000591676416e-06, "loss": 0.3369286060333252, "step": 162 }, { "epoch": 1.1904761904761905, "grad_norm": 1.1140895594029996, "learning_rate": 7.029974551256957e-06, "loss": 0.27125978469848633, "step": 163 }, { "epoch": 1.1978021978021978, "grad_norm": 1.0122537303440942, "learning_rate": 6.992814738589958e-06, "loss": 0.28907060623168945, "step": 164 }, { "epoch": 1.205128205128205, "grad_norm": 1.025657300579659, "learning_rate": 6.95552360245078e-06, "loss": 0.2677385210990906, "step": 165 }, { "epoch": 1.2124542124542124, "grad_norm": 1.0823638919225964, "learning_rate": 6.9181036002687985e-06, "loss": 0.27675288915634155, "step": 166 }, { "epoch": 1.2197802197802199, "grad_norm": 1.0413163959584528, "learning_rate": 6.880557197965465e-06, "loss": 0.3112404942512512, "step": 167 }, { "epoch": 1.2271062271062272, "grad_norm": 1.0645554729563804, "learning_rate": 6.84288686979181e-06, "loss": 0.2850072383880615, "step": 168 }, { "epoch": 1.2344322344322345, "grad_norm": 0.8982420186922085, "learning_rate": 6.805095098165388e-06, "loss": 0.26976898312568665, "step": 169 }, { "epoch": 1.2417582417582418, "grad_norm": 1.1333676300011672, "learning_rate": 6.767184373506698e-06, "loss": 0.2745368480682373, "step": 170 }, { "epoch": 1.249084249084249, "grad_norm": 1.0709277161683297, "learning_rate": 6.7291571940750575e-06, "loss": 0.2779650390148163, "step": 171 }, { "epoch": 1.2564102564102564, "grad_norm": 1.0868456188418263, "learning_rate": 6.6910160658039835e-06, "loss": 0.3108050227165222, "step": 172 }, { "epoch": 1.2637362637362637, "grad_norm": 1.1874804808558082, "learning_rate": 6.652763502136044e-06, "loss": 0.27616167068481445, "step": 173 }, { "epoch": 1.271062271062271, "grad_norm": 1.0270742547414067, "learning_rate": 6.614402023857231e-06, "loss": 0.26505476236343384, "step": 174 }, { "epoch": 1.2783882783882783, "grad_norm": 0.9281377603059646, "learning_rate": 6.57593415893085e-06, "loss": 0.26481738686561584, "step": 175 }, { "epoch": 1.2857142857142856, "grad_norm": 1.0894098307492133, "learning_rate": 6.5373624423309165e-06, "loss": 0.27870288491249084, "step": 176 }, { "epoch": 1.293040293040293, "grad_norm": 1.074181253190212, "learning_rate": 6.498689415875121e-06, "loss": 0.29720550775527954, "step": 177 }, { "epoch": 1.3003663003663004, "grad_norm": 1.0913740336999906, "learning_rate": 6.459917628057319e-06, "loss": 0.2793238162994385, "step": 178 }, { "epoch": 1.3076923076923077, "grad_norm": 1.1164998009762015, "learning_rate": 6.421049633879588e-06, "loss": 0.3088657855987549, "step": 179 }, { "epoch": 1.315018315018315, "grad_norm": 0.9299860686646102, "learning_rate": 6.3820879946838585e-06, "loss": 0.2543247938156128, "step": 180 }, { "epoch": 1.3223443223443223, "grad_norm": 1.1571048592937423, "learning_rate": 6.3430352779831275e-06, "loss": 0.2909772992134094, "step": 181 }, { "epoch": 1.3296703296703296, "grad_norm": 1.094026131808152, "learning_rate": 6.303894057292261e-06, "loss": 0.2640184164047241, "step": 182 }, { "epoch": 1.3369963369963371, "grad_norm": 0.9363985633369026, "learning_rate": 6.264666911958404e-06, "loss": 0.2943224012851715, "step": 183 }, { "epoch": 1.3443223443223444, "grad_norm": 1.0969037309510354, "learning_rate": 6.225356426991007e-06, "loss": 0.24726328253746033, "step": 184 }, { "epoch": 1.3516483516483517, "grad_norm": 1.0885385789514508, "learning_rate": 6.185965192891472e-06, "loss": 0.2971659302711487, "step": 185 }, { "epoch": 1.358974358974359, "grad_norm": 1.0158089498674099, "learning_rate": 6.146495805482451e-06, "loss": 0.29360431432724, "step": 186 }, { "epoch": 1.3663003663003663, "grad_norm": 1.3591604878047607, "learning_rate": 6.106950865736777e-06, "loss": 0.2744835615158081, "step": 187 }, { "epoch": 1.3736263736263736, "grad_norm": 1.0213784235417054, "learning_rate": 6.067332979606069e-06, "loss": 0.2648021876811981, "step": 188 }, { "epoch": 1.380952380952381, "grad_norm": 1.0816098136800893, "learning_rate": 6.027644757849004e-06, "loss": 0.32366281747817993, "step": 189 }, { "epoch": 1.3882783882783882, "grad_norm": 1.0884347912067143, "learning_rate": 5.987888815859266e-06, "loss": 0.2866742014884949, "step": 190 }, { "epoch": 1.3956043956043955, "grad_norm": 0.915669390965943, "learning_rate": 5.948067773493205e-06, "loss": 0.30587273836135864, "step": 191 }, { "epoch": 1.4029304029304028, "grad_norm": 1.0927318653030724, "learning_rate": 5.908184254897183e-06, "loss": 0.314590185880661, "step": 192 }, { "epoch": 1.4102564102564101, "grad_norm": 1.1120641499678694, "learning_rate": 5.8682408883346535e-06, "loss": 0.30427268147468567, "step": 193 }, { "epoch": 1.4175824175824177, "grad_norm": 1.004808225175699, "learning_rate": 5.828240306012957e-06, "loss": 0.26962751150131226, "step": 194 }, { "epoch": 1.424908424908425, "grad_norm": 1.1800129602480383, "learning_rate": 5.788185143909868e-06, "loss": 0.3093605637550354, "step": 195 }, { "epoch": 1.4322344322344323, "grad_norm": 1.1081211032827851, "learning_rate": 5.74807804159989e-06, "loss": 0.2661038935184479, "step": 196 }, { "epoch": 1.4395604395604396, "grad_norm": 1.0397696208232112, "learning_rate": 5.7079216420803e-06, "loss": 0.26260125637054443, "step": 197 }, { "epoch": 1.4468864468864469, "grad_norm": 1.0675565712457995, "learning_rate": 5.66771859159699e-06, "loss": 0.30148035287857056, "step": 198 }, { "epoch": 1.4542124542124542, "grad_norm": 1.0592600391219584, "learning_rate": 5.6274715394700805e-06, "loss": 0.2963045537471771, "step": 199 }, { "epoch": 1.4615384615384617, "grad_norm": 1.1774403611908744, "learning_rate": 5.587183137919332e-06, "loss": 0.3152458071708679, "step": 200 }, { "epoch": 1.468864468864469, "grad_norm": 1.0787649372567354, "learning_rate": 5.546856041889374e-06, "loss": 0.27110299468040466, "step": 201 }, { "epoch": 1.4761904761904763, "grad_norm": 0.9830650827059149, "learning_rate": 5.5064929088747324e-06, "loss": 0.28375378251075745, "step": 202 }, { "epoch": 1.4835164835164836, "grad_norm": 1.134296905235914, "learning_rate": 5.46609639874473e-06, "loss": 0.29090097546577454, "step": 203 }, { "epoch": 1.4908424908424909, "grad_norm": 1.282298463772494, "learning_rate": 5.425669173568179e-06, "loss": 0.3167083263397217, "step": 204 }, { "epoch": 1.4981684981684982, "grad_norm": 1.0742616101894822, "learning_rate": 5.385213897437975e-06, "loss": 0.25165435671806335, "step": 205 }, { "epoch": 1.5054945054945055, "grad_norm": 1.1328142095892046, "learning_rate": 5.344733236295525e-06, "loss": 0.27617213129997253, "step": 206 }, { "epoch": 1.5128205128205128, "grad_norm": 1.0219367549951983, "learning_rate": 5.30422985775507e-06, "loss": 0.25958195328712463, "step": 207 }, { "epoch": 1.52014652014652, "grad_norm": 1.0847411482729885, "learning_rate": 5.263706430927895e-06, "loss": 0.2788093686103821, "step": 208 }, { "epoch": 1.5274725274725274, "grad_norm": 1.2394999596242242, "learning_rate": 5.223165626246432e-06, "loss": 0.30942243337631226, "step": 209 }, { "epoch": 1.5347985347985347, "grad_norm": 1.0406322641722239, "learning_rate": 5.182610115288296e-06, "loss": 0.3002859950065613, "step": 210 }, { "epoch": 1.542124542124542, "grad_norm": 1.0478051687617, "learning_rate": 5.142042570600212e-06, "loss": 0.2850934565067291, "step": 211 }, { "epoch": 1.5494505494505495, "grad_norm": 1.082184087642848, "learning_rate": 5.101465665521919e-06, "loss": 0.29597795009613037, "step": 212 }, { "epoch": 1.5567765567765568, "grad_norm": 1.1107024805480228, "learning_rate": 5.060882074009988e-06, "loss": 0.3078245520591736, "step": 213 }, { "epoch": 1.564102564102564, "grad_norm": 1.0296929677102222, "learning_rate": 5.020294470461615e-06, "loss": 0.2921220064163208, "step": 214 }, { "epoch": 1.5714285714285714, "grad_norm": 1.0540827764915854, "learning_rate": 4.979705529538385e-06, "loss": 0.2733391523361206, "step": 215 }, { "epoch": 1.578754578754579, "grad_norm": 0.9652888358034895, "learning_rate": 4.939117925990013e-06, "loss": 0.2933793365955353, "step": 216 }, { "epoch": 1.5860805860805862, "grad_norm": 1.1196595116135142, "learning_rate": 4.8985343344780815e-06, "loss": 0.31104928255081177, "step": 217 }, { "epoch": 1.5934065934065935, "grad_norm": 1.0078579598513866, "learning_rate": 4.857957429399788e-06, "loss": 0.2975787818431854, "step": 218 }, { "epoch": 1.6007326007326008, "grad_norm": 1.003543299759328, "learning_rate": 4.817389884711706e-06, "loss": 0.2911156117916107, "step": 219 }, { "epoch": 1.6080586080586081, "grad_norm": 1.117174701168215, "learning_rate": 4.7768343737535694e-06, "loss": 0.28502243757247925, "step": 220 }, { "epoch": 1.6153846153846154, "grad_norm": 1.2387468209978119, "learning_rate": 4.736293569072108e-06, "loss": 0.3383062779903412, "step": 221 }, { "epoch": 1.6227106227106227, "grad_norm": 0.9751319932294236, "learning_rate": 4.695770142244931e-06, "loss": 0.26932916045188904, "step": 222 }, { "epoch": 1.63003663003663, "grad_norm": 1.0756446276168852, "learning_rate": 4.655266763704476e-06, "loss": 0.2709274888038635, "step": 223 }, { "epoch": 1.6373626373626373, "grad_norm": 1.0873882111356061, "learning_rate": 4.614786102562026e-06, "loss": 0.30510762333869934, "step": 224 }, { "epoch": 1.6446886446886446, "grad_norm": 0.9991578718833676, "learning_rate": 4.574330826431822e-06, "loss": 0.271587610244751, "step": 225 }, { "epoch": 1.652014652014652, "grad_norm": 0.9959466132377315, "learning_rate": 4.533903601255272e-06, "loss": 0.27231907844543457, "step": 226 }, { "epoch": 1.6593406593406592, "grad_norm": 1.1774838310535634, "learning_rate": 4.493507091125269e-06, "loss": 0.2669839560985565, "step": 227 }, { "epoch": 1.6666666666666665, "grad_norm": 1.0881926934718549, "learning_rate": 4.4531439581106295e-06, "loss": 0.2904966175556183, "step": 228 }, { "epoch": 1.673992673992674, "grad_norm": 1.1146102161723215, "learning_rate": 4.412816862080668e-06, "loss": 0.2829638123512268, "step": 229 }, { "epoch": 1.6813186813186813, "grad_norm": 1.1410084762917667, "learning_rate": 4.37252846052992e-06, "loss": 0.31257230043411255, "step": 230 }, { "epoch": 1.6886446886446886, "grad_norm": 1.1796551533789368, "learning_rate": 4.332281408403011e-06, "loss": 0.2940324544906616, "step": 231 }, { "epoch": 1.695970695970696, "grad_norm": 0.9380068545214617, "learning_rate": 4.292078357919701e-06, "loss": 0.27315038442611694, "step": 232 }, { "epoch": 1.7032967032967035, "grad_norm": 1.1309971396055292, "learning_rate": 4.2519219584001106e-06, "loss": 0.31711578369140625, "step": 233 }, { "epoch": 1.7106227106227108, "grad_norm": 1.0898708429146375, "learning_rate": 4.2118148560901325e-06, "loss": 0.27645236253738403, "step": 234 }, { "epoch": 1.717948717948718, "grad_norm": 0.9890674690676207, "learning_rate": 4.171759693987046e-06, "loss": 0.2982947528362274, "step": 235 }, { "epoch": 1.7252747252747254, "grad_norm": 1.0308735193031042, "learning_rate": 4.131759111665349e-06, "loss": 0.27972856163978577, "step": 236 }, { "epoch": 1.7326007326007327, "grad_norm": 0.9974962409078789, "learning_rate": 4.091815745102818e-06, "loss": 0.2757408916950226, "step": 237 }, { "epoch": 1.73992673992674, "grad_norm": 1.040632796534077, "learning_rate": 4.051932226506797e-06, "loss": 0.2961726784706116, "step": 238 }, { "epoch": 1.7472527472527473, "grad_norm": 1.0688391666784938, "learning_rate": 4.0121111841407345e-06, "loss": 0.2576867938041687, "step": 239 }, { "epoch": 1.7545787545787546, "grad_norm": 1.044156456246768, "learning_rate": 3.9723552421509975e-06, "loss": 0.27895694971084595, "step": 240 }, { "epoch": 1.7619047619047619, "grad_norm": 0.9743851064122274, "learning_rate": 3.932667020393933e-06, "loss": 0.2776602804660797, "step": 241 }, { "epoch": 1.7692307692307692, "grad_norm": 0.9129387125371341, "learning_rate": 3.8930491342632235e-06, "loss": 0.2480865716934204, "step": 242 }, { "epoch": 1.7765567765567765, "grad_norm": 1.0565234983933456, "learning_rate": 3.853504194517551e-06, "loss": 0.290374755859375, "step": 243 }, { "epoch": 1.7838827838827838, "grad_norm": 1.0245173104357006, "learning_rate": 3.814034807108529e-06, "loss": 0.27799317240715027, "step": 244 }, { "epoch": 1.791208791208791, "grad_norm": 1.1247482481535258, "learning_rate": 3.774643573008995e-06, "loss": 0.287639319896698, "step": 245 }, { "epoch": 1.7985347985347986, "grad_norm": 1.2831629362435393, "learning_rate": 3.7353330880415963e-06, "loss": 0.2996615171432495, "step": 246 }, { "epoch": 1.8058608058608059, "grad_norm": 0.9631002049148443, "learning_rate": 3.6961059427077407e-06, "loss": 0.2891607880592346, "step": 247 }, { "epoch": 1.8131868131868132, "grad_norm": 1.0326234643935235, "learning_rate": 3.656964722016875e-06, "loss": 0.2577850818634033, "step": 248 }, { "epoch": 1.8205128205128205, "grad_norm": 0.9666099123254006, "learning_rate": 3.617912005316142e-06, "loss": 0.25415879487991333, "step": 249 }, { "epoch": 1.8278388278388278, "grad_norm": 0.9577101943470598, "learning_rate": 3.578950366120414e-06, "loss": 0.2674415111541748, "step": 250 }, { "epoch": 1.8351648351648353, "grad_norm": 1.2104620184418453, "learning_rate": 3.540082371942682e-06, "loss": 0.29486364126205444, "step": 251 }, { "epoch": 1.8424908424908426, "grad_norm": 1.1130183230560342, "learning_rate": 3.5013105841248794e-06, "loss": 0.28867706656455994, "step": 252 }, { "epoch": 1.84981684981685, "grad_norm": 0.960795493916683, "learning_rate": 3.462637557669084e-06, "loss": 0.27347224950790405, "step": 253 }, { "epoch": 1.8571428571428572, "grad_norm": 1.053963606829607, "learning_rate": 3.424065841069152e-06, "loss": 0.27742648124694824, "step": 254 }, { "epoch": 1.8644688644688645, "grad_norm": 1.073637011494531, "learning_rate": 3.3855979761427705e-06, "loss": 0.30111193656921387, "step": 255 }, { "epoch": 1.8717948717948718, "grad_norm": 1.006804490181391, "learning_rate": 3.347236497863957e-06, "loss": 0.2510993182659149, "step": 256 }, { "epoch": 1.879120879120879, "grad_norm": 0.9797151960888058, "learning_rate": 3.308983934196018e-06, "loss": 0.2800617218017578, "step": 257 }, { "epoch": 1.8864468864468864, "grad_norm": 1.1964125842584994, "learning_rate": 3.2708428059249437e-06, "loss": 0.28070536255836487, "step": 258 }, { "epoch": 1.8937728937728937, "grad_norm": 1.0143522113609016, "learning_rate": 3.2328156264933043e-06, "loss": 0.28813984990119934, "step": 259 }, { "epoch": 1.901098901098901, "grad_norm": 1.0887942458637438, "learning_rate": 3.194904901834613e-06, "loss": 0.27424943447113037, "step": 260 }, { "epoch": 1.9084249084249083, "grad_norm": 0.9073919323524343, "learning_rate": 3.1571131302081916e-06, "loss": 0.2768292725086212, "step": 261 }, { "epoch": 1.9157509157509156, "grad_norm": 0.9850360881093612, "learning_rate": 3.1194428020345375e-06, "loss": 0.318487286567688, "step": 262 }, { "epoch": 1.9230769230769231, "grad_norm": 1.0090745199197444, "learning_rate": 3.081896399731202e-06, "loss": 0.2533684968948364, "step": 263 }, { "epoch": 1.9304029304029304, "grad_norm": 1.1058329729628358, "learning_rate": 3.044476397549221e-06, "loss": 0.27426987886428833, "step": 264 }, { "epoch": 1.9377289377289377, "grad_norm": 1.0176581083684877, "learning_rate": 3.0071852614100427e-06, "loss": 0.27716314792633057, "step": 265 }, { "epoch": 1.945054945054945, "grad_norm": 1.0536498699210162, "learning_rate": 2.9700254487430448e-06, "loss": 0.2791192829608917, "step": 266 }, { "epoch": 1.9523809523809523, "grad_norm": 1.1327721722858968, "learning_rate": 2.9329994083235857e-06, "loss": 0.2645677626132965, "step": 267 }, { "epoch": 1.9597069597069599, "grad_norm": 1.078652769506114, "learning_rate": 2.896109580111634e-06, "loss": 0.27191221714019775, "step": 268 }, { "epoch": 1.9670329670329672, "grad_norm": 1.1837421082764947, "learning_rate": 2.8593583950909833e-06, "loss": 0.2969869375228882, "step": 269 }, { "epoch": 1.9743589743589745, "grad_norm": 0.9930141875119173, "learning_rate": 2.8227482751090445e-06, "loss": 0.25214335322380066, "step": 270 }, { "epoch": 1.9816849816849818, "grad_norm": 1.0475298808452358, "learning_rate": 2.786281632717264e-06, "loss": 0.29422906041145325, "step": 271 }, { "epoch": 1.989010989010989, "grad_norm": 1.0442095186311762, "learning_rate": 2.749960871012129e-06, "loss": 0.30069488286972046, "step": 272 }, { "epoch": 1.9963369963369964, "grad_norm": 0.9446344835083768, "learning_rate": 2.7137883834768076e-06, "loss": 0.29935750365257263, "step": 273 }, { "epoch": 2.0, "grad_norm": 1.8903335357771034, "learning_rate": 2.6777665538234292e-06, "loss": 0.26212257146835327, "step": 274 }, { "epoch": 2.0073260073260073, "grad_norm": 1.0872778312217004, "learning_rate": 2.641897755835997e-06, "loss": 0.2080368995666504, "step": 275 }, { "epoch": 2.0146520146520146, "grad_norm": 1.153142895526473, "learning_rate": 2.6061843532139563e-06, "loss": 0.22341328859329224, "step": 276 }, { "epoch": 2.021978021978022, "grad_norm": 0.9779095973602308, "learning_rate": 2.5706286994164315e-06, "loss": 0.22445546090602875, "step": 277 }, { "epoch": 2.029304029304029, "grad_norm": 1.1817760186238315, "learning_rate": 2.5352331375071437e-06, "loss": 0.23708981275558472, "step": 278 }, { "epoch": 2.0366300366300365, "grad_norm": 1.023239338905551, "learning_rate": 2.5000000000000015e-06, "loss": 0.22113129496574402, "step": 279 }, { "epoch": 2.043956043956044, "grad_norm": 1.1289343692866138, "learning_rate": 2.464931608705384e-06, "loss": 0.24936091899871826, "step": 280 }, { "epoch": 2.051282051282051, "grad_norm": 1.0298366936737875, "learning_rate": 2.430030274577151e-06, "loss": 0.22182141244411469, "step": 281 }, { "epoch": 2.0586080586080584, "grad_norm": 0.9322342047480516, "learning_rate": 2.3952982975603494e-06, "loss": 0.24618740379810333, "step": 282 }, { "epoch": 2.065934065934066, "grad_norm": 1.301143268306206, "learning_rate": 2.3607379664396414e-06, "loss": 0.22918632626533508, "step": 283 }, { "epoch": 2.0732600732600734, "grad_norm": 1.0677424151131676, "learning_rate": 2.3263515586884935e-06, "loss": 0.2351371943950653, "step": 284 }, { "epoch": 2.0805860805860807, "grad_norm": 1.122503313580565, "learning_rate": 2.2921413403190774e-06, "loss": 0.23036789894104004, "step": 285 }, { "epoch": 2.087912087912088, "grad_norm": 1.0871719769548138, "learning_rate": 2.25810956573296e-06, "loss": 0.24664072692394257, "step": 286 }, { "epoch": 2.0952380952380953, "grad_norm": 1.072946930604994, "learning_rate": 2.224258477572524e-06, "loss": 0.2169453501701355, "step": 287 }, { "epoch": 2.1025641025641026, "grad_norm": 1.1552824570087117, "learning_rate": 2.190590306573198e-06, "loss": 0.23637241125106812, "step": 288 }, { "epoch": 2.10989010989011, "grad_norm": 1.0605290517940913, "learning_rate": 2.1571072714164445e-06, "loss": 0.22414323687553406, "step": 289 }, { "epoch": 2.1172161172161172, "grad_norm": 1.043154934258016, "learning_rate": 2.1238115785835512e-06, "loss": 0.22473719716072083, "step": 290 }, { "epoch": 2.1245421245421245, "grad_norm": 1.2211410010717425, "learning_rate": 2.0907054222102367e-06, "loss": 0.21707648038864136, "step": 291 }, { "epoch": 2.131868131868132, "grad_norm": 1.0642724049563004, "learning_rate": 2.0577909839420468e-06, "loss": 0.21333038806915283, "step": 292 }, { "epoch": 2.139194139194139, "grad_norm": 1.0298997037587465, "learning_rate": 2.0250704327906025e-06, "loss": 0.23482055962085724, "step": 293 }, { "epoch": 2.1465201465201464, "grad_norm": 1.2521151418866294, "learning_rate": 1.9925459249906488e-06, "loss": 0.24669216573238373, "step": 294 }, { "epoch": 2.1538461538461537, "grad_norm": 1.0015347552107015, "learning_rate": 1.9602196038579774e-06, "loss": 0.21570074558258057, "step": 295 }, { "epoch": 2.161172161172161, "grad_norm": 1.1363173380860716, "learning_rate": 1.9280935996481792e-06, "loss": 0.21716558933258057, "step": 296 }, { "epoch": 2.1684981684981683, "grad_norm": 1.1171710714186258, "learning_rate": 1.8961700294162578e-06, "loss": 0.2021581530570984, "step": 297 }, { "epoch": 2.1758241758241756, "grad_norm": 1.0929444504935302, "learning_rate": 1.8644509968771302e-06, "loss": 0.2062225341796875, "step": 298 }, { "epoch": 2.183150183150183, "grad_norm": 1.041045504034978, "learning_rate": 1.832938592266984e-06, "loss": 0.2139657437801361, "step": 299 }, { "epoch": 2.1904761904761907, "grad_norm": 1.057190385371437, "learning_rate": 1.8016348922055448e-06, "loss": 0.1996728926897049, "step": 300 }, { "epoch": 2.197802197802198, "grad_norm": 1.1946122105787467, "learning_rate": 1.7705419595592193e-06, "loss": 0.25132063031196594, "step": 301 }, { "epoch": 2.2051282051282053, "grad_norm": 0.9259953268131886, "learning_rate": 1.7396618433051648e-06, "loss": 0.21299096941947937, "step": 302 }, { "epoch": 2.2124542124542126, "grad_norm": 1.213328353826304, "learning_rate": 1.7089965783962608e-06, "loss": 0.2140902727842331, "step": 303 }, { "epoch": 2.21978021978022, "grad_norm": 0.9761515672225444, "learning_rate": 1.6785481856270042e-06, "loss": 0.22363431751728058, "step": 304 }, { "epoch": 2.227106227106227, "grad_norm": 1.155337550086334, "learning_rate": 1.6483186715003523e-06, "loss": 0.23464494943618774, "step": 305 }, { "epoch": 2.2344322344322345, "grad_norm": 1.1140172781174638, "learning_rate": 1.618310028095486e-06, "loss": 0.21518093347549438, "step": 306 }, { "epoch": 2.241758241758242, "grad_norm": 1.2179937873095439, "learning_rate": 1.5885242329365448e-06, "loss": 0.2589687705039978, "step": 307 }, { "epoch": 2.249084249084249, "grad_norm": 1.083087208307822, "learning_rate": 1.5589632488623053e-06, "loss": 0.2202206552028656, "step": 308 }, { "epoch": 2.2564102564102564, "grad_norm": 1.1247253618959971, "learning_rate": 1.5296290238968303e-06, "loss": 0.2516007423400879, "step": 309 }, { "epoch": 2.2637362637362637, "grad_norm": 1.029732329010538, "learning_rate": 1.500523491121108e-06, "loss": 0.21486574411392212, "step": 310 }, { "epoch": 2.271062271062271, "grad_norm": 1.522644892476846, "learning_rate": 1.47164856854565e-06, "loss": 0.22302576899528503, "step": 311 }, { "epoch": 2.2783882783882783, "grad_norm": 1.0835410135357464, "learning_rate": 1.4430061589841122e-06, "loss": 0.2299138456583023, "step": 312 }, { "epoch": 2.2857142857142856, "grad_norm": 1.1608076314086253, "learning_rate": 1.4145981499278877e-06, "loss": 0.2284044623374939, "step": 313 }, { "epoch": 2.293040293040293, "grad_norm": 1.007806828890649, "learning_rate": 1.386426413421738e-06, "loss": 0.21568208932876587, "step": 314 }, { "epoch": 2.3003663003663, "grad_norm": 1.110614593649845, "learning_rate": 1.3584928059404207e-06, "loss": 0.21330109238624573, "step": 315 }, { "epoch": 2.3076923076923075, "grad_norm": 0.9269591001624151, "learning_rate": 1.3307991682663463e-06, "loss": 0.2049589455127716, "step": 316 }, { "epoch": 2.315018315018315, "grad_norm": 1.030660764247251, "learning_rate": 1.303347325368285e-06, "loss": 0.23808613419532776, "step": 317 }, { "epoch": 2.3223443223443225, "grad_norm": 1.0741881929381032, "learning_rate": 1.2761390862810907e-06, "loss": 0.2274160087108612, "step": 318 }, { "epoch": 2.32967032967033, "grad_norm": 1.23151478025897, "learning_rate": 1.2491762439865034e-06, "loss": 0.22255614399909973, "step": 319 }, { "epoch": 2.336996336996337, "grad_norm": 1.120344003470698, "learning_rate": 1.2224605752949786e-06, "loss": 0.2374870479106903, "step": 320 }, { "epoch": 2.3443223443223444, "grad_norm": 1.5560919363814774, "learning_rate": 1.1959938407286099e-06, "loss": 0.2206483781337738, "step": 321 }, { "epoch": 2.3516483516483517, "grad_norm": 0.9655470657059726, "learning_rate": 1.1697777844051105e-06, "loss": 0.20176303386688232, "step": 322 }, { "epoch": 2.358974358974359, "grad_norm": 1.096436902395159, "learning_rate": 1.143814133922872e-06, "loss": 0.22084000706672668, "step": 323 }, { "epoch": 2.3663003663003663, "grad_norm": 1.065227284623936, "learning_rate": 1.1181046002471292e-06, "loss": 0.20460768043994904, "step": 324 }, { "epoch": 2.3736263736263736, "grad_norm": 1.0697198399954104, "learning_rate": 1.0926508775971995e-06, "loss": 0.21005329489707947, "step": 325 }, { "epoch": 2.380952380952381, "grad_norm": 1.119132681089307, "learning_rate": 1.0674546433348453e-06, "loss": 0.23384399712085724, "step": 326 }, { "epoch": 2.3882783882783882, "grad_norm": 1.0612726585808296, "learning_rate": 1.04251755785373e-06, "loss": 0.2149549424648285, "step": 327 }, { "epoch": 2.3956043956043955, "grad_norm": 1.1103936104585082, "learning_rate": 1.0178412644700093e-06, "loss": 0.23808100819587708, "step": 328 }, { "epoch": 2.402930402930403, "grad_norm": 1.1803986673849467, "learning_rate": 9.934273893140335e-07, "loss": 0.21827144920825958, "step": 329 }, { "epoch": 2.41025641025641, "grad_norm": 1.7607496045758002, "learning_rate": 9.692775412231863e-07, "loss": 0.2156897485256195, "step": 330 }, { "epoch": 2.4175824175824174, "grad_norm": 1.0855693076540742, "learning_rate": 9.453933116358715e-07, "loss": 0.22576552629470825, "step": 331 }, { "epoch": 2.4249084249084247, "grad_norm": 1.0458722036651498, "learning_rate": 9.21776274486636e-07, "loss": 0.2029896229505539, "step": 332 }, { "epoch": 2.4322344322344325, "grad_norm": 1.0316218724135404, "learning_rate": 8.984279861024453e-07, "loss": 0.21579168736934662, "step": 333 }, { "epoch": 2.4395604395604398, "grad_norm": 1.0481067715510672, "learning_rate": 8.753499851001341e-07, "loss": 0.2032783031463623, "step": 334 }, { "epoch": 2.446886446886447, "grad_norm": 1.0255499821271086, "learning_rate": 8.525437922850033e-07, "loss": 0.21618527173995972, "step": 335 }, { "epoch": 2.4542124542124544, "grad_norm": 1.2612515831683861, "learning_rate": 8.30010910550611e-07, "loss": 0.22778677940368652, "step": 336 }, { "epoch": 2.4615384615384617, "grad_norm": 1.0389223185126732, "learning_rate": 8.077528247797234e-07, "loss": 0.21174968779087067, "step": 337 }, { "epoch": 2.468864468864469, "grad_norm": 1.056290930403573, "learning_rate": 7.857710017464737e-07, "loss": 0.2388148009777069, "step": 338 }, { "epoch": 2.4761904761904763, "grad_norm": 1.069550154228755, "learning_rate": 7.640668900196985e-07, "loss": 0.22967322170734406, "step": 339 }, { "epoch": 2.4835164835164836, "grad_norm": 1.1293770611150251, "learning_rate": 7.426419198674773e-07, "loss": 0.22522856295108795, "step": 340 }, { "epoch": 2.490842490842491, "grad_norm": 1.044662314903284, "learning_rate": 7.214975031628856e-07, "loss": 0.22356775403022766, "step": 341 }, { "epoch": 2.498168498168498, "grad_norm": 1.8203551388983328, "learning_rate": 7.006350332909495e-07, "loss": 0.23231935501098633, "step": 342 }, { "epoch": 2.5054945054945055, "grad_norm": 1.3212997188494113, "learning_rate": 6.800558850568295e-07, "loss": 0.2266559600830078, "step": 343 }, { "epoch": 2.5128205128205128, "grad_norm": 0.9853631772097284, "learning_rate": 6.597614145952136e-07, "loss": 0.22157558798789978, "step": 344 }, { "epoch": 2.52014652014652, "grad_norm": 0.9528136427940737, "learning_rate": 6.397529592809615e-07, "loss": 0.21530809998512268, "step": 345 }, { "epoch": 2.5274725274725274, "grad_norm": 0.973961814473209, "learning_rate": 6.20031837640967e-07, "loss": 0.21502554416656494, "step": 346 }, { "epoch": 2.5347985347985347, "grad_norm": 0.9865830937894688, "learning_rate": 6.005993492672657e-07, "loss": 0.23088479042053223, "step": 347 }, { "epoch": 2.542124542124542, "grad_norm": 1.031234686449057, "learning_rate": 5.814567747314049e-07, "loss": 0.2098679542541504, "step": 348 }, { "epoch": 2.5494505494505493, "grad_norm": 1.056068279344729, "learning_rate": 5.626053755000421e-07, "loss": 0.21406947076320648, "step": 349 }, { "epoch": 2.5567765567765566, "grad_norm": 0.9374826580586869, "learning_rate": 5.440463938518304e-07, "loss": 0.21536314487457275, "step": 350 }, { "epoch": 2.564102564102564, "grad_norm": 1.2706543007223374, "learning_rate": 5.25781052795541e-07, "loss": 0.21673786640167236, "step": 351 }, { "epoch": 2.571428571428571, "grad_norm": 1.0671257757787735, "learning_rate": 5.078105559894791e-07, "loss": 0.21061310172080994, "step": 352 }, { "epoch": 2.578754578754579, "grad_norm": 0.9344178285162987, "learning_rate": 4.901360876621597e-07, "loss": 0.20954422652721405, "step": 353 }, { "epoch": 2.586080586080586, "grad_norm": 0.9853291224784689, "learning_rate": 4.727588125342669e-07, "loss": 0.2262524664402008, "step": 354 }, { "epoch": 2.5934065934065935, "grad_norm": 1.1287752995473084, "learning_rate": 4.5567987574190677e-07, "loss": 0.2586604356765747, "step": 355 }, { "epoch": 2.600732600732601, "grad_norm": 1.0263217600262091, "learning_rate": 4.3890040276114044e-07, "loss": 0.21056286990642548, "step": 356 }, { "epoch": 2.608058608058608, "grad_norm": 1.059496022645036, "learning_rate": 4.224214993338149e-07, "loss": 0.2316097766160965, "step": 357 }, { "epoch": 2.6153846153846154, "grad_norm": 1.0445066802035055, "learning_rate": 4.062442513947007e-07, "loss": 0.2548368275165558, "step": 358 }, { "epoch": 2.6227106227106227, "grad_norm": 1.0677332887683673, "learning_rate": 3.903697249999289e-07, "loss": 0.21376073360443115, "step": 359 }, { "epoch": 2.63003663003663, "grad_norm": 1.0466606342655096, "learning_rate": 3.747989662567403e-07, "loss": 0.24471336603164673, "step": 360 }, { "epoch": 2.6373626373626373, "grad_norm": 1.120513746514606, "learning_rate": 3.595330012545445e-07, "loss": 0.23034799098968506, "step": 361 }, { "epoch": 2.6446886446886446, "grad_norm": 1.0798320309567475, "learning_rate": 3.445728359973094e-07, "loss": 0.20568513870239258, "step": 362 }, { "epoch": 2.652014652014652, "grad_norm": 1.103445307324226, "learning_rate": 3.299194563372604e-07, "loss": 0.24212944507598877, "step": 363 }, { "epoch": 2.659340659340659, "grad_norm": 1.0111358189326647, "learning_rate": 3.1557382790991686e-07, "loss": 0.216970294713974, "step": 364 }, { "epoch": 2.6666666666666665, "grad_norm": 1.0318010970274858, "learning_rate": 3.015368960704584e-07, "loss": 0.23717445135116577, "step": 365 }, { "epoch": 2.6739926739926743, "grad_norm": 1.1278229722297761, "learning_rate": 2.878095858314278e-07, "loss": 0.21703487634658813, "step": 366 }, { "epoch": 2.6813186813186816, "grad_norm": 1.0063580032703539, "learning_rate": 2.743928018017744e-07, "loss": 0.19310849905014038, "step": 367 }, { "epoch": 2.688644688644689, "grad_norm": 1.0725897334713086, "learning_rate": 2.612874281272371e-07, "loss": 0.21045850217342377, "step": 368 }, { "epoch": 2.695970695970696, "grad_norm": 1.1432574967176703, "learning_rate": 2.4849432843208786e-07, "loss": 0.23722723126411438, "step": 369 }, { "epoch": 2.7032967032967035, "grad_norm": 1.1944305105743942, "learning_rate": 2.3601434576221548e-07, "loss": 0.23865890502929688, "step": 370 }, { "epoch": 2.7106227106227108, "grad_norm": 1.0134760129247256, "learning_rate": 2.2384830252957068e-07, "loss": 0.22348350286483765, "step": 371 }, { "epoch": 2.717948717948718, "grad_norm": 0.9433770405793843, "learning_rate": 2.1199700045797077e-07, "loss": 0.2057940810918808, "step": 372 }, { "epoch": 2.7252747252747254, "grad_norm": 1.101087652221433, "learning_rate": 2.0046122053026697e-07, "loss": 0.21743959188461304, "step": 373 }, { "epoch": 2.7326007326007327, "grad_norm": 1.025514441353977, "learning_rate": 1.8924172293688148e-07, "loss": 0.21851155161857605, "step": 374 }, { "epoch": 2.73992673992674, "grad_norm": 1.0627298559202316, "learning_rate": 1.7833924702570725e-07, "loss": 0.2246307134628296, "step": 375 }, { "epoch": 2.7472527472527473, "grad_norm": 1.0363820350724882, "learning_rate": 1.677545112533896e-07, "loss": 0.20863071084022522, "step": 376 }, { "epoch": 2.7545787545787546, "grad_norm": 0.9833790666928801, "learning_rate": 1.5748821313798124e-07, "loss": 0.2220936417579651, "step": 377 }, { "epoch": 2.761904761904762, "grad_norm": 1.1353105880296717, "learning_rate": 1.4754102921297363e-07, "loss": 0.22596627473831177, "step": 378 }, { "epoch": 2.769230769230769, "grad_norm": 1.098762154582319, "learning_rate": 1.3791361498271704e-07, "loss": 0.23951691389083862, "step": 379 }, { "epoch": 2.7765567765567765, "grad_norm": 1.0516458633934982, "learning_rate": 1.2860660487922616e-07, "loss": 0.22323718667030334, "step": 380 }, { "epoch": 2.7838827838827838, "grad_norm": 1.0766560868652186, "learning_rate": 1.196206122203647e-07, "loss": 0.2271801382303238, "step": 381 }, { "epoch": 2.791208791208791, "grad_norm": 1.012988527675345, "learning_rate": 1.1095622916943494e-07, "loss": 0.2283664345741272, "step": 382 }, { "epoch": 2.7985347985347984, "grad_norm": 1.0479928682412392, "learning_rate": 1.0261402669615505e-07, "loss": 0.24376513063907623, "step": 383 }, { "epoch": 2.8058608058608057, "grad_norm": 1.0034449700163197, "learning_rate": 9.459455453902866e-08, "loss": 0.223099946975708, "step": 384 }, { "epoch": 2.813186813186813, "grad_norm": 0.909745107199666, "learning_rate": 8.68983411691221e-08, "loss": 0.20832499861717224, "step": 385 }, { "epoch": 2.8205128205128203, "grad_norm": 0.9952167120384509, "learning_rate": 7.952589375523567e-08, "loss": 0.21737533807754517, "step": 386 }, { "epoch": 2.8278388278388276, "grad_norm": 0.9659352926198512, "learning_rate": 7.247769813048644e-08, "loss": 0.22568273544311523, "step": 387 }, { "epoch": 2.8351648351648353, "grad_norm": 1.048929026784002, "learning_rate": 6.575421876028721e-08, "loss": 0.22502297163009644, "step": 388 }, { "epoch": 2.8424908424908426, "grad_norm": 1.0019261712810552, "learning_rate": 5.935589871174208e-08, "loss": 0.21102303266525269, "step": 389 }, { "epoch": 2.84981684981685, "grad_norm": 1.1048123277921031, "learning_rate": 5.3283159624448745e-08, "loss": 0.2348283976316452, "step": 390 }, { "epoch": 2.857142857142857, "grad_norm": 1.0414285166534145, "learning_rate": 4.753640168271456e-08, "loss": 0.22936861217021942, "step": 391 }, { "epoch": 2.8644688644688645, "grad_norm": 1.2429631645211932, "learning_rate": 4.211600358917989e-08, "loss": 0.24630418419837952, "step": 392 }, { "epoch": 2.871794871794872, "grad_norm": 0.9924039773066393, "learning_rate": 3.702232253986804e-08, "loss": 0.22691383957862854, "step": 393 }, { "epoch": 2.879120879120879, "grad_norm": 1.0577599551717047, "learning_rate": 3.2255694200643003e-08, "loss": 0.22667211294174194, "step": 394 }, { "epoch": 2.8864468864468864, "grad_norm": 1.1382843631032735, "learning_rate": 2.7816432685091598e-08, "loss": 0.20482704043388367, "step": 395 }, { "epoch": 2.8937728937728937, "grad_norm": 1.0580508429061475, "learning_rate": 2.370483053382111e-08, "loss": 0.23122745752334595, "step": 396 }, { "epoch": 2.901098901098901, "grad_norm": 1.0549784779989113, "learning_rate": 1.992115869518474e-08, "loss": 0.23232720792293549, "step": 397 }, { "epoch": 2.9084249084249083, "grad_norm": 1.0136739757000526, "learning_rate": 1.6465666507425314e-08, "loss": 0.24128326773643494, "step": 398 }, { "epoch": 2.9157509157509156, "grad_norm": 1.1068489652877105, "learning_rate": 1.333858168224178e-08, "loss": 0.24579691886901855, "step": 399 }, { "epoch": 2.9230769230769234, "grad_norm": 1.0482106591579268, "learning_rate": 1.0540110289786742e-08, "loss": 0.19634608924388885, "step": 400 }, { "epoch": 2.9304029304029307, "grad_norm": 0.9986056076887907, "learning_rate": 8.07043674508623e-09, "loss": 0.2212928682565689, "step": 401 }, { "epoch": 2.937728937728938, "grad_norm": 1.0118052358258176, "learning_rate": 5.929723795884967e-09, "loss": 0.21850597858428955, "step": 402 }, { "epoch": 2.9450549450549453, "grad_norm": 0.9704495600656273, "learning_rate": 4.1181125119221785e-09, "loss": 0.21226581931114197, "step": 403 }, { "epoch": 2.9523809523809526, "grad_norm": 0.966690163811677, "learning_rate": 2.635722275638464e-09, "loss": 0.21004317700862885, "step": 404 }, { "epoch": 2.95970695970696, "grad_norm": 1.6033220723893016, "learning_rate": 1.4826507743032071e-09, "loss": 0.20844367146492004, "step": 405 }, { "epoch": 2.967032967032967, "grad_norm": 1.076763214226366, "learning_rate": 6.589739935819461e-10, "loss": 0.21316523849964142, "step": 406 }, { "epoch": 2.9743589743589745, "grad_norm": 1.1351171984487796, "learning_rate": 1.6474621252704494e-10, "loss": 0.21295702457427979, "step": 407 }, { "epoch": 2.9816849816849818, "grad_norm": 0.9892972289109477, "learning_rate": 0.0, "loss": 0.19279634952545166, "step": 408 } ], "logging_steps": 1, "max_steps": 408, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 179589737414656.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }