{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1638, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003663003663003663, "grad_norm": 2.7161898612976074, "learning_rate": 1.0000000000000002e-06, "loss": 2.6832668781280518, "step": 2 }, { "epoch": 0.007326007326007326, "grad_norm": 1.0459221601486206, "learning_rate": 3e-06, "loss": 1.6646876335144043, "step": 4 }, { "epoch": 0.01098901098901099, "grad_norm": 0.2796992063522339, "learning_rate": 5e-06, "loss": 1.8732850551605225, "step": 6 }, { "epoch": 0.014652014652014652, "grad_norm": 0.3104994297027588, "learning_rate": 7.000000000000001e-06, "loss": 1.9880081415176392, "step": 8 }, { "epoch": 0.018315018315018316, "grad_norm": 0.17194640636444092, "learning_rate": 9e-06, "loss": 2.0404136180877686, "step": 10 }, { "epoch": 0.02197802197802198, "grad_norm": 0.6366355419158936, "learning_rate": 1.1000000000000001e-05, "loss": 1.7833327054977417, "step": 12 }, { "epoch": 0.02564102564102564, "grad_norm": 0.19716234505176544, "learning_rate": 1.3000000000000001e-05, "loss": 1.6680744886398315, "step": 14 }, { "epoch": 0.029304029304029304, "grad_norm": 0.3166082799434662, "learning_rate": 1.5e-05, "loss": 1.5191091299057007, "step": 16 }, { "epoch": 0.03296703296703297, "grad_norm": 0.6405833959579468, "learning_rate": 1.7000000000000003e-05, "loss": 1.437489628791809, "step": 18 }, { "epoch": 0.03663003663003663, "grad_norm": 0.10038357973098755, "learning_rate": 1.9e-05, "loss": 1.6290624141693115, "step": 20 }, { "epoch": 0.040293040293040296, "grad_norm": 0.311852365732193, "learning_rate": 2.1e-05, "loss": 0.8650764226913452, "step": 22 }, { "epoch": 0.04395604395604396, "grad_norm": 0.24845249950885773, "learning_rate": 2.3000000000000003e-05, "loss": 0.9759135842323303, "step": 24 }, { "epoch": 0.047619047619047616, "grad_norm": 0.32957103848457336, "learning_rate": 2.5e-05, "loss": 1.30423903465271, "step": 26 }, { "epoch": 0.05128205128205128, "grad_norm": 0.2035657912492752, "learning_rate": 2.7000000000000002e-05, "loss": 1.0214941501617432, "step": 28 }, { "epoch": 0.054945054945054944, "grad_norm": 0.4259459972381592, "learning_rate": 2.9e-05, "loss": 1.1116687059402466, "step": 30 }, { "epoch": 0.05860805860805861, "grad_norm": 0.295806884765625, "learning_rate": 3.1e-05, "loss": 1.0386732816696167, "step": 32 }, { "epoch": 0.06227106227106227, "grad_norm": 0.05838385224342346, "learning_rate": 3.3e-05, "loss": 1.1950305700302124, "step": 34 }, { "epoch": 0.06593406593406594, "grad_norm": 0.0978633388876915, "learning_rate": 3.5e-05, "loss": 1.5224722623825073, "step": 36 }, { "epoch": 0.0695970695970696, "grad_norm": 0.3066957890987396, "learning_rate": 3.7e-05, "loss": 1.309075951576233, "step": 38 }, { "epoch": 0.07326007326007326, "grad_norm": 0.160082146525383, "learning_rate": 3.9000000000000006e-05, "loss": 1.3190542459487915, "step": 40 }, { "epoch": 0.07692307692307693, "grad_norm": 0.34093159437179565, "learning_rate": 4.1e-05, "loss": 0.985637903213501, "step": 42 }, { "epoch": 0.08058608058608059, "grad_norm": 0.3740093410015106, "learning_rate": 4.3e-05, "loss": 1.4261013269424438, "step": 44 }, { "epoch": 0.08424908424908426, "grad_norm": 0.2005496323108673, "learning_rate": 4.5e-05, "loss": 1.5266393423080444, "step": 46 }, { "epoch": 0.08791208791208792, "grad_norm": 0.13309991359710693, "learning_rate": 4.7e-05, "loss": 0.9458646774291992, "step": 48 }, { "epoch": 0.09157509157509157, "grad_norm": 0.15211951732635498, "learning_rate": 4.9e-05, "loss": 1.4107502698898315, "step": 50 }, { "epoch": 0.09523809523809523, "grad_norm": 0.22298896312713623, "learning_rate": 4.9999955969752164e-05, "loss": 0.7173015475273132, "step": 52 }, { "epoch": 0.0989010989010989, "grad_norm": 0.2911456227302551, "learning_rate": 4.999960372880343e-05, "loss": 0.8890910744667053, "step": 54 }, { "epoch": 0.10256410256410256, "grad_norm": 0.9632299542427063, "learning_rate": 4.9998899252420356e-05, "loss": 1.2817891836166382, "step": 56 }, { "epoch": 0.10622710622710622, "grad_norm": 0.5119697451591492, "learning_rate": 4.9997842551631656e-05, "loss": 1.1215670108795166, "step": 58 }, { "epoch": 0.10989010989010989, "grad_norm": 0.1322525441646576, "learning_rate": 4.999643364298017e-05, "loss": 1.3118717670440674, "step": 60 }, { "epoch": 0.11355311355311355, "grad_norm": 0.1301015466451645, "learning_rate": 4.9994672548522613e-05, "loss": 1.3311526775360107, "step": 62 }, { "epoch": 0.11721611721611722, "grad_norm": 0.11371763795614243, "learning_rate": 4.999255929582926e-05, "loss": 1.3023815155029297, "step": 64 }, { "epoch": 0.12087912087912088, "grad_norm": 0.14048679172992706, "learning_rate": 4.9990093917983465e-05, "loss": 1.2759833335876465, "step": 66 }, { "epoch": 0.12454212454212454, "grad_norm": 0.23039552569389343, "learning_rate": 4.9987276453581165e-05, "loss": 1.2587000131607056, "step": 68 }, { "epoch": 0.1282051282051282, "grad_norm": 0.4443025290966034, "learning_rate": 4.998410694673029e-05, "loss": 1.1879582405090332, "step": 70 }, { "epoch": 0.13186813186813187, "grad_norm": 0.22435733675956726, "learning_rate": 4.998058544705005e-05, "loss": 1.3695639371871948, "step": 72 }, { "epoch": 0.13553113553113552, "grad_norm": 0.22466352581977844, "learning_rate": 4.997671200967017e-05, "loss": 1.5553536415100098, "step": 74 }, { "epoch": 0.1391941391941392, "grad_norm": 0.05474912002682686, "learning_rate": 4.997248669523002e-05, "loss": 1.0360654592514038, "step": 76 }, { "epoch": 0.14285714285714285, "grad_norm": 0.17411059141159058, "learning_rate": 4.9967909569877686e-05, "loss": 1.3734983205795288, "step": 78 }, { "epoch": 0.14652014652014653, "grad_norm": 0.22640125453472137, "learning_rate": 4.99629807052689e-05, "loss": 1.5512079000473022, "step": 80 }, { "epoch": 0.15018315018315018, "grad_norm": 0.16909095644950867, "learning_rate": 4.995770017856595e-05, "loss": 1.1615397930145264, "step": 82 }, { "epoch": 0.15384615384615385, "grad_norm": 0.14065229892730713, "learning_rate": 4.995206807243644e-05, "loss": 1.3165048360824585, "step": 84 }, { "epoch": 0.1575091575091575, "grad_norm": 0.2557665705680847, "learning_rate": 4.994608447505203e-05, "loss": 1.456904411315918, "step": 86 }, { "epoch": 0.16117216117216118, "grad_norm": 0.18825113773345947, "learning_rate": 4.993974948008705e-05, "loss": 0.7387548685073853, "step": 88 }, { "epoch": 0.16483516483516483, "grad_norm": 0.09088045358657837, "learning_rate": 4.9933063186717006e-05, "loss": 0.8501173257827759, "step": 90 }, { "epoch": 0.1684981684981685, "grad_norm": 0.31493327021598816, "learning_rate": 4.992602569961704e-05, "loss": 1.2714766263961792, "step": 92 }, { "epoch": 0.17216117216117216, "grad_norm": 0.3955460786819458, "learning_rate": 4.991863712896033e-05, "loss": 1.29978609085083, "step": 94 }, { "epoch": 0.17582417582417584, "grad_norm": 0.22290943562984467, "learning_rate": 4.991089759041628e-05, "loss": 1.1851716041564941, "step": 96 }, { "epoch": 0.1794871794871795, "grad_norm": 0.1698511391878128, "learning_rate": 4.99028072051488e-05, "loss": 1.2791502475738525, "step": 98 }, { "epoch": 0.18315018315018314, "grad_norm": 0.2091524451971054, "learning_rate": 4.989436609981437e-05, "loss": 1.143870234489441, "step": 100 }, { "epoch": 0.18681318681318682, "grad_norm": 0.39920395612716675, "learning_rate": 4.988557440656004e-05, "loss": 1.2132782936096191, "step": 102 }, { "epoch": 0.19047619047619047, "grad_norm": 0.23149904608726501, "learning_rate": 4.987643226302138e-05, "loss": 0.8638072609901428, "step": 104 }, { "epoch": 0.19413919413919414, "grad_norm": 0.8971022367477417, "learning_rate": 4.9866939812320326e-05, "loss": 1.0543807744979858, "step": 106 }, { "epoch": 0.1978021978021978, "grad_norm": 0.14404769241809845, "learning_rate": 4.9857097203062955e-05, "loss": 1.27614426612854, "step": 108 }, { "epoch": 0.20146520146520147, "grad_norm": 0.20676514506340027, "learning_rate": 4.984690458933711e-05, "loss": 1.1264268159866333, "step": 110 }, { "epoch": 0.20512820512820512, "grad_norm": 0.2630727291107178, "learning_rate": 4.983636213071004e-05, "loss": 1.1916111707687378, "step": 112 }, { "epoch": 0.2087912087912088, "grad_norm": 0.3776465356349945, "learning_rate": 4.982546999222587e-05, "loss": 0.6021360158920288, "step": 114 }, { "epoch": 0.21245421245421245, "grad_norm": 0.3124425709247589, "learning_rate": 4.981422834440303e-05, "loss": 1.2633193731307983, "step": 116 }, { "epoch": 0.21611721611721613, "grad_norm": 0.21436505019664764, "learning_rate": 4.98026373632316e-05, "loss": 1.2517153024673462, "step": 118 }, { "epoch": 0.21978021978021978, "grad_norm": 0.165570467710495, "learning_rate": 4.97906972301705e-05, "loss": 1.329060673713684, "step": 120 }, { "epoch": 0.22344322344322345, "grad_norm": 0.1781710535287857, "learning_rate": 4.9778408132144715e-05, "loss": 1.2554113864898682, "step": 122 }, { "epoch": 0.2271062271062271, "grad_norm": 0.4390529692173004, "learning_rate": 4.976577026154235e-05, "loss": 0.9952642321586609, "step": 124 }, { "epoch": 0.23076923076923078, "grad_norm": 1.2797257900238037, "learning_rate": 4.9752783816211576e-05, "loss": 0.872045636177063, "step": 126 }, { "epoch": 0.23443223443223443, "grad_norm": 0.3392629325389862, "learning_rate": 4.973944899945758e-05, "loss": 1.1586322784423828, "step": 128 }, { "epoch": 0.23809523809523808, "grad_norm": 0.216371089220047, "learning_rate": 4.9725766020039395e-05, "loss": 1.2613385915756226, "step": 130 }, { "epoch": 0.24175824175824176, "grad_norm": 0.44755294919013977, "learning_rate": 4.971173509216656e-05, "loss": 1.3555878400802612, "step": 132 }, { "epoch": 0.2454212454212454, "grad_norm": 0.20577383041381836, "learning_rate": 4.969735643549583e-05, "loss": 1.2522915601730347, "step": 134 }, { "epoch": 0.2490842490842491, "grad_norm": 0.33351951837539673, "learning_rate": 4.968263027512773e-05, "loss": 1.2353262901306152, "step": 136 }, { "epoch": 0.25274725274725274, "grad_norm": 0.12553884088993073, "learning_rate": 4.966755684160301e-05, "loss": 1.057889699935913, "step": 138 }, { "epoch": 0.2564102564102564, "grad_norm": 0.17903219163417816, "learning_rate": 4.9652136370899035e-05, "loss": 1.23538076877594, "step": 140 }, { "epoch": 0.2600732600732601, "grad_norm": 0.3907378315925598, "learning_rate": 4.963636910442611e-05, "loss": 1.2505638599395752, "step": 142 }, { "epoch": 0.26373626373626374, "grad_norm": 0.1998424232006073, "learning_rate": 4.96202552890237e-05, "loss": 1.2176811695098877, "step": 144 }, { "epoch": 0.2673992673992674, "grad_norm": 0.4392535388469696, "learning_rate": 4.960379517695654e-05, "loss": 1.3697282075881958, "step": 146 }, { "epoch": 0.27106227106227104, "grad_norm": 0.3896879255771637, "learning_rate": 4.958698902591072e-05, "loss": 1.2809630632400513, "step": 148 }, { "epoch": 0.27472527472527475, "grad_norm": 0.3242920935153961, "learning_rate": 4.9569837098989626e-05, "loss": 0.9012686014175415, "step": 150 }, { "epoch": 0.2783882783882784, "grad_norm": 0.13880665600299835, "learning_rate": 4.9552339664709807e-05, "loss": 0.6081559658050537, "step": 152 }, { "epoch": 0.28205128205128205, "grad_norm": 0.27750471234321594, "learning_rate": 4.9534496996996845e-05, "loss": 1.0718085765838623, "step": 154 }, { "epoch": 0.2857142857142857, "grad_norm": 0.4269710183143616, "learning_rate": 4.951630937518096e-05, "loss": 1.3045586347579956, "step": 156 }, { "epoch": 0.2893772893772894, "grad_norm": 0.36114686727523804, "learning_rate": 4.949777708399273e-05, "loss": 1.24015212059021, "step": 158 }, { "epoch": 0.29304029304029305, "grad_norm": 0.18885226547718048, "learning_rate": 4.947890041355858e-05, "loss": 0.9190669655799866, "step": 160 }, { "epoch": 0.2967032967032967, "grad_norm": 0.42237764596939087, "learning_rate": 4.9459679659396257e-05, "loss": 1.4927023649215698, "step": 162 }, { "epoch": 0.30036630036630035, "grad_norm": 0.1873409003019333, "learning_rate": 4.944011512241021e-05, "loss": 1.0130228996276855, "step": 164 }, { "epoch": 0.304029304029304, "grad_norm": 0.290294885635376, "learning_rate": 4.942020710888684e-05, "loss": 1.3621708154678345, "step": 166 }, { "epoch": 0.3076923076923077, "grad_norm": 0.7741953134536743, "learning_rate": 4.939995593048979e-05, "loss": 1.1007283926010132, "step": 168 }, { "epoch": 0.31135531135531136, "grad_norm": 0.4494468867778778, "learning_rate": 4.937936190425495e-05, "loss": 1.2320328950881958, "step": 170 }, { "epoch": 0.315018315018315, "grad_norm": 1.186848759651184, "learning_rate": 4.9358425352585616e-05, "loss": 1.0239619016647339, "step": 172 }, { "epoch": 0.31868131868131866, "grad_norm": 0.1502193659543991, "learning_rate": 4.933714660324735e-05, "loss": 0.816228449344635, "step": 174 }, { "epoch": 0.32234432234432236, "grad_norm": 0.24374300241470337, "learning_rate": 4.931552598936287e-05, "loss": 1.370795726776123, "step": 176 }, { "epoch": 0.326007326007326, "grad_norm": 0.16081377863883972, "learning_rate": 4.929356384940688e-05, "loss": 0.8959931135177612, "step": 178 }, { "epoch": 0.32967032967032966, "grad_norm": 0.21084611117839813, "learning_rate": 4.927126052720071e-05, "loss": 1.447354793548584, "step": 180 }, { "epoch": 0.3333333333333333, "grad_norm": 0.13184507191181183, "learning_rate": 4.924861637190698e-05, "loss": 0.954731285572052, "step": 182 }, { "epoch": 0.336996336996337, "grad_norm": 0.2652509808540344, "learning_rate": 4.922563173802409e-05, "loss": 1.2110737562179565, "step": 184 }, { "epoch": 0.34065934065934067, "grad_norm": 0.34187304973602295, "learning_rate": 4.9202306985380734e-05, "loss": 1.2186378240585327, "step": 186 }, { "epoch": 0.3443223443223443, "grad_norm": 0.08439934998750687, "learning_rate": 4.917864247913018e-05, "loss": 1.155535101890564, "step": 188 }, { "epoch": 0.34798534798534797, "grad_norm": 0.22483469545841217, "learning_rate": 4.9154638589744646e-05, "loss": 1.2381874322891235, "step": 190 }, { "epoch": 0.3516483516483517, "grad_norm": 10.98539924621582, "learning_rate": 4.913029569300942e-05, "loss": 1.0877535343170166, "step": 192 }, { "epoch": 0.3553113553113553, "grad_norm": 0.09030856937170029, "learning_rate": 4.9105614170017034e-05, "loss": 1.2255364656448364, "step": 194 }, { "epoch": 0.358974358974359, "grad_norm": 0.22610144317150116, "learning_rate": 4.908059440716127e-05, "loss": 1.2344918251037598, "step": 196 }, { "epoch": 0.3626373626373626, "grad_norm": 0.6382534503936768, "learning_rate": 4.9055236796131115e-05, "loss": 0.8511998653411865, "step": 198 }, { "epoch": 0.3663003663003663, "grad_norm": 0.22709640860557556, "learning_rate": 4.902954173390464e-05, "loss": 0.9911755323410034, "step": 200 }, { "epoch": 0.36996336996337, "grad_norm": 0.2304632067680359, "learning_rate": 4.900350962274275e-05, "loss": 1.4108072519302368, "step": 202 }, { "epoch": 0.37362637362637363, "grad_norm": 0.2119007110595703, "learning_rate": 4.897714087018296e-05, "loss": 1.1905288696289062, "step": 204 }, { "epoch": 0.3772893772893773, "grad_norm": 0.2922574579715729, "learning_rate": 4.895043588903292e-05, "loss": 0.7706769704818726, "step": 206 }, { "epoch": 0.38095238095238093, "grad_norm": 0.6245097517967224, "learning_rate": 4.892339509736404e-05, "loss": 1.1153967380523682, "step": 208 }, { "epoch": 0.38461538461538464, "grad_norm": 0.36032259464263916, "learning_rate": 4.889601891850486e-05, "loss": 1.3866379261016846, "step": 210 }, { "epoch": 0.3882783882783883, "grad_norm": 0.14774373173713684, "learning_rate": 4.886830778103452e-05, "loss": 1.0544565916061401, "step": 212 }, { "epoch": 0.39194139194139194, "grad_norm": 0.2895050346851349, "learning_rate": 4.884026211877596e-05, "loss": 1.1082898378372192, "step": 214 }, { "epoch": 0.3956043956043956, "grad_norm": 0.2137245386838913, "learning_rate": 4.881188237078919e-05, "loss": 1.2029824256896973, "step": 216 }, { "epoch": 0.3992673992673993, "grad_norm": 0.3577767014503479, "learning_rate": 4.878316898136437e-05, "loss": 1.2338331937789917, "step": 218 }, { "epoch": 0.40293040293040294, "grad_norm": 0.09472601860761642, "learning_rate": 4.875412240001491e-05, "loss": 0.6112377047538757, "step": 220 }, { "epoch": 0.4065934065934066, "grad_norm": 0.1504858434200287, "learning_rate": 4.872474308147037e-05, "loss": 1.3192267417907715, "step": 222 }, { "epoch": 0.41025641025641024, "grad_norm": 0.3618045449256897, "learning_rate": 4.869503148566939e-05, "loss": 1.0624542236328125, "step": 224 }, { "epoch": 0.4139194139194139, "grad_norm": 0.13315747678279877, "learning_rate": 4.866498807775247e-05, "loss": 1.2139613628387451, "step": 226 }, { "epoch": 0.4175824175824176, "grad_norm": 0.2177940309047699, "learning_rate": 4.8634613328054674e-05, "loss": 1.2820316553115845, "step": 228 }, { "epoch": 0.42124542124542125, "grad_norm": 0.3730919063091278, "learning_rate": 4.8603907712098305e-05, "loss": 1.2036633491516113, "step": 230 }, { "epoch": 0.4249084249084249, "grad_norm": 0.5930754542350769, "learning_rate": 4.8572871710585424e-05, "loss": 0.9775714874267578, "step": 232 }, { "epoch": 0.42857142857142855, "grad_norm": 5.749536991119385, "learning_rate": 4.854150580939035e-05, "loss": 1.4643810987472534, "step": 234 }, { "epoch": 0.43223443223443225, "grad_norm": 0.18177424371242523, "learning_rate": 4.850981049955203e-05, "loss": 0.99868243932724, "step": 236 }, { "epoch": 0.4358974358974359, "grad_norm": 0.5519245266914368, "learning_rate": 4.847778627726636e-05, "loss": 1.051274299621582, "step": 238 }, { "epoch": 0.43956043956043955, "grad_norm": 0.24456854164600372, "learning_rate": 4.844543364387844e-05, "loss": 0.8957317471504211, "step": 240 }, { "epoch": 0.4432234432234432, "grad_norm": 0.32938405871391296, "learning_rate": 4.8412753105874703e-05, "loss": 0.8530710339546204, "step": 242 }, { "epoch": 0.4468864468864469, "grad_norm": 0.08594862371683121, "learning_rate": 4.837974517487496e-05, "loss": 0.560033917427063, "step": 244 }, { "epoch": 0.45054945054945056, "grad_norm": 0.12811991572380066, "learning_rate": 4.8346410367624465e-05, "loss": 1.2348781824111938, "step": 246 }, { "epoch": 0.4542124542124542, "grad_norm": 0.21825870871543884, "learning_rate": 4.831274920598574e-05, "loss": 0.8636214733123779, "step": 248 }, { "epoch": 0.45787545787545786, "grad_norm": 0.6492200493812561, "learning_rate": 4.8278762216930456e-05, "loss": 1.246092677116394, "step": 250 }, { "epoch": 0.46153846153846156, "grad_norm": 0.16339761018753052, "learning_rate": 4.8244449932531195e-05, "loss": 1.1555366516113281, "step": 252 }, { "epoch": 0.4652014652014652, "grad_norm": 0.12176702171564102, "learning_rate": 4.820981288995307e-05, "loss": 0.9462042450904846, "step": 254 }, { "epoch": 0.46886446886446886, "grad_norm": 0.10259034484624863, "learning_rate": 4.8174851631445354e-05, "loss": 1.2327078580856323, "step": 256 }, { "epoch": 0.4725274725274725, "grad_norm": 0.2983081638813019, "learning_rate": 4.8139566704332984e-05, "loss": 1.2545617818832397, "step": 258 }, { "epoch": 0.47619047619047616, "grad_norm": 0.18848823010921478, "learning_rate": 4.810395866100797e-05, "loss": 0.7314871549606323, "step": 260 }, { "epoch": 0.47985347985347987, "grad_norm": 1.1380740404129028, "learning_rate": 4.8068028058920795e-05, "loss": 1.1386513710021973, "step": 262 }, { "epoch": 0.4835164835164835, "grad_norm": 0.19013704359531403, "learning_rate": 4.803177546057163e-05, "loss": 1.207440972328186, "step": 264 }, { "epoch": 0.48717948717948717, "grad_norm": 0.49117130041122437, "learning_rate": 4.799520143350158e-05, "loss": 1.478100299835205, "step": 266 }, { "epoch": 0.4908424908424908, "grad_norm": 0.32450738549232483, "learning_rate": 4.795830655028376e-05, "loss": 0.7695617079734802, "step": 268 }, { "epoch": 0.4945054945054945, "grad_norm": 0.14151829481124878, "learning_rate": 4.792109138851435e-05, "loss": 1.180545449256897, "step": 270 }, { "epoch": 0.4981684981684982, "grad_norm": 0.21694627404212952, "learning_rate": 4.7883556530803554e-05, "loss": 0.8736183643341064, "step": 272 }, { "epoch": 0.5018315018315018, "grad_norm": 0.9315363764762878, "learning_rate": 4.7845702564766475e-05, "loss": 1.2287445068359375, "step": 274 }, { "epoch": 0.5054945054945055, "grad_norm": 0.12385514378547668, "learning_rate": 4.7807530083013906e-05, "loss": 0.814042329788208, "step": 276 }, { "epoch": 0.5091575091575091, "grad_norm": 0.10513313859701157, "learning_rate": 4.776903968314308e-05, "loss": 0.8786470890045166, "step": 278 }, { "epoch": 0.5128205128205128, "grad_norm": 0.213555246591568, "learning_rate": 4.7730231967728275e-05, "loss": 1.2300586700439453, "step": 280 }, { "epoch": 0.5164835164835165, "grad_norm": 0.20062805712223053, "learning_rate": 4.769110754431142e-05, "loss": 1.2230390310287476, "step": 282 }, { "epoch": 0.5201465201465202, "grad_norm": 0.21544235944747925, "learning_rate": 4.765166702539256e-05, "loss": 1.2219314575195312, "step": 284 }, { "epoch": 0.5238095238095238, "grad_norm": 0.22437822818756104, "learning_rate": 4.761191102842027e-05, "loss": 0.9741434454917908, "step": 286 }, { "epoch": 0.5274725274725275, "grad_norm": 0.09989945590496063, "learning_rate": 4.757184017578198e-05, "loss": 1.2340394258499146, "step": 288 }, { "epoch": 0.5311355311355311, "grad_norm": 0.14188872277736664, "learning_rate": 4.7531455094794284e-05, "loss": 1.197536587715149, "step": 290 }, { "epoch": 0.5347985347985348, "grad_norm": 0.1335064321756363, "learning_rate": 4.7490756417693036e-05, "loss": 0.7367426753044128, "step": 292 }, { "epoch": 0.5384615384615384, "grad_norm": 0.02857016585767269, "learning_rate": 4.7449744781623526e-05, "loss": 0.9376294016838074, "step": 294 }, { "epoch": 0.5421245421245421, "grad_norm": 0.11979032307863235, "learning_rate": 4.740842082863043e-05, "loss": 1.0236124992370605, "step": 296 }, { "epoch": 0.5457875457875457, "grad_norm": 0.19949960708618164, "learning_rate": 4.736678520564786e-05, "loss": 1.290779709815979, "step": 298 }, { "epoch": 0.5494505494505495, "grad_norm": 0.12269338220357895, "learning_rate": 4.732483856448913e-05, "loss": 1.1912894248962402, "step": 300 }, { "epoch": 0.5531135531135531, "grad_norm": 0.15550191700458527, "learning_rate": 4.7282581561836644e-05, "loss": 1.1734073162078857, "step": 302 }, { "epoch": 0.5567765567765568, "grad_norm": 0.19052956998348236, "learning_rate": 4.724001485923153e-05, "loss": 0.9569897055625916, "step": 304 }, { "epoch": 0.5604395604395604, "grad_norm": 0.3564753830432892, "learning_rate": 4.7197139123063366e-05, "loss": 0.9688905477523804, "step": 306 }, { "epoch": 0.5641025641025641, "grad_norm": 0.25113749504089355, "learning_rate": 4.715395502455967e-05, "loss": 1.3545844554901123, "step": 308 }, { "epoch": 0.5677655677655677, "grad_norm": 0.19413875043392181, "learning_rate": 4.711046323977545e-05, "loss": 0.9748039245605469, "step": 310 }, { "epoch": 0.5714285714285714, "grad_norm": 0.20445436239242554, "learning_rate": 4.70666644495826e-05, "loss": 1.2018651962280273, "step": 312 }, { "epoch": 0.575091575091575, "grad_norm": 0.1748535931110382, "learning_rate": 4.702255933965924e-05, "loss": 1.1204524040222168, "step": 314 }, { "epoch": 0.5787545787545788, "grad_norm": 0.13978832960128784, "learning_rate": 4.697814860047895e-05, "loss": 1.273799180984497, "step": 316 }, { "epoch": 0.5824175824175825, "grad_norm": 0.1604635864496231, "learning_rate": 4.6933432927300054e-05, "loss": 1.1062840223312378, "step": 318 }, { "epoch": 0.5860805860805861, "grad_norm": 0.1707131415605545, "learning_rate": 4.6888413020154626e-05, "loss": 1.3164299726486206, "step": 320 }, { "epoch": 0.5897435897435898, "grad_norm": 0.13679248094558716, "learning_rate": 4.6843089583837586e-05, "loss": 1.5054590702056885, "step": 322 }, { "epoch": 0.5934065934065934, "grad_norm": 0.19371454417705536, "learning_rate": 4.6797463327895676e-05, "loss": 1.2403850555419922, "step": 324 }, { "epoch": 0.5970695970695971, "grad_norm": 0.09500681608915329, "learning_rate": 4.6751534966616314e-05, "loss": 1.3421348333358765, "step": 326 }, { "epoch": 0.6007326007326007, "grad_norm": 0.1618986278772354, "learning_rate": 4.670530521901645e-05, "loss": 1.2023552656173706, "step": 328 }, { "epoch": 0.6043956043956044, "grad_norm": 0.13862641155719757, "learning_rate": 4.6658774808831284e-05, "loss": 1.1014868021011353, "step": 330 }, { "epoch": 0.608058608058608, "grad_norm": 0.2911272644996643, "learning_rate": 4.6611944464502935e-05, "loss": 1.1684032678604126, "step": 332 }, { "epoch": 0.6117216117216118, "grad_norm": 0.24178026616573334, "learning_rate": 4.6564814919169075e-05, "loss": 1.2577779293060303, "step": 334 }, { "epoch": 0.6153846153846154, "grad_norm": 0.5293629169464111, "learning_rate": 4.651738691065139e-05, "loss": 0.8592604994773865, "step": 336 }, { "epoch": 0.6190476190476191, "grad_norm": 0.09567166119813919, "learning_rate": 4.646966118144407e-05, "loss": 1.2142037153244019, "step": 338 }, { "epoch": 0.6227106227106227, "grad_norm": 0.13777339458465576, "learning_rate": 4.642163847870221e-05, "loss": 1.207306981086731, "step": 340 }, { "epoch": 0.6263736263736264, "grad_norm": 0.22208669781684875, "learning_rate": 4.637331955423002e-05, "loss": 0.5593523979187012, "step": 342 }, { "epoch": 0.63003663003663, "grad_norm": 0.15060071647167206, "learning_rate": 4.6324705164469174e-05, "loss": 1.4146814346313477, "step": 344 }, { "epoch": 0.6336996336996337, "grad_norm": 0.2521788775920868, "learning_rate": 4.6275796070486874e-05, "loss": 0.6819853782653809, "step": 346 }, { "epoch": 0.6373626373626373, "grad_norm": 0.1835089921951294, "learning_rate": 4.622659303796397e-05, "loss": 1.135895013809204, "step": 348 }, { "epoch": 0.6410256410256411, "grad_norm": 0.30718883872032166, "learning_rate": 4.6177096837183016e-05, "loss": 0.8732522130012512, "step": 350 }, { "epoch": 0.6446886446886447, "grad_norm": 0.19664013385772705, "learning_rate": 4.612730824301611e-05, "loss": 0.9108962416648865, "step": 352 }, { "epoch": 0.6483516483516484, "grad_norm": 0.24300748109817505, "learning_rate": 4.6077228034912865e-05, "loss": 0.944155216217041, "step": 354 }, { "epoch": 0.652014652014652, "grad_norm": 0.25140050053596497, "learning_rate": 4.602685699688814e-05, "loss": 1.1503783464431763, "step": 356 }, { "epoch": 0.6556776556776557, "grad_norm": 0.18550491333007812, "learning_rate": 4.5976195917509804e-05, "loss": 1.1416871547698975, "step": 358 }, { "epoch": 0.6593406593406593, "grad_norm": 0.19337521493434906, "learning_rate": 4.592524558988638e-05, "loss": 0.6880902647972107, "step": 360 }, { "epoch": 0.663003663003663, "grad_norm": 0.27510422468185425, "learning_rate": 4.58740068116546e-05, "loss": 0.9372468590736389, "step": 362 }, { "epoch": 0.6666666666666666, "grad_norm": 0.14954572916030884, "learning_rate": 4.582248038496698e-05, "loss": 0.9180594682693481, "step": 364 }, { "epoch": 0.6703296703296703, "grad_norm": 0.23267677426338196, "learning_rate": 4.577066711647918e-05, "loss": 1.1724467277526855, "step": 366 }, { "epoch": 0.673992673992674, "grad_norm": 0.1276102066040039, "learning_rate": 4.571856781733748e-05, "loss": 1.0390164852142334, "step": 368 }, { "epoch": 0.6776556776556777, "grad_norm": 0.1551157683134079, "learning_rate": 4.566618330316596e-05, "loss": 1.081437587738037, "step": 370 }, { "epoch": 0.6813186813186813, "grad_norm": 0.3087083399295807, "learning_rate": 4.561351439405384e-05, "loss": 1.1742217540740967, "step": 372 }, { "epoch": 0.684981684981685, "grad_norm": 0.2865961790084839, "learning_rate": 4.5560561914542576e-05, "loss": 1.1755157709121704, "step": 374 }, { "epoch": 0.6886446886446886, "grad_norm": 0.20178958773612976, "learning_rate": 4.550732669361298e-05, "loss": 0.8584067225456238, "step": 376 }, { "epoch": 0.6923076923076923, "grad_norm": 0.14505843818187714, "learning_rate": 4.54538095646722e-05, "loss": 0.8162437081336975, "step": 378 }, { "epoch": 0.6959706959706959, "grad_norm": 0.35326001048088074, "learning_rate": 4.540001136554077e-05, "loss": 1.0263890027999878, "step": 380 }, { "epoch": 0.6996336996336996, "grad_norm": 0.2113528698682785, "learning_rate": 4.534593293843936e-05, "loss": 0.9698024392127991, "step": 382 }, { "epoch": 0.7032967032967034, "grad_norm": 0.13257572054862976, "learning_rate": 4.529157512997571e-05, "loss": 1.1605135202407837, "step": 384 }, { "epoch": 0.706959706959707, "grad_norm": 0.17257475852966309, "learning_rate": 4.5236938791131305e-05, "loss": 1.0823811292648315, "step": 386 }, { "epoch": 0.7106227106227107, "grad_norm": 0.2746966779232025, "learning_rate": 4.518202477724808e-05, "loss": 0.8808259963989258, "step": 388 }, { "epoch": 0.7142857142857143, "grad_norm": 0.11813390254974365, "learning_rate": 4.5126833948015016e-05, "loss": 1.0819435119628906, "step": 390 }, { "epoch": 0.717948717948718, "grad_norm": 0.2048182636499405, "learning_rate": 4.5071367167454687e-05, "loss": 1.1645246744155884, "step": 392 }, { "epoch": 0.7216117216117216, "grad_norm": 2.1287009716033936, "learning_rate": 4.5015625303909755e-05, "loss": 1.1096913814544678, "step": 394 }, { "epoch": 0.7252747252747253, "grad_norm": 0.11994423717260361, "learning_rate": 4.495960923002935e-05, "loss": 1.223901391029358, "step": 396 }, { "epoch": 0.7289377289377289, "grad_norm": 0.15119600296020508, "learning_rate": 4.49033198227554e-05, "loss": 0.9063436388969421, "step": 398 }, { "epoch": 0.7326007326007326, "grad_norm": 0.11098281294107437, "learning_rate": 4.4846757963308936e-05, "loss": 0.8366504907608032, "step": 400 }, { "epoch": 0.7362637362637363, "grad_norm": 0.14989100396633148, "learning_rate": 4.478992453717626e-05, "loss": 1.228022813796997, "step": 402 }, { "epoch": 0.73992673992674, "grad_norm": 0.2334737479686737, "learning_rate": 4.4732820434095123e-05, "loss": 0.8357920050621033, "step": 404 }, { "epoch": 0.7435897435897436, "grad_norm": 0.3819234073162079, "learning_rate": 4.4675446548040754e-05, "loss": 1.0126510858535767, "step": 406 }, { "epoch": 0.7472527472527473, "grad_norm": 0.19562920928001404, "learning_rate": 4.46178037772119e-05, "loss": 1.012734055519104, "step": 408 }, { "epoch": 0.7509157509157509, "grad_norm": 0.304485559463501, "learning_rate": 4.4559893024016726e-05, "loss": 0.8644341230392456, "step": 410 }, { "epoch": 0.7545787545787546, "grad_norm": 0.24467211961746216, "learning_rate": 4.450171519505873e-05, "loss": 1.152502179145813, "step": 412 }, { "epoch": 0.7582417582417582, "grad_norm": 0.8111533522605896, "learning_rate": 4.4443271201122514e-05, "loss": 0.9966916441917419, "step": 414 }, { "epoch": 0.7619047619047619, "grad_norm": 0.13030032813549042, "learning_rate": 4.4384561957159565e-05, "loss": 1.2616826295852661, "step": 416 }, { "epoch": 0.7655677655677655, "grad_norm": 0.09772861003875732, "learning_rate": 4.43255883822739e-05, "loss": 0.6672307252883911, "step": 418 }, { "epoch": 0.7692307692307693, "grad_norm": 0.16410967707633972, "learning_rate": 4.4266351399707664e-05, "loss": 1.214950680732727, "step": 420 }, { "epoch": 0.7728937728937729, "grad_norm": 0.12338349223136902, "learning_rate": 4.420685193682672e-05, "loss": 0.9765850305557251, "step": 422 }, { "epoch": 0.7765567765567766, "grad_norm": 0.5074856281280518, "learning_rate": 4.4147090925106104e-05, "loss": 0.4896080195903778, "step": 424 }, { "epoch": 0.7802197802197802, "grad_norm": 0.2849983870983124, "learning_rate": 4.4087069300115444e-05, "loss": 0.7668413519859314, "step": 426 }, { "epoch": 0.7838827838827839, "grad_norm": 0.36542126536369324, "learning_rate": 4.4026788001504314e-05, "loss": 1.045650839805603, "step": 428 }, { "epoch": 0.7875457875457875, "grad_norm": 0.1004275232553482, "learning_rate": 4.396624797298754e-05, "loss": 1.1941821575164795, "step": 430 }, { "epoch": 0.7912087912087912, "grad_norm": 0.1513642817735672, "learning_rate": 4.390545016233039e-05, "loss": 1.2807530164718628, "step": 432 }, { "epoch": 0.7948717948717948, "grad_norm": 0.3031829595565796, "learning_rate": 4.3844395521333786e-05, "loss": 0.8745837807655334, "step": 434 }, { "epoch": 0.7985347985347986, "grad_norm": 0.1763853281736374, "learning_rate": 4.378308500581934e-05, "loss": 0.9577867984771729, "step": 436 }, { "epoch": 0.8021978021978022, "grad_norm": 0.21650274097919464, "learning_rate": 4.372151957561447e-05, "loss": 0.8710334300994873, "step": 438 }, { "epoch": 0.8058608058608059, "grad_norm": 0.3157196640968323, "learning_rate": 4.36597001945373e-05, "loss": 1.2961158752441406, "step": 440 }, { "epoch": 0.8095238095238095, "grad_norm": 0.11482734233140945, "learning_rate": 4.3597627830381606e-05, "loss": 1.1874325275421143, "step": 442 }, { "epoch": 0.8131868131868132, "grad_norm": 0.15268991887569427, "learning_rate": 4.353530345490167e-05, "loss": 1.1880759000778198, "step": 444 }, { "epoch": 0.8168498168498168, "grad_norm": 0.21278268098831177, "learning_rate": 4.347272804379705e-05, "loss": 1.206059455871582, "step": 446 }, { "epoch": 0.8205128205128205, "grad_norm": 0.179881751537323, "learning_rate": 4.340990257669732e-05, "loss": 1.2088541984558105, "step": 448 }, { "epoch": 0.8241758241758241, "grad_norm": 0.14933490753173828, "learning_rate": 4.334682803714672e-05, "loss": 1.2412981986999512, "step": 450 }, { "epoch": 0.8278388278388278, "grad_norm": 0.1529897153377533, "learning_rate": 4.328350541258876e-05, "loss": 0.9919160008430481, "step": 452 }, { "epoch": 0.8315018315018315, "grad_norm": 0.10920170694589615, "learning_rate": 4.321993569435078e-05, "loss": 0.49135756492614746, "step": 454 }, { "epoch": 0.8351648351648352, "grad_norm": 0.4436364471912384, "learning_rate": 4.315611987762841e-05, "loss": 0.8858435750007629, "step": 456 }, { "epoch": 0.8388278388278388, "grad_norm": 0.22913309931755066, "learning_rate": 4.309205896146999e-05, "loss": 0.8232947587966919, "step": 458 }, { "epoch": 0.8424908424908425, "grad_norm": 0.2811645567417145, "learning_rate": 4.302775394876096e-05, "loss": 1.0056540966033936, "step": 460 }, { "epoch": 0.8461538461538461, "grad_norm": 0.22459489107131958, "learning_rate": 4.29632058462081e-05, "loss": 1.2183864116668701, "step": 462 }, { "epoch": 0.8498168498168498, "grad_norm": 0.12809278070926666, "learning_rate": 4.2898415664323844e-05, "loss": 1.1671696901321411, "step": 464 }, { "epoch": 0.8534798534798534, "grad_norm": 0.03261662647128105, "learning_rate": 4.2833384417410395e-05, "loss": 1.187354564666748, "step": 466 }, { "epoch": 0.8571428571428571, "grad_norm": 0.1845274269580841, "learning_rate": 4.276811312354389e-05, "loss": 0.8790689706802368, "step": 468 }, { "epoch": 0.8608058608058609, "grad_norm": 0.6504915952682495, "learning_rate": 4.270260280455843e-05, "loss": 0.8886659145355225, "step": 470 }, { "epoch": 0.8644688644688645, "grad_norm": 0.19828136265277863, "learning_rate": 4.263685448603012e-05, "loss": 1.1550533771514893, "step": 472 }, { "epoch": 0.8681318681318682, "grad_norm": 0.4623855948448181, "learning_rate": 4.257086919726097e-05, "loss": 0.7794157862663269, "step": 474 }, { "epoch": 0.8717948717948718, "grad_norm": 0.2206961214542389, "learning_rate": 4.25046479712628e-05, "loss": 1.087639331817627, "step": 476 }, { "epoch": 0.8754578754578755, "grad_norm": 0.23512773215770721, "learning_rate": 4.2438191844741105e-05, "loss": 1.0371439456939697, "step": 478 }, { "epoch": 0.8791208791208791, "grad_norm": 0.7811533808708191, "learning_rate": 4.2371501858078753e-05, "loss": 1.055543065071106, "step": 480 }, { "epoch": 0.8827838827838828, "grad_norm": 0.22808873653411865, "learning_rate": 4.230457905531976e-05, "loss": 1.1782468557357788, "step": 482 }, { "epoch": 0.8864468864468864, "grad_norm": 0.1753520965576172, "learning_rate": 4.22374244841529e-05, "loss": 1.244563341140747, "step": 484 }, { "epoch": 0.8901098901098901, "grad_norm": 1.1950969696044922, "learning_rate": 4.217003919589535e-05, "loss": 0.8924474120140076, "step": 486 }, { "epoch": 0.8937728937728938, "grad_norm": 0.42523816227912903, "learning_rate": 4.210242424547617e-05, "loss": 1.136575698852539, "step": 488 }, { "epoch": 0.8974358974358975, "grad_norm": 0.2354390025138855, "learning_rate": 4.203458069141985e-05, "loss": 1.0925524234771729, "step": 490 }, { "epoch": 0.9010989010989011, "grad_norm": 0.3065359592437744, "learning_rate": 4.196650959582973e-05, "loss": 1.031598687171936, "step": 492 }, { "epoch": 0.9047619047619048, "grad_norm": 0.40230098366737366, "learning_rate": 4.1898212024371304e-05, "loss": 0.5824300646781921, "step": 494 }, { "epoch": 0.9084249084249084, "grad_norm": 0.1176103800535202, "learning_rate": 4.1829689046255616e-05, "loss": 1.2608321905136108, "step": 496 }, { "epoch": 0.9120879120879121, "grad_norm": 0.1900339424610138, "learning_rate": 4.1760941734222505e-05, "loss": 1.117556095123291, "step": 498 }, { "epoch": 0.9157509157509157, "grad_norm": 0.13478179275989532, "learning_rate": 4.1691971164523764e-05, "loss": 0.7983730435371399, "step": 500 }, { "epoch": 0.9194139194139194, "grad_norm": 0.11394549906253815, "learning_rate": 4.1622778416906375e-05, "loss": 0.8523120284080505, "step": 502 }, { "epoch": 0.9230769230769231, "grad_norm": 0.15726390480995178, "learning_rate": 4.15533645745955e-05, "loss": 0.9240443110466003, "step": 504 }, { "epoch": 0.9267399267399268, "grad_norm": 0.07845434546470642, "learning_rate": 4.148373072427762e-05, "loss": 0.8336247205734253, "step": 506 }, { "epoch": 0.9304029304029304, "grad_norm": 0.1776849627494812, "learning_rate": 4.1413877956083456e-05, "loss": 1.1461174488067627, "step": 508 }, { "epoch": 0.9340659340659341, "grad_norm": 0.21315138041973114, "learning_rate": 4.1343807363570964e-05, "loss": 1.2062344551086426, "step": 510 }, { "epoch": 0.9377289377289377, "grad_norm": 0.1842324286699295, "learning_rate": 4.127352004370814e-05, "loss": 1.1556131839752197, "step": 512 }, { "epoch": 0.9413919413919414, "grad_norm": 0.17686887085437775, "learning_rate": 4.12030170968559e-05, "loss": 0.9388005137443542, "step": 514 }, { "epoch": 0.945054945054945, "grad_norm": 0.20578397810459137, "learning_rate": 4.113229962675085e-05, "loss": 1.1634795665740967, "step": 516 }, { "epoch": 0.9487179487179487, "grad_norm": 0.19318969547748566, "learning_rate": 4.1061368740488e-05, "loss": 1.1986818313598633, "step": 518 }, { "epoch": 0.9523809523809523, "grad_norm": 0.25902265310287476, "learning_rate": 4.09902255485034e-05, "loss": 1.1979715824127197, "step": 520 }, { "epoch": 0.9560439560439561, "grad_norm": 0.3866836726665497, "learning_rate": 4.091887116455681e-05, "loss": 0.8659937381744385, "step": 522 }, { "epoch": 0.9597069597069597, "grad_norm": 0.4504384994506836, "learning_rate": 4.084730670571424e-05, "loss": 1.0120433568954468, "step": 524 }, { "epoch": 0.9633699633699634, "grad_norm": 0.2555679380893707, "learning_rate": 4.0775533292330464e-05, "loss": 0.9460458755493164, "step": 526 }, { "epoch": 0.967032967032967, "grad_norm": 0.3483099639415741, "learning_rate": 4.070355204803145e-05, "loss": 0.6675710082054138, "step": 528 }, { "epoch": 0.9706959706959707, "grad_norm": 0.09682053327560425, "learning_rate": 4.0631364099696815e-05, "loss": 0.90069580078125, "step": 530 }, { "epoch": 0.9743589743589743, "grad_norm": 0.7320610880851746, "learning_rate": 4.055897057744219e-05, "loss": 1.3395118713378906, "step": 532 }, { "epoch": 0.978021978021978, "grad_norm": 0.24255181849002838, "learning_rate": 4.048637261460145e-05, "loss": 0.6177163124084473, "step": 534 }, { "epoch": 0.9816849816849816, "grad_norm": 0.42302218079566956, "learning_rate": 4.0413571347709074e-05, "loss": 0.8449323177337646, "step": 536 }, { "epoch": 0.9853479853479854, "grad_norm": 0.1898687183856964, "learning_rate": 4.034056791648228e-05, "loss": 0.7976465225219727, "step": 538 }, { "epoch": 0.989010989010989, "grad_norm": 1.4811292886734009, "learning_rate": 4.0267363463803216e-05, "loss": 1.1151212453842163, "step": 540 }, { "epoch": 0.9926739926739927, "grad_norm": 0.20806598663330078, "learning_rate": 4.019395913570104e-05, "loss": 1.1612093448638916, "step": 542 }, { "epoch": 0.9963369963369964, "grad_norm": 0.4448166787624359, "learning_rate": 4.0120356081334004e-05, "loss": 1.1680574417114258, "step": 544 }, { "epoch": 1.0, "grad_norm": 0.2003720998764038, "learning_rate": 4.004655545297148e-05, "loss": 1.347452163696289, "step": 546 }, { "epoch": 1.0036630036630036, "grad_norm": 0.5149394869804382, "learning_rate": 3.997255840597587e-05, "loss": 0.8998035788536072, "step": 548 }, { "epoch": 1.0073260073260073, "grad_norm": 0.18844829499721527, "learning_rate": 3.9898366098784544e-05, "loss": 1.149839162826538, "step": 550 }, { "epoch": 1.010989010989011, "grad_norm": 0.12711450457572937, "learning_rate": 3.9823979692891734e-05, "loss": 0.6266541481018066, "step": 552 }, { "epoch": 1.0146520146520146, "grad_norm": 0.43699517846107483, "learning_rate": 3.974940035283029e-05, "loss": 1.0944384336471558, "step": 554 }, { "epoch": 1.0183150183150182, "grad_norm": 0.4765096604824066, "learning_rate": 3.967462924615351e-05, "loss": 0.9994142055511475, "step": 556 }, { "epoch": 1.021978021978022, "grad_norm": 0.1308145374059677, "learning_rate": 3.95996675434168e-05, "loss": 1.181485652923584, "step": 558 }, { "epoch": 1.0256410256410255, "grad_norm": 0.17354629933834076, "learning_rate": 3.952451641815942e-05, "loss": 0.8960216641426086, "step": 560 }, { "epoch": 1.0293040293040292, "grad_norm": 0.20918694138526917, "learning_rate": 3.944917704688605e-05, "loss": 1.130763053894043, "step": 562 }, { "epoch": 1.032967032967033, "grad_norm": 1.6546434164047241, "learning_rate": 3.9373650609048404e-05, "loss": 1.109397530555725, "step": 564 }, { "epoch": 1.0366300366300367, "grad_norm": 0.20021019876003265, "learning_rate": 3.929793828702676e-05, "loss": 0.9343792796134949, "step": 566 }, { "epoch": 1.0402930402930404, "grad_norm": 0.8677191734313965, "learning_rate": 3.9222041266111444e-05, "loss": 1.0045840740203857, "step": 568 }, { "epoch": 1.043956043956044, "grad_norm": 0.3973585367202759, "learning_rate": 3.914596073448427e-05, "loss": 1.0684887170791626, "step": 570 }, { "epoch": 1.0476190476190477, "grad_norm": 0.5770029425621033, "learning_rate": 3.906969788319996e-05, "loss": 1.2040116786956787, "step": 572 }, { "epoch": 1.0512820512820513, "grad_norm": 0.4743853509426117, "learning_rate": 3.899325390616748e-05, "loss": 1.0301820039749146, "step": 574 }, { "epoch": 1.054945054945055, "grad_norm": 0.5908330678939819, "learning_rate": 3.891663000013133e-05, "loss": 1.180071473121643, "step": 576 }, { "epoch": 1.0586080586080586, "grad_norm": 0.19290882349014282, "learning_rate": 3.8839827364652875e-05, "loss": 1.1906160116195679, "step": 578 }, { "epoch": 1.0622710622710623, "grad_norm": 0.9853556752204895, "learning_rate": 3.8762847202091486e-05, "loss": 1.186142086982727, "step": 580 }, { "epoch": 1.065934065934066, "grad_norm": 0.2555069625377655, "learning_rate": 3.868569071758577e-05, "loss": 0.9499126076698303, "step": 582 }, { "epoch": 1.0695970695970696, "grad_norm": 0.37080565094947815, "learning_rate": 3.860835911903467e-05, "loss": 1.1149709224700928, "step": 584 }, { "epoch": 1.0732600732600732, "grad_norm": 0.15910615026950836, "learning_rate": 3.853085361707859e-05, "loss": 1.0009230375289917, "step": 586 }, { "epoch": 1.0769230769230769, "grad_norm": 0.24355213344097137, "learning_rate": 3.8453175425080426e-05, "loss": 0.7909836769104004, "step": 588 }, { "epoch": 1.0805860805860805, "grad_norm": 0.2337721288204193, "learning_rate": 3.8375325759106563e-05, "loss": 1.1665717363357544, "step": 590 }, { "epoch": 1.0842490842490842, "grad_norm": 0.16137680411338806, "learning_rate": 3.829730583790782e-05, "loss": 1.009416103363037, "step": 592 }, { "epoch": 1.0879120879120878, "grad_norm": 0.19279745221138, "learning_rate": 3.821911688290043e-05, "loss": 0.9450397491455078, "step": 594 }, { "epoch": 1.0915750915750915, "grad_norm": 0.17870314419269562, "learning_rate": 3.814076011814685e-05, "loss": 0.991208553314209, "step": 596 }, { "epoch": 1.0952380952380953, "grad_norm": 0.3691723346710205, "learning_rate": 3.806223677033664e-05, "loss": 1.0436757802963257, "step": 598 }, { "epoch": 1.098901098901099, "grad_norm": 0.2122497707605362, "learning_rate": 3.798354806876728e-05, "loss": 1.10894775390625, "step": 600 }, { "epoch": 1.1025641025641026, "grad_norm": 0.22092920541763306, "learning_rate": 3.790469524532484e-05, "loss": 0.7955626845359802, "step": 602 }, { "epoch": 1.1062271062271063, "grad_norm": 0.6790117621421814, "learning_rate": 3.782567953446477e-05, "loss": 0.9074943661689758, "step": 604 }, { "epoch": 1.10989010989011, "grad_norm": 0.2356610894203186, "learning_rate": 3.774650217319257e-05, "loss": 0.8648009896278381, "step": 606 }, { "epoch": 1.1135531135531136, "grad_norm": 0.5769211649894714, "learning_rate": 3.766716440104439e-05, "loss": 1.1958070993423462, "step": 608 }, { "epoch": 1.1172161172161172, "grad_norm": 0.2272127866744995, "learning_rate": 3.7587667460067635e-05, "loss": 0.7023400664329529, "step": 610 }, { "epoch": 1.120879120879121, "grad_norm": 0.6303228735923767, "learning_rate": 3.750801259480154e-05, "loss": 1.1139551401138306, "step": 612 }, { "epoch": 1.1245421245421245, "grad_norm": 0.22971026599407196, "learning_rate": 3.7428201052257675e-05, "loss": 0.9638775587081909, "step": 614 }, { "epoch": 1.1282051282051282, "grad_norm": 0.1906343698501587, "learning_rate": 3.7348234081900424e-05, "loss": 1.127274513244629, "step": 616 }, { "epoch": 1.1318681318681318, "grad_norm": 0.29741182923316956, "learning_rate": 3.726811293562739e-05, "loss": 0.36746326088905334, "step": 618 }, { "epoch": 1.1355311355311355, "grad_norm": 1.2210016250610352, "learning_rate": 3.718783886774988e-05, "loss": 1.0633288621902466, "step": 620 }, { "epoch": 1.1391941391941391, "grad_norm": 1.7148534059524536, "learning_rate": 3.7107413134973174e-05, "loss": 0.7120411992073059, "step": 622 }, { "epoch": 1.1428571428571428, "grad_norm": 0.6514428853988647, "learning_rate": 3.702683699637692e-05, "loss": 1.0393530130386353, "step": 624 }, { "epoch": 1.1465201465201464, "grad_norm": 0.6007410287857056, "learning_rate": 3.6946111713395365e-05, "loss": 1.3253600597381592, "step": 626 }, { "epoch": 1.15018315018315, "grad_norm": 0.6678552031517029, "learning_rate": 3.6865238549797686e-05, "loss": 0.4324287176132202, "step": 628 }, { "epoch": 1.1538461538461537, "grad_norm": 0.056091565638780594, "learning_rate": 3.6784218771668125e-05, "loss": 1.0922839641571045, "step": 630 }, { "epoch": 1.1575091575091574, "grad_norm": 0.34849968552589417, "learning_rate": 3.670305364738621e-05, "loss": 1.0799121856689453, "step": 632 }, { "epoch": 1.1611721611721613, "grad_norm": 1.3243259191513062, "learning_rate": 3.662174444760688e-05, "loss": 0.8275938630104065, "step": 634 }, { "epoch": 1.164835164835165, "grad_norm": 1.356533169746399, "learning_rate": 3.6540292445240624e-05, "loss": 0.93868488073349, "step": 636 }, { "epoch": 1.1684981684981686, "grad_norm": 0.4635038673877716, "learning_rate": 3.6458698915433506e-05, "loss": 1.1719251871109009, "step": 638 }, { "epoch": 1.1721611721611722, "grad_norm": 0.04632457718253136, "learning_rate": 3.637696513554725e-05, "loss": 0.7194165587425232, "step": 640 }, { "epoch": 1.1758241758241759, "grad_norm": 0.3217860162258148, "learning_rate": 3.629509238513921e-05, "loss": 1.0931247472763062, "step": 642 }, { "epoch": 1.1794871794871795, "grad_norm": 0.8376574516296387, "learning_rate": 3.621308194594236e-05, "loss": 0.9752073884010315, "step": 644 }, { "epoch": 1.1831501831501832, "grad_norm": 0.19979257881641388, "learning_rate": 3.6130935101845194e-05, "loss": 0.7485665678977966, "step": 646 }, { "epoch": 1.1868131868131868, "grad_norm": 0.2617126703262329, "learning_rate": 3.6048653138871666e-05, "loss": 0.8534201383590698, "step": 648 }, { "epoch": 1.1904761904761905, "grad_norm": 1.090280532836914, "learning_rate": 3.596623734516104e-05, "loss": 1.1257884502410889, "step": 650 }, { "epoch": 1.1941391941391941, "grad_norm": 6.1979570388793945, "learning_rate": 3.588368901094773e-05, "loss": 0.653273344039917, "step": 652 }, { "epoch": 1.1978021978021978, "grad_norm": 0.17688162624835968, "learning_rate": 3.5801009428541096e-05, "loss": 1.249631643295288, "step": 654 }, { "epoch": 1.2014652014652014, "grad_norm": 0.8640678524971008, "learning_rate": 3.571819989230519e-05, "loss": 0.8184079527854919, "step": 656 }, { "epoch": 1.205128205128205, "grad_norm": 0.6149291396141052, "learning_rate": 3.563526169863854e-05, "loss": 0.7101552486419678, "step": 658 }, { "epoch": 1.2087912087912087, "grad_norm": 0.7223177552223206, "learning_rate": 3.555219614595381e-05, "loss": 1.1504517793655396, "step": 660 }, { "epoch": 1.2124542124542124, "grad_norm": 0.6356011629104614, "learning_rate": 3.546900453465752e-05, "loss": 0.970334529876709, "step": 662 }, { "epoch": 1.2161172161172162, "grad_norm": 0.8084374666213989, "learning_rate": 3.538568816712964e-05, "loss": 0.8572604060173035, "step": 664 }, { "epoch": 1.2197802197802199, "grad_norm": 0.2903348505496979, "learning_rate": 3.5302248347703224e-05, "loss": 0.7845436930656433, "step": 666 }, { "epoch": 1.2234432234432235, "grad_norm": 0.9992715716362, "learning_rate": 3.5218686382643994e-05, "loss": 0.8749545812606812, "step": 668 }, { "epoch": 1.2271062271062272, "grad_norm": 0.7143378257751465, "learning_rate": 3.513500358012988e-05, "loss": 0.6878855228424072, "step": 670 }, { "epoch": 1.2307692307692308, "grad_norm": 0.30578601360321045, "learning_rate": 3.5051201250230545e-05, "loss": 1.168808937072754, "step": 672 }, { "epoch": 1.2344322344322345, "grad_norm": 1.2355809211730957, "learning_rate": 3.4967280704886865e-05, "loss": 1.1536543369293213, "step": 674 }, { "epoch": 1.2380952380952381, "grad_norm": 0.3718186914920807, "learning_rate": 3.488324325789044e-05, "loss": 1.1648200750350952, "step": 676 }, { "epoch": 1.2417582417582418, "grad_norm": 0.4209219813346863, "learning_rate": 3.4799090224862924e-05, "loss": 0.7579060792922974, "step": 678 }, { "epoch": 1.2454212454212454, "grad_norm": 2.09708571434021, "learning_rate": 3.471482292323554e-05, "loss": 0.8136189579963684, "step": 680 }, { "epoch": 1.249084249084249, "grad_norm": 0.4596081078052521, "learning_rate": 3.463044267222841e-05, "loss": 1.1541743278503418, "step": 682 }, { "epoch": 1.2527472527472527, "grad_norm": 0.2024964988231659, "learning_rate": 3.454595079282986e-05, "loss": 1.1373684406280518, "step": 684 }, { "epoch": 1.2564102564102564, "grad_norm": 0.5888193845748901, "learning_rate": 3.4461348607775806e-05, "loss": 0.8096006512641907, "step": 686 }, { "epoch": 1.26007326007326, "grad_norm": 0.5350688099861145, "learning_rate": 3.437663744152902e-05, "loss": 1.081048607826233, "step": 688 }, { "epoch": 1.2637362637362637, "grad_norm": 0.4051729738712311, "learning_rate": 3.429181862025839e-05, "loss": 1.0899769067764282, "step": 690 }, { "epoch": 1.2673992673992673, "grad_norm": 1.3433797359466553, "learning_rate": 3.4206893471818155e-05, "loss": 1.1519224643707275, "step": 692 }, { "epoch": 1.271062271062271, "grad_norm": 0.751139223575592, "learning_rate": 3.4121863325727124e-05, "loss": 0.9729434251785278, "step": 694 }, { "epoch": 1.2747252747252746, "grad_norm": 0.7331501245498657, "learning_rate": 3.40367295131479e-05, "loss": 0.9491739869117737, "step": 696 }, { "epoch": 1.2783882783882783, "grad_norm": 0.39379310607910156, "learning_rate": 3.395149336686595e-05, "loss": 0.8585996627807617, "step": 698 }, { "epoch": 1.282051282051282, "grad_norm": 0.2216329425573349, "learning_rate": 3.386615622126883e-05, "loss": 0.6349502801895142, "step": 700 }, { "epoch": 1.2857142857142856, "grad_norm": 1.3201651573181152, "learning_rate": 3.378071941232525e-05, "loss": 0.507042646408081, "step": 702 }, { "epoch": 1.2893772893772895, "grad_norm": 1.1609549522399902, "learning_rate": 3.369518427756417e-05, "loss": 1.2238701581954956, "step": 704 }, { "epoch": 1.293040293040293, "grad_norm": 0.262668639421463, "learning_rate": 3.360955215605385e-05, "loss": 0.954353928565979, "step": 706 }, { "epoch": 1.2967032967032968, "grad_norm": 0.15986381471157074, "learning_rate": 3.35238243883809e-05, "loss": 0.9157785177230835, "step": 708 }, { "epoch": 1.3003663003663004, "grad_norm": 0.24788087606430054, "learning_rate": 3.34380023166293e-05, "loss": 0.523282527923584, "step": 710 }, { "epoch": 1.304029304029304, "grad_norm": 0.5071739554405212, "learning_rate": 3.335208728435935e-05, "loss": 0.8822041749954224, "step": 712 }, { "epoch": 1.3076923076923077, "grad_norm": 0.21843966841697693, "learning_rate": 3.3266080636586685e-05, "loss": 1.1520413160324097, "step": 714 }, { "epoch": 1.3113553113553114, "grad_norm": 0.7570046186447144, "learning_rate": 3.317998371976121e-05, "loss": 1.1054189205169678, "step": 716 }, { "epoch": 1.315018315018315, "grad_norm": 0.24603700637817383, "learning_rate": 3.309379788174598e-05, "loss": 0.8677737712860107, "step": 718 }, { "epoch": 1.3186813186813187, "grad_norm": 0.6697846055030823, "learning_rate": 3.3007524471796136e-05, "loss": 0.8973780870437622, "step": 720 }, { "epoch": 1.3223443223443223, "grad_norm": 0.24531511962413788, "learning_rate": 3.2921164840537784e-05, "loss": 0.8588492274284363, "step": 722 }, { "epoch": 1.326007326007326, "grad_norm": 0.1828172355890274, "learning_rate": 3.283472033994683e-05, "loss": 1.188812255859375, "step": 724 }, { "epoch": 1.3296703296703296, "grad_norm": 0.24933356046676636, "learning_rate": 3.274819232332783e-05, "loss": 1.0235859155654907, "step": 726 }, { "epoch": 1.3333333333333333, "grad_norm": 0.8426433801651001, "learning_rate": 3.2661582145292805e-05, "loss": 1.116140604019165, "step": 728 }, { "epoch": 1.3369963369963371, "grad_norm": 0.518878161907196, "learning_rate": 3.2574891161740014e-05, "loss": 0.6969371438026428, "step": 730 }, { "epoch": 1.3406593406593408, "grad_norm": 0.3753526210784912, "learning_rate": 3.2488120729832745e-05, "loss": 0.7986868023872375, "step": 732 }, { "epoch": 1.3443223443223444, "grad_norm": 0.09369145333766937, "learning_rate": 3.240127220797807e-05, "loss": 0.6143500804901123, "step": 734 }, { "epoch": 1.347985347985348, "grad_norm": 0.7680373787879944, "learning_rate": 3.231434695580558e-05, "loss": 1.102622628211975, "step": 736 }, { "epoch": 1.3516483516483517, "grad_norm": 0.2766784131526947, "learning_rate": 3.222734633414607e-05, "loss": 0.7411299347877502, "step": 738 }, { "epoch": 1.3553113553113554, "grad_norm": 0.4584338665008545, "learning_rate": 3.214027170501029e-05, "loss": 0.9950368404388428, "step": 740 }, { "epoch": 1.358974358974359, "grad_norm": 0.800658106803894, "learning_rate": 3.205312443156755e-05, "loss": 0.5370650887489319, "step": 742 }, { "epoch": 1.3626373626373627, "grad_norm": 0.39628687500953674, "learning_rate": 3.196590587812446e-05, "loss": 1.2178653478622437, "step": 744 }, { "epoch": 1.3663003663003663, "grad_norm": 0.9103190302848816, "learning_rate": 3.1878617410103514e-05, "loss": 1.0132914781570435, "step": 746 }, { "epoch": 1.36996336996337, "grad_norm": 0.24214321374893188, "learning_rate": 3.1791260394021735e-05, "loss": 1.0330907106399536, "step": 748 }, { "epoch": 1.3736263736263736, "grad_norm": 0.22804208099842072, "learning_rate": 3.1703836197469257e-05, "loss": 0.7769557237625122, "step": 750 }, { "epoch": 1.3772893772893773, "grad_norm": 0.31006965041160583, "learning_rate": 3.161634618908797e-05, "loss": 1.099147915840149, "step": 752 }, { "epoch": 1.380952380952381, "grad_norm": 0.2560300827026367, "learning_rate": 3.1528791738550054e-05, "loss": 0.9559687376022339, "step": 754 }, { "epoch": 1.3846153846153846, "grad_norm": 0.21459929645061493, "learning_rate": 3.1441174216536514e-05, "loss": 1.2862838506698608, "step": 756 }, { "epoch": 1.3882783882783882, "grad_norm": 0.6228247880935669, "learning_rate": 3.135349499471579e-05, "loss": 1.1889519691467285, "step": 758 }, { "epoch": 1.3919413919413919, "grad_norm": 0.15515995025634766, "learning_rate": 3.126575544572222e-05, "loss": 1.1298028230667114, "step": 760 }, { "epoch": 1.3956043956043955, "grad_norm": 0.2352827489376068, "learning_rate": 3.117795694313458e-05, "loss": 1.1332722902297974, "step": 762 }, { "epoch": 1.3992673992673992, "grad_norm": 0.15803271532058716, "learning_rate": 3.109010086145456e-05, "loss": 0.689454197883606, "step": 764 }, { "epoch": 1.4029304029304028, "grad_norm": 0.2263651043176651, "learning_rate": 3.1002188576085295e-05, "loss": 0.8705043196678162, "step": 766 }, { "epoch": 1.4065934065934065, "grad_norm": 1.2859166860580444, "learning_rate": 3.091422146330977e-05, "loss": 0.8634616732597351, "step": 768 }, { "epoch": 1.4102564102564101, "grad_norm": 0.38691240549087524, "learning_rate": 3.082620090026932e-05, "loss": 1.1554944515228271, "step": 770 }, { "epoch": 1.4139194139194138, "grad_norm": 0.21298235654830933, "learning_rate": 3.0738128264942046e-05, "loss": 1.1856485605239868, "step": 772 }, { "epoch": 1.4175824175824177, "grad_norm": 0.4227127432823181, "learning_rate": 3.0650004936121254e-05, "loss": 0.9900102615356445, "step": 774 }, { "epoch": 1.4212454212454213, "grad_norm": 0.272560715675354, "learning_rate": 3.0561832293393846e-05, "loss": 1.1388965845108032, "step": 776 }, { "epoch": 1.424908424908425, "grad_norm": 0.23176811635494232, "learning_rate": 3.04736117171188e-05, "loss": 0.8189452886581421, "step": 778 }, { "epoch": 1.4285714285714286, "grad_norm": 0.17611531913280487, "learning_rate": 3.0385344588405422e-05, "loss": 1.3413128852844238, "step": 780 }, { "epoch": 1.4322344322344323, "grad_norm": 0.4652513861656189, "learning_rate": 3.029703228909186e-05, "loss": 1.1679465770721436, "step": 782 }, { "epoch": 1.435897435897436, "grad_norm": 0.16926081478595734, "learning_rate": 3.0208676201723406e-05, "loss": 1.134766697883606, "step": 784 }, { "epoch": 1.4395604395604396, "grad_norm": 3.288715362548828, "learning_rate": 3.0120277709530854e-05, "loss": 0.9238865971565247, "step": 786 }, { "epoch": 1.4432234432234432, "grad_norm": 0.3530486524105072, "learning_rate": 3.003183819640886e-05, "loss": 1.1074001789093018, "step": 788 }, { "epoch": 1.4468864468864469, "grad_norm": 0.0459970124065876, "learning_rate": 2.9943359046894254e-05, "loss": 0.6836336851119995, "step": 790 }, { "epoch": 1.4505494505494505, "grad_norm": 0.23490065336227417, "learning_rate": 2.9854841646144423e-05, "loss": 0.9037283062934875, "step": 792 }, { "epoch": 1.4542124542124542, "grad_norm": 0.35676056146621704, "learning_rate": 2.9766287379915518e-05, "loss": 0.8027743101119995, "step": 794 }, { "epoch": 1.4578754578754578, "grad_norm": 0.17792175710201263, "learning_rate": 2.967769763454089e-05, "loss": 1.1282213926315308, "step": 796 }, { "epoch": 1.4615384615384617, "grad_norm": 0.28798913955688477, "learning_rate": 2.9589073796909282e-05, "loss": 0.7936130166053772, "step": 798 }, { "epoch": 1.4652014652014653, "grad_norm": 0.20934176445007324, "learning_rate": 2.950041725444318e-05, "loss": 0.9876341819763184, "step": 800 }, { "epoch": 1.468864468864469, "grad_norm": 0.4038946032524109, "learning_rate": 2.941172939507706e-05, "loss": 1.5155441761016846, "step": 802 }, { "epoch": 1.4725274725274726, "grad_norm": 0.8771412968635559, "learning_rate": 2.932301160723566e-05, "loss": 1.0064780712127686, "step": 804 }, { "epoch": 1.4761904761904763, "grad_norm": 0.3849470317363739, "learning_rate": 2.923426527981228e-05, "loss": 1.171331524848938, "step": 806 }, { "epoch": 1.47985347985348, "grad_norm": 0.5106516480445862, "learning_rate": 2.9145491802146984e-05, "loss": 1.1589710712432861, "step": 808 }, { "epoch": 1.4835164835164836, "grad_norm": 0.2847084403038025, "learning_rate": 2.905669256400491e-05, "loss": 0.9889826774597168, "step": 810 }, { "epoch": 1.4871794871794872, "grad_norm": 0.20060478150844574, "learning_rate": 2.896786895555444e-05, "loss": 0.8426548838615417, "step": 812 }, { "epoch": 1.4908424908424909, "grad_norm": 0.11503265798091888, "learning_rate": 2.887902236734552e-05, "loss": 1.1940970420837402, "step": 814 }, { "epoch": 1.4945054945054945, "grad_norm": 0.23822751641273499, "learning_rate": 2.879015419028781e-05, "loss": 1.1169782876968384, "step": 816 }, { "epoch": 1.4981684981684982, "grad_norm": 0.1774878203868866, "learning_rate": 2.8701265815628987e-05, "loss": 0.3862011432647705, "step": 818 }, { "epoch": 1.5018315018315018, "grad_norm": 0.2943243980407715, "learning_rate": 2.8612358634932884e-05, "loss": 1.1364233493804932, "step": 820 }, { "epoch": 1.5054945054945055, "grad_norm": 0.34925273060798645, "learning_rate": 2.852343404005778e-05, "loss": 0.8967536687850952, "step": 822 }, { "epoch": 1.5091575091575091, "grad_norm": 0.2488994151353836, "learning_rate": 2.8434493423134544e-05, "loss": 0.8218085169792175, "step": 824 }, { "epoch": 1.5128205128205128, "grad_norm": 0.17446660995483398, "learning_rate": 2.8345538176544918e-05, "loss": 1.0801664590835571, "step": 826 }, { "epoch": 1.5164835164835164, "grad_norm": 0.9011160731315613, "learning_rate": 2.8256569692899627e-05, "loss": 0.9831532835960388, "step": 828 }, { "epoch": 1.52014652014652, "grad_norm": 0.2522432208061218, "learning_rate": 2.8167589365016646e-05, "loss": 0.984779953956604, "step": 830 }, { "epoch": 1.5238095238095237, "grad_norm": 0.2282875031232834, "learning_rate": 2.8078598585899385e-05, "loss": 1.2356276512145996, "step": 832 }, { "epoch": 1.5274725274725274, "grad_norm": 0.45575666427612305, "learning_rate": 2.7989598748714846e-05, "loss": 0.7996046543121338, "step": 834 }, { "epoch": 1.531135531135531, "grad_norm": 0.31805214285850525, "learning_rate": 2.7900591246771855e-05, "loss": 1.1425288915634155, "step": 836 }, { "epoch": 1.5347985347985347, "grad_norm": 0.24859696626663208, "learning_rate": 2.7811577473499224e-05, "loss": 1.1804063320159912, "step": 838 }, { "epoch": 1.5384615384615383, "grad_norm": 0.2666679322719574, "learning_rate": 2.772255882242394e-05, "loss": 1.0411241054534912, "step": 840 }, { "epoch": 1.542124542124542, "grad_norm": 0.20402321219444275, "learning_rate": 2.7633536687149353e-05, "loss": 0.8526805639266968, "step": 842 }, { "epoch": 1.5457875457875456, "grad_norm": 0.38646435737609863, "learning_rate": 2.7544512461333377e-05, "loss": 1.0257073640823364, "step": 844 }, { "epoch": 1.5494505494505495, "grad_norm": 0.19415532052516937, "learning_rate": 2.745548753866663e-05, "loss": 1.1687860488891602, "step": 846 }, { "epoch": 1.5531135531135531, "grad_norm": 0.2800712287425995, "learning_rate": 2.7366463312850655e-05, "loss": 0.5396187901496887, "step": 848 }, { "epoch": 1.5567765567765568, "grad_norm": 0.18754497170448303, "learning_rate": 2.727744117757607e-05, "loss": 1.02944016456604, "step": 850 }, { "epoch": 1.5604395604395604, "grad_norm": 0.2201027274131775, "learning_rate": 2.7188422526500788e-05, "loss": 1.168210744857788, "step": 852 }, { "epoch": 1.564102564102564, "grad_norm": 0.3404369354248047, "learning_rate": 2.709940875322815e-05, "loss": 0.8437097072601318, "step": 854 }, { "epoch": 1.5677655677655677, "grad_norm": 0.35649099946022034, "learning_rate": 2.7010401251285156e-05, "loss": 0.8084161877632141, "step": 856 }, { "epoch": 1.5714285714285714, "grad_norm": 0.47654107213020325, "learning_rate": 2.6921401414100627e-05, "loss": 0.9324872493743896, "step": 858 }, { "epoch": 1.575091575091575, "grad_norm": 0.23765020072460175, "learning_rate": 2.6832410634983356e-05, "loss": 0.6993922591209412, "step": 860 }, { "epoch": 1.578754578754579, "grad_norm": 0.24254556000232697, "learning_rate": 2.6743430307100388e-05, "loss": 1.1566822528839111, "step": 862 }, { "epoch": 1.5824175824175826, "grad_norm": 0.2840297520160675, "learning_rate": 2.665446182345509e-05, "loss": 0.8900972008705139, "step": 864 }, { "epoch": 1.5860805860805862, "grad_norm": 0.23388059437274933, "learning_rate": 2.6565506576865458e-05, "loss": 1.1673542261123657, "step": 866 }, { "epoch": 1.5897435897435899, "grad_norm": 0.5824553966522217, "learning_rate": 2.6476565959942233e-05, "loss": 1.1742522716522217, "step": 868 }, { "epoch": 1.5934065934065935, "grad_norm": 0.2521633803844452, "learning_rate": 2.6387641365067124e-05, "loss": 0.5782104730606079, "step": 870 }, { "epoch": 1.5970695970695972, "grad_norm": 0.16043610870838165, "learning_rate": 2.6298734184371015e-05, "loss": 1.1673752069473267, "step": 872 }, { "epoch": 1.6007326007326008, "grad_norm": 0.1490897536277771, "learning_rate": 2.6209845809712195e-05, "loss": 1.0414141416549683, "step": 874 }, { "epoch": 1.6043956043956045, "grad_norm": 5.497232437133789, "learning_rate": 2.6120977632654485e-05, "loss": 0.7130216360092163, "step": 876 }, { "epoch": 1.6080586080586081, "grad_norm": 0.6886661648750305, "learning_rate": 2.6032131044445563e-05, "loss": 0.9459899663925171, "step": 878 }, { "epoch": 1.6117216117216118, "grad_norm": 0.09610695391893387, "learning_rate": 2.59433074359951e-05, "loss": 0.8761966228485107, "step": 880 }, { "epoch": 1.6153846153846154, "grad_norm": 0.31204506754875183, "learning_rate": 2.5854508197853022e-05, "loss": 1.1188955307006836, "step": 882 }, { "epoch": 1.619047619047619, "grad_norm": 0.3378327190876007, "learning_rate": 2.5765734720187723e-05, "loss": 0.9112301468849182, "step": 884 }, { "epoch": 1.6227106227106227, "grad_norm": 0.22289712727069855, "learning_rate": 2.5676988392764345e-05, "loss": 1.1279692649841309, "step": 886 }, { "epoch": 1.6263736263736264, "grad_norm": 0.20399990677833557, "learning_rate": 2.5588270604922947e-05, "loss": 0.8507078886032104, "step": 888 }, { "epoch": 1.63003663003663, "grad_norm": 0.16419348120689392, "learning_rate": 2.5499582745556828e-05, "loss": 1.1664886474609375, "step": 890 }, { "epoch": 1.6336996336996337, "grad_norm": 0.488471120595932, "learning_rate": 2.541092620309073e-05, "loss": 0.8955670595169067, "step": 892 }, { "epoch": 1.6373626373626373, "grad_norm": 0.2712729573249817, "learning_rate": 2.5322302365459116e-05, "loss": 1.1703094244003296, "step": 894 }, { "epoch": 1.641025641025641, "grad_norm": 0.2993127107620239, "learning_rate": 2.5233712620084494e-05, "loss": 0.7074750065803528, "step": 896 }, { "epoch": 1.6446886446886446, "grad_norm": 0.17085275053977966, "learning_rate": 2.5145158353855592e-05, "loss": 1.122510313987732, "step": 898 }, { "epoch": 1.6483516483516483, "grad_norm": 0.04999161139130592, "learning_rate": 2.505664095310574e-05, "loss": 0.7906731963157654, "step": 900 }, { "epoch": 1.652014652014652, "grad_norm": 0.22062256932258606, "learning_rate": 2.496816180359115e-05, "loss": 0.8734868764877319, "step": 902 }, { "epoch": 1.6556776556776556, "grad_norm": 0.32644936442375183, "learning_rate": 2.4879722290469155e-05, "loss": 1.0106048583984375, "step": 904 }, { "epoch": 1.6593406593406592, "grad_norm": 0.33785051107406616, "learning_rate": 2.4791323798276593e-05, "loss": 1.1063401699066162, "step": 906 }, { "epoch": 1.6630036630036629, "grad_norm": 0.19222113490104675, "learning_rate": 2.4702967710908143e-05, "loss": 0.7271807789802551, "step": 908 }, { "epoch": 1.6666666666666665, "grad_norm": 0.34294626116752625, "learning_rate": 2.4614655411594583e-05, "loss": 1.180138349533081, "step": 910 }, { "epoch": 1.6703296703296702, "grad_norm": 0.08716004341840744, "learning_rate": 2.452638828288121e-05, "loss": 1.2012932300567627, "step": 912 }, { "epoch": 1.673992673992674, "grad_norm": 0.14314715564250946, "learning_rate": 2.4438167706606152e-05, "loss": 1.0735292434692383, "step": 914 }, { "epoch": 1.6776556776556777, "grad_norm": 0.9384481906890869, "learning_rate": 2.434999506387875e-05, "loss": 1.1871505975723267, "step": 916 }, { "epoch": 1.6813186813186813, "grad_norm": 0.8262288570404053, "learning_rate": 2.4261871735057956e-05, "loss": 0.8213975429534912, "step": 918 }, { "epoch": 1.684981684981685, "grad_norm": 0.25928905606269836, "learning_rate": 2.417379909973069e-05, "loss": 1.1863298416137695, "step": 920 }, { "epoch": 1.6886446886446886, "grad_norm": 0.33573785424232483, "learning_rate": 2.408577853669024e-05, "loss": 0.9798559546470642, "step": 922 }, { "epoch": 1.6923076923076923, "grad_norm": 0.254114031791687, "learning_rate": 2.3997811423914717e-05, "loss": 0.8279831409454346, "step": 924 }, { "epoch": 1.695970695970696, "grad_norm": 0.8429379463195801, "learning_rate": 2.390989913854545e-05, "loss": 0.9410126209259033, "step": 926 }, { "epoch": 1.6996336996336996, "grad_norm": 0.13982383906841278, "learning_rate": 2.382204305686543e-05, "loss": 0.7881975173950195, "step": 928 }, { "epoch": 1.7032967032967035, "grad_norm": 0.03225285932421684, "learning_rate": 2.373424455427779e-05, "loss": 0.8376743197441101, "step": 930 }, { "epoch": 1.7069597069597071, "grad_norm": 0.28351178765296936, "learning_rate": 2.364650500528421e-05, "loss": 0.9067142605781555, "step": 932 }, { "epoch": 1.7106227106227108, "grad_norm": 0.1636010706424713, "learning_rate": 2.3558825783463484e-05, "loss": 1.3678433895111084, "step": 934 }, { "epoch": 1.7142857142857144, "grad_norm": 0.29480600357055664, "learning_rate": 2.3471208261449955e-05, "loss": 1.1705397367477417, "step": 936 }, { "epoch": 1.717948717948718, "grad_norm": 0.1829896867275238, "learning_rate": 2.3383653810912033e-05, "loss": 0.4487687945365906, "step": 938 }, { "epoch": 1.7216117216117217, "grad_norm": 0.11328794807195663, "learning_rate": 2.3296163802530745e-05, "loss": 0.4792923629283905, "step": 940 }, { "epoch": 1.7252747252747254, "grad_norm": 0.2216004580259323, "learning_rate": 2.320873960597828e-05, "loss": 1.1090213060379028, "step": 942 }, { "epoch": 1.728937728937729, "grad_norm": 0.147038996219635, "learning_rate": 2.312138258989649e-05, "loss": 0.9103480577468872, "step": 944 }, { "epoch": 1.7326007326007327, "grad_norm": 0.13560520112514496, "learning_rate": 2.3034094121875543e-05, "loss": 1.1171597242355347, "step": 946 }, { "epoch": 1.7362637362637363, "grad_norm": 0.15320324897766113, "learning_rate": 2.2946875568432458e-05, "loss": 1.0631382465362549, "step": 948 }, { "epoch": 1.73992673992674, "grad_norm": 0.1420201063156128, "learning_rate": 2.2859728294989718e-05, "loss": 0.9614072442054749, "step": 950 }, { "epoch": 1.7435897435897436, "grad_norm": 0.10232152044773102, "learning_rate": 2.277265366585394e-05, "loss": 0.9154943227767944, "step": 952 }, { "epoch": 1.7472527472527473, "grad_norm": 0.8972752690315247, "learning_rate": 2.268565304419443e-05, "loss": 1.170873761177063, "step": 954 }, { "epoch": 1.750915750915751, "grad_norm": 0.3512289226055145, "learning_rate": 2.2598727792021933e-05, "loss": 0.741244912147522, "step": 956 }, { "epoch": 1.7545787545787546, "grad_norm": 0.28857314586639404, "learning_rate": 2.2511879270167264e-05, "loss": 1.1468185186386108, "step": 958 }, { "epoch": 1.7582417582417582, "grad_norm": 0.7446520328521729, "learning_rate": 2.2425108838259995e-05, "loss": 0.46506467461586, "step": 960 }, { "epoch": 1.7619047619047619, "grad_norm": 0.3139968514442444, "learning_rate": 2.23384178547072e-05, "loss": 0.9495673775672913, "step": 962 }, { "epoch": 1.7655677655677655, "grad_norm": 0.18298202753067017, "learning_rate": 2.225180767667217e-05, "loss": 1.0209523439407349, "step": 964 }, { "epoch": 1.7692307692307692, "grad_norm": 0.16875436902046204, "learning_rate": 2.2165279660053174e-05, "loss": 1.1537625789642334, "step": 966 }, { "epoch": 1.7728937728937728, "grad_norm": 0.19506409764289856, "learning_rate": 2.2078835159462225e-05, "loss": 0.5657550692558289, "step": 968 }, { "epoch": 1.7765567765567765, "grad_norm": 0.1412108838558197, "learning_rate": 2.1992475528203872e-05, "loss": 0.8089891672134399, "step": 970 }, { "epoch": 1.7802197802197801, "grad_norm": 0.4082830250263214, "learning_rate": 2.1906202118254025e-05, "loss": 0.8698192834854126, "step": 972 }, { "epoch": 1.7838827838827838, "grad_norm": 0.3070997893810272, "learning_rate": 2.1820016280238792e-05, "loss": 1.1678433418273926, "step": 974 }, { "epoch": 1.7875457875457874, "grad_norm": 0.18406766653060913, "learning_rate": 2.1733919363413314e-05, "loss": 1.1347768306732178, "step": 976 }, { "epoch": 1.791208791208791, "grad_norm": 0.16203664243221283, "learning_rate": 2.1647912715640657e-05, "loss": 0.8943782448768616, "step": 978 }, { "epoch": 1.7948717948717947, "grad_norm": 0.17423562705516815, "learning_rate": 2.1561997683370705e-05, "loss": 0.9334428310394287, "step": 980 }, { "epoch": 1.7985347985347986, "grad_norm": 0.15521575510501862, "learning_rate": 2.147617561161911e-05, "loss": 1.121093988418579, "step": 982 }, { "epoch": 1.8021978021978022, "grad_norm": 1.1999096870422363, "learning_rate": 2.1390447843946156e-05, "loss": 1.0399394035339355, "step": 984 }, { "epoch": 1.8058608058608059, "grad_norm": 0.236453577876091, "learning_rate": 2.1304815722435838e-05, "loss": 0.6336957812309265, "step": 986 }, { "epoch": 1.8095238095238095, "grad_norm": 1.987808108329773, "learning_rate": 2.121928058767475e-05, "loss": 0.8018144965171814, "step": 988 }, { "epoch": 1.8131868131868132, "grad_norm": 0.6906500458717346, "learning_rate": 2.113384377873117e-05, "loss": 0.7112327814102173, "step": 990 }, { "epoch": 1.8168498168498168, "grad_norm": 0.5853157043457031, "learning_rate": 2.1048506633134058e-05, "loss": 0.770244300365448, "step": 992 }, { "epoch": 1.8205128205128205, "grad_norm": 0.22558943927288055, "learning_rate": 2.0963270486852116e-05, "loss": 0.8251454830169678, "step": 994 }, { "epoch": 1.8241758241758241, "grad_norm": 0.36891841888427734, "learning_rate": 2.0878136674272874e-05, "loss": 1.0850389003753662, "step": 996 }, { "epoch": 1.8278388278388278, "grad_norm": 0.21534068882465363, "learning_rate": 2.079310652818186e-05, "loss": 0.8296566605567932, "step": 998 }, { "epoch": 1.8315018315018317, "grad_norm": 0.20088708400726318, "learning_rate": 2.070818137974162e-05, "loss": 1.0995657444000244, "step": 1000 }, { "epoch": 1.8351648351648353, "grad_norm": 0.16254781186580658, "learning_rate": 2.0623362558470983e-05, "loss": 1.1204814910888672, "step": 1002 }, { "epoch": 1.838827838827839, "grad_norm": 0.16232743859291077, "learning_rate": 2.05386513922242e-05, "loss": 1.1130619049072266, "step": 1004 }, { "epoch": 1.8424908424908426, "grad_norm": 0.21432489156723022, "learning_rate": 2.0454049207170146e-05, "loss": 1.1204091310501099, "step": 1006 }, { "epoch": 1.8461538461538463, "grad_norm": 0.45065784454345703, "learning_rate": 2.0369557327771594e-05, "loss": 0.7804591655731201, "step": 1008 }, { "epoch": 1.84981684981685, "grad_norm": 0.2610171139240265, "learning_rate": 2.0285177076764462e-05, "loss": 1.076236367225647, "step": 1010 }, { "epoch": 1.8534798534798536, "grad_norm": 0.11059543490409851, "learning_rate": 2.0200909775137085e-05, "loss": 0.7410160899162292, "step": 1012 }, { "epoch": 1.8571428571428572, "grad_norm": 2.333650827407837, "learning_rate": 2.0116756742109577e-05, "loss": 1.1382379531860352, "step": 1014 }, { "epoch": 1.8608058608058609, "grad_norm": 0.16106364130973816, "learning_rate": 2.003271929511314e-05, "loss": 1.1225502490997314, "step": 1016 }, { "epoch": 1.8644688644688645, "grad_norm": 0.1435774266719818, "learning_rate": 1.9948798749769464e-05, "loss": 1.197827696800232, "step": 1018 }, { "epoch": 1.8681318681318682, "grad_norm": 0.04951045662164688, "learning_rate": 1.986499641987013e-05, "loss": 0.9368598461151123, "step": 1020 }, { "epoch": 1.8717948717948718, "grad_norm": 0.17583589255809784, "learning_rate": 1.9781313617356012e-05, "loss": 1.0920844078063965, "step": 1022 }, { "epoch": 1.8754578754578755, "grad_norm": 0.3376821279525757, "learning_rate": 1.9697751652296782e-05, "loss": 0.4992130398750305, "step": 1024 }, { "epoch": 1.879120879120879, "grad_norm": 0.1590043604373932, "learning_rate": 1.961431183287037e-05, "loss": 1.1315664052963257, "step": 1026 }, { "epoch": 1.8827838827838828, "grad_norm": 0.28124603629112244, "learning_rate": 1.9530995465342482e-05, "loss": 0.9077785611152649, "step": 1028 }, { "epoch": 1.8864468864468864, "grad_norm": 0.2012709081172943, "learning_rate": 1.9447803854046192e-05, "loss": 1.1241216659545898, "step": 1030 }, { "epoch": 1.89010989010989, "grad_norm": 0.2831279933452606, "learning_rate": 1.9364738301361473e-05, "loss": 0.9281163811683655, "step": 1032 }, { "epoch": 1.8937728937728937, "grad_norm": 0.45628663897514343, "learning_rate": 1.928180010769482e-05, "loss": 0.7836743593215942, "step": 1034 }, { "epoch": 1.8974358974358974, "grad_norm": 0.26489734649658203, "learning_rate": 1.919899057145891e-05, "loss": 1.1775709390640259, "step": 1036 }, { "epoch": 1.901098901098901, "grad_norm": 0.4411194920539856, "learning_rate": 1.911631098905227e-05, "loss": 1.1143101453781128, "step": 1038 }, { "epoch": 1.9047619047619047, "grad_norm": 0.39963364601135254, "learning_rate": 1.903376265483896e-05, "loss": 1.234065055847168, "step": 1040 }, { "epoch": 1.9084249084249083, "grad_norm": 0.2849297523498535, "learning_rate": 1.895134686112834e-05, "loss": 0.6098147034645081, "step": 1042 }, { "epoch": 1.912087912087912, "grad_norm": 0.17120879888534546, "learning_rate": 1.886906489815482e-05, "loss": 0.8268396258354187, "step": 1044 }, { "epoch": 1.9157509157509156, "grad_norm": 0.2931853234767914, "learning_rate": 1.878691805405765e-05, "loss": 1.0940194129943848, "step": 1046 }, { "epoch": 1.9194139194139193, "grad_norm": 0.16556760668754578, "learning_rate": 1.8704907614860797e-05, "loss": 0.3900573253631592, "step": 1048 }, { "epoch": 1.9230769230769231, "grad_norm": 0.30659088492393494, "learning_rate": 1.8623034864452753e-05, "loss": 0.841820478439331, "step": 1050 }, { "epoch": 1.9267399267399268, "grad_norm": 0.13327482342720032, "learning_rate": 1.8541301084566496e-05, "loss": 0.7997146248817444, "step": 1052 }, { "epoch": 1.9304029304029304, "grad_norm": 0.23069679737091064, "learning_rate": 1.8459707554759385e-05, "loss": 1.1094664335250854, "step": 1054 }, { "epoch": 1.934065934065934, "grad_norm": 0.08997397869825363, "learning_rate": 1.8378255552393126e-05, "loss": 0.737388551235199, "step": 1056 }, { "epoch": 1.9377289377289377, "grad_norm": 0.435207724571228, "learning_rate": 1.8296946352613792e-05, "loss": 0.9677636027336121, "step": 1058 }, { "epoch": 1.9413919413919414, "grad_norm": 0.2596050500869751, "learning_rate": 1.8215781228331884e-05, "loss": 1.0497726202011108, "step": 1060 }, { "epoch": 1.945054945054945, "grad_norm": 0.15544529259204865, "learning_rate": 1.8134761450202316e-05, "loss": 0.7944180369377136, "step": 1062 }, { "epoch": 1.9487179487179487, "grad_norm": 0.3575184643268585, "learning_rate": 1.805388828660463e-05, "loss": 0.7939121127128601, "step": 1064 }, { "epoch": 1.9523809523809523, "grad_norm": 0.38059937953948975, "learning_rate": 1.79731630036231e-05, "loss": 1.1412160396575928, "step": 1066 }, { "epoch": 1.9560439560439562, "grad_norm": 0.15067918598651886, "learning_rate": 1.7892586865026835e-05, "loss": 1.1604868173599243, "step": 1068 }, { "epoch": 1.9597069597069599, "grad_norm": 0.14763464033603668, "learning_rate": 1.7812161132250122e-05, "loss": 0.6316368579864502, "step": 1070 }, { "epoch": 1.9633699633699635, "grad_norm": 0.20061402022838593, "learning_rate": 1.7731887064372617e-05, "loss": 0.7977589964866638, "step": 1072 }, { "epoch": 1.9670329670329672, "grad_norm": 0.20839878916740417, "learning_rate": 1.7651765918099588e-05, "loss": 1.1316888332366943, "step": 1074 }, { "epoch": 1.9706959706959708, "grad_norm": 0.14487165212631226, "learning_rate": 1.757179894774233e-05, "loss": 1.0948164463043213, "step": 1076 }, { "epoch": 1.9743589743589745, "grad_norm": 1.0238404273986816, "learning_rate": 1.7491987405198464e-05, "loss": 0.9682241082191467, "step": 1078 }, { "epoch": 1.978021978021978, "grad_norm": 0.4279444217681885, "learning_rate": 1.7412332539932367e-05, "loss": 0.9370381832122803, "step": 1080 }, { "epoch": 1.9816849816849818, "grad_norm": 0.1554751694202423, "learning_rate": 1.7332835598955615e-05, "loss": 1.0854570865631104, "step": 1082 }, { "epoch": 1.9853479853479854, "grad_norm": 0.18901632726192474, "learning_rate": 1.7253497826807435e-05, "loss": 0.6803427934646606, "step": 1084 }, { "epoch": 1.989010989010989, "grad_norm": 1.4867212772369385, "learning_rate": 1.717432046553523e-05, "loss": 0.965499997138977, "step": 1086 }, { "epoch": 1.9926739926739927, "grad_norm": 0.3673637807369232, "learning_rate": 1.7095304754675168e-05, "loss": 0.9333543181419373, "step": 1088 }, { "epoch": 1.9963369963369964, "grad_norm": 0.16466295719146729, "learning_rate": 1.701645193123272e-05, "loss": 0.887560248374939, "step": 1090 }, { "epoch": 2.0, "grad_norm": 0.21007683873176575, "learning_rate": 1.6937763229663356e-05, "loss": 0.9977954626083374, "step": 1092 }, { "epoch": 2.0036630036630036, "grad_norm": 0.29877015948295593, "learning_rate": 1.685923988185316e-05, "loss": 0.9857615828514099, "step": 1094 }, { "epoch": 2.0073260073260073, "grad_norm": 0.17748361825942993, "learning_rate": 1.6780883117099575e-05, "loss": 1.0911893844604492, "step": 1096 }, { "epoch": 2.010989010989011, "grad_norm": 0.16629448533058167, "learning_rate": 1.6702694162092177e-05, "loss": 1.0311784744262695, "step": 1098 }, { "epoch": 2.0146520146520146, "grad_norm": 0.4006339907646179, "learning_rate": 1.6624674240893452e-05, "loss": 1.078372597694397, "step": 1100 }, { "epoch": 2.0183150183150182, "grad_norm": 0.3369395136833191, "learning_rate": 1.6546824574919572e-05, "loss": 0.7856264114379883, "step": 1102 }, { "epoch": 2.021978021978022, "grad_norm": 0.26995745301246643, "learning_rate": 1.6469146382921407e-05, "loss": 1.084755778312683, "step": 1104 }, { "epoch": 2.0256410256410255, "grad_norm": 0.26533597707748413, "learning_rate": 1.6391640880965338e-05, "loss": 0.7198016047477722, "step": 1106 }, { "epoch": 2.029304029304029, "grad_norm": 0.5049846172332764, "learning_rate": 1.6314309282414244e-05, "loss": 1.1550657749176025, "step": 1108 }, { "epoch": 2.032967032967033, "grad_norm": 0.2585044205188751, "learning_rate": 1.623715279790853e-05, "loss": 0.7946727275848389, "step": 1110 }, { "epoch": 2.0366300366300365, "grad_norm": 0.08100654929876328, "learning_rate": 1.616017263534713e-05, "loss": 0.5124549269676208, "step": 1112 }, { "epoch": 2.04029304029304, "grad_norm": 0.2333383411169052, "learning_rate": 1.608336999986867e-05, "loss": 0.7633625268936157, "step": 1114 }, { "epoch": 2.043956043956044, "grad_norm": 0.6371340751647949, "learning_rate": 1.600674609383253e-05, "loss": 1.0826982259750366, "step": 1116 }, { "epoch": 2.0476190476190474, "grad_norm": 0.38138705492019653, "learning_rate": 1.5930302116800044e-05, "loss": 1.0550010204315186, "step": 1118 }, { "epoch": 2.051282051282051, "grad_norm": 3.1294875144958496, "learning_rate": 1.585403926551573e-05, "loss": 0.7597935199737549, "step": 1120 }, { "epoch": 2.0549450549450547, "grad_norm": 5.722661018371582, "learning_rate": 1.5777958733888565e-05, "loss": 1.1912044286727905, "step": 1122 }, { "epoch": 2.0586080586080584, "grad_norm": 0.21310099959373474, "learning_rate": 1.570206171297324e-05, "loss": 1.0809831619262695, "step": 1124 }, { "epoch": 2.062271062271062, "grad_norm": 0.30974480509757996, "learning_rate": 1.56263493909516e-05, "loss": 0.6720230579376221, "step": 1126 }, { "epoch": 2.065934065934066, "grad_norm": 0.1537218540906906, "learning_rate": 1.555082295311396e-05, "loss": 1.0456634759902954, "step": 1128 }, { "epoch": 2.06959706959707, "grad_norm": 0.44281265139579773, "learning_rate": 1.5475483581840587e-05, "loss": 1.017748236656189, "step": 1130 }, { "epoch": 2.0732600732600734, "grad_norm": 0.24577392637729645, "learning_rate": 1.54003324565832e-05, "loss": 1.0750110149383545, "step": 1132 }, { "epoch": 2.076923076923077, "grad_norm": 0.22100472450256348, "learning_rate": 1.53253707538465e-05, "loss": 1.060890793800354, "step": 1134 }, { "epoch": 2.0805860805860807, "grad_norm": 0.9236302375793457, "learning_rate": 1.5250599647169716e-05, "loss": 0.6415885090827942, "step": 1136 }, { "epoch": 2.0842490842490844, "grad_norm": 0.20396125316619873, "learning_rate": 1.5176020307108276e-05, "loss": 1.0569545030593872, "step": 1138 }, { "epoch": 2.087912087912088, "grad_norm": 0.22348055243492126, "learning_rate": 1.5101633901215456e-05, "loss": 0.9237917065620422, "step": 1140 }, { "epoch": 2.0915750915750917, "grad_norm": 0.18689396977424622, "learning_rate": 1.5027441594024133e-05, "loss": 1.0551191568374634, "step": 1142 }, { "epoch": 2.0952380952380953, "grad_norm": 0.11935965716838837, "learning_rate": 1.4953444547028531e-05, "loss": 0.3609432280063629, "step": 1144 }, { "epoch": 2.098901098901099, "grad_norm": 0.8522807955741882, "learning_rate": 1.4879643918666003e-05, "loss": 0.7986314296722412, "step": 1146 }, { "epoch": 2.1025641025641026, "grad_norm": 0.35904762148857117, "learning_rate": 1.480604086429897e-05, "loss": 0.7680439352989197, "step": 1148 }, { "epoch": 2.1062271062271063, "grad_norm": 0.23995743691921234, "learning_rate": 1.4732636536196794e-05, "loss": 0.7488572597503662, "step": 1150 }, { "epoch": 2.10989010989011, "grad_norm": 0.24136975407600403, "learning_rate": 1.4659432083517726e-05, "loss": 0.9088020324707031, "step": 1152 }, { "epoch": 2.1135531135531136, "grad_norm": 0.25049832463264465, "learning_rate": 1.458642865229093e-05, "loss": 0.6150023937225342, "step": 1154 }, { "epoch": 2.1172161172161172, "grad_norm": 0.2590892016887665, "learning_rate": 1.4513627385398554e-05, "loss": 0.9922336935997009, "step": 1156 }, { "epoch": 2.120879120879121, "grad_norm": 0.6481472849845886, "learning_rate": 1.4441029422557817e-05, "loss": 0.9142146110534668, "step": 1158 }, { "epoch": 2.1245421245421245, "grad_norm": 0.29179754853248596, "learning_rate": 1.4368635900303184e-05, "loss": 0.802727997303009, "step": 1160 }, { "epoch": 2.128205128205128, "grad_norm": 0.5445400476455688, "learning_rate": 1.4296447951968562e-05, "loss": 0.6710273623466492, "step": 1162 }, { "epoch": 2.131868131868132, "grad_norm": 0.22561633586883545, "learning_rate": 1.4224466707669542e-05, "loss": 0.8315181136131287, "step": 1164 }, { "epoch": 2.1355311355311355, "grad_norm": 0.24327363073825836, "learning_rate": 1.4152693294285756e-05, "loss": 0.4054326117038727, "step": 1166 }, { "epoch": 2.139194139194139, "grad_norm": 0.6817838549613953, "learning_rate": 1.4081128835443188e-05, "loss": 0.4102446436882019, "step": 1168 }, { "epoch": 2.142857142857143, "grad_norm": 0.44050848484039307, "learning_rate": 1.400977445149661e-05, "loss": 0.8157304525375366, "step": 1170 }, { "epoch": 2.1465201465201464, "grad_norm": 0.5266426801681519, "learning_rate": 1.3938631259512013e-05, "loss": 1.0504196882247925, "step": 1172 }, { "epoch": 2.15018315018315, "grad_norm": 0.3633262515068054, "learning_rate": 1.3867700373249152e-05, "loss": 1.2162549495697021, "step": 1174 }, { "epoch": 2.1538461538461537, "grad_norm": 0.5713516473770142, "learning_rate": 1.37969829031441e-05, "loss": 0.6878563165664673, "step": 1176 }, { "epoch": 2.1575091575091574, "grad_norm": 0.18287743628025055, "learning_rate": 1.3726479956291872e-05, "loss": 1.0863420963287354, "step": 1178 }, { "epoch": 2.161172161172161, "grad_norm": 0.1208617314696312, "learning_rate": 1.3656192636429043e-05, "loss": 1.089928388595581, "step": 1180 }, { "epoch": 2.1648351648351647, "grad_norm": 0.32237380743026733, "learning_rate": 1.3586122043916538e-05, "loss": 1.0283252000808716, "step": 1182 }, { "epoch": 2.1684981684981683, "grad_norm": 0.18901434540748596, "learning_rate": 1.3516269275722387e-05, "loss": 1.0963468551635742, "step": 1184 }, { "epoch": 2.172161172161172, "grad_norm": 0.11865086108446121, "learning_rate": 1.344663542540451e-05, "loss": 0.1795816421508789, "step": 1186 }, { "epoch": 2.1758241758241756, "grad_norm": 0.35561323165893555, "learning_rate": 1.3377221583093632e-05, "loss": 1.1163209676742554, "step": 1188 }, { "epoch": 2.1794871794871793, "grad_norm": 0.39501580595970154, "learning_rate": 1.3308028835476238e-05, "loss": 0.669342041015625, "step": 1190 }, { "epoch": 2.183150183150183, "grad_norm": 0.6345373392105103, "learning_rate": 1.3239058265777499e-05, "loss": 0.9228946566581726, "step": 1192 }, { "epoch": 2.186813186813187, "grad_norm": 0.30641525983810425, "learning_rate": 1.3170310953744388e-05, "loss": 0.698255181312561, "step": 1194 }, { "epoch": 2.1904761904761907, "grad_norm": 0.21093255281448364, "learning_rate": 1.310178797562871e-05, "loss": 0.8535992503166199, "step": 1196 }, { "epoch": 2.1941391941391943, "grad_norm": 0.20513616502285004, "learning_rate": 1.3033490404170276e-05, "loss": 1.0712019205093384, "step": 1198 }, { "epoch": 2.197802197802198, "grad_norm": 0.4575969874858856, "learning_rate": 1.296541930858015e-05, "loss": 0.6536464691162109, "step": 1200 }, { "epoch": 2.2014652014652016, "grad_norm": 0.22849248349666595, "learning_rate": 1.2897575754523832e-05, "loss": 1.1299998760223389, "step": 1202 }, { "epoch": 2.2051282051282053, "grad_norm": 0.2219730168581009, "learning_rate": 1.2829960804104663e-05, "loss": 1.080318570137024, "step": 1204 }, { "epoch": 2.208791208791209, "grad_norm": 0.20927314460277557, "learning_rate": 1.2762575515847106e-05, "loss": 0.42392417788505554, "step": 1206 }, { "epoch": 2.2124542124542126, "grad_norm": 0.23490868508815765, "learning_rate": 1.2695420944680242e-05, "loss": 0.7045865654945374, "step": 1208 }, { "epoch": 2.2161172161172162, "grad_norm": 0.1882236897945404, "learning_rate": 1.2628498141921243e-05, "loss": 0.4839053452014923, "step": 1210 }, { "epoch": 2.21978021978022, "grad_norm": 0.22759205102920532, "learning_rate": 1.2561808155258897e-05, "loss": 0.9919928908348083, "step": 1212 }, { "epoch": 2.2234432234432235, "grad_norm": 0.22987626492977142, "learning_rate": 1.2495352028737201e-05, "loss": 0.7579973340034485, "step": 1214 }, { "epoch": 2.227106227106227, "grad_norm": 0.2778910994529724, "learning_rate": 1.2429130802739036e-05, "loss": 0.8012394905090332, "step": 1216 }, { "epoch": 2.230769230769231, "grad_norm": 0.17181673645973206, "learning_rate": 1.2363145513969887e-05, "loss": 1.0715208053588867, "step": 1218 }, { "epoch": 2.2344322344322345, "grad_norm": 0.25348830223083496, "learning_rate": 1.229739719544157e-05, "loss": 1.06959867477417, "step": 1220 }, { "epoch": 2.238095238095238, "grad_norm": 0.17098630964756012, "learning_rate": 1.2231886876456116e-05, "loss": 1.079147219657898, "step": 1222 }, { "epoch": 2.241758241758242, "grad_norm": 1.3435940742492676, "learning_rate": 1.2166615582589613e-05, "loss": 0.40509262681007385, "step": 1224 }, { "epoch": 2.2454212454212454, "grad_norm": 0.24130238592624664, "learning_rate": 1.210158433567616e-05, "loss": 1.0378178358078003, "step": 1226 }, { "epoch": 2.249084249084249, "grad_norm": 0.7028672695159912, "learning_rate": 1.2036794153791905e-05, "loss": 0.5614770650863647, "step": 1228 }, { "epoch": 2.2527472527472527, "grad_norm": 0.14420144259929657, "learning_rate": 1.1972246051239054e-05, "loss": 0.5607399344444275, "step": 1230 }, { "epoch": 2.2564102564102564, "grad_norm": 0.20063596963882446, "learning_rate": 1.1907941038530015e-05, "loss": 0.5380869507789612, "step": 1232 }, { "epoch": 2.26007326007326, "grad_norm": 0.3156454563140869, "learning_rate": 1.18438801223716e-05, "loss": 1.0096523761749268, "step": 1234 }, { "epoch": 2.2637362637362637, "grad_norm": 0.2204177975654602, "learning_rate": 1.1780064305649224e-05, "loss": 0.7427061796188354, "step": 1236 }, { "epoch": 2.2673992673992673, "grad_norm": 0.24353505671024323, "learning_rate": 1.1716494587411248e-05, "loss": 0.857605516910553, "step": 1238 }, { "epoch": 2.271062271062271, "grad_norm": 0.2163824439048767, "learning_rate": 1.1653171962853291e-05, "loss": 0.7742936015129089, "step": 1240 }, { "epoch": 2.2747252747252746, "grad_norm": 0.5982560515403748, "learning_rate": 1.1590097423302684e-05, "loss": 0.8472159504890442, "step": 1242 }, { "epoch": 2.2783882783882783, "grad_norm": 0.22811779379844666, "learning_rate": 1.1527271956202947e-05, "loss": 1.033808946609497, "step": 1244 }, { "epoch": 2.282051282051282, "grad_norm": 0.2341231256723404, "learning_rate": 1.1464696545098332e-05, "loss": 0.6939253807067871, "step": 1246 }, { "epoch": 2.2857142857142856, "grad_norm": 0.3489930033683777, "learning_rate": 1.1402372169618398e-05, "loss": 0.6756494641304016, "step": 1248 }, { "epoch": 2.2893772893772892, "grad_norm": 0.21145479381084442, "learning_rate": 1.1340299805462704e-05, "loss": 1.0505157709121704, "step": 1250 }, { "epoch": 2.293040293040293, "grad_norm": 0.31191250681877136, "learning_rate": 1.1278480424385534e-05, "loss": 0.7581150531768799, "step": 1252 }, { "epoch": 2.2967032967032965, "grad_norm": 0.1843535155057907, "learning_rate": 1.1216914994180659e-05, "loss": 1.10303795337677, "step": 1254 }, { "epoch": 2.3003663003663, "grad_norm": 0.4075881540775299, "learning_rate": 1.1155604478666223e-05, "loss": 1.1203564405441284, "step": 1256 }, { "epoch": 2.304029304029304, "grad_norm": 1.3023512363433838, "learning_rate": 1.1094549837669616e-05, "loss": 0.7518989443778992, "step": 1258 }, { "epoch": 2.3076923076923075, "grad_norm": 0.3842734396457672, "learning_rate": 1.1033752027012465e-05, "loss": 1.0648375749588013, "step": 1260 }, { "epoch": 2.311355311355311, "grad_norm": 0.9410233497619629, "learning_rate": 1.097321199849569e-05, "loss": 0.9340834617614746, "step": 1262 }, { "epoch": 2.315018315018315, "grad_norm": 0.2198442965745926, "learning_rate": 1.0912930699884563e-05, "loss": 0.83587646484375, "step": 1264 }, { "epoch": 2.3186813186813184, "grad_norm": 0.1930425763130188, "learning_rate": 1.08529090748939e-05, "loss": 0.7000831961631775, "step": 1266 }, { "epoch": 2.3223443223443225, "grad_norm": 0.6036158204078674, "learning_rate": 1.0793148063173284e-05, "loss": 0.7626188397407532, "step": 1268 }, { "epoch": 2.326007326007326, "grad_norm": 0.2609650492668152, "learning_rate": 1.073364860029234e-05, "loss": 1.1375476121902466, "step": 1270 }, { "epoch": 2.32967032967033, "grad_norm": 0.30271434783935547, "learning_rate": 1.0674411617726106e-05, "loss": 0.9527180194854736, "step": 1272 }, { "epoch": 2.3333333333333335, "grad_norm": 0.9938449859619141, "learning_rate": 1.0615438042840439e-05, "loss": 0.5007555484771729, "step": 1274 }, { "epoch": 2.336996336996337, "grad_norm": 0.821854293346405, "learning_rate": 1.0556728798877488e-05, "loss": 0.9555824398994446, "step": 1276 }, { "epoch": 2.340659340659341, "grad_norm": 0.24786677956581116, "learning_rate": 1.0498284804941277e-05, "loss": 0.7994169592857361, "step": 1278 }, { "epoch": 2.3443223443223444, "grad_norm": 0.3086180090904236, "learning_rate": 1.0440106975983283e-05, "loss": 0.7172934412956238, "step": 1280 }, { "epoch": 2.347985347985348, "grad_norm": 0.41890591382980347, "learning_rate": 1.0382196222788108e-05, "loss": 0.5843296051025391, "step": 1282 }, { "epoch": 2.3516483516483517, "grad_norm": 0.21397873759269714, "learning_rate": 1.0324553451959245e-05, "loss": 1.0417900085449219, "step": 1284 }, { "epoch": 2.3553113553113554, "grad_norm": 0.5100418329238892, "learning_rate": 1.0267179565904879e-05, "loss": 0.4865255355834961, "step": 1286 }, { "epoch": 2.358974358974359, "grad_norm": 0.6028478145599365, "learning_rate": 1.0210075462823738e-05, "loss": 0.8683855533599854, "step": 1288 }, { "epoch": 2.3626373626373627, "grad_norm": 0.18368980288505554, "learning_rate": 1.0153242036691071e-05, "loss": 0.8409366607666016, "step": 1290 }, { "epoch": 2.3663003663003663, "grad_norm": 0.13247643411159515, "learning_rate": 1.0096680177244609e-05, "loss": 0.7085995078086853, "step": 1292 }, { "epoch": 2.36996336996337, "grad_norm": 0.27349239587783813, "learning_rate": 1.0040390769970654e-05, "loss": 0.7937886714935303, "step": 1294 }, { "epoch": 2.3736263736263736, "grad_norm": 0.41278505325317383, "learning_rate": 9.98437469609025e-06, "loss": 1.1152591705322266, "step": 1296 }, { "epoch": 2.3772893772893773, "grad_norm": 0.4278549551963806, "learning_rate": 9.928632832545317e-06, "loss": 1.0227138996124268, "step": 1298 }, { "epoch": 2.380952380952381, "grad_norm": 0.21517953276634216, "learning_rate": 9.873166051984998e-06, "loss": 1.0946927070617676, "step": 1300 }, { "epoch": 2.3846153846153846, "grad_norm": 0.3315783143043518, "learning_rate": 9.817975222751931e-06, "loss": 0.7763844728469849, "step": 1302 }, { "epoch": 2.3882783882783882, "grad_norm": 0.44624730944633484, "learning_rate": 9.763061208868699e-06, "loss": 0.4395400285720825, "step": 1304 }, { "epoch": 2.391941391941392, "grad_norm": 0.20948028564453125, "learning_rate": 9.708424870024285e-06, "loss": 0.8480145335197449, "step": 1306 }, { "epoch": 2.3956043956043955, "grad_norm": 0.4898599088191986, "learning_rate": 9.654067061560645e-06, "loss": 1.0664393901824951, "step": 1308 }, { "epoch": 2.399267399267399, "grad_norm": 0.21064221858978271, "learning_rate": 9.599988634459236e-06, "loss": 0.474110871553421, "step": 1310 }, { "epoch": 2.402930402930403, "grad_norm": 0.3536030650138855, "learning_rate": 9.546190435327795e-06, "loss": 1.0670816898345947, "step": 1312 }, { "epoch": 2.4065934065934065, "grad_norm": 0.2895529866218567, "learning_rate": 9.492673306387029e-06, "loss": 0.7731264233589172, "step": 1314 }, { "epoch": 2.41025641025641, "grad_norm": 0.21338780224323273, "learning_rate": 9.43943808545743e-06, "loss": 1.0734295845031738, "step": 1316 }, { "epoch": 2.413919413919414, "grad_norm": 0.5540740489959717, "learning_rate": 9.386485605946164e-06, "loss": 0.7238420248031616, "step": 1318 }, { "epoch": 2.4175824175824174, "grad_norm": 0.1840064972639084, "learning_rate": 9.333816696834049e-06, "loss": 0.6843035221099854, "step": 1320 }, { "epoch": 2.421245421245421, "grad_norm": 0.1636444330215454, "learning_rate": 9.28143218266253e-06, "loss": 0.5108417272567749, "step": 1322 }, { "epoch": 2.4249084249084247, "grad_norm": 0.277536541223526, "learning_rate": 9.229332883520825e-06, "loss": 0.7295075058937073, "step": 1324 }, { "epoch": 2.4285714285714284, "grad_norm": 0.1264895349740982, "learning_rate": 9.177519615033034e-06, "loss": 0.7249910831451416, "step": 1326 }, { "epoch": 2.4322344322344325, "grad_norm": 0.19156897068023682, "learning_rate": 9.125993188345402e-06, "loss": 0.6318535208702087, "step": 1328 }, { "epoch": 2.435897435897436, "grad_norm": 3.024097204208374, "learning_rate": 9.074754410113628e-06, "loss": 0.7735837697982788, "step": 1330 }, { "epoch": 2.4395604395604398, "grad_norm": 0.14816512167453766, "learning_rate": 9.023804082490197e-06, "loss": 0.9631860256195068, "step": 1332 }, { "epoch": 2.4432234432234434, "grad_norm": 0.43298929929733276, "learning_rate": 8.973143003111863e-06, "loss": 0.6613461971282959, "step": 1334 }, { "epoch": 2.446886446886447, "grad_norm": 0.4770919382572174, "learning_rate": 8.922771965087144e-06, "loss": 0.5602841973304749, "step": 1336 }, { "epoch": 2.4505494505494507, "grad_norm": 0.7131723165512085, "learning_rate": 8.872691756983891e-06, "loss": 0.9735853672027588, "step": 1338 }, { "epoch": 2.4542124542124544, "grad_norm": 0.31672975420951843, "learning_rate": 8.822903162816986e-06, "loss": 0.7807232141494751, "step": 1340 }, { "epoch": 2.457875457875458, "grad_norm": 0.08743062615394592, "learning_rate": 8.773406962036031e-06, "loss": 0.5491883754730225, "step": 1342 }, { "epoch": 2.4615384615384617, "grad_norm": 0.16250762343406677, "learning_rate": 8.724203929513133e-06, "loss": 0.7840443253517151, "step": 1344 }, { "epoch": 2.4652014652014653, "grad_norm": 0.3087010383605957, "learning_rate": 8.675294835530828e-06, "loss": 1.0146785974502563, "step": 1346 }, { "epoch": 2.468864468864469, "grad_norm": 0.26062580943107605, "learning_rate": 8.626680445769981e-06, "loss": 1.0559192895889282, "step": 1348 }, { "epoch": 2.4725274725274726, "grad_norm": 0.25493213534355164, "learning_rate": 8.5783615212978e-06, "loss": 0.6265292167663574, "step": 1350 }, { "epoch": 2.4761904761904763, "grad_norm": 0.2113112509250641, "learning_rate": 8.530338818555931e-06, "loss": 0.711513340473175, "step": 1352 }, { "epoch": 2.47985347985348, "grad_norm": 0.5152426362037659, "learning_rate": 8.482613089348618e-06, "loss": 0.8448625802993774, "step": 1354 }, { "epoch": 2.4835164835164836, "grad_norm": 0.11193950474262238, "learning_rate": 8.435185080830927e-06, "loss": 0.8605793118476868, "step": 1356 }, { "epoch": 2.4871794871794872, "grad_norm": 0.18866127729415894, "learning_rate": 8.388055535497064e-06, "loss": 1.0280365943908691, "step": 1358 }, { "epoch": 2.490842490842491, "grad_norm": 0.4851335883140564, "learning_rate": 8.341225191168722e-06, "loss": 0.8356929421424866, "step": 1360 }, { "epoch": 2.4945054945054945, "grad_norm": 0.18892696499824524, "learning_rate": 8.29469478098355e-06, "loss": 0.7504462599754333, "step": 1362 }, { "epoch": 2.498168498168498, "grad_norm": 0.4648627042770386, "learning_rate": 8.24846503338369e-06, "loss": 0.7001456618309021, "step": 1364 }, { "epoch": 2.501831501831502, "grad_norm": 0.2734161615371704, "learning_rate": 8.202536672104326e-06, "loss": 1.046680212020874, "step": 1366 }, { "epoch": 2.5054945054945055, "grad_norm": 0.12593773007392883, "learning_rate": 8.156910416162417e-06, "loss": 0.6337849497795105, "step": 1368 }, { "epoch": 2.509157509157509, "grad_norm": 0.3318650722503662, "learning_rate": 8.111586979845383e-06, "loss": 0.8238204121589661, "step": 1370 }, { "epoch": 2.5128205128205128, "grad_norm": 0.1444663256406784, "learning_rate": 8.066567072699946e-06, "loss": 0.5835400819778442, "step": 1372 }, { "epoch": 2.5164835164835164, "grad_norm": 0.26651981472969055, "learning_rate": 8.021851399521048e-06, "loss": 0.6938576102256775, "step": 1374 }, { "epoch": 2.52014652014652, "grad_norm": 0.5255207419395447, "learning_rate": 7.97744066034077e-06, "loss": 0.9116069078445435, "step": 1376 }, { "epoch": 2.5238095238095237, "grad_norm": 0.20919840037822723, "learning_rate": 7.933335550417405e-06, "loss": 0.7189561724662781, "step": 1378 }, { "epoch": 2.5274725274725274, "grad_norm": 0.16163820028305054, "learning_rate": 7.889536760224557e-06, "loss": 0.7958462834358215, "step": 1380 }, { "epoch": 2.531135531135531, "grad_norm": 0.1546577364206314, "learning_rate": 7.846044975440334e-06, "loss": 0.7697736620903015, "step": 1382 }, { "epoch": 2.5347985347985347, "grad_norm": 0.1974683552980423, "learning_rate": 7.802860876936636e-06, "loss": 0.7680953741073608, "step": 1384 }, { "epoch": 2.5384615384615383, "grad_norm": 0.1976369023323059, "learning_rate": 7.759985140768474e-06, "loss": 1.0490553379058838, "step": 1386 }, { "epoch": 2.542124542124542, "grad_norm": 0.795116126537323, "learning_rate": 7.717418438163362e-06, "loss": 0.7246772050857544, "step": 1388 }, { "epoch": 2.5457875457875456, "grad_norm": 0.20595654845237732, "learning_rate": 7.675161435510869e-06, "loss": 0.7472343444824219, "step": 1390 }, { "epoch": 2.5494505494505493, "grad_norm": 0.3328467309474945, "learning_rate": 7.633214794352146e-06, "loss": 0.8970118761062622, "step": 1392 }, { "epoch": 2.553113553113553, "grad_norm": 0.2709287405014038, "learning_rate": 7.591579171369574e-06, "loss": 0.6892199516296387, "step": 1394 }, { "epoch": 2.5567765567765566, "grad_norm": 0.0892128273844719, "learning_rate": 7.5502552183764845e-06, "loss": 0.2729473412036896, "step": 1396 }, { "epoch": 2.5604395604395602, "grad_norm": 0.2575327455997467, "learning_rate": 7.5092435823069655e-06, "loss": 1.0643916130065918, "step": 1398 }, { "epoch": 2.564102564102564, "grad_norm": 0.42705675959587097, "learning_rate": 7.468544905205714e-06, "loss": 1.0583202838897705, "step": 1400 }, { "epoch": 2.5677655677655675, "grad_norm": 0.3415769040584564, "learning_rate": 7.428159824218017e-06, "loss": 0.7452787756919861, "step": 1402 }, { "epoch": 2.571428571428571, "grad_norm": 0.16148477792739868, "learning_rate": 7.388088971579742e-06, "loss": 1.0535763502120972, "step": 1404 }, { "epoch": 2.575091575091575, "grad_norm": 0.7299770712852478, "learning_rate": 7.348332974607445e-06, "loss": 0.47556331753730774, "step": 1406 }, { "epoch": 2.578754578754579, "grad_norm": 0.1957835853099823, "learning_rate": 7.308892455688579e-06, "loss": 1.0507322549819946, "step": 1408 }, { "epoch": 2.5824175824175826, "grad_norm": 0.8958855867385864, "learning_rate": 7.269768032271726e-06, "loss": 0.7786467671394348, "step": 1410 }, { "epoch": 2.586080586080586, "grad_norm": 0.5932020545005798, "learning_rate": 7.230960316856925e-06, "loss": 0.7563549876213074, "step": 1412 }, { "epoch": 2.58974358974359, "grad_norm": 1.3887028694152832, "learning_rate": 7.192469916986099e-06, "loss": 0.9135017395019531, "step": 1414 }, { "epoch": 2.5934065934065935, "grad_norm": 0.3214952051639557, "learning_rate": 7.154297435233528e-06, "loss": 1.0506607294082642, "step": 1416 }, { "epoch": 2.597069597069597, "grad_norm": 0.4495256841182709, "learning_rate": 7.116443469196446e-06, "loss": 0.6735981702804565, "step": 1418 }, { "epoch": 2.600732600732601, "grad_norm": 0.20636308193206787, "learning_rate": 7.078908611485656e-06, "loss": 1.0373022556304932, "step": 1420 }, { "epoch": 2.6043956043956045, "grad_norm": 0.753204345703125, "learning_rate": 7.041693449716244e-06, "loss": 0.9630070328712463, "step": 1422 }, { "epoch": 2.608058608058608, "grad_norm": 0.2278732806444168, "learning_rate": 7.00479856649842e-06, "loss": 1.1420621871948242, "step": 1424 }, { "epoch": 2.6117216117216118, "grad_norm": 0.28468847274780273, "learning_rate": 6.96822453942837e-06, "loss": 1.1209793090820312, "step": 1426 }, { "epoch": 2.6153846153846154, "grad_norm": 0.3516117036342621, "learning_rate": 6.931971941079208e-06, "loss": 1.1041990518569946, "step": 1428 }, { "epoch": 2.619047619047619, "grad_norm": 0.18154621124267578, "learning_rate": 6.896041338992029e-06, "loss": 1.0311168432235718, "step": 1430 }, { "epoch": 2.6227106227106227, "grad_norm": 0.5342852473258972, "learning_rate": 6.860433295667022e-06, "loss": 0.8854894042015076, "step": 1432 }, { "epoch": 2.6263736263736264, "grad_norm": 0.8895637392997742, "learning_rate": 6.825148368554646e-06, "loss": 0.8600127696990967, "step": 1434 }, { "epoch": 2.63003663003663, "grad_norm": 0.5177263021469116, "learning_rate": 6.790187110046933e-06, "loss": 1.2419568300247192, "step": 1436 }, { "epoch": 2.6336996336996337, "grad_norm": 0.1689794361591339, "learning_rate": 6.755550067468812e-06, "loss": 1.0835387706756592, "step": 1438 }, { "epoch": 2.6373626373626373, "grad_norm": 0.2473716288805008, "learning_rate": 6.721237783069546e-06, "loss": 1.0448336601257324, "step": 1440 }, { "epoch": 2.641025641025641, "grad_norm": 0.14038227498531342, "learning_rate": 6.687250794014273e-06, "loss": 1.0791858434677124, "step": 1442 }, { "epoch": 2.6446886446886446, "grad_norm": 0.30842867493629456, "learning_rate": 6.653589632375541e-06, "loss": 0.9658035635948181, "step": 1444 }, { "epoch": 2.6483516483516483, "grad_norm": 0.30728021264076233, "learning_rate": 6.6202548251250414e-06, "loss": 0.7608792185783386, "step": 1446 }, { "epoch": 2.652014652014652, "grad_norm": 0.19324815273284912, "learning_rate": 6.587246894125303e-06, "loss": 0.7707818150520325, "step": 1448 }, { "epoch": 2.6556776556776556, "grad_norm": 0.16655150055885315, "learning_rate": 6.554566356121558e-06, "loss": 1.038588285446167, "step": 1450 }, { "epoch": 2.659340659340659, "grad_norm": 0.10667542368173599, "learning_rate": 6.522213722733638e-06, "loss": 0.5798073410987854, "step": 1452 }, { "epoch": 2.663003663003663, "grad_norm": 0.23501524329185486, "learning_rate": 6.490189500447973e-06, "loss": 0.6129744648933411, "step": 1454 }, { "epoch": 2.6666666666666665, "grad_norm": 0.5228734612464905, "learning_rate": 6.4584941906096515e-06, "loss": 1.010016918182373, "step": 1456 }, { "epoch": 2.67032967032967, "grad_norm": 0.1614047735929489, "learning_rate": 6.427128289414573e-06, "loss": 0.7019752264022827, "step": 1458 }, { "epoch": 2.6739926739926743, "grad_norm": 0.5308529138565063, "learning_rate": 6.396092287901696e-06, "loss": 0.6532785296440125, "step": 1460 }, { "epoch": 2.677655677655678, "grad_norm": 0.22194698452949524, "learning_rate": 6.365386671945331e-06, "loss": 0.7371679544448853, "step": 1462 }, { "epoch": 2.6813186813186816, "grad_norm": 1.3231110572814941, "learning_rate": 6.335011922247535e-06, "loss": 0.9731379151344299, "step": 1464 }, { "epoch": 2.684981684981685, "grad_norm": 0.15819787979125977, "learning_rate": 6.304968514330613e-06, "loss": 0.8071764707565308, "step": 1466 }, { "epoch": 2.688644688644689, "grad_norm": 0.22951114177703857, "learning_rate": 6.275256918529631e-06, "loss": 0.95961993932724, "step": 1468 }, { "epoch": 2.6923076923076925, "grad_norm": 0.6898245215415955, "learning_rate": 6.245877599985094e-06, "loss": 0.5869124531745911, "step": 1470 }, { "epoch": 2.695970695970696, "grad_norm": 0.4145790934562683, "learning_rate": 6.216831018635631e-06, "loss": 0.7107551097869873, "step": 1472 }, { "epoch": 2.6996336996337, "grad_norm": 0.5302114486694336, "learning_rate": 6.188117629210814e-06, "loss": 0.4114135205745697, "step": 1474 }, { "epoch": 2.7032967032967035, "grad_norm": 0.254586398601532, "learning_rate": 6.159737881224042e-06, "loss": 1.0859794616699219, "step": 1476 }, { "epoch": 2.706959706959707, "grad_norm": 0.6441674828529358, "learning_rate": 6.131692218965484e-06, "loss": 0.5880909562110901, "step": 1478 }, { "epoch": 2.7106227106227108, "grad_norm": 0.1711389720439911, "learning_rate": 6.103981081495144e-06, "loss": 1.0421608686447144, "step": 1480 }, { "epoch": 2.7142857142857144, "grad_norm": 0.20255252718925476, "learning_rate": 6.076604902635971e-06, "loss": 1.0526020526885986, "step": 1482 }, { "epoch": 2.717948717948718, "grad_norm": 0.8872889280319214, "learning_rate": 6.049564110967082e-06, "loss": 0.9588233828544617, "step": 1484 }, { "epoch": 2.7216117216117217, "grad_norm": 0.2663383483886719, "learning_rate": 6.022859129817042e-06, "loss": 1.0862208604812622, "step": 1486 }, { "epoch": 2.7252747252747254, "grad_norm": 0.15567375719547272, "learning_rate": 5.996490377257248e-06, "loss": 1.091988444328308, "step": 1488 }, { "epoch": 2.728937728937729, "grad_norm": 0.3074291944503784, "learning_rate": 5.970458266095369e-06, "loss": 0.4964509606361389, "step": 1490 }, { "epoch": 2.7326007326007327, "grad_norm": 0.20127466320991516, "learning_rate": 5.944763203868888e-06, "loss": 1.0711864233016968, "step": 1492 }, { "epoch": 2.7362637362637363, "grad_norm": 0.22651104629039764, "learning_rate": 5.919405592838733e-06, "loss": 0.5613836050033569, "step": 1494 }, { "epoch": 2.73992673992674, "grad_norm": 0.20297077298164368, "learning_rate": 5.894385829982967e-06, "loss": 1.1413242816925049, "step": 1496 }, { "epoch": 2.7435897435897436, "grad_norm": 0.20519724488258362, "learning_rate": 5.869704306990585e-06, "loss": 1.0266319513320923, "step": 1498 }, { "epoch": 2.7472527472527473, "grad_norm": 3.1034557819366455, "learning_rate": 5.8453614102553605e-06, "loss": 0.6879111528396606, "step": 1500 }, { "epoch": 2.750915750915751, "grad_norm": 0.4873732030391693, "learning_rate": 5.821357520869821e-06, "loss": 0.9691627621650696, "step": 1502 }, { "epoch": 2.7545787545787546, "grad_norm": 0.13368584215641022, "learning_rate": 5.797693014619274e-06, "loss": 1.0586458444595337, "step": 1504 }, { "epoch": 2.758241758241758, "grad_norm": 0.18307749927043915, "learning_rate": 5.774368261975912e-06, "loss": 1.037876844406128, "step": 1506 }, { "epoch": 2.761904761904762, "grad_norm": 0.10759836435317993, "learning_rate": 5.751383628093026e-06, "loss": 0.8368395566940308, "step": 1508 }, { "epoch": 2.7655677655677655, "grad_norm": 0.2728974521160126, "learning_rate": 5.728739472799295e-06, "loss": 0.8790582418441772, "step": 1510 }, { "epoch": 2.769230769230769, "grad_norm": 0.2314990609884262, "learning_rate": 5.706436150593126e-06, "loss": 0.8743211627006531, "step": 1512 }, { "epoch": 2.772893772893773, "grad_norm": 0.15124575793743134, "learning_rate": 5.684474010637134e-06, "loss": 1.0424885749816895, "step": 1514 }, { "epoch": 2.7765567765567765, "grad_norm": 0.21101588010787964, "learning_rate": 5.662853396752659e-06, "loss": 0.943360447883606, "step": 1516 }, { "epoch": 2.78021978021978, "grad_norm": 0.10518907010555267, "learning_rate": 5.641574647414386e-06, "loss": 0.921418309211731, "step": 1518 }, { "epoch": 2.7838827838827838, "grad_norm": 0.6700722575187683, "learning_rate": 5.620638095745048e-06, "loss": 0.4822154641151428, "step": 1520 }, { "epoch": 2.7875457875457874, "grad_norm": 0.21197772026062012, "learning_rate": 5.600044069510221e-06, "loss": 0.708233118057251, "step": 1522 }, { "epoch": 2.791208791208791, "grad_norm": 0.5329016447067261, "learning_rate": 5.579792891113163e-06, "loss": 0.7894065976142883, "step": 1524 }, { "epoch": 2.7948717948717947, "grad_norm": 0.23669062554836273, "learning_rate": 5.5598848775897975e-06, "loss": 1.0895702838897705, "step": 1526 }, { "epoch": 2.7985347985347984, "grad_norm": 0.2975974977016449, "learning_rate": 5.540320340603742e-06, "loss": 1.0676382780075073, "step": 1528 }, { "epoch": 2.802197802197802, "grad_norm": 0.35699358582496643, "learning_rate": 5.52109958644142e-06, "loss": 1.047616958618164, "step": 1530 }, { "epoch": 2.8058608058608057, "grad_norm": 0.16987060010433197, "learning_rate": 5.50222291600727e-06, "loss": 0.9621225595474243, "step": 1532 }, { "epoch": 2.8095238095238093, "grad_norm": 0.34407544136047363, "learning_rate": 5.483690624819042e-06, "loss": 0.7081210613250732, "step": 1534 }, { "epoch": 2.813186813186813, "grad_norm": 0.1482367217540741, "learning_rate": 5.4655030030031616e-06, "loss": 1.1918277740478516, "step": 1536 }, { "epoch": 2.8168498168498166, "grad_norm": 0.34928014874458313, "learning_rate": 5.4476603352901945e-06, "loss": 0.8316318392753601, "step": 1538 }, { "epoch": 2.8205128205128203, "grad_norm": 0.3218369781970978, "learning_rate": 5.430162901010386e-06, "loss": 0.7342109084129333, "step": 1540 }, { "epoch": 2.824175824175824, "grad_norm": 0.21855826675891876, "learning_rate": 5.413010974089283e-06, "loss": 0.8212740421295166, "step": 1542 }, { "epoch": 2.8278388278388276, "grad_norm": 0.2800341248512268, "learning_rate": 5.39620482304346e-06, "loss": 1.0205451250076294, "step": 1544 }, { "epoch": 2.8315018315018317, "grad_norm": 0.21291717886924744, "learning_rate": 5.379744710976301e-06, "loss": 1.0645310878753662, "step": 1546 }, { "epoch": 2.8351648351648353, "grad_norm": 0.15496331453323364, "learning_rate": 5.363630895573892e-06, "loss": 1.1228570938110352, "step": 1548 }, { "epoch": 2.838827838827839, "grad_norm": 0.8466178178787231, "learning_rate": 5.347863629100969e-06, "loss": 0.737494945526123, "step": 1550 }, { "epoch": 2.8424908424908426, "grad_norm": 0.04996780306100845, "learning_rate": 5.332443158396993e-06, "loss": 0.5186063051223755, "step": 1552 }, { "epoch": 2.8461538461538463, "grad_norm": 0.34756842255592346, "learning_rate": 5.317369724872267e-06, "loss": 1.0735743045806885, "step": 1554 }, { "epoch": 2.84981684981685, "grad_norm": 0.25667431950569153, "learning_rate": 5.302643564504168e-06, "loss": 0.8242087364196777, "step": 1556 }, { "epoch": 2.8534798534798536, "grad_norm": 0.5141234397888184, "learning_rate": 5.288264907833445e-06, "loss": 0.9391310811042786, "step": 1558 }, { "epoch": 2.857142857142857, "grad_norm": 0.37790647149086, "learning_rate": 5.274233979960608e-06, "loss": 0.511182427406311, "step": 1560 }, { "epoch": 2.860805860805861, "grad_norm": 0.8588418960571289, "learning_rate": 5.260551000542418e-06, "loss": 0.6005702614784241, "step": 1562 }, { "epoch": 2.8644688644688645, "grad_norm": 0.20086157321929932, "learning_rate": 5.247216183788431e-06, "loss": 0.7859454154968262, "step": 1564 }, { "epoch": 2.868131868131868, "grad_norm": 0.5744203925132751, "learning_rate": 5.234229738457658e-06, "loss": 0.5249977111816406, "step": 1566 }, { "epoch": 2.871794871794872, "grad_norm": 0.588792085647583, "learning_rate": 5.221591867855286e-06, "loss": 0.677643895149231, "step": 1568 }, { "epoch": 2.8754578754578755, "grad_norm": 0.06420119106769562, "learning_rate": 5.209302769829507e-06, "loss": 0.5973821878433228, "step": 1570 }, { "epoch": 2.879120879120879, "grad_norm": 0.18718576431274414, "learning_rate": 5.197362636768409e-06, "loss": 0.613332986831665, "step": 1572 }, { "epoch": 2.8827838827838828, "grad_norm": 0.2197110801935196, "learning_rate": 5.185771655596972e-06, "loss": 0.9175146818161011, "step": 1574 }, { "epoch": 2.8864468864468864, "grad_norm": 5.933550834655762, "learning_rate": 5.174530007774135e-06, "loss": 0.8471065163612366, "step": 1576 }, { "epoch": 2.89010989010989, "grad_norm": 0.09670909494161606, "learning_rate": 5.1636378692899665e-06, "loss": 0.8234681487083435, "step": 1578 }, { "epoch": 2.8937728937728937, "grad_norm": 0.1601630300283432, "learning_rate": 5.153095410662896e-06, "loss": 1.1230218410491943, "step": 1580 }, { "epoch": 2.8974358974358974, "grad_norm": 2.9173190593719482, "learning_rate": 5.142902796937052e-06, "loss": 0.7799305319786072, "step": 1582 }, { "epoch": 2.901098901098901, "grad_norm": 0.03245487064123154, "learning_rate": 5.133060187679675e-06, "loss": 0.7026646733283997, "step": 1584 }, { "epoch": 2.9047619047619047, "grad_norm": 0.26367539167404175, "learning_rate": 5.1235677369786265e-06, "loss": 0.6960863471031189, "step": 1586 }, { "epoch": 2.9084249084249083, "grad_norm": 0.24362139403820038, "learning_rate": 5.1144255934399655e-06, "loss": 1.0824929475784302, "step": 1588 }, { "epoch": 2.912087912087912, "grad_norm": 0.23772361874580383, "learning_rate": 5.105633900185632e-06, "loss": 1.0874613523483276, "step": 1590 }, { "epoch": 2.9157509157509156, "grad_norm": 0.30294981598854065, "learning_rate": 5.0971927948512e-06, "loss": 0.4234909117221832, "step": 1592 }, { "epoch": 2.9194139194139193, "grad_norm": 0.24272647500038147, "learning_rate": 5.089102409583725e-06, "loss": 1.0570107698440552, "step": 1594 }, { "epoch": 2.9230769230769234, "grad_norm": 0.20444297790527344, "learning_rate": 5.081362871039677e-06, "loss": 0.6874979138374329, "step": 1596 }, { "epoch": 2.926739926739927, "grad_norm": 0.40901777148246765, "learning_rate": 5.073974300382959e-06, "loss": 1.0847806930541992, "step": 1598 }, { "epoch": 2.9304029304029307, "grad_norm": 0.06832870841026306, "learning_rate": 5.066936813282996e-06, "loss": 0.6706178784370422, "step": 1600 }, { "epoch": 2.9340659340659343, "grad_norm": 0.16809964179992676, "learning_rate": 5.060250519912951e-06, "loss": 1.0802940130233765, "step": 1602 }, { "epoch": 2.937728937728938, "grad_norm": 0.11709550023078918, "learning_rate": 5.053915524947969e-06, "loss": 0.7102103233337402, "step": 1604 }, { "epoch": 2.9413919413919416, "grad_norm": 0.18930549919605255, "learning_rate": 5.047931927563565e-06, "loss": 1.052394986152649, "step": 1606 }, { "epoch": 2.9450549450549453, "grad_norm": 0.17763479053974152, "learning_rate": 5.042299821434059e-06, "loss": 0.6530783772468567, "step": 1608 }, { "epoch": 2.948717948717949, "grad_norm": 0.35226595401763916, "learning_rate": 5.037019294731103e-06, "loss": 0.8992307186126709, "step": 1610 }, { "epoch": 2.9523809523809526, "grad_norm": 0.30645254254341125, "learning_rate": 5.032090430122316e-06, "loss": 0.7746174335479736, "step": 1612 }, { "epoch": 2.956043956043956, "grad_norm": 0.7316517233848572, "learning_rate": 5.0275133047699814e-06, "loss": 0.6159262657165527, "step": 1614 }, { "epoch": 2.95970695970696, "grad_norm": 0.20291663706302643, "learning_rate": 5.023287990329835e-06, "loss": 0.737842857837677, "step": 1616 }, { "epoch": 2.9633699633699635, "grad_norm": 0.3495129942893982, "learning_rate": 5.019414552949955e-06, "loss": 1.2598001956939697, "step": 1618 }, { "epoch": 2.967032967032967, "grad_norm": 0.15239816904067993, "learning_rate": 5.015893053269714e-06, "loss": 1.167555332183838, "step": 1620 }, { "epoch": 2.970695970695971, "grad_norm": 0.20208200812339783, "learning_rate": 5.012723546418838e-06, "loss": 0.8371485471725464, "step": 1622 }, { "epoch": 2.9743589743589745, "grad_norm": 0.15967847406864166, "learning_rate": 5.009906082016538e-06, "loss": 0.733574390411377, "step": 1624 }, { "epoch": 2.978021978021978, "grad_norm": 0.17362992465496063, "learning_rate": 5.007440704170741e-06, "loss": 0.7777770161628723, "step": 1626 }, { "epoch": 2.9816849816849818, "grad_norm": 0.15043112635612488, "learning_rate": 5.005327451477387e-06, "loss": 0.8784082531929016, "step": 1628 }, { "epoch": 2.9853479853479854, "grad_norm": 0.19129148125648499, "learning_rate": 5.003566357019837e-06, "loss": 1.2974438667297363, "step": 1630 }, { "epoch": 2.989010989010989, "grad_norm": 0.3431568741798401, "learning_rate": 5.002157448368347e-06, "loss": 0.9204556345939636, "step": 1632 }, { "epoch": 2.9926739926739927, "grad_norm": 0.16419149935245514, "learning_rate": 5.001100747579644e-06, "loss": 0.6911695003509521, "step": 1634 }, { "epoch": 2.9963369963369964, "grad_norm": 0.48860520124435425, "learning_rate": 5.000396271196573e-06, "loss": 1.1634691953659058, "step": 1636 }, { "epoch": 3.0, "grad_norm": 0.28968650102615356, "learning_rate": 5.000044030247836e-06, "loss": 1.0265119075775146, "step": 1638 }, { "epoch": 3.0, "step": 1638, "total_flos": 8.4482141520606e+18, "train_loss": 0.9791712072451618, "train_runtime": 55340.5169, "train_samples_per_second": 0.71, "train_steps_per_second": 0.03 } ], "logging_steps": 2, "max_steps": 1638, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.4482141520606e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }