diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13509 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.305175263849447, + "eval_steps": 100, + "global_step": 3850, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006781672529987708, + "grad_norm": 19.302452087402344, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.7636, + "step": 2 + }, + { + "epoch": 0.0013563345059975417, + "grad_norm": 17.56575584411621, + "learning_rate": 3e-06, + "loss": 0.6728, + "step": 4 + }, + { + "epoch": 0.0020345017589963126, + "grad_norm": 13.646787643432617, + "learning_rate": 5e-06, + "loss": 0.5993, + "step": 6 + }, + { + "epoch": 0.0027126690119950833, + "grad_norm": 4.150851726531982, + "learning_rate": 7.000000000000001e-06, + "loss": 0.3705, + "step": 8 + }, + { + "epoch": 0.003390836264993854, + "grad_norm": 3.0254945755004883, + "learning_rate": 9e-06, + "loss": 0.2513, + "step": 10 + }, + { + "epoch": 0.004069003517992625, + "grad_norm": 2.2863237857818604, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.189, + "step": 12 + }, + { + "epoch": 0.004747170770991396, + "grad_norm": 1.6792925596237183, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.0997, + "step": 14 + }, + { + "epoch": 0.005425338023990167, + "grad_norm": 1.6237634420394897, + "learning_rate": 1.5e-05, + "loss": 0.0595, + "step": 16 + }, + { + "epoch": 0.006103505276988937, + "grad_norm": 1.5864434242248535, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.0527, + "step": 18 + }, + { + "epoch": 0.006781672529987708, + "grad_norm": 1.0857024192810059, + "learning_rate": 1.9e-05, + "loss": 0.0437, + "step": 20 + }, + { + "epoch": 0.007459839782986479, + "grad_norm": 0.7278645634651184, + "learning_rate": 2.1e-05, + "loss": 0.0376, + "step": 22 + }, + { + "epoch": 0.00813800703598525, + "grad_norm": 1.2589967250823975, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.0301, + "step": 24 + }, + { + "epoch": 0.00881617428898402, + "grad_norm": 1.5093748569488525, + "learning_rate": 2.5e-05, + "loss": 0.0269, + "step": 26 + }, + { + "epoch": 0.009494341541982792, + "grad_norm": 1.0473034381866455, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.0284, + "step": 28 + }, + { + "epoch": 0.010172508794981562, + "grad_norm": 0.6267300844192505, + "learning_rate": 2.9e-05, + "loss": 0.0188, + "step": 30 + }, + { + "epoch": 0.010850676047980333, + "grad_norm": 0.6511037349700928, + "learning_rate": 3.1e-05, + "loss": 0.0195, + "step": 32 + }, + { + "epoch": 0.011528843300979103, + "grad_norm": 0.47719255089759827, + "learning_rate": 3.3e-05, + "loss": 0.0133, + "step": 34 + }, + { + "epoch": 0.012207010553977875, + "grad_norm": 1.834301233291626, + "learning_rate": 3.5e-05, + "loss": 0.0234, + "step": 36 + }, + { + "epoch": 0.012885177806976646, + "grad_norm": 0.5575568675994873, + "learning_rate": 3.7e-05, + "loss": 0.0178, + "step": 38 + }, + { + "epoch": 0.013563345059975416, + "grad_norm": 0.4530166983604431, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.0158, + "step": 40 + }, + { + "epoch": 0.014241512312974188, + "grad_norm": 0.41800159215927124, + "learning_rate": 4.1e-05, + "loss": 0.0168, + "step": 42 + }, + { + "epoch": 0.014919679565972958, + "grad_norm": 13.026510238647461, + "learning_rate": 4.3e-05, + "loss": 0.0292, + "step": 44 + }, + { + "epoch": 0.01559784681897173, + "grad_norm": 0.39133763313293457, + "learning_rate": 4.5e-05, + "loss": 0.0147, + "step": 46 + }, + { + "epoch": 0.0162760140719705, + "grad_norm": 0.3970816135406494, + "learning_rate": 4.7e-05, + "loss": 0.0121, + "step": 48 + }, + { + "epoch": 0.01695418132496927, + "grad_norm": 0.34943029284477234, + "learning_rate": 4.9e-05, + "loss": 0.0109, + "step": 50 + }, + { + "epoch": 0.01763234857796804, + "grad_norm": 0.46713560819625854, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.0111, + "step": 52 + }, + { + "epoch": 0.018310515830966812, + "grad_norm": 0.3805915415287018, + "learning_rate": 5.300000000000001e-05, + "loss": 0.0068, + "step": 54 + }, + { + "epoch": 0.018988683083965584, + "grad_norm": 0.6533093452453613, + "learning_rate": 5.500000000000001e-05, + "loss": 0.0113, + "step": 56 + }, + { + "epoch": 0.019666850336964355, + "grad_norm": 0.5974678993225098, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.0073, + "step": 58 + }, + { + "epoch": 0.020345017589963123, + "grad_norm": 0.44925084710121155, + "learning_rate": 5.9e-05, + "loss": 0.0082, + "step": 60 + }, + { + "epoch": 0.021023184842961895, + "grad_norm": 0.7732060551643372, + "learning_rate": 6.1e-05, + "loss": 0.0101, + "step": 62 + }, + { + "epoch": 0.021701352095960667, + "grad_norm": 0.9566466808319092, + "learning_rate": 6.3e-05, + "loss": 0.0128, + "step": 64 + }, + { + "epoch": 0.022379519348959438, + "grad_norm": 0.5865803360939026, + "learning_rate": 6.500000000000001e-05, + "loss": 0.0091, + "step": 66 + }, + { + "epoch": 0.023057686601958206, + "grad_norm": 0.3004082143306732, + "learning_rate": 6.7e-05, + "loss": 0.0097, + "step": 68 + }, + { + "epoch": 0.023735853854956978, + "grad_norm": 0.34375691413879395, + "learning_rate": 6.9e-05, + "loss": 0.0086, + "step": 70 + }, + { + "epoch": 0.02441402110795575, + "grad_norm": 0.5289464592933655, + "learning_rate": 7.1e-05, + "loss": 0.0554, + "step": 72 + }, + { + "epoch": 0.02509218836095452, + "grad_norm": 0.5276769399642944, + "learning_rate": 7.3e-05, + "loss": 0.0089, + "step": 74 + }, + { + "epoch": 0.025770355613953293, + "grad_norm": 0.3362698554992676, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0075, + "step": 76 + }, + { + "epoch": 0.02644852286695206, + "grad_norm": 0.3724222779273987, + "learning_rate": 7.7e-05, + "loss": 0.0069, + "step": 78 + }, + { + "epoch": 0.027126690119950832, + "grad_norm": 0.33922865986824036, + "learning_rate": 7.900000000000001e-05, + "loss": 0.007, + "step": 80 + }, + { + "epoch": 0.027804857372949604, + "grad_norm": 0.25453755259513855, + "learning_rate": 8.1e-05, + "loss": 0.005, + "step": 82 + }, + { + "epoch": 0.028483024625948376, + "grad_norm": 0.21005849540233612, + "learning_rate": 8.3e-05, + "loss": 0.0043, + "step": 84 + }, + { + "epoch": 0.029161191878947144, + "grad_norm": 0.22220443189144135, + "learning_rate": 8.5e-05, + "loss": 0.0046, + "step": 86 + }, + { + "epoch": 0.029839359131945915, + "grad_norm": 0.2243790626525879, + "learning_rate": 8.7e-05, + "loss": 0.0036, + "step": 88 + }, + { + "epoch": 0.030517526384944687, + "grad_norm": 0.2276381105184555, + "learning_rate": 8.900000000000001e-05, + "loss": 0.0051, + "step": 90 + }, + { + "epoch": 0.03119569363794346, + "grad_norm": 0.19878089427947998, + "learning_rate": 9.1e-05, + "loss": 0.004, + "step": 92 + }, + { + "epoch": 0.03187386089094223, + "grad_norm": 0.20180144906044006, + "learning_rate": 9.300000000000001e-05, + "loss": 0.0055, + "step": 94 + }, + { + "epoch": 0.032552028143941, + "grad_norm": 0.22953402996063232, + "learning_rate": 9.5e-05, + "loss": 0.0036, + "step": 96 + }, + { + "epoch": 0.03323019539693977, + "grad_norm": 0.20858259499073029, + "learning_rate": 9.7e-05, + "loss": 0.0039, + "step": 98 + }, + { + "epoch": 0.03390836264993854, + "grad_norm": 0.19686263799667358, + "learning_rate": 9.900000000000001e-05, + "loss": 0.0042, + "step": 100 + }, + { + "epoch": 0.03458652990293731, + "grad_norm": 0.1803613305091858, + "learning_rate": 9.999999748250068e-05, + "loss": 0.0048, + "step": 102 + }, + { + "epoch": 0.03526469715593608, + "grad_norm": 0.28180426359176636, + "learning_rate": 9.999997734250768e-05, + "loss": 0.0051, + "step": 104 + }, + { + "epoch": 0.035942864408934856, + "grad_norm": 0.2615203559398651, + "learning_rate": 9.999993706252977e-05, + "loss": 0.0036, + "step": 106 + }, + { + "epoch": 0.036621031661933624, + "grad_norm": 0.200982928276062, + "learning_rate": 9.999987664258321e-05, + "loss": 0.0042, + "step": 108 + }, + { + "epoch": 0.03729919891493239, + "grad_norm": 0.21157991886138916, + "learning_rate": 9.999979608269231e-05, + "loss": 0.0038, + "step": 110 + }, + { + "epoch": 0.03797736616793117, + "grad_norm": 0.16271452605724335, + "learning_rate": 9.999969538288952e-05, + "loss": 0.0023, + "step": 112 + }, + { + "epoch": 0.038655533420929936, + "grad_norm": 0.15799306333065033, + "learning_rate": 9.999957454321543e-05, + "loss": 0.0032, + "step": 114 + }, + { + "epoch": 0.03933370067392871, + "grad_norm": 0.23333343863487244, + "learning_rate": 9.999943356371866e-05, + "loss": 0.0029, + "step": 116 + }, + { + "epoch": 0.04001186792692748, + "grad_norm": 0.2120216190814972, + "learning_rate": 9.999927244445606e-05, + "loss": 0.0032, + "step": 118 + }, + { + "epoch": 0.04069003517992625, + "grad_norm": 0.12919344007968903, + "learning_rate": 9.999909118549248e-05, + "loss": 0.0016, + "step": 120 + }, + { + "epoch": 0.04136820243292502, + "grad_norm": 0.12390448898077011, + "learning_rate": 9.999888978690095e-05, + "loss": 0.0025, + "step": 122 + }, + { + "epoch": 0.04204636968592379, + "grad_norm": 0.21758395433425903, + "learning_rate": 9.99986682487626e-05, + "loss": 0.0026, + "step": 124 + }, + { + "epoch": 0.042724536938922565, + "grad_norm": 0.21877099573612213, + "learning_rate": 9.999842657116665e-05, + "loss": 0.0017, + "step": 126 + }, + { + "epoch": 0.04340270419192133, + "grad_norm": 0.1546417772769928, + "learning_rate": 9.999816475421046e-05, + "loss": 0.0026, + "step": 128 + }, + { + "epoch": 0.0440808714449201, + "grad_norm": 0.1697741150856018, + "learning_rate": 9.999788279799951e-05, + "loss": 0.0021, + "step": 130 + }, + { + "epoch": 0.044759038697918876, + "grad_norm": 0.23591545224189758, + "learning_rate": 9.999758070264732e-05, + "loss": 0.0021, + "step": 132 + }, + { + "epoch": 0.045437205950917645, + "grad_norm": 0.23597721755504608, + "learning_rate": 9.999725846827562e-05, + "loss": 0.0022, + "step": 134 + }, + { + "epoch": 0.04611537320391641, + "grad_norm": 0.09806650131940842, + "learning_rate": 9.999691609501417e-05, + "loss": 0.0014, + "step": 136 + }, + { + "epoch": 0.04679354045691519, + "grad_norm": 0.17510582506656647, + "learning_rate": 9.99965535830009e-05, + "loss": 0.0011, + "step": 138 + }, + { + "epoch": 0.047471707709913956, + "grad_norm": 0.16027089953422546, + "learning_rate": 9.999617093238182e-05, + "loss": 0.002, + "step": 140 + }, + { + "epoch": 0.04814987496291273, + "grad_norm": 0.14597536623477936, + "learning_rate": 9.999576814331109e-05, + "loss": 0.0013, + "step": 142 + }, + { + "epoch": 0.0488280422159115, + "grad_norm": 0.2963673770427704, + "learning_rate": 9.999534521595093e-05, + "loss": 0.0019, + "step": 144 + }, + { + "epoch": 0.04950620946891027, + "grad_norm": 0.32802748680114746, + "learning_rate": 9.999490215047167e-05, + "loss": 0.0022, + "step": 146 + }, + { + "epoch": 0.05018437672190904, + "grad_norm": 0.17002712190151215, + "learning_rate": 9.999443894705182e-05, + "loss": 0.0016, + "step": 148 + }, + { + "epoch": 0.05086254397490781, + "grad_norm": 0.10025887191295624, + "learning_rate": 9.999395560587795e-05, + "loss": 0.0014, + "step": 150 + }, + { + "epoch": 0.051540711227906585, + "grad_norm": 0.24780897796154022, + "learning_rate": 9.999345212714471e-05, + "loss": 0.0022, + "step": 152 + }, + { + "epoch": 0.052218878480905354, + "grad_norm": 0.18174393475055695, + "learning_rate": 9.999292851105495e-05, + "loss": 0.0023, + "step": 154 + }, + { + "epoch": 0.05289704573390412, + "grad_norm": 0.34791404008865356, + "learning_rate": 9.999238475781957e-05, + "loss": 0.0033, + "step": 156 + }, + { + "epoch": 0.0535752129869029, + "grad_norm": 0.30141982436180115, + "learning_rate": 9.999182086765756e-05, + "loss": 0.0033, + "step": 158 + }, + { + "epoch": 0.054253380239901665, + "grad_norm": 0.26267531514167786, + "learning_rate": 9.999123684079612e-05, + "loss": 0.002, + "step": 160 + }, + { + "epoch": 0.05493154749290044, + "grad_norm": 0.08853629976511002, + "learning_rate": 9.999063267747044e-05, + "loss": 0.0022, + "step": 162 + }, + { + "epoch": 0.05560971474589921, + "grad_norm": 2.4076087474823, + "learning_rate": 9.999000837792391e-05, + "loss": 0.0052, + "step": 164 + }, + { + "epoch": 0.056287881998897976, + "grad_norm": 1.0381807088851929, + "learning_rate": 9.998936394240797e-05, + "loss": 0.0046, + "step": 166 + }, + { + "epoch": 0.05696604925189675, + "grad_norm": 0.7304306626319885, + "learning_rate": 9.998869937118222e-05, + "loss": 0.015, + "step": 168 + }, + { + "epoch": 0.05764421650489552, + "grad_norm": 1.530617594718933, + "learning_rate": 9.998801466451433e-05, + "loss": 0.0075, + "step": 170 + }, + { + "epoch": 0.05832238375789429, + "grad_norm": 11.509839057922363, + "learning_rate": 9.998730982268012e-05, + "loss": 0.0751, + "step": 172 + }, + { + "epoch": 0.05900055101089306, + "grad_norm": 2.877173662185669, + "learning_rate": 9.998658484596349e-05, + "loss": 0.0189, + "step": 174 + }, + { + "epoch": 0.05967871826389183, + "grad_norm": 0.45325320959091187, + "learning_rate": 9.998583973465646e-05, + "loss": 0.0055, + "step": 176 + }, + { + "epoch": 0.060356885516890606, + "grad_norm": 0.32094088196754456, + "learning_rate": 9.998507448905917e-05, + "loss": 0.0037, + "step": 178 + }, + { + "epoch": 0.061035052769889374, + "grad_norm": 0.36653316020965576, + "learning_rate": 9.998428910947983e-05, + "loss": 0.0098, + "step": 180 + }, + { + "epoch": 0.06171322002288814, + "grad_norm": 0.2647911608219147, + "learning_rate": 9.998348359623484e-05, + "loss": 0.0063, + "step": 182 + }, + { + "epoch": 0.06239138727588692, + "grad_norm": 1.8155499696731567, + "learning_rate": 9.998265794964863e-05, + "loss": 0.019, + "step": 184 + }, + { + "epoch": 0.06306955452888569, + "grad_norm": 0.6207036972045898, + "learning_rate": 9.998181217005376e-05, + "loss": 0.0094, + "step": 186 + }, + { + "epoch": 0.06374772178188445, + "grad_norm": 0.26838263869285583, + "learning_rate": 9.998094625779094e-05, + "loss": 0.0048, + "step": 188 + }, + { + "epoch": 0.06442588903488322, + "grad_norm": 0.21736088395118713, + "learning_rate": 9.998006021320893e-05, + "loss": 0.0046, + "step": 190 + }, + { + "epoch": 0.065104056287882, + "grad_norm": 0.22532005608081818, + "learning_rate": 9.997915403666466e-05, + "loss": 0.004, + "step": 192 + }, + { + "epoch": 0.06578222354088077, + "grad_norm": 0.35037961602211, + "learning_rate": 9.997822772852312e-05, + "loss": 0.0028, + "step": 194 + }, + { + "epoch": 0.06646039079387954, + "grad_norm": 0.5995946526527405, + "learning_rate": 9.997728128915742e-05, + "loss": 0.0031, + "step": 196 + }, + { + "epoch": 0.06713855804687831, + "grad_norm": 0.1885141283273697, + "learning_rate": 9.99763147189488e-05, + "loss": 0.0035, + "step": 198 + }, + { + "epoch": 0.06781672529987708, + "grad_norm": 0.6732299327850342, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0037, + "step": 200 + }, + { + "epoch": 0.06849489255287586, + "grad_norm": 0.6802183389663696, + "learning_rate": 9.997432118756822e-05, + "loss": 0.0161, + "step": 202 + }, + { + "epoch": 0.06917305980587463, + "grad_norm": 0.1995115876197815, + "learning_rate": 9.997329422719926e-05, + "loss": 0.0036, + "step": 204 + }, + { + "epoch": 0.0698512270588734, + "grad_norm": 0.156790092587471, + "learning_rate": 9.997224713759335e-05, + "loss": 0.0053, + "step": 206 + }, + { + "epoch": 0.07052939431187216, + "grad_norm": 0.11744780093431473, + "learning_rate": 9.997117991917228e-05, + "loss": 0.0022, + "step": 208 + }, + { + "epoch": 0.07120756156487093, + "grad_norm": 0.16716186702251434, + "learning_rate": 9.997009257236592e-05, + "loss": 0.0026, + "step": 210 + }, + { + "epoch": 0.07188572881786971, + "grad_norm": 0.15055015683174133, + "learning_rate": 9.996898509761224e-05, + "loss": 0.0028, + "step": 212 + }, + { + "epoch": 0.07256389607086848, + "grad_norm": 0.5399858951568604, + "learning_rate": 9.996785749535733e-05, + "loss": 0.0031, + "step": 214 + }, + { + "epoch": 0.07324206332386725, + "grad_norm": 0.3503137528896332, + "learning_rate": 9.99667097660554e-05, + "loss": 0.0036, + "step": 216 + }, + { + "epoch": 0.07392023057686602, + "grad_norm": 0.09014566987752914, + "learning_rate": 9.996554191016875e-05, + "loss": 0.0022, + "step": 218 + }, + { + "epoch": 0.07459839782986478, + "grad_norm": 0.11608397960662842, + "learning_rate": 9.996435392816781e-05, + "loss": 0.0023, + "step": 220 + }, + { + "epoch": 0.07527656508286357, + "grad_norm": 0.08935557305812836, + "learning_rate": 9.996314582053106e-05, + "loss": 0.0021, + "step": 222 + }, + { + "epoch": 0.07595473233586233, + "grad_norm": 0.21059103310108185, + "learning_rate": 9.996191758774515e-05, + "loss": 0.0023, + "step": 224 + }, + { + "epoch": 0.0766328995888611, + "grad_norm": 0.09767665714025497, + "learning_rate": 9.996066923030484e-05, + "loss": 0.0009, + "step": 226 + }, + { + "epoch": 0.07731106684185987, + "grad_norm": 0.10100996494293213, + "learning_rate": 9.995940074871291e-05, + "loss": 0.0019, + "step": 228 + }, + { + "epoch": 0.07798923409485864, + "grad_norm": 0.12427397817373276, + "learning_rate": 9.995811214348034e-05, + "loss": 0.001, + "step": 230 + }, + { + "epoch": 0.07866740134785742, + "grad_norm": 0.16718646883964539, + "learning_rate": 9.995680341512619e-05, + "loss": 0.001, + "step": 232 + }, + { + "epoch": 0.07934556860085619, + "grad_norm": 0.09956853836774826, + "learning_rate": 9.995547456417757e-05, + "loss": 0.0012, + "step": 234 + }, + { + "epoch": 0.08002373585385496, + "grad_norm": 0.10336117446422577, + "learning_rate": 9.995412559116979e-05, + "loss": 0.0012, + "step": 236 + }, + { + "epoch": 0.08070190310685373, + "grad_norm": 0.08326272666454315, + "learning_rate": 9.995275649664619e-05, + "loss": 0.0009, + "step": 238 + }, + { + "epoch": 0.0813800703598525, + "grad_norm": 0.1553446352481842, + "learning_rate": 9.995136728115825e-05, + "loss": 0.0018, + "step": 240 + }, + { + "epoch": 0.08205823761285128, + "grad_norm": 0.19187629222869873, + "learning_rate": 9.994995794526554e-05, + "loss": 0.0009, + "step": 242 + }, + { + "epoch": 0.08273640486585004, + "grad_norm": 0.12646210193634033, + "learning_rate": 9.994852848953574e-05, + "loss": 0.0021, + "step": 244 + }, + { + "epoch": 0.08341457211884881, + "grad_norm": 0.0978604182600975, + "learning_rate": 9.994707891454464e-05, + "loss": 0.0011, + "step": 246 + }, + { + "epoch": 0.08409273937184758, + "grad_norm": 0.1135120764374733, + "learning_rate": 9.994560922087612e-05, + "loss": 0.001, + "step": 248 + }, + { + "epoch": 0.08477090662484635, + "grad_norm": 0.1019337996840477, + "learning_rate": 9.994411940912219e-05, + "loss": 0.0011, + "step": 250 + }, + { + "epoch": 0.08544907387784513, + "grad_norm": 0.08082269132137299, + "learning_rate": 9.994260947988293e-05, + "loss": 0.0008, + "step": 252 + }, + { + "epoch": 0.0861272411308439, + "grad_norm": 0.03959016129374504, + "learning_rate": 9.994107943376654e-05, + "loss": 0.0007, + "step": 254 + }, + { + "epoch": 0.08680540838384267, + "grad_norm": 0.06835722178220749, + "learning_rate": 9.993952927138934e-05, + "loss": 0.0007, + "step": 256 + }, + { + "epoch": 0.08748357563684143, + "grad_norm": 0.03966667875647545, + "learning_rate": 9.99379589933757e-05, + "loss": 0.0007, + "step": 258 + }, + { + "epoch": 0.0881617428898402, + "grad_norm": 0.050738293677568436, + "learning_rate": 9.993636860035816e-05, + "loss": 0.0012, + "step": 260 + }, + { + "epoch": 0.08883991014283897, + "grad_norm": 0.03119371458888054, + "learning_rate": 9.993475809297732e-05, + "loss": 0.001, + "step": 262 + }, + { + "epoch": 0.08951807739583775, + "grad_norm": 0.05491959676146507, + "learning_rate": 9.99331274718819e-05, + "loss": 0.0009, + "step": 264 + }, + { + "epoch": 0.09019624464883652, + "grad_norm": 0.07895030081272125, + "learning_rate": 9.99314767377287e-05, + "loss": 0.0009, + "step": 266 + }, + { + "epoch": 0.09087441190183529, + "grad_norm": 0.062128305435180664, + "learning_rate": 9.992980589118264e-05, + "loss": 0.0008, + "step": 268 + }, + { + "epoch": 0.09155257915483406, + "grad_norm": 0.08582911640405655, + "learning_rate": 9.992811493291674e-05, + "loss": 0.001, + "step": 270 + }, + { + "epoch": 0.09223074640783283, + "grad_norm": 0.055452585220336914, + "learning_rate": 9.992640386361211e-05, + "loss": 0.0006, + "step": 272 + }, + { + "epoch": 0.09290891366083161, + "grad_norm": 0.035648979246616364, + "learning_rate": 9.992467268395798e-05, + "loss": 0.0004, + "step": 274 + }, + { + "epoch": 0.09358708091383038, + "grad_norm": 0.09037120640277863, + "learning_rate": 9.992292139465165e-05, + "loss": 0.0008, + "step": 276 + }, + { + "epoch": 0.09426524816682914, + "grad_norm": 0.05466959998011589, + "learning_rate": 9.992114999639857e-05, + "loss": 0.0006, + "step": 278 + }, + { + "epoch": 0.09494341541982791, + "grad_norm": 0.048906926065683365, + "learning_rate": 9.991935848991222e-05, + "loss": 0.0011, + "step": 280 + }, + { + "epoch": 0.09562158267282668, + "grad_norm": 0.028933236375451088, + "learning_rate": 9.991754687591428e-05, + "loss": 0.0004, + "step": 282 + }, + { + "epoch": 0.09629974992582546, + "grad_norm": 0.2024703323841095, + "learning_rate": 9.991571515513439e-05, + "loss": 0.0012, + "step": 284 + }, + { + "epoch": 0.09697791717882423, + "grad_norm": 0.05048273503780365, + "learning_rate": 9.991386332831042e-05, + "loss": 0.0007, + "step": 286 + }, + { + "epoch": 0.097656084431823, + "grad_norm": 0.04887642711400986, + "learning_rate": 9.991199139618827e-05, + "loss": 0.0004, + "step": 288 + }, + { + "epoch": 0.09833425168482177, + "grad_norm": 0.13716323673725128, + "learning_rate": 9.991009935952196e-05, + "loss": 0.0014, + "step": 290 + }, + { + "epoch": 0.09901241893782053, + "grad_norm": 0.06719297915697098, + "learning_rate": 9.990818721907358e-05, + "loss": 0.0004, + "step": 292 + }, + { + "epoch": 0.09969058619081932, + "grad_norm": 0.024962617084383965, + "learning_rate": 9.990625497561337e-05, + "loss": 0.0005, + "step": 294 + }, + { + "epoch": 0.10036875344381808, + "grad_norm": 0.07603111863136292, + "learning_rate": 9.990430262991962e-05, + "loss": 0.0009, + "step": 296 + }, + { + "epoch": 0.10104692069681685, + "grad_norm": 0.06402367353439331, + "learning_rate": 9.990233018277872e-05, + "loss": 0.0004, + "step": 298 + }, + { + "epoch": 0.10172508794981562, + "grad_norm": 0.013269501738250256, + "learning_rate": 9.990033763498523e-05, + "loss": 0.0002, + "step": 300 + }, + { + "epoch": 0.10240325520281439, + "grad_norm": 0.04440806433558464, + "learning_rate": 9.989832498734168e-05, + "loss": 0.0006, + "step": 302 + }, + { + "epoch": 0.10308142245581317, + "grad_norm": 0.09003011137247086, + "learning_rate": 9.989629224065879e-05, + "loss": 0.0009, + "step": 304 + }, + { + "epoch": 0.10375958970881194, + "grad_norm": 0.055034276098012924, + "learning_rate": 9.989423939575538e-05, + "loss": 0.0005, + "step": 306 + }, + { + "epoch": 0.10443775696181071, + "grad_norm": 0.0314326211810112, + "learning_rate": 9.989216645345827e-05, + "loss": 0.0003, + "step": 308 + }, + { + "epoch": 0.10511592421480948, + "grad_norm": 0.08611449599266052, + "learning_rate": 9.98900734146025e-05, + "loss": 0.0003, + "step": 310 + }, + { + "epoch": 0.10579409146780824, + "grad_norm": 0.0826747715473175, + "learning_rate": 9.988796028003114e-05, + "loss": 0.0009, + "step": 312 + }, + { + "epoch": 0.10647225872080703, + "grad_norm": 0.056868743151426315, + "learning_rate": 9.988582705059531e-05, + "loss": 0.0003, + "step": 314 + }, + { + "epoch": 0.1071504259738058, + "grad_norm": 0.04921082779765129, + "learning_rate": 9.988367372715434e-05, + "loss": 0.0002, + "step": 316 + }, + { + "epoch": 0.10782859322680456, + "grad_norm": 0.11438657343387604, + "learning_rate": 9.988150031057555e-05, + "loss": 0.0004, + "step": 318 + }, + { + "epoch": 0.10850676047980333, + "grad_norm": 0.027646591886878014, + "learning_rate": 9.987930680173438e-05, + "loss": 0.0002, + "step": 320 + }, + { + "epoch": 0.1091849277328021, + "grad_norm": 0.07535082846879959, + "learning_rate": 9.987709320151441e-05, + "loss": 0.0005, + "step": 322 + }, + { + "epoch": 0.10986309498580088, + "grad_norm": 0.011354583315551281, + "learning_rate": 9.987485951080727e-05, + "loss": 0.0003, + "step": 324 + }, + { + "epoch": 0.11054126223879965, + "grad_norm": 0.10233359783887863, + "learning_rate": 9.987260573051269e-05, + "loss": 0.0011, + "step": 326 + }, + { + "epoch": 0.11121942949179842, + "grad_norm": 0.04623352363705635, + "learning_rate": 9.987033186153847e-05, + "loss": 0.0004, + "step": 328 + }, + { + "epoch": 0.11189759674479718, + "grad_norm": 0.09176378697156906, + "learning_rate": 9.986803790480054e-05, + "loss": 0.0005, + "step": 330 + }, + { + "epoch": 0.11257576399779595, + "grad_norm": 0.007960623130202293, + "learning_rate": 9.986572386122291e-05, + "loss": 0.0004, + "step": 332 + }, + { + "epoch": 0.11325393125079472, + "grad_norm": 0.07089328020811081, + "learning_rate": 9.986338973173768e-05, + "loss": 0.0006, + "step": 334 + }, + { + "epoch": 0.1139320985037935, + "grad_norm": 0.03843646124005318, + "learning_rate": 9.986103551728501e-05, + "loss": 0.0005, + "step": 336 + }, + { + "epoch": 0.11461026575679227, + "grad_norm": 0.05794384330511093, + "learning_rate": 9.985866121881321e-05, + "loss": 0.0004, + "step": 338 + }, + { + "epoch": 0.11528843300979104, + "grad_norm": 0.07073888182640076, + "learning_rate": 9.985626683727863e-05, + "loss": 0.0004, + "step": 340 + }, + { + "epoch": 0.1159666002627898, + "grad_norm": 0.04688240587711334, + "learning_rate": 9.985385237364573e-05, + "loss": 0.0004, + "step": 342 + }, + { + "epoch": 0.11664476751578857, + "grad_norm": 0.03950873389840126, + "learning_rate": 9.985141782888705e-05, + "loss": 0.0005, + "step": 344 + }, + { + "epoch": 0.11732293476878736, + "grad_norm": 0.08916309475898743, + "learning_rate": 9.984896320398324e-05, + "loss": 0.0004, + "step": 346 + }, + { + "epoch": 0.11800110202178613, + "grad_norm": 0.06798672676086426, + "learning_rate": 9.984648849992301e-05, + "loss": 0.0003, + "step": 348 + }, + { + "epoch": 0.1186792692747849, + "grad_norm": 0.034365102648735046, + "learning_rate": 9.984399371770318e-05, + "loss": 0.0003, + "step": 350 + }, + { + "epoch": 0.11935743652778366, + "grad_norm": 0.0313638299703598, + "learning_rate": 9.984147885832864e-05, + "loss": 0.0006, + "step": 352 + }, + { + "epoch": 0.12003560378078243, + "grad_norm": 0.031012045219540596, + "learning_rate": 9.983894392281237e-05, + "loss": 0.0005, + "step": 354 + }, + { + "epoch": 0.12071377103378121, + "grad_norm": 0.059786684811115265, + "learning_rate": 9.983638891217544e-05, + "loss": 0.0005, + "step": 356 + }, + { + "epoch": 0.12139193828677998, + "grad_norm": 0.12950845062732697, + "learning_rate": 9.983381382744703e-05, + "loss": 0.0007, + "step": 358 + }, + { + "epoch": 0.12207010553977875, + "grad_norm": 0.09296132624149323, + "learning_rate": 9.983121866966437e-05, + "loss": 0.0008, + "step": 360 + }, + { + "epoch": 0.12274827279277752, + "grad_norm": 0.024494891986250877, + "learning_rate": 9.982860343987277e-05, + "loss": 0.0002, + "step": 362 + }, + { + "epoch": 0.12342644004577628, + "grad_norm": 0.0354764498770237, + "learning_rate": 9.982596813912569e-05, + "loss": 0.0007, + "step": 364 + }, + { + "epoch": 0.12410460729877507, + "grad_norm": 0.06026894226670265, + "learning_rate": 9.982331276848458e-05, + "loss": 0.0008, + "step": 366 + }, + { + "epoch": 0.12478277455177383, + "grad_norm": 0.07126794010400772, + "learning_rate": 9.982063732901906e-05, + "loss": 0.0007, + "step": 368 + }, + { + "epoch": 0.1254609418047726, + "grad_norm": 0.04826202988624573, + "learning_rate": 9.981794182180677e-05, + "loss": 0.0005, + "step": 370 + }, + { + "epoch": 0.12613910905777137, + "grad_norm": 0.02070901356637478, + "learning_rate": 9.981522624793347e-05, + "loss": 0.0006, + "step": 372 + }, + { + "epoch": 0.12681727631077014, + "grad_norm": 0.05529174581170082, + "learning_rate": 9.9812490608493e-05, + "loss": 0.0005, + "step": 374 + }, + { + "epoch": 0.1274954435637689, + "grad_norm": 0.0771322250366211, + "learning_rate": 9.980973490458728e-05, + "loss": 0.0006, + "step": 376 + }, + { + "epoch": 0.12817361081676767, + "grad_norm": 0.06000866740942001, + "learning_rate": 9.98069591373263e-05, + "loss": 0.0004, + "step": 378 + }, + { + "epoch": 0.12885177806976644, + "grad_norm": 0.019623633474111557, + "learning_rate": 9.980416330782811e-05, + "loss": 0.0003, + "step": 380 + }, + { + "epoch": 0.12952994532276524, + "grad_norm": 0.02568269707262516, + "learning_rate": 9.980134741721891e-05, + "loss": 0.0003, + "step": 382 + }, + { + "epoch": 0.130208112575764, + "grad_norm": 0.02139587514102459, + "learning_rate": 9.979851146663293e-05, + "loss": 0.0002, + "step": 384 + }, + { + "epoch": 0.13088627982876277, + "grad_norm": 0.013906003907322884, + "learning_rate": 9.979565545721248e-05, + "loss": 0.0002, + "step": 386 + }, + { + "epoch": 0.13156444708176154, + "grad_norm": 0.09719321131706238, + "learning_rate": 9.979277939010798e-05, + "loss": 0.0003, + "step": 388 + }, + { + "epoch": 0.1322426143347603, + "grad_norm": 0.012353319674730301, + "learning_rate": 9.978988326647788e-05, + "loss": 0.0001, + "step": 390 + }, + { + "epoch": 0.13292078158775908, + "grad_norm": 0.09881331026554108, + "learning_rate": 9.978696708748875e-05, + "loss": 0.0002, + "step": 392 + }, + { + "epoch": 0.13359894884075785, + "grad_norm": 0.006447671912610531, + "learning_rate": 9.978403085431524e-05, + "loss": 0.0005, + "step": 394 + }, + { + "epoch": 0.13427711609375662, + "grad_norm": 0.02782581001520157, + "learning_rate": 9.978107456814007e-05, + "loss": 0.0004, + "step": 396 + }, + { + "epoch": 0.13495528334675538, + "grad_norm": 0.03197741508483887, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0003, + "step": 398 + }, + { + "epoch": 0.13563345059975415, + "grad_norm": 0.121083565056324, + "learning_rate": 9.977510184155591e-05, + "loss": 0.0005, + "step": 400 + }, + { + "epoch": 0.13631161785275295, + "grad_norm": 0.014267323538661003, + "learning_rate": 9.977208540355278e-05, + "loss": 0.0003, + "step": 402 + }, + { + "epoch": 0.13698978510575172, + "grad_norm": 0.018701685592532158, + "learning_rate": 9.976904891735958e-05, + "loss": 0.0001, + "step": 404 + }, + { + "epoch": 0.13766795235875048, + "grad_norm": 0.042165644466876984, + "learning_rate": 9.976599238419944e-05, + "loss": 0.0002, + "step": 406 + }, + { + "epoch": 0.13834611961174925, + "grad_norm": 0.013971824198961258, + "learning_rate": 9.97629158053035e-05, + "loss": 0.0002, + "step": 408 + }, + { + "epoch": 0.13902428686474802, + "grad_norm": 0.035180144011974335, + "learning_rate": 9.975981918191103e-05, + "loss": 0.0001, + "step": 410 + }, + { + "epoch": 0.1397024541177468, + "grad_norm": 0.03857795521616936, + "learning_rate": 9.975670251526936e-05, + "loss": 0.0001, + "step": 412 + }, + { + "epoch": 0.14038062137074556, + "grad_norm": 0.0060835485346615314, + "learning_rate": 9.975356580663384e-05, + "loss": 0.0, + "step": 414 + }, + { + "epoch": 0.14105878862374432, + "grad_norm": 0.07395796477794647, + "learning_rate": 9.975040905726798e-05, + "loss": 0.0004, + "step": 416 + }, + { + "epoch": 0.1417369558767431, + "grad_norm": 0.018834367394447327, + "learning_rate": 9.974723226844331e-05, + "loss": 0.0001, + "step": 418 + }, + { + "epoch": 0.14241512312974186, + "grad_norm": 0.0059880041517317295, + "learning_rate": 9.974403544143941e-05, + "loss": 0.0004, + "step": 420 + }, + { + "epoch": 0.14309329038274066, + "grad_norm": 0.019147314131259918, + "learning_rate": 9.974081857754401e-05, + "loss": 0.0001, + "step": 422 + }, + { + "epoch": 0.14377145763573942, + "grad_norm": 0.0023199408315122128, + "learning_rate": 9.973758167805281e-05, + "loss": 0.0001, + "step": 424 + }, + { + "epoch": 0.1444496248887382, + "grad_norm": 0.013452797196805477, + "learning_rate": 9.973432474426967e-05, + "loss": 0.0, + "step": 426 + }, + { + "epoch": 0.14512779214173696, + "grad_norm": 0.06036541238427162, + "learning_rate": 9.973104777750646e-05, + "loss": 0.0007, + "step": 428 + }, + { + "epoch": 0.14580595939473573, + "grad_norm": 0.03896476328372955, + "learning_rate": 9.972775077908317e-05, + "loss": 0.0003, + "step": 430 + }, + { + "epoch": 0.1464841266477345, + "grad_norm": 0.017291929572820663, + "learning_rate": 9.972443375032779e-05, + "loss": 0.0, + "step": 432 + }, + { + "epoch": 0.14716229390073327, + "grad_norm": 0.0299264844506979, + "learning_rate": 9.972109669257646e-05, + "loss": 0.0001, + "step": 434 + }, + { + "epoch": 0.14784046115373203, + "grad_norm": 0.002126836683601141, + "learning_rate": 9.971773960717332e-05, + "loss": 0.0001, + "step": 436 + }, + { + "epoch": 0.1485186284067308, + "grad_norm": 0.004080125596374273, + "learning_rate": 9.971436249547061e-05, + "loss": 0.0, + "step": 438 + }, + { + "epoch": 0.14919679565972957, + "grad_norm": 0.021544527262449265, + "learning_rate": 9.971096535882865e-05, + "loss": 0.0, + "step": 440 + }, + { + "epoch": 0.14987496291272834, + "grad_norm": 0.01083934772759676, + "learning_rate": 9.970754819861577e-05, + "loss": 0.0, + "step": 442 + }, + { + "epoch": 0.15055313016572713, + "grad_norm": 0.005995896179229021, + "learning_rate": 9.970411101620843e-05, + "loss": 0.0001, + "step": 444 + }, + { + "epoch": 0.1512312974187259, + "grad_norm": 0.03495006263256073, + "learning_rate": 9.970065381299112e-05, + "loss": 0.0002, + "step": 446 + }, + { + "epoch": 0.15190946467172467, + "grad_norm": 0.012965913861989975, + "learning_rate": 9.969717659035638e-05, + "loss": 0.0003, + "step": 448 + }, + { + "epoch": 0.15258763192472344, + "grad_norm": 0.04326948896050453, + "learning_rate": 9.969367934970486e-05, + "loss": 0.0001, + "step": 450 + }, + { + "epoch": 0.1532657991777222, + "grad_norm": 0.03606222942471504, + "learning_rate": 9.969016209244525e-05, + "loss": 0.0002, + "step": 452 + }, + { + "epoch": 0.15394396643072097, + "grad_norm": 0.017364950850605965, + "learning_rate": 9.96866248199943e-05, + "loss": 0.0001, + "step": 454 + }, + { + "epoch": 0.15462213368371974, + "grad_norm": 0.029245370998978615, + "learning_rate": 9.968306753377679e-05, + "loss": 0.0002, + "step": 456 + }, + { + "epoch": 0.1553003009367185, + "grad_norm": 0.011929360218346119, + "learning_rate": 9.967949023522563e-05, + "loss": 0.0001, + "step": 458 + }, + { + "epoch": 0.15597846818971728, + "grad_norm": 0.02571127377450466, + "learning_rate": 9.967589292578173e-05, + "loss": 0.0001, + "step": 460 + }, + { + "epoch": 0.15665663544271605, + "grad_norm": 0.0448257140815258, + "learning_rate": 9.967227560689412e-05, + "loss": 0.0002, + "step": 462 + }, + { + "epoch": 0.15733480269571484, + "grad_norm": 0.014019777067005634, + "learning_rate": 9.966863828001982e-05, + "loss": 0.0002, + "step": 464 + }, + { + "epoch": 0.1580129699487136, + "grad_norm": 0.009223355911672115, + "learning_rate": 9.966498094662398e-05, + "loss": 0.0001, + "step": 466 + }, + { + "epoch": 0.15869113720171238, + "grad_norm": 0.010595602914690971, + "learning_rate": 9.966130360817973e-05, + "loss": 0.0, + "step": 468 + }, + { + "epoch": 0.15936930445471115, + "grad_norm": 0.015226882882416248, + "learning_rate": 9.965760626616834e-05, + "loss": 0.0001, + "step": 470 + }, + { + "epoch": 0.16004747170770992, + "grad_norm": 0.039237625896930695, + "learning_rate": 9.96538889220791e-05, + "loss": 0.0003, + "step": 472 + }, + { + "epoch": 0.16072563896070868, + "grad_norm": 0.006564146839082241, + "learning_rate": 9.965015157740931e-05, + "loss": 0.0003, + "step": 474 + }, + { + "epoch": 0.16140380621370745, + "grad_norm": 0.04217905178666115, + "learning_rate": 9.964639423366442e-05, + "loss": 0.0009, + "step": 476 + }, + { + "epoch": 0.16208197346670622, + "grad_norm": 0.0012876184191554785, + "learning_rate": 9.964261689235787e-05, + "loss": 0.0, + "step": 478 + }, + { + "epoch": 0.162760140719705, + "grad_norm": 0.004951702430844307, + "learning_rate": 9.963881955501116e-05, + "loss": 0.0001, + "step": 480 + }, + { + "epoch": 0.16343830797270376, + "grad_norm": 0.02183827944099903, + "learning_rate": 9.963500222315388e-05, + "loss": 0.0003, + "step": 482 + }, + { + "epoch": 0.16411647522570255, + "grad_norm": 0.020162342116236687, + "learning_rate": 9.963116489832364e-05, + "loss": 0.0004, + "step": 484 + }, + { + "epoch": 0.16479464247870132, + "grad_norm": 0.002864987123757601, + "learning_rate": 9.962730758206611e-05, + "loss": 0.0, + "step": 486 + }, + { + "epoch": 0.1654728097317001, + "grad_norm": 0.02149767428636551, + "learning_rate": 9.962343027593502e-05, + "loss": 0.0001, + "step": 488 + }, + { + "epoch": 0.16615097698469886, + "grad_norm": 0.06681331247091293, + "learning_rate": 9.961953298149215e-05, + "loss": 0.0005, + "step": 490 + }, + { + "epoch": 0.16682914423769762, + "grad_norm": 0.035218510776758194, + "learning_rate": 9.961561570030734e-05, + "loss": 0.0002, + "step": 492 + }, + { + "epoch": 0.1675073114906964, + "grad_norm": 0.08965438604354858, + "learning_rate": 9.961167843395845e-05, + "loss": 0.0008, + "step": 494 + }, + { + "epoch": 0.16818547874369516, + "grad_norm": 0.04331008344888687, + "learning_rate": 9.960772118403141e-05, + "loss": 0.0001, + "step": 496 + }, + { + "epoch": 0.16886364599669393, + "grad_norm": 0.07946818321943283, + "learning_rate": 9.960374395212022e-05, + "loss": 0.0003, + "step": 498 + }, + { + "epoch": 0.1695418132496927, + "grad_norm": 0.018603485077619553, + "learning_rate": 9.95997467398269e-05, + "loss": 0.0002, + "step": 500 + }, + { + "epoch": 0.17021998050269146, + "grad_norm": 0.06275351345539093, + "learning_rate": 9.959572954876151e-05, + "loss": 0.0009, + "step": 502 + }, + { + "epoch": 0.17089814775569026, + "grad_norm": 0.03865795210003853, + "learning_rate": 9.95916923805422e-05, + "loss": 0.0003, + "step": 504 + }, + { + "epoch": 0.17157631500868903, + "grad_norm": 0.015474680811166763, + "learning_rate": 9.958763523679514e-05, + "loss": 0.0002, + "step": 506 + }, + { + "epoch": 0.1722544822616878, + "grad_norm": 0.14019793272018433, + "learning_rate": 9.958355811915451e-05, + "loss": 0.0002, + "step": 508 + }, + { + "epoch": 0.17293264951468656, + "grad_norm": 0.05070113018155098, + "learning_rate": 9.957946102926263e-05, + "loss": 0.0003, + "step": 510 + }, + { + "epoch": 0.17361081676768533, + "grad_norm": 0.013154142536222935, + "learning_rate": 9.957534396876976e-05, + "loss": 0.0002, + "step": 512 + }, + { + "epoch": 0.1742889840206841, + "grad_norm": 0.028667040169239044, + "learning_rate": 9.957120693933428e-05, + "loss": 0.0007, + "step": 514 + }, + { + "epoch": 0.17496715127368287, + "grad_norm": 0.027827998623251915, + "learning_rate": 9.956704994262256e-05, + "loss": 0.0003, + "step": 516 + }, + { + "epoch": 0.17564531852668164, + "grad_norm": 0.0170214232057333, + "learning_rate": 9.956287298030905e-05, + "loss": 0.0003, + "step": 518 + }, + { + "epoch": 0.1763234857796804, + "grad_norm": 0.022373339161276817, + "learning_rate": 9.955867605407624e-05, + "loss": 0.0001, + "step": 520 + }, + { + "epoch": 0.17700165303267917, + "grad_norm": 0.044036850333213806, + "learning_rate": 9.955445916561463e-05, + "loss": 0.0002, + "step": 522 + }, + { + "epoch": 0.17767982028567794, + "grad_norm": 0.018346000462770462, + "learning_rate": 9.955022231662281e-05, + "loss": 0.0001, + "step": 524 + }, + { + "epoch": 0.17835798753867674, + "grad_norm": 0.007154988124966621, + "learning_rate": 9.954596550880735e-05, + "loss": 0.0001, + "step": 526 + }, + { + "epoch": 0.1790361547916755, + "grad_norm": 0.0529172420501709, + "learning_rate": 9.954168874388292e-05, + "loss": 0.0002, + "step": 528 + }, + { + "epoch": 0.17971432204467427, + "grad_norm": 0.032745905220508575, + "learning_rate": 9.953739202357218e-05, + "loss": 0.0001, + "step": 530 + }, + { + "epoch": 0.18039248929767304, + "grad_norm": 0.006617931183427572, + "learning_rate": 9.953307534960587e-05, + "loss": 0.0, + "step": 532 + }, + { + "epoch": 0.1810706565506718, + "grad_norm": 0.028485778719186783, + "learning_rate": 9.952873872372272e-05, + "loss": 0.0001, + "step": 534 + }, + { + "epoch": 0.18174882380367058, + "grad_norm": 0.005062984302639961, + "learning_rate": 9.952438214766955e-05, + "loss": 0.0, + "step": 536 + }, + { + "epoch": 0.18242699105666935, + "grad_norm": 0.04665204510092735, + "learning_rate": 9.952000562320115e-05, + "loss": 0.0001, + "step": 538 + }, + { + "epoch": 0.18310515830966811, + "grad_norm": 0.0015840464038774371, + "learning_rate": 9.951560915208042e-05, + "loss": 0.0, + "step": 540 + }, + { + "epoch": 0.18378332556266688, + "grad_norm": 0.0015154321445152164, + "learning_rate": 9.951119273607825e-05, + "loss": 0.0, + "step": 542 + }, + { + "epoch": 0.18446149281566565, + "grad_norm": 0.04031079262495041, + "learning_rate": 9.950675637697355e-05, + "loss": 0.0001, + "step": 544 + }, + { + "epoch": 0.18513966006866445, + "grad_norm": 0.039752230048179626, + "learning_rate": 9.950230007655332e-05, + "loss": 0.0007, + "step": 546 + }, + { + "epoch": 0.18581782732166321, + "grad_norm": 0.07987188547849655, + "learning_rate": 9.949782383661254e-05, + "loss": 0.0007, + "step": 548 + }, + { + "epoch": 0.18649599457466198, + "grad_norm": 0.004725519567728043, + "learning_rate": 9.949332765895423e-05, + "loss": 0.0, + "step": 550 + }, + { + "epoch": 0.18717416182766075, + "grad_norm": 0.014836054295301437, + "learning_rate": 9.948881154538945e-05, + "loss": 0.0001, + "step": 552 + }, + { + "epoch": 0.18785232908065952, + "grad_norm": 0.0027612450066953897, + "learning_rate": 9.94842754977373e-05, + "loss": 0.0001, + "step": 554 + }, + { + "epoch": 0.1885304963336583, + "grad_norm": 0.009966784156858921, + "learning_rate": 9.94797195178249e-05, + "loss": 0.0001, + "step": 556 + }, + { + "epoch": 0.18920866358665706, + "grad_norm": 0.02951398678123951, + "learning_rate": 9.947514360748741e-05, + "loss": 0.0001, + "step": 558 + }, + { + "epoch": 0.18988683083965582, + "grad_norm": 0.02646208181977272, + "learning_rate": 9.947054776856798e-05, + "loss": 0.0002, + "step": 560 + }, + { + "epoch": 0.1905649980926546, + "grad_norm": 0.0137828653678298, + "learning_rate": 9.946593200291783e-05, + "loss": 0.0, + "step": 562 + }, + { + "epoch": 0.19124316534565336, + "grad_norm": 0.06634095311164856, + "learning_rate": 9.946129631239618e-05, + "loss": 0.0002, + "step": 564 + }, + { + "epoch": 0.19192133259865216, + "grad_norm": 0.03192392364144325, + "learning_rate": 9.945664069887028e-05, + "loss": 0.0001, + "step": 566 + }, + { + "epoch": 0.19259949985165092, + "grad_norm": 0.009010384790599346, + "learning_rate": 9.945196516421543e-05, + "loss": 0.0, + "step": 568 + }, + { + "epoch": 0.1932776671046497, + "grad_norm": 0.03663842752575874, + "learning_rate": 9.944726971031493e-05, + "loss": 0.0002, + "step": 570 + }, + { + "epoch": 0.19395583435764846, + "grad_norm": 0.12181317061185837, + "learning_rate": 9.94425543390601e-05, + "loss": 0.0002, + "step": 572 + }, + { + "epoch": 0.19463400161064723, + "grad_norm": 0.01086985133588314, + "learning_rate": 9.94378190523503e-05, + "loss": 0.0, + "step": 574 + }, + { + "epoch": 0.195312168863646, + "grad_norm": 0.030524946749210358, + "learning_rate": 9.943306385209289e-05, + "loss": 0.0, + "step": 576 + }, + { + "epoch": 0.19599033611664476, + "grad_norm": 0.007708962541073561, + "learning_rate": 9.942828874020327e-05, + "loss": 0.0, + "step": 578 + }, + { + "epoch": 0.19666850336964353, + "grad_norm": 0.007101174909621477, + "learning_rate": 9.942349371860487e-05, + "loss": 0.0001, + "step": 580 + }, + { + "epoch": 0.1973466706226423, + "grad_norm": 0.0024575525894761086, + "learning_rate": 9.94186787892291e-05, + "loss": 0.0001, + "step": 582 + }, + { + "epoch": 0.19802483787564107, + "grad_norm": 0.008838510140776634, + "learning_rate": 9.941384395401543e-05, + "loss": 0.0, + "step": 584 + }, + { + "epoch": 0.19870300512863986, + "grad_norm": 0.003978129476308823, + "learning_rate": 9.940898921491131e-05, + "loss": 0.0007, + "step": 586 + }, + { + "epoch": 0.19938117238163863, + "grad_norm": 0.026870913803577423, + "learning_rate": 9.940411457387227e-05, + "loss": 0.0002, + "step": 588 + }, + { + "epoch": 0.2000593396346374, + "grad_norm": 0.009151192381978035, + "learning_rate": 9.939922003286176e-05, + "loss": 0.0, + "step": 590 + }, + { + "epoch": 0.20073750688763617, + "grad_norm": 0.051844075322151184, + "learning_rate": 9.939430559385135e-05, + "loss": 0.0002, + "step": 592 + }, + { + "epoch": 0.20141567414063494, + "grad_norm": 0.06585478037595749, + "learning_rate": 9.938937125882053e-05, + "loss": 0.0002, + "step": 594 + }, + { + "epoch": 0.2020938413936337, + "grad_norm": 0.05280110239982605, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0002, + "step": 596 + }, + { + "epoch": 0.20277200864663247, + "grad_norm": 0.03558439016342163, + "learning_rate": 9.937944290865597e-05, + "loss": 0.0001, + "step": 598 + }, + { + "epoch": 0.20345017589963124, + "grad_norm": 0.005566301755607128, + "learning_rate": 9.937444889752137e-05, + "loss": 0.0, + "step": 600 + }, + { + "epoch": 0.20412834315263, + "grad_norm": 0.004138810560107231, + "learning_rate": 9.936943499836463e-05, + "loss": 0.0, + "step": 602 + }, + { + "epoch": 0.20480651040562878, + "grad_norm": 0.00421471381559968, + "learning_rate": 9.936440121320539e-05, + "loss": 0.0, + "step": 604 + }, + { + "epoch": 0.20548467765862755, + "grad_norm": 0.0489053912460804, + "learning_rate": 9.935934754407125e-05, + "loss": 0.0002, + "step": 606 + }, + { + "epoch": 0.20616284491162634, + "grad_norm": 0.050261691212654114, + "learning_rate": 9.935427399299782e-05, + "loss": 0.0003, + "step": 608 + }, + { + "epoch": 0.2068410121646251, + "grad_norm": 0.01245457585901022, + "learning_rate": 9.934918056202871e-05, + "loss": 0.0001, + "step": 610 + }, + { + "epoch": 0.20751917941762388, + "grad_norm": 0.031247129663825035, + "learning_rate": 9.934406725321559e-05, + "loss": 0.0002, + "step": 612 + }, + { + "epoch": 0.20819734667062265, + "grad_norm": 0.06600417196750641, + "learning_rate": 9.933893406861807e-05, + "loss": 0.0002, + "step": 614 + }, + { + "epoch": 0.20887551392362141, + "grad_norm": 0.023129012435674667, + "learning_rate": 9.933378101030381e-05, + "loss": 0.0007, + "step": 616 + }, + { + "epoch": 0.20955368117662018, + "grad_norm": 0.018992137163877487, + "learning_rate": 9.932860808034848e-05, + "loss": 0.0001, + "step": 618 + }, + { + "epoch": 0.21023184842961895, + "grad_norm": 0.01329959649592638, + "learning_rate": 9.932341528083569e-05, + "loss": 0.0, + "step": 620 + }, + { + "epoch": 0.21091001568261772, + "grad_norm": 0.07119770348072052, + "learning_rate": 9.931820261385714e-05, + "loss": 0.0005, + "step": 622 + }, + { + "epoch": 0.2115881829356165, + "grad_norm": 0.08495577424764633, + "learning_rate": 9.931297008151246e-05, + "loss": 0.0001, + "step": 624 + }, + { + "epoch": 0.21226635018861525, + "grad_norm": 0.0634210929274559, + "learning_rate": 9.930771768590933e-05, + "loss": 0.0001, + "step": 626 + }, + { + "epoch": 0.21294451744161405, + "grad_norm": 0.10992208123207092, + "learning_rate": 9.930244542916342e-05, + "loss": 0.0006, + "step": 628 + }, + { + "epoch": 0.21362268469461282, + "grad_norm": 0.058085739612579346, + "learning_rate": 9.929715331339839e-05, + "loss": 0.0003, + "step": 630 + }, + { + "epoch": 0.2143008519476116, + "grad_norm": 0.0160099845379591, + "learning_rate": 9.92918413407459e-05, + "loss": 0.0003, + "step": 632 + }, + { + "epoch": 0.21497901920061036, + "grad_norm": 0.46785369515419006, + "learning_rate": 9.928650951334559e-05, + "loss": 0.0008, + "step": 634 + }, + { + "epoch": 0.21565718645360912, + "grad_norm": 0.059186507016420364, + "learning_rate": 9.928115783334518e-05, + "loss": 0.0003, + "step": 636 + }, + { + "epoch": 0.2163353537066079, + "grad_norm": 0.07586536556482315, + "learning_rate": 9.927578630290025e-05, + "loss": 0.0016, + "step": 638 + }, + { + "epoch": 0.21701352095960666, + "grad_norm": 0.10785187780857086, + "learning_rate": 9.927039492417452e-05, + "loss": 0.0014, + "step": 640 + }, + { + "epoch": 0.21769168821260543, + "grad_norm": 0.11385278403759003, + "learning_rate": 9.926498369933959e-05, + "loss": 0.0014, + "step": 642 + }, + { + "epoch": 0.2183698554656042, + "grad_norm": 0.08722253888845444, + "learning_rate": 9.925955263057511e-05, + "loss": 0.0012, + "step": 644 + }, + { + "epoch": 0.21904802271860296, + "grad_norm": 0.09422720223665237, + "learning_rate": 9.925410172006874e-05, + "loss": 0.0012, + "step": 646 + }, + { + "epoch": 0.21972618997160176, + "grad_norm": 0.08135107904672623, + "learning_rate": 9.924863097001607e-05, + "loss": 0.0009, + "step": 648 + }, + { + "epoch": 0.22040435722460053, + "grad_norm": 0.06382672488689423, + "learning_rate": 9.924314038262074e-05, + "loss": 0.0007, + "step": 650 + }, + { + "epoch": 0.2210825244775993, + "grad_norm": 0.04187219962477684, + "learning_rate": 9.923762996009438e-05, + "loss": 0.0004, + "step": 652 + }, + { + "epoch": 0.22176069173059806, + "grad_norm": 0.11694247275590897, + "learning_rate": 9.923209970465652e-05, + "loss": 0.0006, + "step": 654 + }, + { + "epoch": 0.22243885898359683, + "grad_norm": 0.15844054520130157, + "learning_rate": 9.922654961853481e-05, + "loss": 0.0007, + "step": 656 + }, + { + "epoch": 0.2231170262365956, + "grad_norm": 0.055115025490522385, + "learning_rate": 9.92209797039648e-05, + "loss": 0.0005, + "step": 658 + }, + { + "epoch": 0.22379519348959437, + "grad_norm": 0.05294685438275337, + "learning_rate": 9.921538996319004e-05, + "loss": 0.0005, + "step": 660 + }, + { + "epoch": 0.22447336074259314, + "grad_norm": 0.05556558817625046, + "learning_rate": 9.92097803984621e-05, + "loss": 0.0005, + "step": 662 + }, + { + "epoch": 0.2251515279955919, + "grad_norm": 0.02577158622443676, + "learning_rate": 9.92041510120405e-05, + "loss": 0.0005, + "step": 664 + }, + { + "epoch": 0.22582969524859067, + "grad_norm": 0.06684298813343048, + "learning_rate": 9.919850180619274e-05, + "loss": 0.0005, + "step": 666 + }, + { + "epoch": 0.22650786250158944, + "grad_norm": 0.08713594824075699, + "learning_rate": 9.919283278319436e-05, + "loss": 0.0011, + "step": 668 + }, + { + "epoch": 0.22718602975458824, + "grad_norm": 0.06460582464933395, + "learning_rate": 9.91871439453288e-05, + "loss": 0.0009, + "step": 670 + }, + { + "epoch": 0.227864197007587, + "grad_norm": 0.06616802513599396, + "learning_rate": 9.918143529488755e-05, + "loss": 0.0007, + "step": 672 + }, + { + "epoch": 0.22854236426058577, + "grad_norm": 0.026611236855387688, + "learning_rate": 9.917570683417004e-05, + "loss": 0.0011, + "step": 674 + }, + { + "epoch": 0.22922053151358454, + "grad_norm": 0.07026272267103195, + "learning_rate": 9.91699585654837e-05, + "loss": 0.0006, + "step": 676 + }, + { + "epoch": 0.2298986987665833, + "grad_norm": 0.11559715121984482, + "learning_rate": 9.916419049114392e-05, + "loss": 0.0009, + "step": 678 + }, + { + "epoch": 0.23057686601958208, + "grad_norm": 0.0518777072429657, + "learning_rate": 9.915840261347411e-05, + "loss": 0.0003, + "step": 680 + }, + { + "epoch": 0.23125503327258085, + "grad_norm": 0.060435306280851364, + "learning_rate": 9.915259493480559e-05, + "loss": 0.0007, + "step": 682 + }, + { + "epoch": 0.2319332005255796, + "grad_norm": 0.038414761424064636, + "learning_rate": 9.914676745747772e-05, + "loss": 0.0004, + "step": 684 + }, + { + "epoch": 0.23261136777857838, + "grad_norm": 0.031407903879880905, + "learning_rate": 9.914092018383778e-05, + "loss": 0.0004, + "step": 686 + }, + { + "epoch": 0.23328953503157715, + "grad_norm": 0.023152906447649002, + "learning_rate": 9.913505311624108e-05, + "loss": 0.0005, + "step": 688 + }, + { + "epoch": 0.23396770228457595, + "grad_norm": 0.0736289694905281, + "learning_rate": 9.912916625705087e-05, + "loss": 0.0006, + "step": 690 + }, + { + "epoch": 0.23464586953757471, + "grad_norm": 0.03230787813663483, + "learning_rate": 9.912325960863834e-05, + "loss": 0.0002, + "step": 692 + }, + { + "epoch": 0.23532403679057348, + "grad_norm": 0.06092187389731407, + "learning_rate": 9.911733317338272e-05, + "loss": 0.0003, + "step": 694 + }, + { + "epoch": 0.23600220404357225, + "grad_norm": 0.02147577330470085, + "learning_rate": 9.911138695367119e-05, + "loss": 0.0003, + "step": 696 + }, + { + "epoch": 0.23668037129657102, + "grad_norm": 0.10465848445892334, + "learning_rate": 9.910542095189885e-05, + "loss": 0.0011, + "step": 698 + }, + { + "epoch": 0.2373585385495698, + "grad_norm": 0.051002729684114456, + "learning_rate": 9.909943517046884e-05, + "loss": 0.0003, + "step": 700 + }, + { + "epoch": 0.23803670580256855, + "grad_norm": 0.04811201989650726, + "learning_rate": 9.90934296117922e-05, + "loss": 0.0003, + "step": 702 + }, + { + "epoch": 0.23871487305556732, + "grad_norm": 0.024624748155474663, + "learning_rate": 9.908740427828799e-05, + "loss": 0.0004, + "step": 704 + }, + { + "epoch": 0.2393930403085661, + "grad_norm": 0.05176735296845436, + "learning_rate": 9.908135917238321e-05, + "loss": 0.0002, + "step": 706 + }, + { + "epoch": 0.24007120756156486, + "grad_norm": 0.04927726835012436, + "learning_rate": 9.907529429651281e-05, + "loss": 0.0009, + "step": 708 + }, + { + "epoch": 0.24074937481456365, + "grad_norm": 0.031045028939843178, + "learning_rate": 9.906920965311974e-05, + "loss": 0.0001, + "step": 710 + }, + { + "epoch": 0.24142754206756242, + "grad_norm": 0.10256350040435791, + "learning_rate": 9.90631052446549e-05, + "loss": 0.0006, + "step": 712 + }, + { + "epoch": 0.2421057093205612, + "grad_norm": 0.03954285383224487, + "learning_rate": 9.905698107357714e-05, + "loss": 0.0006, + "step": 714 + }, + { + "epoch": 0.24278387657355996, + "grad_norm": 0.05060918629169464, + "learning_rate": 9.905083714235326e-05, + "loss": 0.0007, + "step": 716 + }, + { + "epoch": 0.24346204382655873, + "grad_norm": 0.019770212471485138, + "learning_rate": 9.904467345345805e-05, + "loss": 0.0008, + "step": 718 + }, + { + "epoch": 0.2441402110795575, + "grad_norm": 0.0872698500752449, + "learning_rate": 9.903849000937423e-05, + "loss": 0.0006, + "step": 720 + }, + { + "epoch": 0.24481837833255626, + "grad_norm": 0.08249643445014954, + "learning_rate": 9.90322868125925e-05, + "loss": 0.0006, + "step": 722 + }, + { + "epoch": 0.24549654558555503, + "grad_norm": 0.1213611587882042, + "learning_rate": 9.902606386561151e-05, + "loss": 0.0009, + "step": 724 + }, + { + "epoch": 0.2461747128385538, + "grad_norm": 0.12369897216558456, + "learning_rate": 9.901982117093786e-05, + "loss": 0.0009, + "step": 726 + }, + { + "epoch": 0.24685288009155257, + "grad_norm": 0.04433707520365715, + "learning_rate": 9.901355873108609e-05, + "loss": 0.0003, + "step": 728 + }, + { + "epoch": 0.24753104734455136, + "grad_norm": 0.03901137784123421, + "learning_rate": 9.900727654857874e-05, + "loss": 0.0005, + "step": 730 + }, + { + "epoch": 0.24820921459755013, + "grad_norm": 0.060861486941576004, + "learning_rate": 9.900097462594625e-05, + "loss": 0.0008, + "step": 732 + }, + { + "epoch": 0.2488873818505489, + "grad_norm": 0.0498674139380455, + "learning_rate": 9.899465296572706e-05, + "loss": 0.0005, + "step": 734 + }, + { + "epoch": 0.24956554910354767, + "grad_norm": 0.10359224677085876, + "learning_rate": 9.898831157046749e-05, + "loss": 0.0006, + "step": 736 + }, + { + "epoch": 0.2502437163565464, + "grad_norm": 0.06794043630361557, + "learning_rate": 9.898195044272189e-05, + "loss": 0.0005, + "step": 738 + }, + { + "epoch": 0.2509218836095452, + "grad_norm": 0.02869260124862194, + "learning_rate": 9.89755695850525e-05, + "loss": 0.0002, + "step": 740 + }, + { + "epoch": 0.251600050862544, + "grad_norm": 0.04206472635269165, + "learning_rate": 9.896916900002954e-05, + "loss": 0.0005, + "step": 742 + }, + { + "epoch": 0.25227821811554274, + "grad_norm": 0.12347942590713501, + "learning_rate": 9.896274869023117e-05, + "loss": 0.0003, + "step": 744 + }, + { + "epoch": 0.25295638536854154, + "grad_norm": 0.01147325336933136, + "learning_rate": 9.895630865824347e-05, + "loss": 0.0001, + "step": 746 + }, + { + "epoch": 0.2536345526215403, + "grad_norm": 0.10761839151382446, + "learning_rate": 9.89498489066605e-05, + "loss": 0.0002, + "step": 748 + }, + { + "epoch": 0.2543127198745391, + "grad_norm": 0.06379670649766922, + "learning_rate": 9.894336943808426e-05, + "loss": 0.0005, + "step": 750 + }, + { + "epoch": 0.2549908871275378, + "grad_norm": 0.19594375789165497, + "learning_rate": 9.893687025512465e-05, + "loss": 0.001, + "step": 752 + }, + { + "epoch": 0.2556690543805366, + "grad_norm": 0.04399771988391876, + "learning_rate": 9.893035136039956e-05, + "loss": 0.0002, + "step": 754 + }, + { + "epoch": 0.25634722163353535, + "grad_norm": 0.09866727888584137, + "learning_rate": 9.892381275653478e-05, + "loss": 0.0006, + "step": 756 + }, + { + "epoch": 0.25702538888653415, + "grad_norm": 0.03635840862989426, + "learning_rate": 9.891725444616409e-05, + "loss": 0.0011, + "step": 758 + }, + { + "epoch": 0.2577035561395329, + "grad_norm": 0.026622148230671883, + "learning_rate": 9.891067643192915e-05, + "loss": 0.0003, + "step": 760 + }, + { + "epoch": 0.2583817233925317, + "grad_norm": 0.047546133399009705, + "learning_rate": 9.89040787164796e-05, + "loss": 0.0004, + "step": 762 + }, + { + "epoch": 0.2590598906455305, + "grad_norm": 0.17135459184646606, + "learning_rate": 9.889746130247297e-05, + "loss": 0.0005, + "step": 764 + }, + { + "epoch": 0.2597380578985292, + "grad_norm": 0.05973830074071884, + "learning_rate": 9.889082419257481e-05, + "loss": 0.0003, + "step": 766 + }, + { + "epoch": 0.260416225151528, + "grad_norm": 0.08419614285230637, + "learning_rate": 9.888416738945848e-05, + "loss": 0.0009, + "step": 768 + }, + { + "epoch": 0.26109439240452675, + "grad_norm": 0.08141511678695679, + "learning_rate": 9.887749089580538e-05, + "loss": 0.0002, + "step": 770 + }, + { + "epoch": 0.26177255965752555, + "grad_norm": 0.40493646264076233, + "learning_rate": 9.88707947143048e-05, + "loss": 0.0007, + "step": 772 + }, + { + "epoch": 0.2624507269105243, + "grad_norm": 0.24903036653995514, + "learning_rate": 9.886407884765395e-05, + "loss": 0.0005, + "step": 774 + }, + { + "epoch": 0.2631288941635231, + "grad_norm": 0.7945914268493652, + "learning_rate": 9.885734329855798e-05, + "loss": 0.0008, + "step": 776 + }, + { + "epoch": 0.2638070614165218, + "grad_norm": 5.618687152862549, + "learning_rate": 9.885058806972997e-05, + "loss": 0.0147, + "step": 778 + }, + { + "epoch": 0.2644852286695206, + "grad_norm": 0.3053636848926544, + "learning_rate": 9.884381316389093e-05, + "loss": 0.0051, + "step": 780 + }, + { + "epoch": 0.2651633959225194, + "grad_norm": 0.8200685977935791, + "learning_rate": 9.883701858376979e-05, + "loss": 0.0078, + "step": 782 + }, + { + "epoch": 0.26584156317551816, + "grad_norm": 0.3827979564666748, + "learning_rate": 9.883020433210342e-05, + "loss": 0.0048, + "step": 784 + }, + { + "epoch": 0.26651973042851695, + "grad_norm": 49.57893753051758, + "learning_rate": 9.882337041163656e-05, + "loss": 0.3228, + "step": 786 + }, + { + "epoch": 0.2671978976815157, + "grad_norm": 82.7600326538086, + "learning_rate": 9.881651682512195e-05, + "loss": 1.323, + "step": 788 + }, + { + "epoch": 0.2678760649345145, + "grad_norm": 13.147876739501953, + "learning_rate": 9.88096435753202e-05, + "loss": 0.2319, + "step": 790 + }, + { + "epoch": 0.26855423218751323, + "grad_norm": 4.195157527923584, + "learning_rate": 9.880275066499985e-05, + "loss": 0.1171, + "step": 792 + }, + { + "epoch": 0.269232399440512, + "grad_norm": 0.9070939421653748, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0663, + "step": 794 + }, + { + "epoch": 0.26991056669351077, + "grad_norm": 0.3190385401248932, + "learning_rate": 9.878890587391714e-05, + "loss": 0.0182, + "step": 796 + }, + { + "epoch": 0.27058873394650956, + "grad_norm": 0.15086790919303894, + "learning_rate": 9.878195399873147e-05, + "loss": 0.006, + "step": 798 + }, + { + "epoch": 0.2712669011995083, + "grad_norm": 0.148417666554451, + "learning_rate": 9.877498247418056e-05, + "loss": 0.0021, + "step": 800 + }, + { + "epoch": 0.2719450684525071, + "grad_norm": 0.1928234100341797, + "learning_rate": 9.876799130307253e-05, + "loss": 0.0024, + "step": 802 + }, + { + "epoch": 0.2726232357055059, + "grad_norm": 0.4669875204563141, + "learning_rate": 9.876098048822343e-05, + "loss": 0.0257, + "step": 804 + }, + { + "epoch": 0.27330140295850464, + "grad_norm": 1.2202422618865967, + "learning_rate": 9.875395003245724e-05, + "loss": 0.0405, + "step": 806 + }, + { + "epoch": 0.27397957021150343, + "grad_norm": 0.32797545194625854, + "learning_rate": 9.874689993860579e-05, + "loss": 0.0319, + "step": 808 + }, + { + "epoch": 0.27465773746450217, + "grad_norm": 0.9610121846199036, + "learning_rate": 9.873983020950888e-05, + "loss": 0.0205, + "step": 810 + }, + { + "epoch": 0.27533590471750097, + "grad_norm": 0.3528335690498352, + "learning_rate": 9.87327408480142e-05, + "loss": 0.0073, + "step": 812 + }, + { + "epoch": 0.2760140719704997, + "grad_norm": 0.41824105381965637, + "learning_rate": 9.872563185697732e-05, + "loss": 0.0054, + "step": 814 + }, + { + "epoch": 0.2766922392234985, + "grad_norm": 0.2852812111377716, + "learning_rate": 9.871850323926177e-05, + "loss": 0.004, + "step": 816 + }, + { + "epoch": 0.27737040647649724, + "grad_norm": 1.4140238761901855, + "learning_rate": 9.871135499773894e-05, + "loss": 0.0105, + "step": 818 + }, + { + "epoch": 0.27804857372949604, + "grad_norm": 0.8318770527839661, + "learning_rate": 9.870418713528812e-05, + "loss": 0.006, + "step": 820 + }, + { + "epoch": 0.2787267409824948, + "grad_norm": 0.7967255711555481, + "learning_rate": 9.869699965479657e-05, + "loss": 0.0044, + "step": 822 + }, + { + "epoch": 0.2794049082354936, + "grad_norm": 0.5289302468299866, + "learning_rate": 9.868979255915938e-05, + "loss": 0.0131, + "step": 824 + }, + { + "epoch": 0.2800830754884924, + "grad_norm": 3.1792924404144287, + "learning_rate": 9.868256585127956e-05, + "loss": 0.0473, + "step": 826 + }, + { + "epoch": 0.2807612427414911, + "grad_norm": 0.2872530221939087, + "learning_rate": 9.867531953406803e-05, + "loss": 0.0073, + "step": 828 + }, + { + "epoch": 0.2814394099944899, + "grad_norm": 0.14570827782154083, + "learning_rate": 9.866805361044362e-05, + "loss": 0.01, + "step": 830 + }, + { + "epoch": 0.28211757724748865, + "grad_norm": 0.2351459115743637, + "learning_rate": 9.866076808333304e-05, + "loss": 0.0053, + "step": 832 + }, + { + "epoch": 0.28279574450048744, + "grad_norm": 0.14194971323013306, + "learning_rate": 9.86534629556709e-05, + "loss": 0.006, + "step": 834 + }, + { + "epoch": 0.2834739117534862, + "grad_norm": 0.21115270256996155, + "learning_rate": 9.864613823039969e-05, + "loss": 0.0041, + "step": 836 + }, + { + "epoch": 0.284152079006485, + "grad_norm": 0.17921510338783264, + "learning_rate": 9.863879391046984e-05, + "loss": 0.0044, + "step": 838 + }, + { + "epoch": 0.2848302462594837, + "grad_norm": 0.1330479234457016, + "learning_rate": 9.863142999883959e-05, + "loss": 0.003, + "step": 840 + }, + { + "epoch": 0.2855084135124825, + "grad_norm": 0.09766431152820587, + "learning_rate": 9.862404649847516e-05, + "loss": 0.0021, + "step": 842 + }, + { + "epoch": 0.2861865807654813, + "grad_norm": 0.12152712047100067, + "learning_rate": 9.861664341235063e-05, + "loss": 0.0042, + "step": 844 + }, + { + "epoch": 0.28686474801848005, + "grad_norm": 0.1460103988647461, + "learning_rate": 9.860922074344794e-05, + "loss": 0.0022, + "step": 846 + }, + { + "epoch": 0.28754291527147885, + "grad_norm": 0.16154733300209045, + "learning_rate": 9.860177849475694e-05, + "loss": 0.0029, + "step": 848 + }, + { + "epoch": 0.2882210825244776, + "grad_norm": 0.08406810462474823, + "learning_rate": 9.85943166692754e-05, + "loss": 0.0012, + "step": 850 + }, + { + "epoch": 0.2888992497774764, + "grad_norm": 0.15870791673660278, + "learning_rate": 9.858683527000887e-05, + "loss": 0.0018, + "step": 852 + }, + { + "epoch": 0.2895774170304751, + "grad_norm": 0.07115360349416733, + "learning_rate": 9.857933429997094e-05, + "loss": 0.0011, + "step": 854 + }, + { + "epoch": 0.2902555842834739, + "grad_norm": 0.052113912999629974, + "learning_rate": 9.857181376218295e-05, + "loss": 0.0008, + "step": 856 + }, + { + "epoch": 0.29093375153647266, + "grad_norm": 0.059886470437049866, + "learning_rate": 9.856427365967419e-05, + "loss": 0.0009, + "step": 858 + }, + { + "epoch": 0.29161191878947146, + "grad_norm": 0.0380706861615181, + "learning_rate": 9.855671399548181e-05, + "loss": 0.0009, + "step": 860 + }, + { + "epoch": 0.2922900860424702, + "grad_norm": 0.017071694135665894, + "learning_rate": 9.854913477265084e-05, + "loss": 0.0005, + "step": 862 + }, + { + "epoch": 0.292968253295469, + "grad_norm": 0.09737870842218399, + "learning_rate": 9.854153599423418e-05, + "loss": 0.0006, + "step": 864 + }, + { + "epoch": 0.2936464205484678, + "grad_norm": 0.07049310207366943, + "learning_rate": 9.853391766329263e-05, + "loss": 0.0008, + "step": 866 + }, + { + "epoch": 0.29432458780146653, + "grad_norm": 0.07267790287733078, + "learning_rate": 9.852627978289486e-05, + "loss": 0.0009, + "step": 868 + }, + { + "epoch": 0.2950027550544653, + "grad_norm": 0.03373875841498375, + "learning_rate": 9.851862235611737e-05, + "loss": 0.0006, + "step": 870 + }, + { + "epoch": 0.29568092230746407, + "grad_norm": 0.11455586552619934, + "learning_rate": 9.851094538604461e-05, + "loss": 0.0008, + "step": 872 + }, + { + "epoch": 0.29635908956046286, + "grad_norm": 0.028519853949546814, + "learning_rate": 9.850324887576887e-05, + "loss": 0.0002, + "step": 874 + }, + { + "epoch": 0.2970372568134616, + "grad_norm": 0.07312014698982239, + "learning_rate": 9.849553282839025e-05, + "loss": 0.0013, + "step": 876 + }, + { + "epoch": 0.2977154240664604, + "grad_norm": 0.1544421762228012, + "learning_rate": 9.848779724701684e-05, + "loss": 0.0008, + "step": 878 + }, + { + "epoch": 0.29839359131945914, + "grad_norm": 0.049141231924295425, + "learning_rate": 9.848004213476449e-05, + "loss": 0.0006, + "step": 880 + }, + { + "epoch": 0.29907175857245794, + "grad_norm": 0.08961957693099976, + "learning_rate": 9.847226749475695e-05, + "loss": 0.0006, + "step": 882 + }, + { + "epoch": 0.2997499258254567, + "grad_norm": 0.05696597695350647, + "learning_rate": 9.846447333012588e-05, + "loss": 0.0005, + "step": 884 + }, + { + "epoch": 0.30042809307845547, + "grad_norm": 0.08603312820196152, + "learning_rate": 9.845665964401071e-05, + "loss": 0.0012, + "step": 886 + }, + { + "epoch": 0.30110626033145427, + "grad_norm": 0.02400689572095871, + "learning_rate": 9.844882643955887e-05, + "loss": 0.0003, + "step": 888 + }, + { + "epoch": 0.301784427584453, + "grad_norm": 0.028485383838415146, + "learning_rate": 9.844097371992552e-05, + "loss": 0.0003, + "step": 890 + }, + { + "epoch": 0.3024625948374518, + "grad_norm": 0.042584579437971115, + "learning_rate": 9.843310148827374e-05, + "loss": 0.0004, + "step": 892 + }, + { + "epoch": 0.30314076209045054, + "grad_norm": 0.051422711461782455, + "learning_rate": 9.842520974777448e-05, + "loss": 0.0003, + "step": 894 + }, + { + "epoch": 0.30381892934344934, + "grad_norm": 0.04666488617658615, + "learning_rate": 9.841729850160652e-05, + "loss": 0.0006, + "step": 896 + }, + { + "epoch": 0.3044970965964481, + "grad_norm": 0.030913488939404488, + "learning_rate": 9.840936775295653e-05, + "loss": 0.0005, + "step": 898 + }, + { + "epoch": 0.3051752638494469, + "grad_norm": 0.07295496016740799, + "learning_rate": 9.840141750501898e-05, + "loss": 0.0003, + "step": 900 + }, + { + "epoch": 0.3058534311024456, + "grad_norm": 0.04738324508070946, + "learning_rate": 9.839344776099625e-05, + "loss": 0.0006, + "step": 902 + }, + { + "epoch": 0.3065315983554444, + "grad_norm": 0.021162323653697968, + "learning_rate": 9.838545852409857e-05, + "loss": 0.0002, + "step": 904 + }, + { + "epoch": 0.3072097656084432, + "grad_norm": 0.013712971471250057, + "learning_rate": 9.837744979754396e-05, + "loss": 0.0001, + "step": 906 + }, + { + "epoch": 0.30788793286144195, + "grad_norm": 0.04329269379377365, + "learning_rate": 9.836942158455837e-05, + "loss": 0.0004, + "step": 908 + }, + { + "epoch": 0.30856610011444074, + "grad_norm": 0.011373177170753479, + "learning_rate": 9.836137388837554e-05, + "loss": 0.0002, + "step": 910 + }, + { + "epoch": 0.3092442673674395, + "grad_norm": 0.1146940067410469, + "learning_rate": 9.83533067122371e-05, + "loss": 0.001, + "step": 912 + }, + { + "epoch": 0.3099224346204383, + "grad_norm": 0.01689288392663002, + "learning_rate": 9.834522005939248e-05, + "loss": 0.0002, + "step": 914 + }, + { + "epoch": 0.310600601873437, + "grad_norm": 0.03612351417541504, + "learning_rate": 9.833711393309903e-05, + "loss": 0.0004, + "step": 916 + }, + { + "epoch": 0.3112787691264358, + "grad_norm": 0.05595879629254341, + "learning_rate": 9.832898833662185e-05, + "loss": 0.0002, + "step": 918 + }, + { + "epoch": 0.31195693637943456, + "grad_norm": 0.03937789425253868, + "learning_rate": 9.832084327323394e-05, + "loss": 0.0002, + "step": 920 + }, + { + "epoch": 0.31263510363243335, + "grad_norm": 0.00396231422200799, + "learning_rate": 9.831267874621616e-05, + "loss": 0.0, + "step": 922 + }, + { + "epoch": 0.3133132708854321, + "grad_norm": 0.02203402668237686, + "learning_rate": 9.830449475885715e-05, + "loss": 0.0002, + "step": 924 + }, + { + "epoch": 0.3139914381384309, + "grad_norm": 0.016906969249248505, + "learning_rate": 9.829629131445342e-05, + "loss": 0.0001, + "step": 926 + }, + { + "epoch": 0.3146696053914297, + "grad_norm": 0.028574733063578606, + "learning_rate": 9.828806841630933e-05, + "loss": 0.0004, + "step": 928 + }, + { + "epoch": 0.3153477726444284, + "grad_norm": 0.03310710936784744, + "learning_rate": 9.827982606773705e-05, + "loss": 0.0002, + "step": 930 + }, + { + "epoch": 0.3160259398974272, + "grad_norm": 0.04199539124965668, + "learning_rate": 9.827156427205662e-05, + "loss": 0.0002, + "step": 932 + }, + { + "epoch": 0.31670410715042596, + "grad_norm": 0.04112956300377846, + "learning_rate": 9.826328303259586e-05, + "loss": 0.0002, + "step": 934 + }, + { + "epoch": 0.31738227440342476, + "grad_norm": 0.04984778165817261, + "learning_rate": 9.825498235269046e-05, + "loss": 0.0001, + "step": 936 + }, + { + "epoch": 0.3180604416564235, + "grad_norm": 0.079465851187706, + "learning_rate": 9.824666223568395e-05, + "loss": 0.0004, + "step": 938 + }, + { + "epoch": 0.3187386089094223, + "grad_norm": 0.007913575507700443, + "learning_rate": 9.823832268492768e-05, + "loss": 0.0001, + "step": 940 + }, + { + "epoch": 0.31941677616242103, + "grad_norm": 0.027005091309547424, + "learning_rate": 9.822996370378077e-05, + "loss": 0.0002, + "step": 942 + }, + { + "epoch": 0.32009494341541983, + "grad_norm": 0.03642921522259712, + "learning_rate": 9.822158529561026e-05, + "loss": 0.0002, + "step": 944 + }, + { + "epoch": 0.3207731106684186, + "grad_norm": 0.006631182506680489, + "learning_rate": 9.821318746379095e-05, + "loss": 0.0004, + "step": 946 + }, + { + "epoch": 0.32145127792141737, + "grad_norm": 0.028765078634023666, + "learning_rate": 9.820477021170551e-05, + "loss": 0.0002, + "step": 948 + }, + { + "epoch": 0.32212944517441616, + "grad_norm": 0.03270328417420387, + "learning_rate": 9.819633354274439e-05, + "loss": 0.0002, + "step": 950 + }, + { + "epoch": 0.3228076124274149, + "grad_norm": 0.00990514736622572, + "learning_rate": 9.818787746030587e-05, + "loss": 0.0001, + "step": 952 + }, + { + "epoch": 0.3234857796804137, + "grad_norm": 0.028807222843170166, + "learning_rate": 9.81794019677961e-05, + "loss": 0.0001, + "step": 954 + }, + { + "epoch": 0.32416394693341244, + "grad_norm": 0.02939584106206894, + "learning_rate": 9.817090706862895e-05, + "loss": 0.0003, + "step": 956 + }, + { + "epoch": 0.32484211418641123, + "grad_norm": 0.01710331439971924, + "learning_rate": 9.816239276622621e-05, + "loss": 0.0001, + "step": 958 + }, + { + "epoch": 0.32552028143941, + "grad_norm": 0.013261375017464161, + "learning_rate": 9.815385906401741e-05, + "loss": 0.0, + "step": 960 + }, + { + "epoch": 0.32619844869240877, + "grad_norm": 0.025372454896569252, + "learning_rate": 9.814530596543994e-05, + "loss": 0.0, + "step": 962 + }, + { + "epoch": 0.3268766159454075, + "grad_norm": 0.005917869508266449, + "learning_rate": 9.813673347393899e-05, + "loss": 0.0, + "step": 964 + }, + { + "epoch": 0.3275547831984063, + "grad_norm": 0.006878651212900877, + "learning_rate": 9.812814159296757e-05, + "loss": 0.0, + "step": 966 + }, + { + "epoch": 0.3282329504514051, + "grad_norm": 0.04819082096219063, + "learning_rate": 9.811953032598644e-05, + "loss": 0.0001, + "step": 968 + }, + { + "epoch": 0.32891111770440384, + "grad_norm": 0.019986020401120186, + "learning_rate": 9.811089967646428e-05, + "loss": 0.0003, + "step": 970 + }, + { + "epoch": 0.32958928495740264, + "grad_norm": 0.005363111849874258, + "learning_rate": 9.810224964787746e-05, + "loss": 0.0, + "step": 972 + }, + { + "epoch": 0.3302674522104014, + "grad_norm": 0.023500891402363777, + "learning_rate": 9.809358024371024e-05, + "loss": 0.0, + "step": 974 + }, + { + "epoch": 0.3309456194634002, + "grad_norm": 0.006279476452618837, + "learning_rate": 9.808489146745466e-05, + "loss": 0.0, + "step": 976 + }, + { + "epoch": 0.3316237867163989, + "grad_norm": 0.007938874885439873, + "learning_rate": 9.807618332261052e-05, + "loss": 0.0, + "step": 978 + }, + { + "epoch": 0.3323019539693977, + "grad_norm": 0.0037944272626191378, + "learning_rate": 9.806745581268552e-05, + "loss": 0.0, + "step": 980 + }, + { + "epoch": 0.33298012122239645, + "grad_norm": 0.031090207397937775, + "learning_rate": 9.805870894119506e-05, + "loss": 0.0002, + "step": 982 + }, + { + "epoch": 0.33365828847539525, + "grad_norm": 0.04043909162282944, + "learning_rate": 9.804994271166236e-05, + "loss": 0.0001, + "step": 984 + }, + { + "epoch": 0.334336455728394, + "grad_norm": 0.00045781582593917847, + "learning_rate": 9.804115712761851e-05, + "loss": 0.0, + "step": 986 + }, + { + "epoch": 0.3350146229813928, + "grad_norm": 0.011888192035257816, + "learning_rate": 9.803235219260231e-05, + "loss": 0.0, + "step": 988 + }, + { + "epoch": 0.3356927902343916, + "grad_norm": 0.021957555785775185, + "learning_rate": 9.802352791016039e-05, + "loss": 0.0, + "step": 990 + }, + { + "epoch": 0.3363709574873903, + "grad_norm": 0.022019004449248314, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0004, + "step": 992 + }, + { + "epoch": 0.3370491247403891, + "grad_norm": 0.001489504473283887, + "learning_rate": 9.800582131722485e-05, + "loss": 0.0001, + "step": 994 + }, + { + "epoch": 0.33772729199338786, + "grad_norm": 0.002710692584514618, + "learning_rate": 9.799693901386345e-05, + "loss": 0.0001, + "step": 996 + }, + { + "epoch": 0.33840545924638665, + "grad_norm": 0.0030198118183761835, + "learning_rate": 9.798803737734076e-05, + "loss": 0.0001, + "step": 998 + }, + { + "epoch": 0.3390836264993854, + "grad_norm": 0.021767836064100266, + "learning_rate": 9.797911641124236e-05, + "loss": 0.0001, + "step": 1000 + }, + { + "epoch": 0.3397617937523842, + "grad_norm": 0.0020249721128493547, + "learning_rate": 9.79701761191616e-05, + "loss": 0.0003, + "step": 1002 + }, + { + "epoch": 0.34043996100538293, + "grad_norm": 0.01206112653017044, + "learning_rate": 9.796121650469963e-05, + "loss": 0.0001, + "step": 1004 + }, + { + "epoch": 0.3411181282583817, + "grad_norm": 0.0016083617229014635, + "learning_rate": 9.795223757146539e-05, + "loss": 0.0, + "step": 1006 + }, + { + "epoch": 0.3417962955113805, + "grad_norm": 0.004822399001568556, + "learning_rate": 9.794323932307559e-05, + "loss": 0.0, + "step": 1008 + }, + { + "epoch": 0.34247446276437926, + "grad_norm": 0.008584674447774887, + "learning_rate": 9.793422176315472e-05, + "loss": 0.0, + "step": 1010 + }, + { + "epoch": 0.34315263001737806, + "grad_norm": 0.003843455109745264, + "learning_rate": 9.792518489533507e-05, + "loss": 0.0, + "step": 1012 + }, + { + "epoch": 0.3438307972703768, + "grad_norm": 0.002502762945368886, + "learning_rate": 9.791612872325667e-05, + "loss": 0.0, + "step": 1014 + }, + { + "epoch": 0.3445089645233756, + "grad_norm": 0.0043897624127566814, + "learning_rate": 9.790705325056735e-05, + "loss": 0.0, + "step": 1016 + }, + { + "epoch": 0.34518713177637433, + "grad_norm": 0.013913934119045734, + "learning_rate": 9.78979584809227e-05, + "loss": 0.0, + "step": 1018 + }, + { + "epoch": 0.34586529902937313, + "grad_norm": 0.0008682821062393486, + "learning_rate": 9.788884441798612e-05, + "loss": 0.0, + "step": 1020 + }, + { + "epoch": 0.34654346628237187, + "grad_norm": 0.0015347907319664955, + "learning_rate": 9.787971106542872e-05, + "loss": 0.0, + "step": 1022 + }, + { + "epoch": 0.34722163353537067, + "grad_norm": 0.0006304876878857613, + "learning_rate": 9.787055842692944e-05, + "loss": 0.0, + "step": 1024 + }, + { + "epoch": 0.3478998007883694, + "grad_norm": 0.015095069073140621, + "learning_rate": 9.786138650617493e-05, + "loss": 0.0, + "step": 1026 + }, + { + "epoch": 0.3485779680413682, + "grad_norm": 0.0007357313297688961, + "learning_rate": 9.785219530685969e-05, + "loss": 0.0, + "step": 1028 + }, + { + "epoch": 0.349256135294367, + "grad_norm": 0.006811681669205427, + "learning_rate": 9.784298483268588e-05, + "loss": 0.0001, + "step": 1030 + }, + { + "epoch": 0.34993430254736574, + "grad_norm": 0.004541031084954739, + "learning_rate": 9.783375508736351e-05, + "loss": 0.0, + "step": 1032 + }, + { + "epoch": 0.35061246980036453, + "grad_norm": 0.002640953753143549, + "learning_rate": 9.782450607461031e-05, + "loss": 0.0, + "step": 1034 + }, + { + "epoch": 0.3512906370533633, + "grad_norm": 0.014266207814216614, + "learning_rate": 9.781523779815179e-05, + "loss": 0.0, + "step": 1036 + }, + { + "epoch": 0.35196880430636207, + "grad_norm": 0.0035020592622458935, + "learning_rate": 9.780595026172119e-05, + "loss": 0.0, + "step": 1038 + }, + { + "epoch": 0.3526469715593608, + "grad_norm": 0.0020277295261621475, + "learning_rate": 9.779664346905954e-05, + "loss": 0.0, + "step": 1040 + }, + { + "epoch": 0.3533251388123596, + "grad_norm": 0.0008716668817214668, + "learning_rate": 9.778731742391562e-05, + "loss": 0.0, + "step": 1042 + }, + { + "epoch": 0.35400330606535835, + "grad_norm": 0.004051413387060165, + "learning_rate": 9.777797213004596e-05, + "loss": 0.0, + "step": 1044 + }, + { + "epoch": 0.35468147331835714, + "grad_norm": 0.0006017841515131295, + "learning_rate": 9.776860759121484e-05, + "loss": 0.0, + "step": 1046 + }, + { + "epoch": 0.3553596405713559, + "grad_norm": 0.0020267630461603403, + "learning_rate": 9.775922381119429e-05, + "loss": 0.0, + "step": 1048 + }, + { + "epoch": 0.3560378078243547, + "grad_norm": 0.001963792135939002, + "learning_rate": 9.774982079376411e-05, + "loss": 0.0, + "step": 1050 + }, + { + "epoch": 0.3567159750773535, + "grad_norm": 0.039067551493644714, + "learning_rate": 9.774039854271182e-05, + "loss": 0.0001, + "step": 1052 + }, + { + "epoch": 0.3573941423303522, + "grad_norm": 0.009917604736983776, + "learning_rate": 9.773095706183271e-05, + "loss": 0.0, + "step": 1054 + }, + { + "epoch": 0.358072309583351, + "grad_norm": 0.03943745791912079, + "learning_rate": 9.772149635492978e-05, + "loss": 0.0002, + "step": 1056 + }, + { + "epoch": 0.35875047683634975, + "grad_norm": 0.03156045079231262, + "learning_rate": 9.771201642581385e-05, + "loss": 0.0001, + "step": 1058 + }, + { + "epoch": 0.35942864408934855, + "grad_norm": 0.0029956672806292772, + "learning_rate": 9.77025172783034e-05, + "loss": 0.0, + "step": 1060 + }, + { + "epoch": 0.3601068113423473, + "grad_norm": 0.009155509062111378, + "learning_rate": 9.769299891622469e-05, + "loss": 0.0, + "step": 1062 + }, + { + "epoch": 0.3607849785953461, + "grad_norm": 0.001078491797670722, + "learning_rate": 9.768346134341174e-05, + "loss": 0.0, + "step": 1064 + }, + { + "epoch": 0.3614631458483448, + "grad_norm": 0.006173829548060894, + "learning_rate": 9.767390456370624e-05, + "loss": 0.0, + "step": 1066 + }, + { + "epoch": 0.3621413131013436, + "grad_norm": 0.0022076182067394257, + "learning_rate": 9.76643285809577e-05, + "loss": 0.0001, + "step": 1068 + }, + { + "epoch": 0.3628194803543424, + "grad_norm": 0.0006827347096987069, + "learning_rate": 9.76547333990233e-05, + "loss": 0.0, + "step": 1070 + }, + { + "epoch": 0.36349764760734116, + "grad_norm": 0.02418726310133934, + "learning_rate": 9.764511902176798e-05, + "loss": 0.0001, + "step": 1072 + }, + { + "epoch": 0.36417581486033995, + "grad_norm": 0.002262687310576439, + "learning_rate": 9.763548545306443e-05, + "loss": 0.0, + "step": 1074 + }, + { + "epoch": 0.3648539821133387, + "grad_norm": 0.0012373102363198996, + "learning_rate": 9.762583269679303e-05, + "loss": 0.0, + "step": 1076 + }, + { + "epoch": 0.3655321493663375, + "grad_norm": 0.2534419596195221, + "learning_rate": 9.761616075684192e-05, + "loss": 0.0002, + "step": 1078 + }, + { + "epoch": 0.36621031661933623, + "grad_norm": 0.0008780054631642997, + "learning_rate": 9.760646963710694e-05, + "loss": 0.0, + "step": 1080 + }, + { + "epoch": 0.366888483872335, + "grad_norm": 0.04172241687774658, + "learning_rate": 9.759675934149168e-05, + "loss": 0.0004, + "step": 1082 + }, + { + "epoch": 0.36756665112533377, + "grad_norm": 0.04473108425736427, + "learning_rate": 9.758702987390746e-05, + "loss": 0.0004, + "step": 1084 + }, + { + "epoch": 0.36824481837833256, + "grad_norm": 0.11230382323265076, + "learning_rate": 9.75772812382733e-05, + "loss": 0.0006, + "step": 1086 + }, + { + "epoch": 0.3689229856313313, + "grad_norm": 0.08816549926996231, + "learning_rate": 9.756751343851594e-05, + "loss": 0.0007, + "step": 1088 + }, + { + "epoch": 0.3696011528843301, + "grad_norm": 0.06734654307365417, + "learning_rate": 9.755772647856986e-05, + "loss": 0.0003, + "step": 1090 + }, + { + "epoch": 0.3702793201373289, + "grad_norm": 0.02505115605890751, + "learning_rate": 9.754792036237725e-05, + "loss": 0.0004, + "step": 1092 + }, + { + "epoch": 0.37095748739032763, + "grad_norm": 0.02396882139146328, + "learning_rate": 9.753809509388798e-05, + "loss": 0.0007, + "step": 1094 + }, + { + "epoch": 0.37163565464332643, + "grad_norm": 0.023111695423722267, + "learning_rate": 9.752825067705971e-05, + "loss": 0.0002, + "step": 1096 + }, + { + "epoch": 0.37231382189632517, + "grad_norm": 0.045954182744026184, + "learning_rate": 9.751838711585774e-05, + "loss": 0.0003, + "step": 1098 + }, + { + "epoch": 0.37299198914932397, + "grad_norm": 0.029057525098323822, + "learning_rate": 9.750850441425515e-05, + "loss": 0.0004, + "step": 1100 + }, + { + "epoch": 0.3736701564023227, + "grad_norm": 0.030954748392105103, + "learning_rate": 9.749860257623263e-05, + "loss": 0.0002, + "step": 1102 + }, + { + "epoch": 0.3743483236553215, + "grad_norm": 0.0924738347530365, + "learning_rate": 9.748868160577868e-05, + "loss": 0.0003, + "step": 1104 + }, + { + "epoch": 0.37502649090832024, + "grad_norm": 0.05257641151547432, + "learning_rate": 9.747874150688948e-05, + "loss": 0.0001, + "step": 1106 + }, + { + "epoch": 0.37570465816131904, + "grad_norm": 0.05264095962047577, + "learning_rate": 9.746878228356885e-05, + "loss": 0.0007, + "step": 1108 + }, + { + "epoch": 0.3763828254143178, + "grad_norm": 0.04698754847049713, + "learning_rate": 9.74588039398284e-05, + "loss": 0.0002, + "step": 1110 + }, + { + "epoch": 0.3770609926673166, + "grad_norm": 0.031078685075044632, + "learning_rate": 9.744880647968741e-05, + "loss": 0.0003, + "step": 1112 + }, + { + "epoch": 0.37773915992031537, + "grad_norm": 0.011770748533308506, + "learning_rate": 9.743878990717283e-05, + "loss": 0.0002, + "step": 1114 + }, + { + "epoch": 0.3784173271733141, + "grad_norm": 0.016282442957162857, + "learning_rate": 9.742875422631937e-05, + "loss": 0.0001, + "step": 1116 + }, + { + "epoch": 0.3790954944263129, + "grad_norm": 0.03300836682319641, + "learning_rate": 9.741869944116937e-05, + "loss": 0.0001, + "step": 1118 + }, + { + "epoch": 0.37977366167931165, + "grad_norm": 0.046346571296453476, + "learning_rate": 9.740862555577289e-05, + "loss": 0.0009, + "step": 1120 + }, + { + "epoch": 0.38045182893231044, + "grad_norm": 0.020293988287448883, + "learning_rate": 9.739853257418772e-05, + "loss": 0.0001, + "step": 1122 + }, + { + "epoch": 0.3811299961853092, + "grad_norm": 0.006188335362821817, + "learning_rate": 9.73884205004793e-05, + "loss": 0.0, + "step": 1124 + }, + { + "epoch": 0.381808163438308, + "grad_norm": 0.04997680336236954, + "learning_rate": 9.737828933872075e-05, + "loss": 0.0003, + "step": 1126 + }, + { + "epoch": 0.3824863306913067, + "grad_norm": 0.013155604712665081, + "learning_rate": 9.736813909299294e-05, + "loss": 0.0001, + "step": 1128 + }, + { + "epoch": 0.3831644979443055, + "grad_norm": 0.0035465641412883997, + "learning_rate": 9.735796976738437e-05, + "loss": 0.0002, + "step": 1130 + }, + { + "epoch": 0.3838426651973043, + "grad_norm": 0.006658546626567841, + "learning_rate": 9.734778136599123e-05, + "loss": 0.0, + "step": 1132 + }, + { + "epoch": 0.38452083245030305, + "grad_norm": 0.053657446056604385, + "learning_rate": 9.733757389291742e-05, + "loss": 0.0008, + "step": 1134 + }, + { + "epoch": 0.38519899970330185, + "grad_norm": 0.08685652166604996, + "learning_rate": 9.73273473522745e-05, + "loss": 0.0005, + "step": 1136 + }, + { + "epoch": 0.3858771669563006, + "grad_norm": 0.009623444639146328, + "learning_rate": 9.731710174818174e-05, + "loss": 0.0001, + "step": 1138 + }, + { + "epoch": 0.3865553342092994, + "grad_norm": 0.03243015334010124, + "learning_rate": 9.730683708476604e-05, + "loss": 0.0004, + "step": 1140 + }, + { + "epoch": 0.3872335014622981, + "grad_norm": 0.0013256813399493694, + "learning_rate": 9.729655336616204e-05, + "loss": 0.0004, + "step": 1142 + }, + { + "epoch": 0.3879116687152969, + "grad_norm": 0.006923831533640623, + "learning_rate": 9.728625059651197e-05, + "loss": 0.0001, + "step": 1144 + }, + { + "epoch": 0.38858983596829566, + "grad_norm": 0.004441696684807539, + "learning_rate": 9.727592877996585e-05, + "loss": 0.0001, + "step": 1146 + }, + { + "epoch": 0.38926800322129446, + "grad_norm": 0.004240158013999462, + "learning_rate": 9.726558792068125e-05, + "loss": 0.0, + "step": 1148 + }, + { + "epoch": 0.3899461704742932, + "grad_norm": 0.014222053810954094, + "learning_rate": 9.72552280228235e-05, + "loss": 0.0001, + "step": 1150 + }, + { + "epoch": 0.390624337727292, + "grad_norm": 0.03526894748210907, + "learning_rate": 9.724484909056554e-05, + "loss": 0.0004, + "step": 1152 + }, + { + "epoch": 0.3913025049802908, + "grad_norm": 0.005613984540104866, + "learning_rate": 9.723445112808802e-05, + "loss": 0.0, + "step": 1154 + }, + { + "epoch": 0.39198067223328953, + "grad_norm": 0.01902814209461212, + "learning_rate": 9.722403413957923e-05, + "loss": 0.0001, + "step": 1156 + }, + { + "epoch": 0.3926588394862883, + "grad_norm": 0.07783958315849304, + "learning_rate": 9.721359812923515e-05, + "loss": 0.0003, + "step": 1158 + }, + { + "epoch": 0.39333700673928707, + "grad_norm": 0.008552520535886288, + "learning_rate": 9.720314310125937e-05, + "loss": 0.0, + "step": 1160 + }, + { + "epoch": 0.39401517399228586, + "grad_norm": 0.00928240455687046, + "learning_rate": 9.719266905986322e-05, + "loss": 0.0, + "step": 1162 + }, + { + "epoch": 0.3946933412452846, + "grad_norm": 0.04865438863635063, + "learning_rate": 9.71821760092656e-05, + "loss": 0.0001, + "step": 1164 + }, + { + "epoch": 0.3953715084982834, + "grad_norm": 0.005656507331877947, + "learning_rate": 9.717166395369313e-05, + "loss": 0.0, + "step": 1166 + }, + { + "epoch": 0.39604967575128214, + "grad_norm": 0.00930027011781931, + "learning_rate": 9.716113289738004e-05, + "loss": 0.0001, + "step": 1168 + }, + { + "epoch": 0.39672784300428093, + "grad_norm": 0.023045826703310013, + "learning_rate": 9.715058284456828e-05, + "loss": 0.0001, + "step": 1170 + }, + { + "epoch": 0.39740601025727973, + "grad_norm": 0.00920875370502472, + "learning_rate": 9.714001379950739e-05, + "loss": 0.0001, + "step": 1172 + }, + { + "epoch": 0.39808417751027847, + "grad_norm": 0.002126706065610051, + "learning_rate": 9.712942576645459e-05, + "loss": 0.0001, + "step": 1174 + }, + { + "epoch": 0.39876234476327727, + "grad_norm": 0.010585418902337551, + "learning_rate": 9.71188187496747e-05, + "loss": 0.0, + "step": 1176 + }, + { + "epoch": 0.399440512016276, + "grad_norm": 0.007169020362198353, + "learning_rate": 9.710819275344028e-05, + "loss": 0.0, + "step": 1178 + }, + { + "epoch": 0.4001186792692748, + "grad_norm": 0.008359720930457115, + "learning_rate": 9.709754778203143e-05, + "loss": 0.0001, + "step": 1180 + }, + { + "epoch": 0.40079684652227354, + "grad_norm": 0.017095040529966354, + "learning_rate": 9.708688383973596e-05, + "loss": 0.0002, + "step": 1182 + }, + { + "epoch": 0.40147501377527234, + "grad_norm": 0.003711053868755698, + "learning_rate": 9.707620093084933e-05, + "loss": 0.0, + "step": 1184 + }, + { + "epoch": 0.4021531810282711, + "grad_norm": 0.003627863246947527, + "learning_rate": 9.706549905967459e-05, + "loss": 0.0, + "step": 1186 + }, + { + "epoch": 0.4028313482812699, + "grad_norm": 0.004584900569170713, + "learning_rate": 9.705477823052246e-05, + "loss": 0.0, + "step": 1188 + }, + { + "epoch": 0.4035095155342686, + "grad_norm": 0.012822436168789864, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0, + "step": 1190 + }, + { + "epoch": 0.4041876827872674, + "grad_norm": 0.02418431267142296, + "learning_rate": 9.703327971556704e-05, + "loss": 0.0, + "step": 1192 + }, + { + "epoch": 0.4048658500402662, + "grad_norm": 0.011410797946155071, + "learning_rate": 9.702250203842336e-05, + "loss": 0.0, + "step": 1194 + }, + { + "epoch": 0.40554401729326495, + "grad_norm": 0.0004434087313711643, + "learning_rate": 9.701170542062148e-05, + "loss": 0.0, + "step": 1196 + }, + { + "epoch": 0.40622218454626374, + "grad_norm": 0.0011917415540665388, + "learning_rate": 9.700088986651027e-05, + "loss": 0.0, + "step": 1198 + }, + { + "epoch": 0.4069003517992625, + "grad_norm": 0.1085277646780014, + "learning_rate": 9.699005538044626e-05, + "loss": 0.0002, + "step": 1200 + }, + { + "epoch": 0.4075785190522613, + "grad_norm": 0.001079617883078754, + "learning_rate": 9.697920196679353e-05, + "loss": 0.0, + "step": 1202 + }, + { + "epoch": 0.40825668630526, + "grad_norm": 0.0006843967130407691, + "learning_rate": 9.69683296299239e-05, + "loss": 0.0001, + "step": 1204 + }, + { + "epoch": 0.4089348535582588, + "grad_norm": 0.0017491994658485055, + "learning_rate": 9.695743837421669e-05, + "loss": 0.0, + "step": 1206 + }, + { + "epoch": 0.40961302081125756, + "grad_norm": 0.050194092094898224, + "learning_rate": 9.694652820405895e-05, + "loss": 0.0001, + "step": 1208 + }, + { + "epoch": 0.41029118806425635, + "grad_norm": 0.0033219442702829838, + "learning_rate": 9.693559912384523e-05, + "loss": 0.0001, + "step": 1210 + }, + { + "epoch": 0.4109693553172551, + "grad_norm": 0.021614020690321922, + "learning_rate": 9.69246511379778e-05, + "loss": 0.0, + "step": 1212 + }, + { + "epoch": 0.4116475225702539, + "grad_norm": 0.0711006447672844, + "learning_rate": 9.691368425086651e-05, + "loss": 0.0007, + "step": 1214 + }, + { + "epoch": 0.4123256898232527, + "grad_norm": 0.0004094350733794272, + "learning_rate": 9.690269846692881e-05, + "loss": 0.0, + "step": 1216 + }, + { + "epoch": 0.4130038570762514, + "grad_norm": 0.008974644355475903, + "learning_rate": 9.689169379058979e-05, + "loss": 0.0006, + "step": 1218 + }, + { + "epoch": 0.4136820243292502, + "grad_norm": 0.00673671206459403, + "learning_rate": 9.688067022628211e-05, + "loss": 0.0004, + "step": 1220 + }, + { + "epoch": 0.41436019158224896, + "grad_norm": 0.014988470822572708, + "learning_rate": 9.686962777844606e-05, + "loss": 0.0002, + "step": 1222 + }, + { + "epoch": 0.41503835883524776, + "grad_norm": 0.05447763949632645, + "learning_rate": 9.685856645152955e-05, + "loss": 0.0002, + "step": 1224 + }, + { + "epoch": 0.4157165260882465, + "grad_norm": 0.009435374289751053, + "learning_rate": 9.68474862499881e-05, + "loss": 0.0001, + "step": 1226 + }, + { + "epoch": 0.4163946933412453, + "grad_norm": 0.022904202342033386, + "learning_rate": 9.683638717828476e-05, + "loss": 0.0002, + "step": 1228 + }, + { + "epoch": 0.41707286059424403, + "grad_norm": 0.002160982694476843, + "learning_rate": 9.682526924089028e-05, + "loss": 0.0, + "step": 1230 + }, + { + "epoch": 0.41775102784724283, + "grad_norm": 0.005374177824705839, + "learning_rate": 9.681413244228294e-05, + "loss": 0.0001, + "step": 1232 + }, + { + "epoch": 0.4184291951002416, + "grad_norm": 0.0011474484344944358, + "learning_rate": 9.680297678694867e-05, + "loss": 0.0001, + "step": 1234 + }, + { + "epoch": 0.41910736235324036, + "grad_norm": 0.03147886320948601, + "learning_rate": 9.679180227938094e-05, + "loss": 0.0002, + "step": 1236 + }, + { + "epoch": 0.41978552960623916, + "grad_norm": 0.04308544844388962, + "learning_rate": 9.678060892408083e-05, + "loss": 0.0001, + "step": 1238 + }, + { + "epoch": 0.4204636968592379, + "grad_norm": 0.00260750949382782, + "learning_rate": 9.676939672555706e-05, + "loss": 0.0, + "step": 1240 + }, + { + "epoch": 0.4211418641122367, + "grad_norm": 0.049146685749292374, + "learning_rate": 9.675816568832588e-05, + "loss": 0.0001, + "step": 1242 + }, + { + "epoch": 0.42182003136523544, + "grad_norm": 0.05711871013045311, + "learning_rate": 9.674691581691114e-05, + "loss": 0.0, + "step": 1244 + }, + { + "epoch": 0.42249819861823423, + "grad_norm": 0.008233152329921722, + "learning_rate": 9.673564711584431e-05, + "loss": 0.0, + "step": 1246 + }, + { + "epoch": 0.423176365871233, + "grad_norm": 0.001224467996507883, + "learning_rate": 9.672435958966442e-05, + "loss": 0.0, + "step": 1248 + }, + { + "epoch": 0.42385453312423177, + "grad_norm": 0.0015520035522058606, + "learning_rate": 9.671305324291806e-05, + "loss": 0.0, + "step": 1250 + }, + { + "epoch": 0.4245327003772305, + "grad_norm": 0.09785683453083038, + "learning_rate": 9.670172808015943e-05, + "loss": 0.0003, + "step": 1252 + }, + { + "epoch": 0.4252108676302293, + "grad_norm": 0.011007199063897133, + "learning_rate": 9.669038410595033e-05, + "loss": 0.0, + "step": 1254 + }, + { + "epoch": 0.4258890348832281, + "grad_norm": 0.05002971738576889, + "learning_rate": 9.667902132486009e-05, + "loss": 0.0002, + "step": 1256 + }, + { + "epoch": 0.42656720213622684, + "grad_norm": 0.027726514264941216, + "learning_rate": 9.666763974146564e-05, + "loss": 0.0002, + "step": 1258 + }, + { + "epoch": 0.42724536938922564, + "grad_norm": 0.034664928913116455, + "learning_rate": 9.66562393603515e-05, + "loss": 0.0002, + "step": 1260 + }, + { + "epoch": 0.4279235366422244, + "grad_norm": 0.0023947851732373238, + "learning_rate": 9.664482018610971e-05, + "loss": 0.0001, + "step": 1262 + }, + { + "epoch": 0.4286017038952232, + "grad_norm": 0.027528831735253334, + "learning_rate": 9.663338222333993e-05, + "loss": 0.0002, + "step": 1264 + }, + { + "epoch": 0.4292798711482219, + "grad_norm": 0.006727095227688551, + "learning_rate": 9.662192547664937e-05, + "loss": 0.0, + "step": 1266 + }, + { + "epoch": 0.4299580384012207, + "grad_norm": 0.0011739626061171293, + "learning_rate": 9.66104499506528e-05, + "loss": 0.0, + "step": 1268 + }, + { + "epoch": 0.43063620565421945, + "grad_norm": 0.002890555886551738, + "learning_rate": 9.659895564997257e-05, + "loss": 0.0, + "step": 1270 + }, + { + "epoch": 0.43131437290721825, + "grad_norm": 0.001660423120483756, + "learning_rate": 9.658744257923857e-05, + "loss": 0.0, + "step": 1272 + }, + { + "epoch": 0.431992540160217, + "grad_norm": 0.030299240723252296, + "learning_rate": 9.657591074308827e-05, + "loss": 0.0002, + "step": 1274 + }, + { + "epoch": 0.4326707074132158, + "grad_norm": 0.0587908960878849, + "learning_rate": 9.656436014616669e-05, + "loss": 0.0002, + "step": 1276 + }, + { + "epoch": 0.4333488746662146, + "grad_norm": 0.004164823330938816, + "learning_rate": 9.655279079312642e-05, + "loss": 0.0001, + "step": 1278 + }, + { + "epoch": 0.4340270419192133, + "grad_norm": 0.0012816853122785687, + "learning_rate": 9.654120268862758e-05, + "loss": 0.0, + "step": 1280 + }, + { + "epoch": 0.4347052091722121, + "grad_norm": 0.026919906958937645, + "learning_rate": 9.652959583733787e-05, + "loss": 0.0001, + "step": 1282 + }, + { + "epoch": 0.43538337642521086, + "grad_norm": 0.0006462790770456195, + "learning_rate": 9.651797024393252e-05, + "loss": 0.0, + "step": 1284 + }, + { + "epoch": 0.43606154367820965, + "grad_norm": 0.0024133336264640093, + "learning_rate": 9.650632591309431e-05, + "loss": 0.0002, + "step": 1286 + }, + { + "epoch": 0.4367397109312084, + "grad_norm": 0.031166810542345047, + "learning_rate": 9.649466284951359e-05, + "loss": 0.0003, + "step": 1288 + }, + { + "epoch": 0.4374178781842072, + "grad_norm": 0.004173577763140202, + "learning_rate": 9.648298105788822e-05, + "loss": 0.0, + "step": 1290 + }, + { + "epoch": 0.4380960454372059, + "grad_norm": 0.012892540544271469, + "learning_rate": 9.647128054292365e-05, + "loss": 0.0001, + "step": 1292 + }, + { + "epoch": 0.4387742126902047, + "grad_norm": 0.04061705246567726, + "learning_rate": 9.645956130933284e-05, + "loss": 0.0002, + "step": 1294 + }, + { + "epoch": 0.4394523799432035, + "grad_norm": 0.0026674168184399605, + "learning_rate": 9.644782336183629e-05, + "loss": 0.0001, + "step": 1296 + }, + { + "epoch": 0.44013054719620226, + "grad_norm": 0.006499381735920906, + "learning_rate": 9.643606670516205e-05, + "loss": 0.0, + "step": 1298 + }, + { + "epoch": 0.44080871444920106, + "grad_norm": 0.00404358608648181, + "learning_rate": 9.642429134404569e-05, + "loss": 0.0004, + "step": 1300 + }, + { + "epoch": 0.4414868817021998, + "grad_norm": 0.03190307691693306, + "learning_rate": 9.641249728323033e-05, + "loss": 0.0001, + "step": 1302 + }, + { + "epoch": 0.4421650489551986, + "grad_norm": 0.004501507617533207, + "learning_rate": 9.64006845274666e-05, + "loss": 0.0, + "step": 1304 + }, + { + "epoch": 0.44284321620819733, + "grad_norm": 0.0022560632787644863, + "learning_rate": 9.63888530815127e-05, + "loss": 0.0, + "step": 1306 + }, + { + "epoch": 0.44352138346119613, + "grad_norm": 0.007302314043045044, + "learning_rate": 9.637700295013432e-05, + "loss": 0.0, + "step": 1308 + }, + { + "epoch": 0.44419955071419487, + "grad_norm": 0.014958181418478489, + "learning_rate": 9.636513413810471e-05, + "loss": 0.0, + "step": 1310 + }, + { + "epoch": 0.44487771796719366, + "grad_norm": 0.001923035946674645, + "learning_rate": 9.63532466502046e-05, + "loss": 0.0, + "step": 1312 + }, + { + "epoch": 0.4455558852201924, + "grad_norm": 0.0023916817735880613, + "learning_rate": 9.63413404912223e-05, + "loss": 0.0, + "step": 1314 + }, + { + "epoch": 0.4462340524731912, + "grad_norm": 0.03837057203054428, + "learning_rate": 9.632941566595357e-05, + "loss": 0.0001, + "step": 1316 + }, + { + "epoch": 0.44691221972619, + "grad_norm": 0.0005067187012173235, + "learning_rate": 9.631747217920178e-05, + "loss": 0.0, + "step": 1318 + }, + { + "epoch": 0.44759038697918874, + "grad_norm": 0.002008441835641861, + "learning_rate": 9.63055100357777e-05, + "loss": 0.0, + "step": 1320 + }, + { + "epoch": 0.44826855423218753, + "grad_norm": 0.004499231465160847, + "learning_rate": 9.629352924049975e-05, + "loss": 0.0, + "step": 1322 + }, + { + "epoch": 0.4489467214851863, + "grad_norm": 0.014846640639007092, + "learning_rate": 9.628152979819374e-05, + "loss": 0.0003, + "step": 1324 + }, + { + "epoch": 0.44962488873818507, + "grad_norm": 0.022211197763681412, + "learning_rate": 9.626951171369305e-05, + "loss": 0.0002, + "step": 1326 + }, + { + "epoch": 0.4503030559911838, + "grad_norm": 0.0004746819904539734, + "learning_rate": 9.62574749918386e-05, + "loss": 0.0, + "step": 1328 + }, + { + "epoch": 0.4509812232441826, + "grad_norm": 0.0026894102338701487, + "learning_rate": 9.624541963747873e-05, + "loss": 0.0001, + "step": 1330 + }, + { + "epoch": 0.45165939049718135, + "grad_norm": 0.0006651193252764642, + "learning_rate": 9.623334565546937e-05, + "loss": 0.0, + "step": 1332 + }, + { + "epoch": 0.45233755775018014, + "grad_norm": 0.009189899079501629, + "learning_rate": 9.622125305067393e-05, + "loss": 0.0, + "step": 1334 + }, + { + "epoch": 0.4530157250031789, + "grad_norm": 0.0018004291923716664, + "learning_rate": 9.620914182796327e-05, + "loss": 0.0, + "step": 1336 + }, + { + "epoch": 0.4536938922561777, + "grad_norm": 0.0035469504073262215, + "learning_rate": 9.61970119922158e-05, + "loss": 0.0, + "step": 1338 + }, + { + "epoch": 0.4543720595091765, + "grad_norm": 0.001930896774865687, + "learning_rate": 9.618486354831743e-05, + "loss": 0.0, + "step": 1340 + }, + { + "epoch": 0.4550502267621752, + "grad_norm": 0.0019152465974912047, + "learning_rate": 9.617269650116157e-05, + "loss": 0.0002, + "step": 1342 + }, + { + "epoch": 0.455728394015174, + "grad_norm": 0.0019315932877361774, + "learning_rate": 9.616051085564906e-05, + "loss": 0.0, + "step": 1344 + }, + { + "epoch": 0.45640656126817275, + "grad_norm": 0.0008209678344428539, + "learning_rate": 9.614830661668829e-05, + "loss": 0.0, + "step": 1346 + }, + { + "epoch": 0.45708472852117155, + "grad_norm": 0.002465085359290242, + "learning_rate": 9.613608378919513e-05, + "loss": 0.0001, + "step": 1348 + }, + { + "epoch": 0.4577628957741703, + "grad_norm": 0.001089555793441832, + "learning_rate": 9.612384237809295e-05, + "loss": 0.0, + "step": 1350 + }, + { + "epoch": 0.4584410630271691, + "grad_norm": 0.0003401720605324954, + "learning_rate": 9.611158238831259e-05, + "loss": 0.0, + "step": 1352 + }, + { + "epoch": 0.4591192302801678, + "grad_norm": 0.0004586543655022979, + "learning_rate": 9.609930382479233e-05, + "loss": 0.0, + "step": 1354 + }, + { + "epoch": 0.4597973975331666, + "grad_norm": 0.0005209209630265832, + "learning_rate": 9.608700669247802e-05, + "loss": 0.0, + "step": 1356 + }, + { + "epoch": 0.4604755647861654, + "grad_norm": 0.0011509821051731706, + "learning_rate": 9.607469099632291e-05, + "loss": 0.0, + "step": 1358 + }, + { + "epoch": 0.46115373203916415, + "grad_norm": 0.0004960742662660778, + "learning_rate": 9.606235674128781e-05, + "loss": 0.0, + "step": 1360 + }, + { + "epoch": 0.46183189929216295, + "grad_norm": 0.002033142140135169, + "learning_rate": 9.605000393234089e-05, + "loss": 0.0, + "step": 1362 + }, + { + "epoch": 0.4625100665451617, + "grad_norm": 0.0006835223757661879, + "learning_rate": 9.60376325744579e-05, + "loss": 0.0, + "step": 1364 + }, + { + "epoch": 0.4631882337981605, + "grad_norm": 0.0004336755082476884, + "learning_rate": 9.602524267262203e-05, + "loss": 0.0, + "step": 1366 + }, + { + "epoch": 0.4638664010511592, + "grad_norm": 0.0009202666115015745, + "learning_rate": 9.60128342318239e-05, + "loss": 0.0, + "step": 1368 + }, + { + "epoch": 0.464544568304158, + "grad_norm": 0.00034619055804796517, + "learning_rate": 9.600040725706164e-05, + "loss": 0.0, + "step": 1370 + }, + { + "epoch": 0.46522273555715676, + "grad_norm": 0.00034607655834406614, + "learning_rate": 9.598796175334085e-05, + "loss": 0.0, + "step": 1372 + }, + { + "epoch": 0.46590090281015556, + "grad_norm": 0.0022296837996691465, + "learning_rate": 9.597549772567456e-05, + "loss": 0.0, + "step": 1374 + }, + { + "epoch": 0.4665790700631543, + "grad_norm": 0.0003772445779759437, + "learning_rate": 9.596301517908328e-05, + "loss": 0.0, + "step": 1376 + }, + { + "epoch": 0.4672572373161531, + "grad_norm": 0.0872640609741211, + "learning_rate": 9.595051411859499e-05, + "loss": 0.0001, + "step": 1378 + }, + { + "epoch": 0.4679354045691519, + "grad_norm": 0.00028464055503718555, + "learning_rate": 9.59379945492451e-05, + "loss": 0.0, + "step": 1380 + }, + { + "epoch": 0.46861357182215063, + "grad_norm": 0.0007707217009738088, + "learning_rate": 9.592545647607652e-05, + "loss": 0.0, + "step": 1382 + }, + { + "epoch": 0.46929173907514943, + "grad_norm": 0.0003467558417469263, + "learning_rate": 9.591289990413954e-05, + "loss": 0.0, + "step": 1384 + }, + { + "epoch": 0.46996990632814817, + "grad_norm": 0.017712855711579323, + "learning_rate": 9.590032483849199e-05, + "loss": 0.0001, + "step": 1386 + }, + { + "epoch": 0.47064807358114696, + "grad_norm": 0.0008023407426662743, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0, + "step": 1388 + }, + { + "epoch": 0.4713262408341457, + "grad_norm": 0.011850233189761639, + "learning_rate": 9.587511924633348e-05, + "loss": 0.0, + "step": 1390 + }, + { + "epoch": 0.4720044080871445, + "grad_norm": 0.0005710996338166296, + "learning_rate": 9.586248872997532e-05, + "loss": 0.0, + "step": 1392 + }, + { + "epoch": 0.47268257534014324, + "grad_norm": 0.009720748290419579, + "learning_rate": 9.584983974021222e-05, + "loss": 0.0, + "step": 1394 + }, + { + "epoch": 0.47336074259314204, + "grad_norm": 0.0015520682791247964, + "learning_rate": 9.583717228213914e-05, + "loss": 0.0002, + "step": 1396 + }, + { + "epoch": 0.4740389098461408, + "grad_norm": 0.06051940098404884, + "learning_rate": 9.582448636085855e-05, + "loss": 0.0002, + "step": 1398 + }, + { + "epoch": 0.4747170770991396, + "grad_norm": 0.036541007459163666, + "learning_rate": 9.581178198148034e-05, + "loss": 0.0001, + "step": 1400 + }, + { + "epoch": 0.47539524435213837, + "grad_norm": 0.0005350433639250696, + "learning_rate": 9.579905914912182e-05, + "loss": 0.0, + "step": 1402 + }, + { + "epoch": 0.4760734116051371, + "grad_norm": 0.008719049394130707, + "learning_rate": 9.578631786890774e-05, + "loss": 0.0, + "step": 1404 + }, + { + "epoch": 0.4767515788581359, + "grad_norm": 0.052884291857481, + "learning_rate": 9.577355814597031e-05, + "loss": 0.0002, + "step": 1406 + }, + { + "epoch": 0.47742974611113465, + "grad_norm": 0.002605754416435957, + "learning_rate": 9.576077998544912e-05, + "loss": 0.0, + "step": 1408 + }, + { + "epoch": 0.47810791336413344, + "grad_norm": 0.0009654919849708676, + "learning_rate": 9.574798339249125e-05, + "loss": 0.0, + "step": 1410 + }, + { + "epoch": 0.4787860806171322, + "grad_norm": 0.014030814170837402, + "learning_rate": 9.573516837225112e-05, + "loss": 0.0, + "step": 1412 + }, + { + "epoch": 0.479464247870131, + "grad_norm": 0.003000308061018586, + "learning_rate": 9.572233492989064e-05, + "loss": 0.0, + "step": 1414 + }, + { + "epoch": 0.4801424151231297, + "grad_norm": 0.03821394965052605, + "learning_rate": 9.570948307057912e-05, + "loss": 0.0001, + "step": 1416 + }, + { + "epoch": 0.4808205823761285, + "grad_norm": 0.0002816450723912567, + "learning_rate": 9.569661279949329e-05, + "loss": 0.0, + "step": 1418 + }, + { + "epoch": 0.4814987496291273, + "grad_norm": 0.005087055265903473, + "learning_rate": 9.56837241218173e-05, + "loss": 0.0, + "step": 1420 + }, + { + "epoch": 0.48217691688212605, + "grad_norm": 0.017613375559449196, + "learning_rate": 9.567081704274268e-05, + "loss": 0.0, + "step": 1422 + }, + { + "epoch": 0.48285508413512485, + "grad_norm": 0.00882010255008936, + "learning_rate": 9.565789156746842e-05, + "loss": 0.0003, + "step": 1424 + }, + { + "epoch": 0.4835332513881236, + "grad_norm": 0.0006112809642218053, + "learning_rate": 9.564494770120089e-05, + "loss": 0.0, + "step": 1426 + }, + { + "epoch": 0.4842114186411224, + "grad_norm": 0.0022903706412762403, + "learning_rate": 9.56319854491539e-05, + "loss": 0.0, + "step": 1428 + }, + { + "epoch": 0.4848895858941211, + "grad_norm": 0.0009983257623389363, + "learning_rate": 9.561900481654862e-05, + "loss": 0.0, + "step": 1430 + }, + { + "epoch": 0.4855677531471199, + "grad_norm": 0.0007166972500272095, + "learning_rate": 9.560600580861365e-05, + "loss": 0.0, + "step": 1432 + }, + { + "epoch": 0.48624592040011866, + "grad_norm": 0.04111838713288307, + "learning_rate": 9.559298843058501e-05, + "loss": 0.0001, + "step": 1434 + }, + { + "epoch": 0.48692408765311745, + "grad_norm": 0.001161652966402471, + "learning_rate": 9.557995268770608e-05, + "loss": 0.0, + "step": 1436 + }, + { + "epoch": 0.4876022549061162, + "grad_norm": 0.01842758245766163, + "learning_rate": 9.556689858522764e-05, + "loss": 0.0, + "step": 1438 + }, + { + "epoch": 0.488280422159115, + "grad_norm": 0.0006185704842209816, + "learning_rate": 9.555382612840791e-05, + "loss": 0.0, + "step": 1440 + }, + { + "epoch": 0.4889585894121138, + "grad_norm": 0.012515629641711712, + "learning_rate": 9.554073532251247e-05, + "loss": 0.0, + "step": 1442 + }, + { + "epoch": 0.4896367566651125, + "grad_norm": 0.0011411434970796108, + "learning_rate": 9.552762617281428e-05, + "loss": 0.0, + "step": 1444 + }, + { + "epoch": 0.4903149239181113, + "grad_norm": 0.0034499443136155605, + "learning_rate": 9.55144986845937e-05, + "loss": 0.0, + "step": 1446 + }, + { + "epoch": 0.49099309117111006, + "grad_norm": 0.019210342317819595, + "learning_rate": 9.550135286313852e-05, + "loss": 0.0001, + "step": 1448 + }, + { + "epoch": 0.49167125842410886, + "grad_norm": 0.005309837404638529, + "learning_rate": 9.548818871374383e-05, + "loss": 0.0, + "step": 1450 + }, + { + "epoch": 0.4923494256771076, + "grad_norm": 0.0003773440548684448, + "learning_rate": 9.547500624171217e-05, + "loss": 0.0, + "step": 1452 + }, + { + "epoch": 0.4930275929301064, + "grad_norm": 0.00025181396631523967, + "learning_rate": 9.546180545235344e-05, + "loss": 0.0, + "step": 1454 + }, + { + "epoch": 0.49370576018310514, + "grad_norm": 0.00040584264206700027, + "learning_rate": 9.544858635098491e-05, + "loss": 0.0, + "step": 1456 + }, + { + "epoch": 0.49438392743610393, + "grad_norm": 0.05193353816866875, + "learning_rate": 9.543534894293121e-05, + "loss": 0.0001, + "step": 1458 + }, + { + "epoch": 0.4950620946891027, + "grad_norm": 0.0005504893488250673, + "learning_rate": 9.542209323352441e-05, + "loss": 0.0, + "step": 1460 + }, + { + "epoch": 0.49574026194210147, + "grad_norm": 0.0023802071809768677, + "learning_rate": 9.540881922810388e-05, + "loss": 0.0, + "step": 1462 + }, + { + "epoch": 0.49641842919510026, + "grad_norm": 0.0005835008923895657, + "learning_rate": 9.539552693201642e-05, + "loss": 0.0, + "step": 1464 + }, + { + "epoch": 0.497096596448099, + "grad_norm": 0.0014621730661019683, + "learning_rate": 9.538221635061611e-05, + "loss": 0.0, + "step": 1466 + }, + { + "epoch": 0.4977747637010978, + "grad_norm": 0.0009525296627543867, + "learning_rate": 9.536888748926449e-05, + "loss": 0.0, + "step": 1468 + }, + { + "epoch": 0.49845293095409654, + "grad_norm": 0.08590487390756607, + "learning_rate": 9.535554035333041e-05, + "loss": 0.0002, + "step": 1470 + }, + { + "epoch": 0.49913109820709534, + "grad_norm": 0.001831568544730544, + "learning_rate": 9.534217494819011e-05, + "loss": 0.0, + "step": 1472 + }, + { + "epoch": 0.4998092654600941, + "grad_norm": 0.0029671755619347095, + "learning_rate": 9.532879127922717e-05, + "loss": 0.0003, + "step": 1474 + }, + { + "epoch": 0.5004874327130928, + "grad_norm": 0.024554090574383736, + "learning_rate": 9.53153893518325e-05, + "loss": 0.0002, + "step": 1476 + }, + { + "epoch": 0.5011655999660917, + "grad_norm": 0.0011774948798120022, + "learning_rate": 9.530196917140444e-05, + "loss": 0.0, + "step": 1478 + }, + { + "epoch": 0.5018437672190904, + "grad_norm": 0.02411930449306965, + "learning_rate": 9.52885307433486e-05, + "loss": 0.0, + "step": 1480 + }, + { + "epoch": 0.5025219344720891, + "grad_norm": 0.052455827593803406, + "learning_rate": 9.527507407307799e-05, + "loss": 0.0002, + "step": 1482 + }, + { + "epoch": 0.503200101725088, + "grad_norm": 0.03569287061691284, + "learning_rate": 9.526159916601298e-05, + "loss": 0.0002, + "step": 1484 + }, + { + "epoch": 0.5038782689780867, + "grad_norm": 0.0015953992260619998, + "learning_rate": 9.524810602758121e-05, + "loss": 0.0, + "step": 1486 + }, + { + "epoch": 0.5045564362310855, + "grad_norm": 0.04384199529886246, + "learning_rate": 9.523459466321777e-05, + "loss": 0.0, + "step": 1488 + }, + { + "epoch": 0.5052346034840842, + "grad_norm": 0.022246044129133224, + "learning_rate": 9.5221065078365e-05, + "loss": 0.0001, + "step": 1490 + }, + { + "epoch": 0.5059127707370831, + "grad_norm": 0.0010375117417424917, + "learning_rate": 9.520751727847262e-05, + "loss": 0.0, + "step": 1492 + }, + { + "epoch": 0.5065909379900818, + "grad_norm": 0.0021145131904631853, + "learning_rate": 9.519395126899768e-05, + "loss": 0.0002, + "step": 1494 + }, + { + "epoch": 0.5072691052430806, + "grad_norm": 0.012676035054028034, + "learning_rate": 9.518036705540458e-05, + "loss": 0.0001, + "step": 1496 + }, + { + "epoch": 0.5079472724960793, + "grad_norm": 0.006452012341469526, + "learning_rate": 9.516676464316505e-05, + "loss": 0.0, + "step": 1498 + }, + { + "epoch": 0.5086254397490781, + "grad_norm": 0.0071767643094062805, + "learning_rate": 9.51531440377581e-05, + "loss": 0.0, + "step": 1500 + }, + { + "epoch": 0.5093036070020769, + "grad_norm": 0.08541678637266159, + "learning_rate": 9.513950524467014e-05, + "loss": 0.0003, + "step": 1502 + }, + { + "epoch": 0.5099817742550756, + "grad_norm": 0.01230217982083559, + "learning_rate": 9.512584826939487e-05, + "loss": 0.0001, + "step": 1504 + }, + { + "epoch": 0.5106599415080745, + "grad_norm": 0.000837941886857152, + "learning_rate": 9.51121731174333e-05, + "loss": 0.0, + "step": 1506 + }, + { + "epoch": 0.5113381087610732, + "grad_norm": 0.000993087887763977, + "learning_rate": 9.50984797942938e-05, + "loss": 0.0, + "step": 1508 + }, + { + "epoch": 0.512016276014072, + "grad_norm": 0.07351294904947281, + "learning_rate": 9.508476830549205e-05, + "loss": 0.0001, + "step": 1510 + }, + { + "epoch": 0.5126944432670707, + "grad_norm": 0.0013747677439823747, + "learning_rate": 9.507103865655101e-05, + "loss": 0.0, + "step": 1512 + }, + { + "epoch": 0.5133726105200695, + "grad_norm": 0.005486196838319302, + "learning_rate": 9.505729085300097e-05, + "loss": 0.0, + "step": 1514 + }, + { + "epoch": 0.5140507777730683, + "grad_norm": 0.00730984378606081, + "learning_rate": 9.504352490037958e-05, + "loss": 0.0, + "step": 1516 + }, + { + "epoch": 0.514728945026067, + "grad_norm": 0.02579108066856861, + "learning_rate": 9.502974080423172e-05, + "loss": 0.0, + "step": 1518 + }, + { + "epoch": 0.5154071122790658, + "grad_norm": 0.0014567964244633913, + "learning_rate": 9.501593857010969e-05, + "loss": 0.0, + "step": 1520 + }, + { + "epoch": 0.5160852795320646, + "grad_norm": 0.006894987542182207, + "learning_rate": 9.500211820357297e-05, + "loss": 0.0003, + "step": 1522 + }, + { + "epoch": 0.5167634467850634, + "grad_norm": 0.004342679399996996, + "learning_rate": 9.49882797101884e-05, + "loss": 0.0, + "step": 1524 + }, + { + "epoch": 0.5174416140380621, + "grad_norm": 0.00029608941986225545, + "learning_rate": 9.497442309553016e-05, + "loss": 0.0, + "step": 1526 + }, + { + "epoch": 0.518119781291061, + "grad_norm": 0.020566822960972786, + "learning_rate": 9.496054836517968e-05, + "loss": 0.0, + "step": 1528 + }, + { + "epoch": 0.5187979485440597, + "grad_norm": 0.01303982175886631, + "learning_rate": 9.494665552472568e-05, + "loss": 0.0, + "step": 1530 + }, + { + "epoch": 0.5194761157970584, + "grad_norm": 0.0011589553905650973, + "learning_rate": 9.493274457976422e-05, + "loss": 0.0, + "step": 1532 + }, + { + "epoch": 0.5201542830500572, + "grad_norm": 0.02673693560063839, + "learning_rate": 9.491881553589863e-05, + "loss": 0.0, + "step": 1534 + }, + { + "epoch": 0.520832450303056, + "grad_norm": 0.009233599528670311, + "learning_rate": 9.490486839873949e-05, + "loss": 0.0005, + "step": 1536 + }, + { + "epoch": 0.5215106175560548, + "grad_norm": 0.006066435482352972, + "learning_rate": 9.489090317390475e-05, + "loss": 0.0, + "step": 1538 + }, + { + "epoch": 0.5221887848090535, + "grad_norm": 0.002637030789628625, + "learning_rate": 9.487691986701957e-05, + "loss": 0.0, + "step": 1540 + }, + { + "epoch": 0.5228669520620522, + "grad_norm": 0.04053063690662384, + "learning_rate": 9.486291848371643e-05, + "loss": 0.0001, + "step": 1542 + }, + { + "epoch": 0.5235451193150511, + "grad_norm": 0.0009022873127833009, + "learning_rate": 9.484889902963509e-05, + "loss": 0.0, + "step": 1544 + }, + { + "epoch": 0.5242232865680498, + "grad_norm": 0.007944276556372643, + "learning_rate": 9.483486151042258e-05, + "loss": 0.0, + "step": 1546 + }, + { + "epoch": 0.5249014538210486, + "grad_norm": 0.010381053201854229, + "learning_rate": 9.482080593173323e-05, + "loss": 0.0, + "step": 1548 + }, + { + "epoch": 0.5255796210740474, + "grad_norm": 0.0014247434446588159, + "learning_rate": 9.480673229922861e-05, + "loss": 0.0001, + "step": 1550 + }, + { + "epoch": 0.5262577883270462, + "grad_norm": 0.00017903864500112832, + "learning_rate": 9.479264061857756e-05, + "loss": 0.0, + "step": 1552 + }, + { + "epoch": 0.5269359555800449, + "grad_norm": 0.0005723198410123587, + "learning_rate": 9.477853089545624e-05, + "loss": 0.0, + "step": 1554 + }, + { + "epoch": 0.5276141228330437, + "grad_norm": 0.022112039849162102, + "learning_rate": 9.476440313554803e-05, + "loss": 0.0001, + "step": 1556 + }, + { + "epoch": 0.5282922900860425, + "grad_norm": 0.04562903195619583, + "learning_rate": 9.47502573445436e-05, + "loss": 0.0002, + "step": 1558 + }, + { + "epoch": 0.5289704573390412, + "grad_norm": 0.000275386351859197, + "learning_rate": 9.473609352814083e-05, + "loss": 0.0, + "step": 1560 + }, + { + "epoch": 0.52964862459204, + "grad_norm": 0.0007450791308656335, + "learning_rate": 9.472191169204497e-05, + "loss": 0.0, + "step": 1562 + }, + { + "epoch": 0.5303267918450388, + "grad_norm": 0.0013386242790147662, + "learning_rate": 9.47077118419684e-05, + "loss": 0.0, + "step": 1564 + }, + { + "epoch": 0.5310049590980376, + "grad_norm": 0.0014541690470650792, + "learning_rate": 9.469349398363088e-05, + "loss": 0.0, + "step": 1566 + }, + { + "epoch": 0.5316831263510363, + "grad_norm": 0.00032288787770085037, + "learning_rate": 9.467925812275931e-05, + "loss": 0.0, + "step": 1568 + }, + { + "epoch": 0.5323612936040351, + "grad_norm": 0.00030021867132745683, + "learning_rate": 9.46650042650879e-05, + "loss": 0.0, + "step": 1570 + }, + { + "epoch": 0.5330394608570339, + "grad_norm": 0.007078014779835939, + "learning_rate": 9.465073241635814e-05, + "loss": 0.0, + "step": 1572 + }, + { + "epoch": 0.5337176281100326, + "grad_norm": 0.00022079533664509654, + "learning_rate": 9.463644258231869e-05, + "loss": 0.0, + "step": 1574 + }, + { + "epoch": 0.5343957953630314, + "grad_norm": 0.003117764601483941, + "learning_rate": 9.46221347687255e-05, + "loss": 0.0, + "step": 1576 + }, + { + "epoch": 0.5350739626160301, + "grad_norm": 0.0010085367830470204, + "learning_rate": 9.460780898134177e-05, + "loss": 0.0, + "step": 1578 + }, + { + "epoch": 0.535752129869029, + "grad_norm": 0.0002581927983555943, + "learning_rate": 9.45934652259379e-05, + "loss": 0.0, + "step": 1580 + }, + { + "epoch": 0.5364302971220277, + "grad_norm": 0.03966759517788887, + "learning_rate": 9.45791035082916e-05, + "loss": 0.0001, + "step": 1582 + }, + { + "epoch": 0.5371084643750265, + "grad_norm": 0.00011890224413946271, + "learning_rate": 9.456472383418771e-05, + "loss": 0.0, + "step": 1584 + }, + { + "epoch": 0.5377866316280253, + "grad_norm": 9.437848348170519e-05, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0, + "step": 1586 + }, + { + "epoch": 0.538464798881024, + "grad_norm": 0.0005711221019737422, + "learning_rate": 9.453591063978302e-05, + "loss": 0.0, + "step": 1588 + }, + { + "epoch": 0.5391429661340228, + "grad_norm": 0.00018182714120484889, + "learning_rate": 9.452147713108816e-05, + "loss": 0.0, + "step": 1590 + }, + { + "epoch": 0.5398211333870215, + "grad_norm": 0.0001235974341398105, + "learning_rate": 9.450702568914764e-05, + "loss": 0.0, + "step": 1592 + }, + { + "epoch": 0.5404993006400204, + "grad_norm": 0.0001750366936903447, + "learning_rate": 9.44925563197825e-05, + "loss": 0.0, + "step": 1594 + }, + { + "epoch": 0.5411774678930191, + "grad_norm": 0.022940626367926598, + "learning_rate": 9.447806902882097e-05, + "loss": 0.0001, + "step": 1596 + }, + { + "epoch": 0.5418556351460179, + "grad_norm": 0.0008206462953239679, + "learning_rate": 9.446356382209858e-05, + "loss": 0.0, + "step": 1598 + }, + { + "epoch": 0.5425338023990166, + "grad_norm": 0.13076429069042206, + "learning_rate": 9.444904070545798e-05, + "loss": 0.0003, + "step": 1600 + }, + { + "epoch": 0.5432119696520155, + "grad_norm": 0.02494717203080654, + "learning_rate": 9.443449968474911e-05, + "loss": 0.0, + "step": 1602 + }, + { + "epoch": 0.5438901369050142, + "grad_norm": 0.22823643684387207, + "learning_rate": 9.441994076582907e-05, + "loss": 0.0002, + "step": 1604 + }, + { + "epoch": 0.5445683041580129, + "grad_norm": 0.03987814113497734, + "learning_rate": 9.440536395456222e-05, + "loss": 0.0001, + "step": 1606 + }, + { + "epoch": 0.5452464714110118, + "grad_norm": 0.2199239283800125, + "learning_rate": 9.439076925682006e-05, + "loss": 0.001, + "step": 1608 + }, + { + "epoch": 0.5459246386640105, + "grad_norm": 0.06272803992033005, + "learning_rate": 9.437615667848138e-05, + "loss": 0.0004, + "step": 1610 + }, + { + "epoch": 0.5466028059170093, + "grad_norm": 0.1594870239496231, + "learning_rate": 9.436152622543207e-05, + "loss": 0.0012, + "step": 1612 + }, + { + "epoch": 0.547280973170008, + "grad_norm": 0.007507851347327232, + "learning_rate": 9.434687790356531e-05, + "loss": 0.0005, + "step": 1614 + }, + { + "epoch": 0.5479591404230069, + "grad_norm": 0.03764336183667183, + "learning_rate": 9.433221171878144e-05, + "loss": 0.0011, + "step": 1616 + }, + { + "epoch": 0.5486373076760056, + "grad_norm": 0.1107349619269371, + "learning_rate": 9.431752767698799e-05, + "loss": 0.001, + "step": 1618 + }, + { + "epoch": 0.5493154749290043, + "grad_norm": 0.027478542178869247, + "learning_rate": 9.430282578409968e-05, + "loss": 0.0002, + "step": 1620 + }, + { + "epoch": 0.5499936421820031, + "grad_norm": 0.05804652348160744, + "learning_rate": 9.428810604603846e-05, + "loss": 0.0006, + "step": 1622 + }, + { + "epoch": 0.5506718094350019, + "grad_norm": 0.024435337632894516, + "learning_rate": 9.42733684687334e-05, + "loss": 0.0002, + "step": 1624 + }, + { + "epoch": 0.5513499766880007, + "grad_norm": 0.0044030421413481236, + "learning_rate": 9.425861305812082e-05, + "loss": 0.0002, + "step": 1626 + }, + { + "epoch": 0.5520281439409994, + "grad_norm": 0.06567414849996567, + "learning_rate": 9.424383982014423e-05, + "loss": 0.0002, + "step": 1628 + }, + { + "epoch": 0.5527063111939983, + "grad_norm": 0.10958150774240494, + "learning_rate": 9.42290487607542e-05, + "loss": 0.0006, + "step": 1630 + }, + { + "epoch": 0.553384478446997, + "grad_norm": 0.036298271268606186, + "learning_rate": 9.421423988590865e-05, + "loss": 0.0003, + "step": 1632 + }, + { + "epoch": 0.5540626456999957, + "grad_norm": 0.09867826849222183, + "learning_rate": 9.419941320157255e-05, + "loss": 0.0009, + "step": 1634 + }, + { + "epoch": 0.5547408129529945, + "grad_norm": 0.04817799851298332, + "learning_rate": 9.418456871371809e-05, + "loss": 0.0006, + "step": 1636 + }, + { + "epoch": 0.5554189802059933, + "grad_norm": 0.05213902145624161, + "learning_rate": 9.416970642832465e-05, + "loss": 0.0002, + "step": 1638 + }, + { + "epoch": 0.5560971474589921, + "grad_norm": 0.035610131919384, + "learning_rate": 9.415482635137873e-05, + "loss": 0.0004, + "step": 1640 + }, + { + "epoch": 0.5567753147119908, + "grad_norm": 0.05330129340291023, + "learning_rate": 9.413992848887406e-05, + "loss": 0.0006, + "step": 1642 + }, + { + "epoch": 0.5574534819649896, + "grad_norm": 0.0890207514166832, + "learning_rate": 9.412501284681145e-05, + "loss": 0.001, + "step": 1644 + }, + { + "epoch": 0.5581316492179884, + "grad_norm": 0.03352257236838341, + "learning_rate": 9.411007943119894e-05, + "loss": 0.0003, + "step": 1646 + }, + { + "epoch": 0.5588098164709872, + "grad_norm": 0.023783110082149506, + "learning_rate": 9.409512824805172e-05, + "loss": 0.0006, + "step": 1648 + }, + { + "epoch": 0.5594879837239859, + "grad_norm": 0.050553686916828156, + "learning_rate": 9.408015930339211e-05, + "loss": 0.0006, + "step": 1650 + }, + { + "epoch": 0.5601661509769847, + "grad_norm": 0.03647039085626602, + "learning_rate": 9.40651726032496e-05, + "loss": 0.0004, + "step": 1652 + }, + { + "epoch": 0.5608443182299835, + "grad_norm": 0.06705080717802048, + "learning_rate": 9.405016815366086e-05, + "loss": 0.0004, + "step": 1654 + }, + { + "epoch": 0.5615224854829822, + "grad_norm": 0.033958956599235535, + "learning_rate": 9.403514596066963e-05, + "loss": 0.0004, + "step": 1656 + }, + { + "epoch": 0.562200652735981, + "grad_norm": 0.04107276350259781, + "learning_rate": 9.402010603032689e-05, + "loss": 0.0004, + "step": 1658 + }, + { + "epoch": 0.5628788199889798, + "grad_norm": 0.03651130571961403, + "learning_rate": 9.400504836869069e-05, + "loss": 0.0002, + "step": 1660 + }, + { + "epoch": 0.5635569872419786, + "grad_norm": 0.08204538375139236, + "learning_rate": 9.398997298182628e-05, + "loss": 0.0007, + "step": 1662 + }, + { + "epoch": 0.5642351544949773, + "grad_norm": 0.052677396684885025, + "learning_rate": 9.397487987580602e-05, + "loss": 0.0005, + "step": 1664 + }, + { + "epoch": 0.5649133217479761, + "grad_norm": 0.05886976420879364, + "learning_rate": 9.395976905670939e-05, + "loss": 0.0003, + "step": 1666 + }, + { + "epoch": 0.5655914890009749, + "grad_norm": 0.01760573871433735, + "learning_rate": 9.394464053062304e-05, + "loss": 0.0001, + "step": 1668 + }, + { + "epoch": 0.5662696562539736, + "grad_norm": 0.038459379225969315, + "learning_rate": 9.392949430364076e-05, + "loss": 0.0003, + "step": 1670 + }, + { + "epoch": 0.5669478235069724, + "grad_norm": 0.020486151799559593, + "learning_rate": 9.391433038186341e-05, + "loss": 0.0002, + "step": 1672 + }, + { + "epoch": 0.5676259907599712, + "grad_norm": 0.037114061415195465, + "learning_rate": 9.389914877139903e-05, + "loss": 0.0002, + "step": 1674 + }, + { + "epoch": 0.56830415801297, + "grad_norm": 0.022536030039191246, + "learning_rate": 9.388394947836279e-05, + "loss": 0.0002, + "step": 1676 + }, + { + "epoch": 0.5689823252659687, + "grad_norm": 0.020146558061242104, + "learning_rate": 9.386873250887694e-05, + "loss": 0.0002, + "step": 1678 + }, + { + "epoch": 0.5696604925189674, + "grad_norm": 0.011131091974675655, + "learning_rate": 9.385349786907088e-05, + "loss": 0.0001, + "step": 1680 + }, + { + "epoch": 0.5703386597719663, + "grad_norm": 0.020283561199903488, + "learning_rate": 9.383824556508112e-05, + "loss": 0.0002, + "step": 1682 + }, + { + "epoch": 0.571016827024965, + "grad_norm": 0.026053188368678093, + "learning_rate": 9.382297560305129e-05, + "loss": 0.0002, + "step": 1684 + }, + { + "epoch": 0.5716949942779638, + "grad_norm": 0.0034120355267077684, + "learning_rate": 9.380768798913213e-05, + "loss": 0.0003, + "step": 1686 + }, + { + "epoch": 0.5723731615309626, + "grad_norm": 0.0031566445250064135, + "learning_rate": 9.37923827294815e-05, + "loss": 0.0, + "step": 1688 + }, + { + "epoch": 0.5730513287839614, + "grad_norm": 0.04603234678506851, + "learning_rate": 9.377705983026433e-05, + "loss": 0.0008, + "step": 1690 + }, + { + "epoch": 0.5737294960369601, + "grad_norm": 0.024317435920238495, + "learning_rate": 9.376171929765269e-05, + "loss": 0.0005, + "step": 1692 + }, + { + "epoch": 0.5744076632899588, + "grad_norm": 0.024809902533888817, + "learning_rate": 9.374636113782576e-05, + "loss": 0.0001, + "step": 1694 + }, + { + "epoch": 0.5750858305429577, + "grad_norm": 0.06629851460456848, + "learning_rate": 9.373098535696979e-05, + "loss": 0.0003, + "step": 1696 + }, + { + "epoch": 0.5757639977959564, + "grad_norm": 0.07917294651269913, + "learning_rate": 9.371559196127815e-05, + "loss": 0.0006, + "step": 1698 + }, + { + "epoch": 0.5764421650489552, + "grad_norm": 0.029546458274126053, + "learning_rate": 9.37001809569513e-05, + "loss": 0.0005, + "step": 1700 + }, + { + "epoch": 0.5771203323019539, + "grad_norm": 0.02987642213702202, + "learning_rate": 9.368475235019678e-05, + "loss": 0.0004, + "step": 1702 + }, + { + "epoch": 0.5777984995549528, + "grad_norm": 0.05608579143881798, + "learning_rate": 9.366930614722925e-05, + "loss": 0.0009, + "step": 1704 + }, + { + "epoch": 0.5784766668079515, + "grad_norm": 0.053664229810237885, + "learning_rate": 9.365384235427042e-05, + "loss": 0.0002, + "step": 1706 + }, + { + "epoch": 0.5791548340609503, + "grad_norm": 0.04168921709060669, + "learning_rate": 9.36383609775491e-05, + "loss": 0.0003, + "step": 1708 + }, + { + "epoch": 0.5798330013139491, + "grad_norm": 0.04824904724955559, + "learning_rate": 9.36228620233012e-05, + "loss": 0.0002, + "step": 1710 + }, + { + "epoch": 0.5805111685669478, + "grad_norm": 0.0651102215051651, + "learning_rate": 9.36073454977697e-05, + "loss": 0.0004, + "step": 1712 + }, + { + "epoch": 0.5811893358199466, + "grad_norm": 0.03015986643731594, + "learning_rate": 9.359181140720464e-05, + "loss": 0.0003, + "step": 1714 + }, + { + "epoch": 0.5818675030729453, + "grad_norm": 0.031174469739198685, + "learning_rate": 9.357625975786317e-05, + "loss": 0.0002, + "step": 1716 + }, + { + "epoch": 0.5825456703259442, + "grad_norm": 0.09150509536266327, + "learning_rate": 9.356069055600948e-05, + "loss": 0.0005, + "step": 1718 + }, + { + "epoch": 0.5832238375789429, + "grad_norm": 0.048985328525304794, + "learning_rate": 9.354510380791483e-05, + "loss": 0.0002, + "step": 1720 + }, + { + "epoch": 0.5839020048319417, + "grad_norm": 0.010641923174262047, + "learning_rate": 9.352949951985758e-05, + "loss": 0.0001, + "step": 1722 + }, + { + "epoch": 0.5845801720849404, + "grad_norm": 0.011535330675542355, + "learning_rate": 9.351387769812315e-05, + "loss": 0.0003, + "step": 1724 + }, + { + "epoch": 0.5852583393379392, + "grad_norm": 0.01625724509358406, + "learning_rate": 9.349823834900395e-05, + "loss": 0.0001, + "step": 1726 + }, + { + "epoch": 0.585936506590938, + "grad_norm": 0.03464433178305626, + "learning_rate": 9.348258147879957e-05, + "loss": 0.0003, + "step": 1728 + }, + { + "epoch": 0.5866146738439367, + "grad_norm": 0.056171514093875885, + "learning_rate": 9.346690709381655e-05, + "loss": 0.0005, + "step": 1730 + }, + { + "epoch": 0.5872928410969356, + "grad_norm": 0.04725542664527893, + "learning_rate": 9.345121520036856e-05, + "loss": 0.0003, + "step": 1732 + }, + { + "epoch": 0.5879710083499343, + "grad_norm": 0.04443192109465599, + "learning_rate": 9.343550580477631e-05, + "loss": 0.0003, + "step": 1734 + }, + { + "epoch": 0.5886491756029331, + "grad_norm": 0.004577595740556717, + "learning_rate": 9.341977891336749e-05, + "loss": 0.0002, + "step": 1736 + }, + { + "epoch": 0.5893273428559318, + "grad_norm": 0.005344774108380079, + "learning_rate": 9.340403453247691e-05, + "loss": 0.0, + "step": 1738 + }, + { + "epoch": 0.5900055101089307, + "grad_norm": 0.03278961032629013, + "learning_rate": 9.338827266844644e-05, + "loss": 0.0004, + "step": 1740 + }, + { + "epoch": 0.5906836773619294, + "grad_norm": 0.07316702604293823, + "learning_rate": 9.33724933276249e-05, + "loss": 0.0003, + "step": 1742 + }, + { + "epoch": 0.5913618446149281, + "grad_norm": 0.027365239337086678, + "learning_rate": 9.335669651636824e-05, + "loss": 0.0001, + "step": 1744 + }, + { + "epoch": 0.5920400118679269, + "grad_norm": 0.021680912002921104, + "learning_rate": 9.33408822410394e-05, + "loss": 0.0001, + "step": 1746 + }, + { + "epoch": 0.5927181791209257, + "grad_norm": 0.07199620455503464, + "learning_rate": 9.33250505080084e-05, + "loss": 0.0004, + "step": 1748 + }, + { + "epoch": 0.5933963463739245, + "grad_norm": 0.0033168045338243246, + "learning_rate": 9.330920132365222e-05, + "loss": 0.0, + "step": 1750 + }, + { + "epoch": 0.5940745136269232, + "grad_norm": 0.02242601104080677, + "learning_rate": 9.329333469435493e-05, + "loss": 0.0001, + "step": 1752 + }, + { + "epoch": 0.5947526808799221, + "grad_norm": 0.01347749400883913, + "learning_rate": 9.327745062650761e-05, + "loss": 0.0001, + "step": 1754 + }, + { + "epoch": 0.5954308481329208, + "grad_norm": 0.004134488757699728, + "learning_rate": 9.326154912650834e-05, + "loss": 0.0001, + "step": 1756 + }, + { + "epoch": 0.5961090153859195, + "grad_norm": 0.04600505530834198, + "learning_rate": 9.324563020076227e-05, + "loss": 0.0003, + "step": 1758 + }, + { + "epoch": 0.5967871826389183, + "grad_norm": 0.005666503217071295, + "learning_rate": 9.322969385568151e-05, + "loss": 0.0001, + "step": 1760 + }, + { + "epoch": 0.5974653498919171, + "grad_norm": 0.02874704822897911, + "learning_rate": 9.321374009768525e-05, + "loss": 0.0001, + "step": 1762 + }, + { + "epoch": 0.5981435171449159, + "grad_norm": 0.0032893160823732615, + "learning_rate": 9.319776893319962e-05, + "loss": 0.0, + "step": 1764 + }, + { + "epoch": 0.5988216843979146, + "grad_norm": 0.004591864533722401, + "learning_rate": 9.318178036865785e-05, + "loss": 0.0001, + "step": 1766 + }, + { + "epoch": 0.5994998516509134, + "grad_norm": 0.06738582253456116, + "learning_rate": 9.316577441050012e-05, + "loss": 0.0004, + "step": 1768 + }, + { + "epoch": 0.6001780189039122, + "grad_norm": 0.0225976575165987, + "learning_rate": 9.314975106517361e-05, + "loss": 0.0002, + "step": 1770 + }, + { + "epoch": 0.6008561861569109, + "grad_norm": 0.09164045006036758, + "learning_rate": 9.313371033913253e-05, + "loss": 0.0006, + "step": 1772 + }, + { + "epoch": 0.6015343534099097, + "grad_norm": 0.018893009051680565, + "learning_rate": 9.311765223883808e-05, + "loss": 0.0002, + "step": 1774 + }, + { + "epoch": 0.6022125206629085, + "grad_norm": 0.05388549342751503, + "learning_rate": 9.310157677075847e-05, + "loss": 0.0005, + "step": 1776 + }, + { + "epoch": 0.6028906879159073, + "grad_norm": 0.08900446444749832, + "learning_rate": 9.308548394136889e-05, + "loss": 0.0006, + "step": 1778 + }, + { + "epoch": 0.603568855168906, + "grad_norm": 0.14662937819957733, + "learning_rate": 9.306937375715153e-05, + "loss": 0.0008, + "step": 1780 + }, + { + "epoch": 0.6042470224219048, + "grad_norm": 0.0050133345648646355, + "learning_rate": 9.305324622459557e-05, + "loss": 0.0, + "step": 1782 + }, + { + "epoch": 0.6049251896749036, + "grad_norm": 0.05219025909900665, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0004, + "step": 1784 + }, + { + "epoch": 0.6056033569279023, + "grad_norm": 0.04286068305373192, + "learning_rate": 9.302093914045953e-05, + "loss": 0.0006, + "step": 1786 + }, + { + "epoch": 0.6062815241809011, + "grad_norm": 0.053802043199539185, + "learning_rate": 9.300475960189272e-05, + "loss": 0.0008, + "step": 1788 + }, + { + "epoch": 0.6069596914338999, + "grad_norm": 0.004111258313059807, + "learning_rate": 9.298856274101389e-05, + "loss": 0.0001, + "step": 1790 + }, + { + "epoch": 0.6076378586868987, + "grad_norm": 0.020110275596380234, + "learning_rate": 9.297234856434712e-05, + "loss": 0.0005, + "step": 1792 + }, + { + "epoch": 0.6083160259398974, + "grad_norm": 0.043072108179330826, + "learning_rate": 9.29561170784235e-05, + "loss": 0.0004, + "step": 1794 + }, + { + "epoch": 0.6089941931928962, + "grad_norm": 0.010845893993973732, + "learning_rate": 9.293986828978106e-05, + "loss": 0.0004, + "step": 1796 + }, + { + "epoch": 0.609672360445895, + "grad_norm": 0.017575867474079132, + "learning_rate": 9.292360220496479e-05, + "loss": 0.0001, + "step": 1798 + }, + { + "epoch": 0.6103505276988938, + "grad_norm": 0.05960315838456154, + "learning_rate": 9.29073188305267e-05, + "loss": 0.0002, + "step": 1800 + }, + { + "epoch": 0.6110286949518925, + "grad_norm": 0.01850164122879505, + "learning_rate": 9.28910181730257e-05, + "loss": 0.0003, + "step": 1802 + }, + { + "epoch": 0.6117068622048912, + "grad_norm": 0.059937555342912674, + "learning_rate": 9.287470023902774e-05, + "loss": 0.0004, + "step": 1804 + }, + { + "epoch": 0.6123850294578901, + "grad_norm": 0.0020369836129248142, + "learning_rate": 9.285836503510562e-05, + "loss": 0.0001, + "step": 1806 + }, + { + "epoch": 0.6130631967108888, + "grad_norm": 0.00964849442243576, + "learning_rate": 9.28420125678392e-05, + "loss": 0.0, + "step": 1808 + }, + { + "epoch": 0.6137413639638876, + "grad_norm": 0.04994570463895798, + "learning_rate": 9.282564284381524e-05, + "loss": 0.0002, + "step": 1810 + }, + { + "epoch": 0.6144195312168864, + "grad_norm": 0.02114264853298664, + "learning_rate": 9.280925586962747e-05, + "loss": 0.0, + "step": 1812 + }, + { + "epoch": 0.6150976984698852, + "grad_norm": 0.0017973040230572224, + "learning_rate": 9.279285165187655e-05, + "loss": 0.0002, + "step": 1814 + }, + { + "epoch": 0.6157758657228839, + "grad_norm": 0.015073217451572418, + "learning_rate": 9.27764301971701e-05, + "loss": 0.0, + "step": 1816 + }, + { + "epoch": 0.6164540329758826, + "grad_norm": 0.010442632250487804, + "learning_rate": 9.275999151212269e-05, + "loss": 0.0, + "step": 1818 + }, + { + "epoch": 0.6171322002288815, + "grad_norm": 0.00925542414188385, + "learning_rate": 9.27435356033558e-05, + "loss": 0.0001, + "step": 1820 + }, + { + "epoch": 0.6178103674818802, + "grad_norm": 0.0010579138761386275, + "learning_rate": 9.272706247749789e-05, + "loss": 0.0, + "step": 1822 + }, + { + "epoch": 0.618488534734879, + "grad_norm": 0.0006910067750141025, + "learning_rate": 9.271057214118432e-05, + "loss": 0.0004, + "step": 1824 + }, + { + "epoch": 0.6191667019878777, + "grad_norm": 0.0045022424310445786, + "learning_rate": 9.26940646010574e-05, + "loss": 0.0, + "step": 1826 + }, + { + "epoch": 0.6198448692408766, + "grad_norm": 0.0032699264120310545, + "learning_rate": 9.267753986376637e-05, + "loss": 0.0, + "step": 1828 + }, + { + "epoch": 0.6205230364938753, + "grad_norm": 0.0015567560913041234, + "learning_rate": 9.266099793596739e-05, + "loss": 0.0003, + "step": 1830 + }, + { + "epoch": 0.621201203746874, + "grad_norm": 0.019803684204816818, + "learning_rate": 9.264443882432355e-05, + "loss": 0.0001, + "step": 1832 + }, + { + "epoch": 0.6218793709998729, + "grad_norm": 0.03902900218963623, + "learning_rate": 9.262786253550482e-05, + "loss": 0.0001, + "step": 1834 + }, + { + "epoch": 0.6225575382528716, + "grad_norm": 0.0033882891293615103, + "learning_rate": 9.261126907618818e-05, + "loss": 0.0, + "step": 1836 + }, + { + "epoch": 0.6232357055058704, + "grad_norm": 0.004297104198485613, + "learning_rate": 9.259465845305745e-05, + "loss": 0.0, + "step": 1838 + }, + { + "epoch": 0.6239138727588691, + "grad_norm": 0.04403658211231232, + "learning_rate": 9.257803067280337e-05, + "loss": 0.0002, + "step": 1840 + }, + { + "epoch": 0.624592040011868, + "grad_norm": 0.0007246221066452563, + "learning_rate": 9.256138574212365e-05, + "loss": 0.0, + "step": 1842 + }, + { + "epoch": 0.6252702072648667, + "grad_norm": 0.003953785169869661, + "learning_rate": 9.254472366772285e-05, + "loss": 0.0, + "step": 1844 + }, + { + "epoch": 0.6259483745178654, + "grad_norm": 0.003318328410387039, + "learning_rate": 9.252804445631243e-05, + "loss": 0.0, + "step": 1846 + }, + { + "epoch": 0.6266265417708642, + "grad_norm": 0.021481771022081375, + "learning_rate": 9.251134811461077e-05, + "loss": 0.0001, + "step": 1848 + }, + { + "epoch": 0.627304709023863, + "grad_norm": 0.0016347782220691442, + "learning_rate": 9.249463464934321e-05, + "loss": 0.0, + "step": 1850 + }, + { + "epoch": 0.6279828762768618, + "grad_norm": 0.002228502184152603, + "learning_rate": 9.247790406724186e-05, + "loss": 0.0, + "step": 1852 + }, + { + "epoch": 0.6286610435298605, + "grad_norm": 0.015562807209789753, + "learning_rate": 9.246115637504587e-05, + "loss": 0.0001, + "step": 1854 + }, + { + "epoch": 0.6293392107828594, + "grad_norm": 0.016600174829363823, + "learning_rate": 9.244439157950114e-05, + "loss": 0.0001, + "step": 1856 + }, + { + "epoch": 0.6300173780358581, + "grad_norm": 0.014609369449317455, + "learning_rate": 9.242760968736056e-05, + "loss": 0.0, + "step": 1858 + }, + { + "epoch": 0.6306955452888569, + "grad_norm": 0.0012606787495315075, + "learning_rate": 9.241081070538389e-05, + "loss": 0.0, + "step": 1860 + }, + { + "epoch": 0.6313737125418556, + "grad_norm": 0.01701480522751808, + "learning_rate": 9.239399464033771e-05, + "loss": 0.0001, + "step": 1862 + }, + { + "epoch": 0.6320518797948544, + "grad_norm": 0.009432896971702576, + "learning_rate": 9.237716149899559e-05, + "loss": 0.0, + "step": 1864 + }, + { + "epoch": 0.6327300470478532, + "grad_norm": 0.0005134916864335537, + "learning_rate": 9.236031128813788e-05, + "loss": 0.0, + "step": 1866 + }, + { + "epoch": 0.6334082143008519, + "grad_norm": 0.0026470404118299484, + "learning_rate": 9.234344401455184e-05, + "loss": 0.0, + "step": 1868 + }, + { + "epoch": 0.6340863815538507, + "grad_norm": 0.0005988986231386662, + "learning_rate": 9.232655968503161e-05, + "loss": 0.0, + "step": 1870 + }, + { + "epoch": 0.6347645488068495, + "grad_norm": 0.00032376497983932495, + "learning_rate": 9.230965830637821e-05, + "loss": 0.0, + "step": 1872 + }, + { + "epoch": 0.6354427160598483, + "grad_norm": 0.0007756618433631957, + "learning_rate": 9.22927398853995e-05, + "loss": 0.0, + "step": 1874 + }, + { + "epoch": 0.636120883312847, + "grad_norm": 0.0016515598399564624, + "learning_rate": 9.227580442891022e-05, + "loss": 0.0, + "step": 1876 + }, + { + "epoch": 0.6367990505658458, + "grad_norm": 0.03476359322667122, + "learning_rate": 9.225885194373199e-05, + "loss": 0.0001, + "step": 1878 + }, + { + "epoch": 0.6374772178188446, + "grad_norm": 0.001096145948395133, + "learning_rate": 9.224188243669323e-05, + "loss": 0.0, + "step": 1880 + }, + { + "epoch": 0.6381553850718433, + "grad_norm": 0.0007735094404779375, + "learning_rate": 9.222489591462927e-05, + "loss": 0.0, + "step": 1882 + }, + { + "epoch": 0.6388335523248421, + "grad_norm": 0.118960902094841, + "learning_rate": 9.22078923843823e-05, + "loss": 0.0002, + "step": 1884 + }, + { + "epoch": 0.6395117195778409, + "grad_norm": 0.00038147129816934466, + "learning_rate": 9.219087185280132e-05, + "loss": 0.0, + "step": 1886 + }, + { + "epoch": 0.6401898868308397, + "grad_norm": 0.0548308789730072, + "learning_rate": 9.21738343267422e-05, + "loss": 0.0002, + "step": 1888 + }, + { + "epoch": 0.6408680540838384, + "grad_norm": 0.002948600333184004, + "learning_rate": 9.215677981306768e-05, + "loss": 0.0, + "step": 1890 + }, + { + "epoch": 0.6415462213368373, + "grad_norm": 0.00647724187001586, + "learning_rate": 9.213970831864727e-05, + "loss": 0.0, + "step": 1892 + }, + { + "epoch": 0.642224388589836, + "grad_norm": 0.03968558833003044, + "learning_rate": 9.212261985035739e-05, + "loss": 0.0003, + "step": 1894 + }, + { + "epoch": 0.6429025558428347, + "grad_norm": 0.0006300415843725204, + "learning_rate": 9.210551441508126e-05, + "loss": 0.0, + "step": 1896 + }, + { + "epoch": 0.6435807230958335, + "grad_norm": 0.0007496281177736819, + "learning_rate": 9.208839201970897e-05, + "loss": 0.0, + "step": 1898 + }, + { + "epoch": 0.6442588903488323, + "grad_norm": 0.029067767783999443, + "learning_rate": 9.207125267113742e-05, + "loss": 0.0002, + "step": 1900 + }, + { + "epoch": 0.6449370576018311, + "grad_norm": 0.0023949281312525272, + "learning_rate": 9.205409637627031e-05, + "loss": 0.0001, + "step": 1902 + }, + { + "epoch": 0.6456152248548298, + "grad_norm": 0.048111461102962494, + "learning_rate": 9.203692314201822e-05, + "loss": 0.0002, + "step": 1904 + }, + { + "epoch": 0.6462933921078285, + "grad_norm": 0.002165262820199132, + "learning_rate": 9.201973297529851e-05, + "loss": 0.0, + "step": 1906 + }, + { + "epoch": 0.6469715593608274, + "grad_norm": 0.0018261608202010393, + "learning_rate": 9.200252588303539e-05, + "loss": 0.0, + "step": 1908 + }, + { + "epoch": 0.6476497266138261, + "grad_norm": 0.00589523371309042, + "learning_rate": 9.198530187215986e-05, + "loss": 0.0, + "step": 1910 + }, + { + "epoch": 0.6483278938668249, + "grad_norm": 0.00899149477481842, + "learning_rate": 9.196806094960976e-05, + "loss": 0.0, + "step": 1912 + }, + { + "epoch": 0.6490060611198237, + "grad_norm": 0.00829945970326662, + "learning_rate": 9.195080312232972e-05, + "loss": 0.0, + "step": 1914 + }, + { + "epoch": 0.6496842283728225, + "grad_norm": 0.01966802030801773, + "learning_rate": 9.193352839727121e-05, + "loss": 0.0001, + "step": 1916 + }, + { + "epoch": 0.6503623956258212, + "grad_norm": 0.003154600504785776, + "learning_rate": 9.191623678139247e-05, + "loss": 0.0, + "step": 1918 + }, + { + "epoch": 0.65104056287882, + "grad_norm": 0.024086041375994682, + "learning_rate": 9.189892828165857e-05, + "loss": 0.0007, + "step": 1920 + }, + { + "epoch": 0.6517187301318188, + "grad_norm": 0.021366657689213753, + "learning_rate": 9.188160290504136e-05, + "loss": 0.0, + "step": 1922 + }, + { + "epoch": 0.6523968973848175, + "grad_norm": 0.003147814190015197, + "learning_rate": 9.186426065851952e-05, + "loss": 0.0, + "step": 1924 + }, + { + "epoch": 0.6530750646378163, + "grad_norm": 0.018301868811249733, + "learning_rate": 9.184690154907849e-05, + "loss": 0.0001, + "step": 1926 + }, + { + "epoch": 0.653753231890815, + "grad_norm": 0.0010956176556646824, + "learning_rate": 9.182952558371052e-05, + "loss": 0.0004, + "step": 1928 + }, + { + "epoch": 0.6544313991438139, + "grad_norm": 0.011941217817366123, + "learning_rate": 9.181213276941465e-05, + "loss": 0.0001, + "step": 1930 + }, + { + "epoch": 0.6551095663968126, + "grad_norm": 0.04679281637072563, + "learning_rate": 9.179472311319669e-05, + "loss": 0.0001, + "step": 1932 + }, + { + "epoch": 0.6557877336498114, + "grad_norm": 0.005270330235362053, + "learning_rate": 9.177729662206926e-05, + "loss": 0.0005, + "step": 1934 + }, + { + "epoch": 0.6564659009028102, + "grad_norm": 0.059503424912691116, + "learning_rate": 9.175985330305175e-05, + "loss": 0.0006, + "step": 1936 + }, + { + "epoch": 0.657144068155809, + "grad_norm": 0.05685205012559891, + "learning_rate": 9.174239316317033e-05, + "loss": 0.0003, + "step": 1938 + }, + { + "epoch": 0.6578222354088077, + "grad_norm": 0.02518923580646515, + "learning_rate": 9.172491620945793e-05, + "loss": 0.0001, + "step": 1940 + }, + { + "epoch": 0.6585004026618064, + "grad_norm": 0.0023309921380132437, + "learning_rate": 9.170742244895427e-05, + "loss": 0.0002, + "step": 1942 + }, + { + "epoch": 0.6591785699148053, + "grad_norm": 0.010663488879799843, + "learning_rate": 9.168991188870583e-05, + "loss": 0.0001, + "step": 1944 + }, + { + "epoch": 0.659856737167804, + "grad_norm": 0.004960994701832533, + "learning_rate": 9.167238453576589e-05, + "loss": 0.0, + "step": 1946 + }, + { + "epoch": 0.6605349044208028, + "grad_norm": 0.010904761031270027, + "learning_rate": 9.165484039719444e-05, + "loss": 0.0002, + "step": 1948 + }, + { + "epoch": 0.6612130716738015, + "grad_norm": 0.02706189453601837, + "learning_rate": 9.163727948005823e-05, + "loss": 0.0002, + "step": 1950 + }, + { + "epoch": 0.6618912389268004, + "grad_norm": 0.030097760260105133, + "learning_rate": 9.161970179143084e-05, + "loss": 0.0003, + "step": 1952 + }, + { + "epoch": 0.6625694061797991, + "grad_norm": 0.04335637018084526, + "learning_rate": 9.160210733839254e-05, + "loss": 0.0003, + "step": 1954 + }, + { + "epoch": 0.6632475734327978, + "grad_norm": 0.07014850527048111, + "learning_rate": 9.158449612803039e-05, + "loss": 0.0005, + "step": 1956 + }, + { + "epoch": 0.6639257406857967, + "grad_norm": 0.004154474474489689, + "learning_rate": 9.156686816743816e-05, + "loss": 0.0002, + "step": 1958 + }, + { + "epoch": 0.6646039079387954, + "grad_norm": 0.07189586758613586, + "learning_rate": 9.154922346371642e-05, + "loss": 0.0003, + "step": 1960 + }, + { + "epoch": 0.6652820751917942, + "grad_norm": 0.022829728201031685, + "learning_rate": 9.153156202397243e-05, + "loss": 0.0001, + "step": 1962 + }, + { + "epoch": 0.6659602424447929, + "grad_norm": 0.015623291954398155, + "learning_rate": 9.151388385532022e-05, + "loss": 0.0001, + "step": 1964 + }, + { + "epoch": 0.6666384096977918, + "grad_norm": 0.04405735060572624, + "learning_rate": 9.149618896488056e-05, + "loss": 0.0001, + "step": 1966 + }, + { + "epoch": 0.6673165769507905, + "grad_norm": 0.007028218358755112, + "learning_rate": 9.147847735978094e-05, + "loss": 0.0002, + "step": 1968 + }, + { + "epoch": 0.6679947442037892, + "grad_norm": 0.06891366094350815, + "learning_rate": 9.146074904715561e-05, + "loss": 0.0002, + "step": 1970 + }, + { + "epoch": 0.668672911456788, + "grad_norm": 0.006903359200805426, + "learning_rate": 9.144300403414552e-05, + "loss": 0.0001, + "step": 1972 + }, + { + "epoch": 0.6693510787097868, + "grad_norm": 0.02753445878624916, + "learning_rate": 9.142524232789836e-05, + "loss": 0.0001, + "step": 1974 + }, + { + "epoch": 0.6700292459627856, + "grad_norm": 0.002083703875541687, + "learning_rate": 9.140746393556854e-05, + "loss": 0.0001, + "step": 1976 + }, + { + "epoch": 0.6707074132157843, + "grad_norm": 0.03971313685178757, + "learning_rate": 9.13896688643172e-05, + "loss": 0.0001, + "step": 1978 + }, + { + "epoch": 0.6713855804687832, + "grad_norm": 0.01733364723622799, + "learning_rate": 9.13718571213122e-05, + "loss": 0.0001, + "step": 1980 + }, + { + "epoch": 0.6720637477217819, + "grad_norm": 0.0006641743239015341, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0001, + "step": 1982 + }, + { + "epoch": 0.6727419149747806, + "grad_norm": 0.009665962308645248, + "learning_rate": 9.133618364874616e-05, + "loss": 0.0001, + "step": 1984 + }, + { + "epoch": 0.6734200822277794, + "grad_norm": 0.025363771244883537, + "learning_rate": 9.131832193355441e-05, + "loss": 0.0001, + "step": 1986 + }, + { + "epoch": 0.6740982494807782, + "grad_norm": 0.020307032391428947, + "learning_rate": 9.130044357534752e-05, + "loss": 0.0001, + "step": 1988 + }, + { + "epoch": 0.674776416733777, + "grad_norm": 0.004317408427596092, + "learning_rate": 9.12825485813269e-05, + "loss": 0.0001, + "step": 1990 + }, + { + "epoch": 0.6754545839867757, + "grad_norm": 0.007612558081746101, + "learning_rate": 9.126463695870064e-05, + "loss": 0.0, + "step": 1992 + }, + { + "epoch": 0.6761327512397745, + "grad_norm": 0.00039802887476980686, + "learning_rate": 9.124670871468355e-05, + "loss": 0.0, + "step": 1994 + }, + { + "epoch": 0.6768109184927733, + "grad_norm": 0.06519202142953873, + "learning_rate": 9.122876385649713e-05, + "loss": 0.0002, + "step": 1996 + }, + { + "epoch": 0.677489085745772, + "grad_norm": 0.0010920091299340129, + "learning_rate": 9.121080239136954e-05, + "loss": 0.0002, + "step": 1998 + }, + { + "epoch": 0.6781672529987708, + "grad_norm": 0.0016080245841294527, + "learning_rate": 9.119282432653569e-05, + "loss": 0.0, + "step": 2000 + }, + { + "epoch": 0.6788454202517696, + "grad_norm": 0.007306835614144802, + "learning_rate": 9.117482966923714e-05, + "loss": 0.0, + "step": 2002 + }, + { + "epoch": 0.6795235875047684, + "grad_norm": 0.011635251343250275, + "learning_rate": 9.11568184267221e-05, + "loss": 0.0, + "step": 2004 + }, + { + "epoch": 0.6802017547577671, + "grad_norm": 0.001634229440242052, + "learning_rate": 9.113879060624553e-05, + "loss": 0.0, + "step": 2006 + }, + { + "epoch": 0.6808799220107659, + "grad_norm": 0.0009552877745591104, + "learning_rate": 9.112074621506902e-05, + "loss": 0.0, + "step": 2008 + }, + { + "epoch": 0.6815580892637647, + "grad_norm": 0.004326884634792805, + "learning_rate": 9.110268526046085e-05, + "loss": 0.0, + "step": 2010 + }, + { + "epoch": 0.6822362565167635, + "grad_norm": 0.0170154869556427, + "learning_rate": 9.108460774969598e-05, + "loss": 0.0001, + "step": 2012 + }, + { + "epoch": 0.6829144237697622, + "grad_norm": 0.023070022463798523, + "learning_rate": 9.106651369005601e-05, + "loss": 0.0001, + "step": 2014 + }, + { + "epoch": 0.683592591022761, + "grad_norm": 0.0006951871910132468, + "learning_rate": 9.104840308882924e-05, + "loss": 0.0, + "step": 2016 + }, + { + "epoch": 0.6842707582757598, + "grad_norm": 0.030992897227406502, + "learning_rate": 9.103027595331063e-05, + "loss": 0.0002, + "step": 2018 + }, + { + "epoch": 0.6849489255287585, + "grad_norm": 0.0006043879548087716, + "learning_rate": 9.101213229080177e-05, + "loss": 0.0002, + "step": 2020 + }, + { + "epoch": 0.6856270927817573, + "grad_norm": 0.0035766216460615396, + "learning_rate": 9.099397210861091e-05, + "loss": 0.0, + "step": 2022 + }, + { + "epoch": 0.6863052600347561, + "grad_norm": 0.020493974909186363, + "learning_rate": 9.0975795414053e-05, + "loss": 0.0002, + "step": 2024 + }, + { + "epoch": 0.6869834272877549, + "grad_norm": 0.0067712995223701, + "learning_rate": 9.09576022144496e-05, + "loss": 0.0, + "step": 2026 + }, + { + "epoch": 0.6876615945407536, + "grad_norm": 0.007755874190479517, + "learning_rate": 9.093939251712891e-05, + "loss": 0.0, + "step": 2028 + }, + { + "epoch": 0.6883397617937523, + "grad_norm": 0.01566697284579277, + "learning_rate": 9.092116632942582e-05, + "loss": 0.0001, + "step": 2030 + }, + { + "epoch": 0.6890179290467512, + "grad_norm": 0.0011003854451701045, + "learning_rate": 9.090292365868183e-05, + "loss": 0.0, + "step": 2032 + }, + { + "epoch": 0.6896960962997499, + "grad_norm": 0.0002819379442371428, + "learning_rate": 9.088466451224508e-05, + "loss": 0.0001, + "step": 2034 + }, + { + "epoch": 0.6903742635527487, + "grad_norm": 0.0012433732626959682, + "learning_rate": 9.086638889747035e-05, + "loss": 0.0, + "step": 2036 + }, + { + "epoch": 0.6910524308057475, + "grad_norm": 0.00959568377584219, + "learning_rate": 9.084809682171907e-05, + "loss": 0.0, + "step": 2038 + }, + { + "epoch": 0.6917305980587463, + "grad_norm": 0.029413780197501183, + "learning_rate": 9.082978829235927e-05, + "loss": 0.0002, + "step": 2040 + }, + { + "epoch": 0.692408765311745, + "grad_norm": 0.001475983764976263, + "learning_rate": 9.081146331676562e-05, + "loss": 0.0, + "step": 2042 + }, + { + "epoch": 0.6930869325647437, + "grad_norm": 0.00169148959685117, + "learning_rate": 9.079312190231944e-05, + "loss": 0.0, + "step": 2044 + }, + { + "epoch": 0.6937650998177426, + "grad_norm": 0.0006321507389657199, + "learning_rate": 9.077476405640862e-05, + "loss": 0.0, + "step": 2046 + }, + { + "epoch": 0.6944432670707413, + "grad_norm": 0.0008575081010349095, + "learning_rate": 9.075638978642771e-05, + "loss": 0.0, + "step": 2048 + }, + { + "epoch": 0.6951214343237401, + "grad_norm": 0.0017931024776771665, + "learning_rate": 9.07379990997779e-05, + "loss": 0.0, + "step": 2050 + }, + { + "epoch": 0.6957996015767388, + "grad_norm": 0.013845414854586124, + "learning_rate": 9.071959200386688e-05, + "loss": 0.0001, + "step": 2052 + }, + { + "epoch": 0.6964777688297377, + "grad_norm": 0.0004693720256909728, + "learning_rate": 9.07011685061091e-05, + "loss": 0.0, + "step": 2054 + }, + { + "epoch": 0.6971559360827364, + "grad_norm": 0.002094951691105962, + "learning_rate": 9.068272861392551e-05, + "loss": 0.0, + "step": 2056 + }, + { + "epoch": 0.6978341033357351, + "grad_norm": 0.08430146425962448, + "learning_rate": 9.06642723347437e-05, + "loss": 0.0002, + "step": 2058 + }, + { + "epoch": 0.698512270588734, + "grad_norm": 0.0010323359165340662, + "learning_rate": 9.064579967599786e-05, + "loss": 0.0001, + "step": 2060 + }, + { + "epoch": 0.6991904378417327, + "grad_norm": 0.009902028366923332, + "learning_rate": 9.062731064512876e-05, + "loss": 0.0, + "step": 2062 + }, + { + "epoch": 0.6998686050947315, + "grad_norm": 0.0009311030153185129, + "learning_rate": 9.06088052495838e-05, + "loss": 0.0002, + "step": 2064 + }, + { + "epoch": 0.7005467723477302, + "grad_norm": 0.00923315342515707, + "learning_rate": 9.059028349681694e-05, + "loss": 0.0, + "step": 2066 + }, + { + "epoch": 0.7012249396007291, + "grad_norm": 0.06685227155685425, + "learning_rate": 9.057174539428873e-05, + "loss": 0.0002, + "step": 2068 + }, + { + "epoch": 0.7019031068537278, + "grad_norm": 0.01807691715657711, + "learning_rate": 9.055319094946633e-05, + "loss": 0.0002, + "step": 2070 + }, + { + "epoch": 0.7025812741067265, + "grad_norm": 0.00733609776943922, + "learning_rate": 9.053462016982348e-05, + "loss": 0.0, + "step": 2072 + }, + { + "epoch": 0.7032594413597253, + "grad_norm": 0.02869594469666481, + "learning_rate": 9.051603306284047e-05, + "loss": 0.0002, + "step": 2074 + }, + { + "epoch": 0.7039376086127241, + "grad_norm": 0.028494779020547867, + "learning_rate": 9.049742963600418e-05, + "loss": 0.0001, + "step": 2076 + }, + { + "epoch": 0.7046157758657229, + "grad_norm": 0.011124657467007637, + "learning_rate": 9.047880989680808e-05, + "loss": 0.0001, + "step": 2078 + }, + { + "epoch": 0.7052939431187216, + "grad_norm": 0.001125624985434115, + "learning_rate": 9.046017385275218e-05, + "loss": 0.0001, + "step": 2080 + }, + { + "epoch": 0.7059721103717205, + "grad_norm": 0.006963169202208519, + "learning_rate": 9.044152151134311e-05, + "loss": 0.0001, + "step": 2082 + }, + { + "epoch": 0.7066502776247192, + "grad_norm": 0.07182872295379639, + "learning_rate": 9.0422852880094e-05, + "loss": 0.0002, + "step": 2084 + }, + { + "epoch": 0.707328444877718, + "grad_norm": 0.006542180199176073, + "learning_rate": 9.040416796652458e-05, + "loss": 0.0, + "step": 2086 + }, + { + "epoch": 0.7080066121307167, + "grad_norm": 0.0014154227683320642, + "learning_rate": 9.038546677816115e-05, + "loss": 0.0001, + "step": 2088 + }, + { + "epoch": 0.7086847793837155, + "grad_norm": 0.005531368311494589, + "learning_rate": 9.036674932253652e-05, + "loss": 0.0002, + "step": 2090 + }, + { + "epoch": 0.7093629466367143, + "grad_norm": 0.035860493779182434, + "learning_rate": 9.034801560719011e-05, + "loss": 0.0001, + "step": 2092 + }, + { + "epoch": 0.710041113889713, + "grad_norm": 0.02187497168779373, + "learning_rate": 9.032926563966781e-05, + "loss": 0.0001, + "step": 2094 + }, + { + "epoch": 0.7107192811427118, + "grad_norm": 0.003647587960585952, + "learning_rate": 9.031049942752215e-05, + "loss": 0.0, + "step": 2096 + }, + { + "epoch": 0.7113974483957106, + "grad_norm": 0.03892146795988083, + "learning_rate": 9.029171697831214e-05, + "loss": 0.0002, + "step": 2098 + }, + { + "epoch": 0.7120756156487094, + "grad_norm": 0.00161711813416332, + "learning_rate": 9.027291829960334e-05, + "loss": 0.0, + "step": 2100 + }, + { + "epoch": 0.7127537829017081, + "grad_norm": 0.06189465895295143, + "learning_rate": 9.025410339896788e-05, + "loss": 0.0008, + "step": 2102 + }, + { + "epoch": 0.713431950154707, + "grad_norm": 0.001967528834939003, + "learning_rate": 9.023527228398439e-05, + "loss": 0.0005, + "step": 2104 + }, + { + "epoch": 0.7141101174077057, + "grad_norm": 0.04755755886435509, + "learning_rate": 9.021642496223801e-05, + "loss": 0.0005, + "step": 2106 + }, + { + "epoch": 0.7147882846607044, + "grad_norm": 0.04014548659324646, + "learning_rate": 9.019756144132048e-05, + "loss": 0.0001, + "step": 2108 + }, + { + "epoch": 0.7154664519137032, + "grad_norm": 0.03437534719705582, + "learning_rate": 9.017868172882999e-05, + "loss": 0.0001, + "step": 2110 + }, + { + "epoch": 0.716144619166702, + "grad_norm": 0.06222149357199669, + "learning_rate": 9.015978583237132e-05, + "loss": 0.0004, + "step": 2112 + }, + { + "epoch": 0.7168227864197008, + "grad_norm": 0.007575817871838808, + "learning_rate": 9.014087375955573e-05, + "loss": 0.0001, + "step": 2114 + }, + { + "epoch": 0.7175009536726995, + "grad_norm": 0.03959363326430321, + "learning_rate": 9.012194551800098e-05, + "loss": 0.0001, + "step": 2116 + }, + { + "epoch": 0.7181791209256984, + "grad_norm": 0.024278750643134117, + "learning_rate": 9.010300111533138e-05, + "loss": 0.0001, + "step": 2118 + }, + { + "epoch": 0.7188572881786971, + "grad_norm": 0.00355353276245296, + "learning_rate": 9.008404055917772e-05, + "loss": 0.0001, + "step": 2120 + }, + { + "epoch": 0.7195354554316958, + "grad_norm": 0.06160525605082512, + "learning_rate": 9.00650638571773e-05, + "loss": 0.0001, + "step": 2122 + }, + { + "epoch": 0.7202136226846946, + "grad_norm": 0.006701144389808178, + "learning_rate": 9.004607101697397e-05, + "loss": 0.0, + "step": 2124 + }, + { + "epoch": 0.7208917899376934, + "grad_norm": 0.004692121408879757, + "learning_rate": 9.002706204621803e-05, + "loss": 0.0001, + "step": 2126 + }, + { + "epoch": 0.7215699571906922, + "grad_norm": 0.02521413005888462, + "learning_rate": 9.000803695256627e-05, + "loss": 0.0003, + "step": 2128 + }, + { + "epoch": 0.7222481244436909, + "grad_norm": 0.021716909483075142, + "learning_rate": 8.998899574368201e-05, + "loss": 0.0001, + "step": 2130 + }, + { + "epoch": 0.7229262916966896, + "grad_norm": 0.03598916158080101, + "learning_rate": 8.996993842723504e-05, + "loss": 0.0001, + "step": 2132 + }, + { + "epoch": 0.7236044589496885, + "grad_norm": 0.003856200259178877, + "learning_rate": 8.995086501090167e-05, + "loss": 0.0, + "step": 2134 + }, + { + "epoch": 0.7242826262026872, + "grad_norm": 0.003834143513813615, + "learning_rate": 8.993177550236464e-05, + "loss": 0.0, + "step": 2136 + }, + { + "epoch": 0.724960793455686, + "grad_norm": 0.008992083370685577, + "learning_rate": 8.991266990931322e-05, + "loss": 0.0, + "step": 2138 + }, + { + "epoch": 0.7256389607086848, + "grad_norm": 0.01066268514841795, + "learning_rate": 8.989354823944314e-05, + "loss": 0.0001, + "step": 2140 + }, + { + "epoch": 0.7263171279616836, + "grad_norm": 0.0006079597515054047, + "learning_rate": 8.987441050045658e-05, + "loss": 0.0, + "step": 2142 + }, + { + "epoch": 0.7269952952146823, + "grad_norm": 0.007435145787894726, + "learning_rate": 8.985525670006225e-05, + "loss": 0.0, + "step": 2144 + }, + { + "epoch": 0.727673462467681, + "grad_norm": 0.002100519370287657, + "learning_rate": 8.983608684597529e-05, + "loss": 0.0, + "step": 2146 + }, + { + "epoch": 0.7283516297206799, + "grad_norm": 0.02002492919564247, + "learning_rate": 8.98169009459173e-05, + "loss": 0.0001, + "step": 2148 + }, + { + "epoch": 0.7290297969736786, + "grad_norm": 0.0023629760835319757, + "learning_rate": 8.979769900761639e-05, + "loss": 0.0, + "step": 2150 + }, + { + "epoch": 0.7297079642266774, + "grad_norm": 0.0003582996141631156, + "learning_rate": 8.977848103880706e-05, + "loss": 0.0, + "step": 2152 + }, + { + "epoch": 0.7303861314796761, + "grad_norm": 0.0012692782329395413, + "learning_rate": 8.975924704723033e-05, + "loss": 0.0, + "step": 2154 + }, + { + "epoch": 0.731064298732675, + "grad_norm": 0.0007864875951781869, + "learning_rate": 8.973999704063365e-05, + "loss": 0.0, + "step": 2156 + }, + { + "epoch": 0.7317424659856737, + "grad_norm": 0.0011207296047359705, + "learning_rate": 8.972073102677091e-05, + "loss": 0.0, + "step": 2158 + }, + { + "epoch": 0.7324206332386725, + "grad_norm": 0.002925334731116891, + "learning_rate": 8.970144901340246e-05, + "loss": 0.0, + "step": 2160 + }, + { + "epoch": 0.7330988004916713, + "grad_norm": 0.0008847307763062418, + "learning_rate": 8.96821510082951e-05, + "loss": 0.0, + "step": 2162 + }, + { + "epoch": 0.73377696774467, + "grad_norm": 0.023506417870521545, + "learning_rate": 8.966283701922205e-05, + "loss": 0.0, + "step": 2164 + }, + { + "epoch": 0.7344551349976688, + "grad_norm": 0.004152826499193907, + "learning_rate": 8.964350705396301e-05, + "loss": 0.0001, + "step": 2166 + }, + { + "epoch": 0.7351333022506675, + "grad_norm": 0.009032066911458969, + "learning_rate": 8.962416112030405e-05, + "loss": 0.0, + "step": 2168 + }, + { + "epoch": 0.7358114695036664, + "grad_norm": 0.024347808212041855, + "learning_rate": 8.960479922603775e-05, + "loss": 0.0001, + "step": 2170 + }, + { + "epoch": 0.7364896367566651, + "grad_norm": 0.012557287700474262, + "learning_rate": 8.958542137896304e-05, + "loss": 0.0001, + "step": 2172 + }, + { + "epoch": 0.7371678040096639, + "grad_norm": 0.02936052344739437, + "learning_rate": 8.956602758688535e-05, + "loss": 0.0002, + "step": 2174 + }, + { + "epoch": 0.7378459712626626, + "grad_norm": 0.0004298368585295975, + "learning_rate": 8.954661785761647e-05, + "loss": 0.0, + "step": 2176 + }, + { + "epoch": 0.7385241385156615, + "grad_norm": 0.001203262945637107, + "learning_rate": 8.952719219897464e-05, + "loss": 0.0, + "step": 2178 + }, + { + "epoch": 0.7392023057686602, + "grad_norm": 0.027022022753953934, + "learning_rate": 8.950775061878453e-05, + "loss": 0.0, + "step": 2180 + }, + { + "epoch": 0.7398804730216589, + "grad_norm": 0.0010193714406341314, + "learning_rate": 8.948829312487719e-05, + "loss": 0.0, + "step": 2182 + }, + { + "epoch": 0.7405586402746578, + "grad_norm": 0.006130346562713385, + "learning_rate": 8.94688197250901e-05, + "loss": 0.0, + "step": 2184 + }, + { + "epoch": 0.7412368075276565, + "grad_norm": 0.0006499038427136838, + "learning_rate": 8.944933042726714e-05, + "loss": 0.0, + "step": 2186 + }, + { + "epoch": 0.7419149747806553, + "grad_norm": 0.0354667603969574, + "learning_rate": 8.94298252392586e-05, + "loss": 0.0001, + "step": 2188 + }, + { + "epoch": 0.742593142033654, + "grad_norm": 0.000631691247690469, + "learning_rate": 8.941030416892118e-05, + "loss": 0.0, + "step": 2190 + }, + { + "epoch": 0.7432713092866529, + "grad_norm": 0.0023066874127835035, + "learning_rate": 8.939076722411796e-05, + "loss": 0.0, + "step": 2192 + }, + { + "epoch": 0.7439494765396516, + "grad_norm": 0.011757615953683853, + "learning_rate": 8.93712144127184e-05, + "loss": 0.0, + "step": 2194 + }, + { + "epoch": 0.7446276437926503, + "grad_norm": 0.03690662980079651, + "learning_rate": 8.935164574259838e-05, + "loss": 0.0002, + "step": 2196 + }, + { + "epoch": 0.7453058110456491, + "grad_norm": 0.002006297232583165, + "learning_rate": 8.933206122164018e-05, + "loss": 0.0, + "step": 2198 + }, + { + "epoch": 0.7459839782986479, + "grad_norm": 0.055943265557289124, + "learning_rate": 8.931246085773239e-05, + "loss": 0.0003, + "step": 2200 + }, + { + "epoch": 0.7466621455516467, + "grad_norm": 0.00032859868952073157, + "learning_rate": 8.92928446587701e-05, + "loss": 0.0, + "step": 2202 + }, + { + "epoch": 0.7473403128046454, + "grad_norm": 0.0006503578624688089, + "learning_rate": 8.927321263265467e-05, + "loss": 0.0, + "step": 2204 + }, + { + "epoch": 0.7480184800576443, + "grad_norm": 0.01603405550122261, + "learning_rate": 8.925356478729387e-05, + "loss": 0.0001, + "step": 2206 + }, + { + "epoch": 0.748696647310643, + "grad_norm": 0.000402370496885851, + "learning_rate": 8.92339011306019e-05, + "loss": 0.0, + "step": 2208 + }, + { + "epoch": 0.7493748145636417, + "grad_norm": 0.0431690588593483, + "learning_rate": 8.921422167049923e-05, + "loss": 0.0003, + "step": 2210 + }, + { + "epoch": 0.7500529818166405, + "grad_norm": 0.04571768641471863, + "learning_rate": 8.919452641491276e-05, + "loss": 0.0001, + "step": 2212 + }, + { + "epoch": 0.7507311490696393, + "grad_norm": 0.0007486413232982159, + "learning_rate": 8.917481537177575e-05, + "loss": 0.0, + "step": 2214 + }, + { + "epoch": 0.7514093163226381, + "grad_norm": 0.0011588893830776215, + "learning_rate": 8.915508854902778e-05, + "loss": 0.0, + "step": 2216 + }, + { + "epoch": 0.7520874835756368, + "grad_norm": 0.04039638116955757, + "learning_rate": 8.913534595461483e-05, + "loss": 0.0002, + "step": 2218 + }, + { + "epoch": 0.7527656508286356, + "grad_norm": 0.0011450482998043299, + "learning_rate": 8.91155875964892e-05, + "loss": 0.0, + "step": 2220 + }, + { + "epoch": 0.7534438180816344, + "grad_norm": 0.02565939724445343, + "learning_rate": 8.909581348260958e-05, + "loss": 0.0001, + "step": 2222 + }, + { + "epoch": 0.7541219853346331, + "grad_norm": 0.05398557707667351, + "learning_rate": 8.907602362094094e-05, + "loss": 0.0002, + "step": 2224 + }, + { + "epoch": 0.7548001525876319, + "grad_norm": 0.01135649997740984, + "learning_rate": 8.905621801945467e-05, + "loss": 0.0001, + "step": 2226 + }, + { + "epoch": 0.7554783198406307, + "grad_norm": 0.007088663522154093, + "learning_rate": 8.903639668612846e-05, + "loss": 0.0, + "step": 2228 + }, + { + "epoch": 0.7561564870936295, + "grad_norm": 0.009923601523041725, + "learning_rate": 8.901655962894632e-05, + "loss": 0.0, + "step": 2230 + }, + { + "epoch": 0.7568346543466282, + "grad_norm": 0.02772349677979946, + "learning_rate": 8.899670685589864e-05, + "loss": 0.0002, + "step": 2232 + }, + { + "epoch": 0.757512821599627, + "grad_norm": 0.04577437788248062, + "learning_rate": 8.89768383749821e-05, + "loss": 0.0001, + "step": 2234 + }, + { + "epoch": 0.7581909888526258, + "grad_norm": 0.0013328574132174253, + "learning_rate": 8.895695419419972e-05, + "loss": 0.0, + "step": 2236 + }, + { + "epoch": 0.7588691561056246, + "grad_norm": 0.017390921711921692, + "learning_rate": 8.893705432156086e-05, + "loss": 0.0, + "step": 2238 + }, + { + "epoch": 0.7595473233586233, + "grad_norm": 0.035108234733343124, + "learning_rate": 8.891713876508116e-05, + "loss": 0.0001, + "step": 2240 + }, + { + "epoch": 0.7602254906116221, + "grad_norm": 0.016167139634490013, + "learning_rate": 8.889720753278264e-05, + "loss": 0.0, + "step": 2242 + }, + { + "epoch": 0.7609036578646209, + "grad_norm": 0.03242606669664383, + "learning_rate": 8.887726063269355e-05, + "loss": 0.0001, + "step": 2244 + }, + { + "epoch": 0.7615818251176196, + "grad_norm": 0.002253993647173047, + "learning_rate": 8.885729807284856e-05, + "loss": 0.0, + "step": 2246 + }, + { + "epoch": 0.7622599923706184, + "grad_norm": 0.05065949261188507, + "learning_rate": 8.883731986128853e-05, + "loss": 0.0003, + "step": 2248 + }, + { + "epoch": 0.7629381596236172, + "grad_norm": 0.0061168549582362175, + "learning_rate": 8.88173260060607e-05, + "loss": 0.0, + "step": 2250 + }, + { + "epoch": 0.763616326876616, + "grad_norm": 0.00484465854242444, + "learning_rate": 8.879731651521861e-05, + "loss": 0.0, + "step": 2252 + }, + { + "epoch": 0.7642944941296147, + "grad_norm": 0.01573886163532734, + "learning_rate": 8.877729139682206e-05, + "loss": 0.0002, + "step": 2254 + }, + { + "epoch": 0.7649726613826134, + "grad_norm": 0.0024972499813884497, + "learning_rate": 8.875725065893717e-05, + "loss": 0.0003, + "step": 2256 + }, + { + "epoch": 0.7656508286356123, + "grad_norm": 0.007368649821728468, + "learning_rate": 8.873719430963636e-05, + "loss": 0.0, + "step": 2258 + }, + { + "epoch": 0.766328995888611, + "grad_norm": 0.05810406804084778, + "learning_rate": 8.871712235699831e-05, + "loss": 0.0002, + "step": 2260 + }, + { + "epoch": 0.7670071631416098, + "grad_norm": 0.001753704040311277, + "learning_rate": 8.869703480910801e-05, + "loss": 0.0, + "step": 2262 + }, + { + "epoch": 0.7676853303946086, + "grad_norm": 0.015127058140933514, + "learning_rate": 8.867693167405671e-05, + "loss": 0.0001, + "step": 2264 + }, + { + "epoch": 0.7683634976476074, + "grad_norm": 0.007170891854912043, + "learning_rate": 8.865681295994195e-05, + "loss": 0.0, + "step": 2266 + }, + { + "epoch": 0.7690416649006061, + "grad_norm": 0.0003231594746466726, + "learning_rate": 8.863667867486756e-05, + "loss": 0.0001, + "step": 2268 + }, + { + "epoch": 0.7697198321536048, + "grad_norm": 0.03522728011012077, + "learning_rate": 8.861652882694363e-05, + "loss": 0.0001, + "step": 2270 + }, + { + "epoch": 0.7703979994066037, + "grad_norm": 0.0019485610537230968, + "learning_rate": 8.859636342428648e-05, + "loss": 0.0, + "step": 2272 + }, + { + "epoch": 0.7710761666596024, + "grad_norm": 0.003564967541024089, + "learning_rate": 8.857618247501878e-05, + "loss": 0.0, + "step": 2274 + }, + { + "epoch": 0.7717543339126012, + "grad_norm": 0.016705702990293503, + "learning_rate": 8.855598598726939e-05, + "loss": 0.0, + "step": 2276 + }, + { + "epoch": 0.7724325011655999, + "grad_norm": 0.00029819709016010165, + "learning_rate": 8.853577396917345e-05, + "loss": 0.0, + "step": 2278 + }, + { + "epoch": 0.7731106684185988, + "grad_norm": 0.01925487443804741, + "learning_rate": 8.851554642887237e-05, + "loss": 0.0, + "step": 2280 + }, + { + "epoch": 0.7737888356715975, + "grad_norm": 0.002167096361517906, + "learning_rate": 8.849530337451378e-05, + "loss": 0.0, + "step": 2282 + }, + { + "epoch": 0.7744670029245962, + "grad_norm": 0.00036691053537651896, + "learning_rate": 8.84750448142516e-05, + "loss": 0.0, + "step": 2284 + }, + { + "epoch": 0.7751451701775951, + "grad_norm": 0.0002451124310027808, + "learning_rate": 8.845477075624598e-05, + "loss": 0.0, + "step": 2286 + }, + { + "epoch": 0.7758233374305938, + "grad_norm": 0.007234338205307722, + "learning_rate": 8.843448120866329e-05, + "loss": 0.0, + "step": 2288 + }, + { + "epoch": 0.7765015046835926, + "grad_norm": 0.004028369206935167, + "learning_rate": 8.841417617967618e-05, + "loss": 0.0, + "step": 2290 + }, + { + "epoch": 0.7771796719365913, + "grad_norm": 0.0018081561429426074, + "learning_rate": 8.839385567746347e-05, + "loss": 0.0, + "step": 2292 + }, + { + "epoch": 0.7778578391895902, + "grad_norm": 0.0005776785546913743, + "learning_rate": 8.837351971021031e-05, + "loss": 0.0, + "step": 2294 + }, + { + "epoch": 0.7785360064425889, + "grad_norm": 0.0026705998461693525, + "learning_rate": 8.8353168286108e-05, + "loss": 0.0, + "step": 2296 + }, + { + "epoch": 0.7792141736955877, + "grad_norm": 0.003311643609777093, + "learning_rate": 8.833280141335408e-05, + "loss": 0.0001, + "step": 2298 + }, + { + "epoch": 0.7798923409485864, + "grad_norm": 0.00011252472177147865, + "learning_rate": 8.831241910015233e-05, + "loss": 0.0, + "step": 2300 + }, + { + "epoch": 0.7805705082015852, + "grad_norm": 0.0006050001829862595, + "learning_rate": 8.829202135471276e-05, + "loss": 0.0, + "step": 2302 + }, + { + "epoch": 0.781248675454584, + "grad_norm": 0.0012319076340645552, + "learning_rate": 8.827160818525156e-05, + "loss": 0.0, + "step": 2304 + }, + { + "epoch": 0.7819268427075827, + "grad_norm": 0.0023186788894236088, + "learning_rate": 8.825117959999116e-05, + "loss": 0.0, + "step": 2306 + }, + { + "epoch": 0.7826050099605816, + "grad_norm": 0.028587566688656807, + "learning_rate": 8.82307356071602e-05, + "loss": 0.0001, + "step": 2308 + }, + { + "epoch": 0.7832831772135803, + "grad_norm": 0.002290781820192933, + "learning_rate": 8.82102762149935e-05, + "loss": 0.0, + "step": 2310 + }, + { + "epoch": 0.7839613444665791, + "grad_norm": 0.00015715346671640873, + "learning_rate": 8.818980143173213e-05, + "loss": 0.0, + "step": 2312 + }, + { + "epoch": 0.7846395117195778, + "grad_norm": 0.001944783260114491, + "learning_rate": 8.816931126562328e-05, + "loss": 0.0, + "step": 2314 + }, + { + "epoch": 0.7853176789725766, + "grad_norm": 0.000705886457581073, + "learning_rate": 8.814880572492044e-05, + "loss": 0.0, + "step": 2316 + }, + { + "epoch": 0.7859958462255754, + "grad_norm": 0.0004583608824759722, + "learning_rate": 8.81282848178832e-05, + "loss": 0.0, + "step": 2318 + }, + { + "epoch": 0.7866740134785741, + "grad_norm": 0.0001509250869276002, + "learning_rate": 8.81077485527774e-05, + "loss": 0.0, + "step": 2320 + }, + { + "epoch": 0.7873521807315729, + "grad_norm": 0.0009164375951513648, + "learning_rate": 8.808719693787504e-05, + "loss": 0.0, + "step": 2322 + }, + { + "epoch": 0.7880303479845717, + "grad_norm": 0.0007774062687531114, + "learning_rate": 8.806662998145431e-05, + "loss": 0.0, + "step": 2324 + }, + { + "epoch": 0.7887085152375705, + "grad_norm": 0.004715454764664173, + "learning_rate": 8.804604769179958e-05, + "loss": 0.0, + "step": 2326 + }, + { + "epoch": 0.7893866824905692, + "grad_norm": 0.00013452318671625108, + "learning_rate": 8.802545007720138e-05, + "loss": 0.0, + "step": 2328 + }, + { + "epoch": 0.790064849743568, + "grad_norm": 0.00014649407239630818, + "learning_rate": 8.800483714595644e-05, + "loss": 0.0, + "step": 2330 + }, + { + "epoch": 0.7907430169965668, + "grad_norm": 0.00020664343901444227, + "learning_rate": 8.798420890636764e-05, + "loss": 0.0, + "step": 2332 + }, + { + "epoch": 0.7914211842495655, + "grad_norm": 0.010160190984606743, + "learning_rate": 8.796356536674403e-05, + "loss": 0.0, + "step": 2334 + }, + { + "epoch": 0.7920993515025643, + "grad_norm": 9.14070478756912e-05, + "learning_rate": 8.794290653540084e-05, + "loss": 0.0, + "step": 2336 + }, + { + "epoch": 0.7927775187555631, + "grad_norm": 9.601074270904064e-05, + "learning_rate": 8.792223242065945e-05, + "loss": 0.0, + "step": 2338 + }, + { + "epoch": 0.7934556860085619, + "grad_norm": 0.0001925271499203518, + "learning_rate": 8.790154303084736e-05, + "loss": 0.0, + "step": 2340 + }, + { + "epoch": 0.7941338532615606, + "grad_norm": 0.00016138785576913506, + "learning_rate": 8.788083837429828e-05, + "loss": 0.0, + "step": 2342 + }, + { + "epoch": 0.7948120205145595, + "grad_norm": 0.0004923270898871124, + "learning_rate": 8.786011845935202e-05, + "loss": 0.0, + "step": 2344 + }, + { + "epoch": 0.7954901877675582, + "grad_norm": 0.00026215388788841665, + "learning_rate": 8.783938329435457e-05, + "loss": 0.0, + "step": 2346 + }, + { + "epoch": 0.7961683550205569, + "grad_norm": 0.0034639956429600716, + "learning_rate": 8.781863288765806e-05, + "loss": 0.0, + "step": 2348 + }, + { + "epoch": 0.7968465222735557, + "grad_norm": 0.0001678073313087225, + "learning_rate": 8.779786724762075e-05, + "loss": 0.0, + "step": 2350 + }, + { + "epoch": 0.7975246895265545, + "grad_norm": 0.00024357331858482212, + "learning_rate": 8.777708638260702e-05, + "loss": 0.0, + "step": 2352 + }, + { + "epoch": 0.7982028567795533, + "grad_norm": 0.00016859491006471217, + "learning_rate": 8.775629030098742e-05, + "loss": 0.0, + "step": 2354 + }, + { + "epoch": 0.798881024032552, + "grad_norm": 0.0007309844950214028, + "learning_rate": 8.773547901113862e-05, + "loss": 0.0, + "step": 2356 + }, + { + "epoch": 0.7995591912855508, + "grad_norm": 0.00010276456305291504, + "learning_rate": 8.771465252144335e-05, + "loss": 0.0, + "step": 2358 + }, + { + "epoch": 0.8002373585385496, + "grad_norm": 5.053769200458191e-05, + "learning_rate": 8.769381084029058e-05, + "loss": 0.0, + "step": 2360 + }, + { + "epoch": 0.8009155257915483, + "grad_norm": 0.0001550625602249056, + "learning_rate": 8.76729539760753e-05, + "loss": 0.0, + "step": 2362 + }, + { + "epoch": 0.8015936930445471, + "grad_norm": 0.00012546095240395516, + "learning_rate": 8.765208193719866e-05, + "loss": 0.0, + "step": 2364 + }, + { + "epoch": 0.8022718602975459, + "grad_norm": 6.550251418957487e-05, + "learning_rate": 8.763119473206794e-05, + "loss": 0.0, + "step": 2366 + }, + { + "epoch": 0.8029500275505447, + "grad_norm": 0.00018826685845851898, + "learning_rate": 8.761029236909647e-05, + "loss": 0.0, + "step": 2368 + }, + { + "epoch": 0.8036281948035434, + "grad_norm": 0.0002414361370028928, + "learning_rate": 8.758937485670373e-05, + "loss": 0.0, + "step": 2370 + }, + { + "epoch": 0.8043063620565422, + "grad_norm": 0.00019206578144803643, + "learning_rate": 8.75684422033153e-05, + "loss": 0.0, + "step": 2372 + }, + { + "epoch": 0.804984529309541, + "grad_norm": 6.82855534250848e-05, + "learning_rate": 8.754749441736283e-05, + "loss": 0.0, + "step": 2374 + }, + { + "epoch": 0.8056626965625397, + "grad_norm": 0.00027947468333877623, + "learning_rate": 8.752653150728411e-05, + "loss": 0.0, + "step": 2376 + }, + { + "epoch": 0.8063408638155385, + "grad_norm": 0.0001643617288209498, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0, + "step": 2378 + }, + { + "epoch": 0.8070190310685372, + "grad_norm": 8.241987961810082e-05, + "learning_rate": 8.74845603485294e-05, + "loss": 0.0, + "step": 2380 + }, + { + "epoch": 0.8076971983215361, + "grad_norm": 0.00038982118712738156, + "learning_rate": 8.746355211675939e-05, + "loss": 0.0, + "step": 2382 + }, + { + "epoch": 0.8083753655745348, + "grad_norm": 0.02610670030117035, + "learning_rate": 8.744252879467507e-05, + "loss": 0.0001, + "step": 2384 + }, + { + "epoch": 0.8090535328275336, + "grad_norm": 0.00022798529244028032, + "learning_rate": 8.742149039074463e-05, + "loss": 0.0, + "step": 2386 + }, + { + "epoch": 0.8097317000805324, + "grad_norm": 5.434233753476292e-05, + "learning_rate": 8.740043691344233e-05, + "loss": 0.0, + "step": 2388 + }, + { + "epoch": 0.8104098673335312, + "grad_norm": 8.579740824643523e-05, + "learning_rate": 8.737936837124852e-05, + "loss": 0.0, + "step": 2390 + }, + { + "epoch": 0.8110880345865299, + "grad_norm": 0.0001528613647678867, + "learning_rate": 8.73582847726496e-05, + "loss": 0.0, + "step": 2392 + }, + { + "epoch": 0.8117662018395286, + "grad_norm": 0.0012288985308259726, + "learning_rate": 8.733718612613803e-05, + "loss": 0.0, + "step": 2394 + }, + { + "epoch": 0.8124443690925275, + "grad_norm": 4.78740876133088e-05, + "learning_rate": 8.731607244021236e-05, + "loss": 0.0, + "step": 2396 + }, + { + "epoch": 0.8131225363455262, + "grad_norm": 0.021069234237074852, + "learning_rate": 8.729494372337716e-05, + "loss": 0.0001, + "step": 2398 + }, + { + "epoch": 0.813800703598525, + "grad_norm": 0.00011475202336441725, + "learning_rate": 8.727379998414311e-05, + "loss": 0.0, + "step": 2400 + }, + { + "epoch": 0.8144788708515237, + "grad_norm": 0.00019884259381797165, + "learning_rate": 8.725264123102688e-05, + "loss": 0.0, + "step": 2402 + }, + { + "epoch": 0.8151570381045226, + "grad_norm": 0.00013354167458601296, + "learning_rate": 8.723146747255122e-05, + "loss": 0.0, + "step": 2404 + }, + { + "epoch": 0.8158352053575213, + "grad_norm": 0.000227173266466707, + "learning_rate": 8.72102787172449e-05, + "loss": 0.0, + "step": 2406 + }, + { + "epoch": 0.81651337261052, + "grad_norm": 7.385117351077497e-05, + "learning_rate": 8.718907497364277e-05, + "loss": 0.0, + "step": 2408 + }, + { + "epoch": 0.8171915398635189, + "grad_norm": 0.002906026551499963, + "learning_rate": 8.71678562502857e-05, + "loss": 0.0, + "step": 2410 + }, + { + "epoch": 0.8178697071165176, + "grad_norm": 4.0036276914179325e-05, + "learning_rate": 8.714662255572055e-05, + "loss": 0.0, + "step": 2412 + }, + { + "epoch": 0.8185478743695164, + "grad_norm": 7.089857535902411e-05, + "learning_rate": 8.712537389850031e-05, + "loss": 0.0, + "step": 2414 + }, + { + "epoch": 0.8192260416225151, + "grad_norm": 0.0007380903698503971, + "learning_rate": 8.710411028718388e-05, + "loss": 0.0, + "step": 2416 + }, + { + "epoch": 0.819904208875514, + "grad_norm": 0.00014535282389260828, + "learning_rate": 8.708283173033627e-05, + "loss": 0.0, + "step": 2418 + }, + { + "epoch": 0.8205823761285127, + "grad_norm": 0.0002633386757224798, + "learning_rate": 8.706153823652848e-05, + "loss": 0.0, + "step": 2420 + }, + { + "epoch": 0.8212605433815114, + "grad_norm": 8.741030615055934e-05, + "learning_rate": 8.70402298143375e-05, + "loss": 0.0, + "step": 2422 + }, + { + "epoch": 0.8219387106345102, + "grad_norm": 0.0004991906462237239, + "learning_rate": 8.70189064723464e-05, + "loss": 0.0, + "step": 2424 + }, + { + "epoch": 0.822616877887509, + "grad_norm": 0.0008120696293190122, + "learning_rate": 8.69975682191442e-05, + "loss": 0.0, + "step": 2426 + }, + { + "epoch": 0.8232950451405078, + "grad_norm": 8.671852992847562e-05, + "learning_rate": 8.697621506332594e-05, + "loss": 0.0, + "step": 2428 + }, + { + "epoch": 0.8239732123935065, + "grad_norm": 0.0048635476268827915, + "learning_rate": 8.695484701349268e-05, + "loss": 0.0, + "step": 2430 + }, + { + "epoch": 0.8246513796465054, + "grad_norm": 0.0019546092953532934, + "learning_rate": 8.693346407825144e-05, + "loss": 0.0, + "step": 2432 + }, + { + "epoch": 0.8253295468995041, + "grad_norm": 0.0003734403580892831, + "learning_rate": 8.69120662662153e-05, + "loss": 0.0, + "step": 2434 + }, + { + "epoch": 0.8260077141525028, + "grad_norm": 0.0005670825485140085, + "learning_rate": 8.68906535860033e-05, + "loss": 0.0, + "step": 2436 + }, + { + "epoch": 0.8266858814055016, + "grad_norm": 0.00019534399325493723, + "learning_rate": 8.686922604624044e-05, + "loss": 0.0, + "step": 2438 + }, + { + "epoch": 0.8273640486585004, + "grad_norm": 9.16475837584585e-05, + "learning_rate": 8.684778365555773e-05, + "loss": 0.0, + "step": 2440 + }, + { + "epoch": 0.8280422159114992, + "grad_norm": 0.0014319182373583317, + "learning_rate": 8.682632642259216e-05, + "loss": 0.0, + "step": 2442 + }, + { + "epoch": 0.8287203831644979, + "grad_norm": 6.968220259295776e-05, + "learning_rate": 8.680485435598673e-05, + "loss": 0.0, + "step": 2444 + }, + { + "epoch": 0.8293985504174967, + "grad_norm": 0.00036705288221128285, + "learning_rate": 8.678336746439035e-05, + "loss": 0.0, + "step": 2446 + }, + { + "epoch": 0.8300767176704955, + "grad_norm": 0.0001591446198290214, + "learning_rate": 8.676186575645796e-05, + "loss": 0.0, + "step": 2448 + }, + { + "epoch": 0.8307548849234943, + "grad_norm": 7.104194810381159e-05, + "learning_rate": 8.674034924085043e-05, + "loss": 0.0, + "step": 2450 + }, + { + "epoch": 0.831433052176493, + "grad_norm": 0.00011930055188713595, + "learning_rate": 8.671881792623464e-05, + "loss": 0.0, + "step": 2452 + }, + { + "epoch": 0.8321112194294918, + "grad_norm": 0.0007769940420985222, + "learning_rate": 8.669727182128338e-05, + "loss": 0.0, + "step": 2454 + }, + { + "epoch": 0.8327893866824906, + "grad_norm": 0.0024935437832027674, + "learning_rate": 8.667571093467541e-05, + "loss": 0.0, + "step": 2456 + }, + { + "epoch": 0.8334675539354893, + "grad_norm": 4.6994613512651995e-05, + "learning_rate": 8.665413527509546e-05, + "loss": 0.0, + "step": 2458 + }, + { + "epoch": 0.8341457211884881, + "grad_norm": 7.935589383123443e-05, + "learning_rate": 8.66325448512342e-05, + "loss": 0.0, + "step": 2460 + }, + { + "epoch": 0.8348238884414869, + "grad_norm": 6.114977441029623e-05, + "learning_rate": 8.661093967178826e-05, + "loss": 0.0, + "step": 2462 + }, + { + "epoch": 0.8355020556944857, + "grad_norm": 3.911663225153461e-05, + "learning_rate": 8.65893197454602e-05, + "loss": 0.0, + "step": 2464 + }, + { + "epoch": 0.8361802229474844, + "grad_norm": 0.005718694068491459, + "learning_rate": 8.656768508095853e-05, + "loss": 0.0, + "step": 2466 + }, + { + "epoch": 0.8368583902004832, + "grad_norm": 5.9984056861139834e-05, + "learning_rate": 8.654603568699768e-05, + "loss": 0.0, + "step": 2468 + }, + { + "epoch": 0.837536557453482, + "grad_norm": 6.293792830547318e-05, + "learning_rate": 8.652437157229801e-05, + "loss": 0.0, + "step": 2470 + }, + { + "epoch": 0.8382147247064807, + "grad_norm": 4.67793652205728e-05, + "learning_rate": 8.650269274558585e-05, + "loss": 0.0, + "step": 2472 + }, + { + "epoch": 0.8388928919594795, + "grad_norm": 0.000718626833986491, + "learning_rate": 8.648099921559342e-05, + "loss": 0.0, + "step": 2474 + }, + { + "epoch": 0.8395710592124783, + "grad_norm": 0.00555463507771492, + "learning_rate": 8.645929099105887e-05, + "loss": 0.0, + "step": 2476 + }, + { + "epoch": 0.8402492264654771, + "grad_norm": 0.00010523942182771862, + "learning_rate": 8.643756808072624e-05, + "loss": 0.0, + "step": 2478 + }, + { + "epoch": 0.8409273937184758, + "grad_norm": 4.4146174332126975e-05, + "learning_rate": 8.641583049334558e-05, + "loss": 0.0, + "step": 2480 + }, + { + "epoch": 0.8416055609714745, + "grad_norm": 2.2396032363758422e-05, + "learning_rate": 8.639407823767274e-05, + "loss": 0.0, + "step": 2482 + }, + { + "epoch": 0.8422837282244734, + "grad_norm": 5.793202217319049e-05, + "learning_rate": 8.637231132246955e-05, + "loss": 0.0, + "step": 2484 + }, + { + "epoch": 0.8429618954774721, + "grad_norm": 0.0030191531404852867, + "learning_rate": 8.635052975650369e-05, + "loss": 0.0, + "step": 2486 + }, + { + "epoch": 0.8436400627304709, + "grad_norm": 5.942771167610772e-05, + "learning_rate": 8.63287335485488e-05, + "loss": 0.0, + "step": 2488 + }, + { + "epoch": 0.8443182299834697, + "grad_norm": 5.2272378525231034e-05, + "learning_rate": 8.630692270738439e-05, + "loss": 0.0, + "step": 2490 + }, + { + "epoch": 0.8449963972364685, + "grad_norm": 3.476697384030558e-05, + "learning_rate": 8.628509724179584e-05, + "loss": 0.0, + "step": 2492 + }, + { + "epoch": 0.8456745644894672, + "grad_norm": 8.026020805118605e-05, + "learning_rate": 8.626325716057446e-05, + "loss": 0.0, + "step": 2494 + }, + { + "epoch": 0.846352731742466, + "grad_norm": 7.056519825709984e-05, + "learning_rate": 8.624140247251743e-05, + "loss": 0.0, + "step": 2496 + }, + { + "epoch": 0.8470308989954648, + "grad_norm": 7.995586202014238e-05, + "learning_rate": 8.621953318642784e-05, + "loss": 0.0, + "step": 2498 + }, + { + "epoch": 0.8477090662484635, + "grad_norm": 4.878836261923425e-05, + "learning_rate": 8.619764931111459e-05, + "loss": 0.0, + "step": 2500 + }, + { + "epoch": 0.8483872335014623, + "grad_norm": 0.0002059117250610143, + "learning_rate": 8.617575085539253e-05, + "loss": 0.0, + "step": 2502 + }, + { + "epoch": 0.849065400754461, + "grad_norm": 4.21931836172007e-05, + "learning_rate": 8.615383782808237e-05, + "loss": 0.0, + "step": 2504 + }, + { + "epoch": 0.8497435680074599, + "grad_norm": 0.0018902408191934228, + "learning_rate": 8.613191023801064e-05, + "loss": 0.0, + "step": 2506 + }, + { + "epoch": 0.8504217352604586, + "grad_norm": 5.137163680046797e-05, + "learning_rate": 8.61099680940098e-05, + "loss": 0.0, + "step": 2508 + }, + { + "epoch": 0.8510999025134574, + "grad_norm": 0.0010492595611140132, + "learning_rate": 8.608801140491811e-05, + "loss": 0.0, + "step": 2510 + }, + { + "epoch": 0.8517780697664562, + "grad_norm": 6.547579687321559e-05, + "learning_rate": 8.606604017957976e-05, + "loss": 0.0, + "step": 2512 + }, + { + "epoch": 0.8524562370194549, + "grad_norm": 2.5358762286487035e-05, + "learning_rate": 8.604405442684474e-05, + "loss": 0.0, + "step": 2514 + }, + { + "epoch": 0.8531344042724537, + "grad_norm": 5.517467434401624e-05, + "learning_rate": 8.602205415556889e-05, + "loss": 0.0, + "step": 2516 + }, + { + "epoch": 0.8538125715254524, + "grad_norm": 4.9549442337593064e-05, + "learning_rate": 8.600003937461394e-05, + "loss": 0.0, + "step": 2518 + }, + { + "epoch": 0.8544907387784513, + "grad_norm": 3.567721068975516e-05, + "learning_rate": 8.597801009284744e-05, + "loss": 0.0, + "step": 2520 + }, + { + "epoch": 0.85516890603145, + "grad_norm": 3.9313017623499036e-05, + "learning_rate": 8.595596631914277e-05, + "loss": 0.0, + "step": 2522 + }, + { + "epoch": 0.8558470732844488, + "grad_norm": 0.0009759738459251821, + "learning_rate": 8.593390806237917e-05, + "loss": 0.0, + "step": 2524 + }, + { + "epoch": 0.8565252405374475, + "grad_norm": 0.00014349607226904482, + "learning_rate": 8.591183533144171e-05, + "loss": 0.0, + "step": 2526 + }, + { + "epoch": 0.8572034077904463, + "grad_norm": 5.6128526921384037e-05, + "learning_rate": 8.588974813522126e-05, + "loss": 0.0, + "step": 2528 + }, + { + "epoch": 0.8578815750434451, + "grad_norm": 5.782934749731794e-05, + "learning_rate": 8.586764648261456e-05, + "loss": 0.0, + "step": 2530 + }, + { + "epoch": 0.8585597422964438, + "grad_norm": 0.00024813110940158367, + "learning_rate": 8.584553038252414e-05, + "loss": 0.0, + "step": 2532 + }, + { + "epoch": 0.8592379095494427, + "grad_norm": 2.7292395316180773e-05, + "learning_rate": 8.582339984385838e-05, + "loss": 0.0, + "step": 2534 + }, + { + "epoch": 0.8599160768024414, + "grad_norm": 3.258465585531667e-05, + "learning_rate": 8.580125487553143e-05, + "loss": 0.0, + "step": 2536 + }, + { + "epoch": 0.8605942440554402, + "grad_norm": 1.458320184610784e-05, + "learning_rate": 8.57790954864633e-05, + "loss": 0.0, + "step": 2538 + }, + { + "epoch": 0.8612724113084389, + "grad_norm": 0.00010439179459353909, + "learning_rate": 8.575692168557981e-05, + "loss": 0.0, + "step": 2540 + }, + { + "epoch": 0.8619505785614378, + "grad_norm": 4.440178599907085e-05, + "learning_rate": 8.573473348181251e-05, + "loss": 0.0, + "step": 2542 + }, + { + "epoch": 0.8626287458144365, + "grad_norm": 3.7284295103745535e-05, + "learning_rate": 8.571253088409886e-05, + "loss": 0.0, + "step": 2544 + }, + { + "epoch": 0.8633069130674352, + "grad_norm": 3.342417403473519e-05, + "learning_rate": 8.569031390138202e-05, + "loss": 0.0, + "step": 2546 + }, + { + "epoch": 0.863985080320434, + "grad_norm": 2.9887109121773392e-05, + "learning_rate": 8.566808254261103e-05, + "loss": 0.0, + "step": 2548 + }, + { + "epoch": 0.8646632475734328, + "grad_norm": 4.787669240613468e-05, + "learning_rate": 8.564583681674065e-05, + "loss": 0.0, + "step": 2550 + }, + { + "epoch": 0.8653414148264316, + "grad_norm": 5.066965240985155e-05, + "learning_rate": 8.562357673273147e-05, + "loss": 0.0, + "step": 2552 + }, + { + "epoch": 0.8660195820794303, + "grad_norm": 0.000141487194923684, + "learning_rate": 8.560130229954984e-05, + "loss": 0.0, + "step": 2554 + }, + { + "epoch": 0.8666977493324292, + "grad_norm": 4.295802136766724e-05, + "learning_rate": 8.557901352616789e-05, + "loss": 0.0, + "step": 2556 + }, + { + "epoch": 0.8673759165854279, + "grad_norm": 2.949769805127289e-05, + "learning_rate": 8.555671042156354e-05, + "loss": 0.0, + "step": 2558 + }, + { + "epoch": 0.8680540838384266, + "grad_norm": 3.286828359705396e-05, + "learning_rate": 8.55343929947205e-05, + "loss": 0.0, + "step": 2560 + }, + { + "epoch": 0.8687322510914254, + "grad_norm": 0.00010639905667630956, + "learning_rate": 8.55120612546282e-05, + "loss": 0.0, + "step": 2562 + }, + { + "epoch": 0.8694104183444242, + "grad_norm": 2.4564649720559828e-05, + "learning_rate": 8.548971521028189e-05, + "loss": 0.0, + "step": 2564 + }, + { + "epoch": 0.870088585597423, + "grad_norm": 0.00031212970498017967, + "learning_rate": 8.546735487068252e-05, + "loss": 0.0, + "step": 2566 + }, + { + "epoch": 0.8707667528504217, + "grad_norm": 0.00017252239922527224, + "learning_rate": 8.544498024483687e-05, + "loss": 0.0, + "step": 2568 + }, + { + "epoch": 0.8714449201034205, + "grad_norm": 4.669900954468176e-05, + "learning_rate": 8.542259134175739e-05, + "loss": 0.0, + "step": 2570 + }, + { + "epoch": 0.8721230873564193, + "grad_norm": 2.540825335017871e-05, + "learning_rate": 8.540018817046238e-05, + "loss": 0.0, + "step": 2572 + }, + { + "epoch": 0.872801254609418, + "grad_norm": 3.5121825931128114e-05, + "learning_rate": 8.53777707399758e-05, + "loss": 0.0, + "step": 2574 + }, + { + "epoch": 0.8734794218624168, + "grad_norm": 2.1930207367404364e-05, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0, + "step": 2576 + }, + { + "epoch": 0.8741575891154156, + "grad_norm": 3.102342816418968e-05, + "learning_rate": 8.533289313755263e-05, + "loss": 0.0, + "step": 2578 + }, + { + "epoch": 0.8748357563684144, + "grad_norm": 9.593948198016733e-05, + "learning_rate": 8.531043298369274e-05, + "loss": 0.0, + "step": 2580 + }, + { + "epoch": 0.8755139236214131, + "grad_norm": 0.0001077059205272235, + "learning_rate": 8.528795860679468e-05, + "loss": 0.0, + "step": 2582 + }, + { + "epoch": 0.8761920908744119, + "grad_norm": 5.816249904455617e-05, + "learning_rate": 8.52654700159111e-05, + "loss": 0.0, + "step": 2584 + }, + { + "epoch": 0.8768702581274107, + "grad_norm": 2.9580603950307705e-05, + "learning_rate": 8.524296722010042e-05, + "loss": 0.0, + "step": 2586 + }, + { + "epoch": 0.8775484253804094, + "grad_norm": 0.001621534931473434, + "learning_rate": 8.522045022842677e-05, + "loss": 0.0, + "step": 2588 + }, + { + "epoch": 0.8782265926334082, + "grad_norm": 2.7219555704505183e-05, + "learning_rate": 8.519791904995996e-05, + "loss": 0.0, + "step": 2590 + }, + { + "epoch": 0.878904759886407, + "grad_norm": 3.0587609217036515e-05, + "learning_rate": 8.517537369377557e-05, + "loss": 0.0, + "step": 2592 + }, + { + "epoch": 0.8795829271394058, + "grad_norm": 3.34189462591894e-05, + "learning_rate": 8.515281416895488e-05, + "loss": 0.0, + "step": 2594 + }, + { + "epoch": 0.8802610943924045, + "grad_norm": 3.841116631519981e-05, + "learning_rate": 8.513024048458485e-05, + "loss": 0.0, + "step": 2596 + }, + { + "epoch": 0.8809392616454033, + "grad_norm": 4.351099778432399e-05, + "learning_rate": 8.510765264975813e-05, + "loss": 0.0, + "step": 2598 + }, + { + "epoch": 0.8816174288984021, + "grad_norm": 6.0994632804067805e-05, + "learning_rate": 8.508505067357313e-05, + "loss": 0.0, + "step": 2600 + }, + { + "epoch": 0.8822955961514009, + "grad_norm": 2.2559877834282815e-05, + "learning_rate": 8.506243456513391e-05, + "loss": 0.0, + "step": 2602 + }, + { + "epoch": 0.8829737634043996, + "grad_norm": 3.4261829569004476e-05, + "learning_rate": 8.503980433355024e-05, + "loss": 0.0, + "step": 2604 + }, + { + "epoch": 0.8836519306573983, + "grad_norm": 2.5804607503232546e-05, + "learning_rate": 8.501715998793757e-05, + "loss": 0.0, + "step": 2606 + }, + { + "epoch": 0.8843300979103972, + "grad_norm": 3.149559779558331e-05, + "learning_rate": 8.499450153741706e-05, + "loss": 0.0, + "step": 2608 + }, + { + "epoch": 0.8850082651633959, + "grad_norm": 5.7846591516863555e-05, + "learning_rate": 8.497182899111551e-05, + "loss": 0.0, + "step": 2610 + }, + { + "epoch": 0.8856864324163947, + "grad_norm": 9.873088129097596e-05, + "learning_rate": 8.494914235816542e-05, + "loss": 0.0, + "step": 2612 + }, + { + "epoch": 0.8863645996693935, + "grad_norm": 3.39953439834062e-05, + "learning_rate": 8.492644164770499e-05, + "loss": 0.0, + "step": 2614 + }, + { + "epoch": 0.8870427669223923, + "grad_norm": 3.4924494684673846e-05, + "learning_rate": 8.490372686887802e-05, + "loss": 0.0, + "step": 2616 + }, + { + "epoch": 0.887720934175391, + "grad_norm": 5.127754411660135e-05, + "learning_rate": 8.488099803083404e-05, + "loss": 0.0, + "step": 2618 + }, + { + "epoch": 0.8883991014283897, + "grad_norm": 0.00016556790797039866, + "learning_rate": 8.485825514272824e-05, + "loss": 0.0, + "step": 2620 + }, + { + "epoch": 0.8890772686813886, + "grad_norm": 3.2666204788256437e-05, + "learning_rate": 8.483549821372141e-05, + "loss": 0.0, + "step": 2622 + }, + { + "epoch": 0.8897554359343873, + "grad_norm": 3.055952402064577e-05, + "learning_rate": 8.481272725298009e-05, + "loss": 0.0, + "step": 2624 + }, + { + "epoch": 0.8904336031873861, + "grad_norm": 0.0005399963119998574, + "learning_rate": 8.478994226967638e-05, + "loss": 0.0, + "step": 2626 + }, + { + "epoch": 0.8911117704403848, + "grad_norm": 0.0007298051496036351, + "learning_rate": 8.476714327298809e-05, + "loss": 0.0, + "step": 2628 + }, + { + "epoch": 0.8917899376933837, + "grad_norm": 6.035435217199847e-05, + "learning_rate": 8.474433027209866e-05, + "loss": 0.0, + "step": 2630 + }, + { + "epoch": 0.8924681049463824, + "grad_norm": 0.00013174138439353555, + "learning_rate": 8.472150327619714e-05, + "loss": 0.0, + "step": 2632 + }, + { + "epoch": 0.8931462721993811, + "grad_norm": 2.576379120000638e-05, + "learning_rate": 8.469866229447825e-05, + "loss": 0.0, + "step": 2634 + }, + { + "epoch": 0.89382443945238, + "grad_norm": 0.007593007758259773, + "learning_rate": 8.467580733614233e-05, + "loss": 0.0, + "step": 2636 + }, + { + "epoch": 0.8945026067053787, + "grad_norm": 3.41287195624318e-05, + "learning_rate": 8.465293841039539e-05, + "loss": 0.0, + "step": 2638 + }, + { + "epoch": 0.8951807739583775, + "grad_norm": 5.675890133716166e-05, + "learning_rate": 8.463005552644898e-05, + "loss": 0.0, + "step": 2640 + }, + { + "epoch": 0.8958589412113762, + "grad_norm": 0.00010869366087717935, + "learning_rate": 8.460715869352035e-05, + "loss": 0.0, + "step": 2642 + }, + { + "epoch": 0.8965371084643751, + "grad_norm": 0.00029541790718212724, + "learning_rate": 8.458424792083233e-05, + "loss": 0.0, + "step": 2644 + }, + { + "epoch": 0.8972152757173738, + "grad_norm": 0.00016577077622059733, + "learning_rate": 8.456132321761339e-05, + "loss": 0.0, + "step": 2646 + }, + { + "epoch": 0.8978934429703725, + "grad_norm": 7.052655564621091e-05, + "learning_rate": 8.45383845930976e-05, + "loss": 0.0, + "step": 2648 + }, + { + "epoch": 0.8985716102233713, + "grad_norm": 8.335147867910564e-05, + "learning_rate": 8.451543205652462e-05, + "loss": 0.0, + "step": 2650 + }, + { + "epoch": 0.8992497774763701, + "grad_norm": 4.06303079216741e-05, + "learning_rate": 8.449246561713972e-05, + "loss": 0.0, + "step": 2652 + }, + { + "epoch": 0.8999279447293689, + "grad_norm": 3.0850900657242164e-05, + "learning_rate": 8.44694852841938e-05, + "loss": 0.0, + "step": 2654 + }, + { + "epoch": 0.9006061119823676, + "grad_norm": 9.577847959008068e-05, + "learning_rate": 8.444649106694335e-05, + "loss": 0.0, + "step": 2656 + }, + { + "epoch": 0.9012842792353665, + "grad_norm": 0.0001490665745222941, + "learning_rate": 8.442348297465039e-05, + "loss": 0.0, + "step": 2658 + }, + { + "epoch": 0.9019624464883652, + "grad_norm": 3.8870639400556684e-05, + "learning_rate": 8.440046101658263e-05, + "loss": 0.0, + "step": 2660 + }, + { + "epoch": 0.902640613741364, + "grad_norm": 2.3244661861099303e-05, + "learning_rate": 8.437742520201327e-05, + "loss": 0.0, + "step": 2662 + }, + { + "epoch": 0.9033187809943627, + "grad_norm": 3.095623105764389e-05, + "learning_rate": 8.435437554022115e-05, + "loss": 0.0, + "step": 2664 + }, + { + "epoch": 0.9039969482473615, + "grad_norm": 2.8265672881389037e-05, + "learning_rate": 8.433131204049067e-05, + "loss": 0.0, + "step": 2666 + }, + { + "epoch": 0.9046751155003603, + "grad_norm": 0.000637600664049387, + "learning_rate": 8.430823471211183e-05, + "loss": 0.0, + "step": 2668 + }, + { + "epoch": 0.905353282753359, + "grad_norm": 3.2973053748719394e-05, + "learning_rate": 8.428514356438012e-05, + "loss": 0.0, + "step": 2670 + }, + { + "epoch": 0.9060314500063578, + "grad_norm": 3.9376176573568955e-05, + "learning_rate": 8.42620386065967e-05, + "loss": 0.0, + "step": 2672 + }, + { + "epoch": 0.9067096172593566, + "grad_norm": 1.6717647667974234e-05, + "learning_rate": 8.423891984806821e-05, + "loss": 0.0, + "step": 2674 + }, + { + "epoch": 0.9073877845123554, + "grad_norm": 0.0008120017591863871, + "learning_rate": 8.421578729810692e-05, + "loss": 0.0, + "step": 2676 + }, + { + "epoch": 0.9080659517653541, + "grad_norm": 3.816479875240475e-05, + "learning_rate": 8.419264096603059e-05, + "loss": 0.0, + "step": 2678 + }, + { + "epoch": 0.908744119018353, + "grad_norm": 0.023744838312268257, + "learning_rate": 8.416948086116256e-05, + "loss": 0.0, + "step": 2680 + }, + { + "epoch": 0.9094222862713517, + "grad_norm": 2.676597432582639e-05, + "learning_rate": 8.414630699283174e-05, + "loss": 0.0, + "step": 2682 + }, + { + "epoch": 0.9101004535243504, + "grad_norm": 0.00024049430794548243, + "learning_rate": 8.412311937037254e-05, + "loss": 0.0, + "step": 2684 + }, + { + "epoch": 0.9107786207773492, + "grad_norm": 3.337972520967014e-05, + "learning_rate": 8.409991800312493e-05, + "loss": 0.0, + "step": 2686 + }, + { + "epoch": 0.911456788030348, + "grad_norm": 3.9082591683836654e-05, + "learning_rate": 8.407670290043443e-05, + "loss": 0.0, + "step": 2688 + }, + { + "epoch": 0.9121349552833468, + "grad_norm": 0.03246762230992317, + "learning_rate": 8.405347407165208e-05, + "loss": 0.0, + "step": 2690 + }, + { + "epoch": 0.9128131225363455, + "grad_norm": 7.276963879121467e-05, + "learning_rate": 8.403023152613445e-05, + "loss": 0.0, + "step": 2692 + }, + { + "epoch": 0.9134912897893444, + "grad_norm": 0.00017264233611058444, + "learning_rate": 8.400697527324362e-05, + "loss": 0.0, + "step": 2694 + }, + { + "epoch": 0.9141694570423431, + "grad_norm": 2.2921736672287807e-05, + "learning_rate": 8.398370532234722e-05, + "loss": 0.0, + "step": 2696 + }, + { + "epoch": 0.9148476242953418, + "grad_norm": 2.179672992497217e-05, + "learning_rate": 8.396042168281839e-05, + "loss": 0.0, + "step": 2698 + }, + { + "epoch": 0.9155257915483406, + "grad_norm": 2.780408067337703e-05, + "learning_rate": 8.393712436403576e-05, + "loss": 0.0, + "step": 2700 + }, + { + "epoch": 0.9162039588013394, + "grad_norm": 0.0007328210631385446, + "learning_rate": 8.391381337538349e-05, + "loss": 0.0, + "step": 2702 + }, + { + "epoch": 0.9168821260543382, + "grad_norm": 0.0016520414501428604, + "learning_rate": 8.389048872625127e-05, + "loss": 0.0, + "step": 2704 + }, + { + "epoch": 0.9175602933073369, + "grad_norm": 0.07671560347080231, + "learning_rate": 8.386715042603423e-05, + "loss": 0.0001, + "step": 2706 + }, + { + "epoch": 0.9182384605603356, + "grad_norm": 0.04780285805463791, + "learning_rate": 8.384379848413304e-05, + "loss": 0.0003, + "step": 2708 + }, + { + "epoch": 0.9189166278133345, + "grad_norm": 0.000507183896843344, + "learning_rate": 8.382043290995388e-05, + "loss": 0.0, + "step": 2710 + }, + { + "epoch": 0.9195947950663332, + "grad_norm": 0.004272800404578447, + "learning_rate": 8.379705371290839e-05, + "loss": 0.0, + "step": 2712 + }, + { + "epoch": 0.920272962319332, + "grad_norm": 0.032198842614889145, + "learning_rate": 8.37736609024137e-05, + "loss": 0.0, + "step": 2714 + }, + { + "epoch": 0.9209511295723308, + "grad_norm": 0.0008604843751527369, + "learning_rate": 8.375025448789242e-05, + "loss": 0.0001, + "step": 2716 + }, + { + "epoch": 0.9216292968253296, + "grad_norm": 0.00031635569757781923, + "learning_rate": 8.37268344787727e-05, + "loss": 0.0001, + "step": 2718 + }, + { + "epoch": 0.9223074640783283, + "grad_norm": 0.08091209083795547, + "learning_rate": 8.370340088448808e-05, + "loss": 0.0001, + "step": 2720 + }, + { + "epoch": 0.922985631331327, + "grad_norm": 0.0015913655515760183, + "learning_rate": 8.367995371447759e-05, + "loss": 0.0, + "step": 2722 + }, + { + "epoch": 0.9236637985843259, + "grad_norm": 0.056560616940259933, + "learning_rate": 8.365649297818578e-05, + "loss": 0.0001, + "step": 2724 + }, + { + "epoch": 0.9243419658373246, + "grad_norm": 0.049934688955545425, + "learning_rate": 8.363301868506264e-05, + "loss": 0.0002, + "step": 2726 + }, + { + "epoch": 0.9250201330903234, + "grad_norm": 0.011096746660768986, + "learning_rate": 8.360953084456358e-05, + "loss": 0.0, + "step": 2728 + }, + { + "epoch": 0.9256983003433221, + "grad_norm": 0.0012205281527712941, + "learning_rate": 8.358602946614951e-05, + "loss": 0.0, + "step": 2730 + }, + { + "epoch": 0.926376467596321, + "grad_norm": 0.0005476016085594893, + "learning_rate": 8.35625145592868e-05, + "loss": 0.0, + "step": 2732 + }, + { + "epoch": 0.9270546348493197, + "grad_norm": 0.06708763539791107, + "learning_rate": 8.353898613344724e-05, + "loss": 0.0004, + "step": 2734 + }, + { + "epoch": 0.9277328021023185, + "grad_norm": 0.000514314800966531, + "learning_rate": 8.351544419810807e-05, + "loss": 0.0, + "step": 2736 + }, + { + "epoch": 0.9284109693553173, + "grad_norm": 0.00016134425823111087, + "learning_rate": 8.3491888762752e-05, + "loss": 0.0002, + "step": 2738 + }, + { + "epoch": 0.929089136608316, + "grad_norm": 0.0014166885521262884, + "learning_rate": 8.346831983686711e-05, + "loss": 0.0003, + "step": 2740 + }, + { + "epoch": 0.9297673038613148, + "grad_norm": 0.018221652135252953, + "learning_rate": 8.344473742994703e-05, + "loss": 0.0, + "step": 2742 + }, + { + "epoch": 0.9304454711143135, + "grad_norm": 0.016289401799440384, + "learning_rate": 8.342114155149069e-05, + "loss": 0.0002, + "step": 2744 + }, + { + "epoch": 0.9311236383673124, + "grad_norm": 0.0034765347372740507, + "learning_rate": 8.339753221100252e-05, + "loss": 0.0, + "step": 2746 + }, + { + "epoch": 0.9318018056203111, + "grad_norm": 0.004045094829052687, + "learning_rate": 8.337390941799239e-05, + "loss": 0.0002, + "step": 2748 + }, + { + "epoch": 0.9324799728733099, + "grad_norm": 0.00048109699855558574, + "learning_rate": 8.335027318197554e-05, + "loss": 0.0, + "step": 2750 + }, + { + "epoch": 0.9331581401263086, + "grad_norm": 0.026088694110512733, + "learning_rate": 8.332662351247262e-05, + "loss": 0.0, + "step": 2752 + }, + { + "epoch": 0.9338363073793075, + "grad_norm": 0.08691231906414032, + "learning_rate": 8.330296041900975e-05, + "loss": 0.0003, + "step": 2754 + }, + { + "epoch": 0.9345144746323062, + "grad_norm": 0.0608268566429615, + "learning_rate": 8.327928391111841e-05, + "loss": 0.0002, + "step": 2756 + }, + { + "epoch": 0.9351926418853049, + "grad_norm": 0.015854302793741226, + "learning_rate": 8.325559399833548e-05, + "loss": 0.0002, + "step": 2758 + }, + { + "epoch": 0.9358708091383038, + "grad_norm": 0.022921759635210037, + "learning_rate": 8.323189069020328e-05, + "loss": 0.0003, + "step": 2760 + }, + { + "epoch": 0.9365489763913025, + "grad_norm": 0.02427630126476288, + "learning_rate": 8.320817399626945e-05, + "loss": 0.0001, + "step": 2762 + }, + { + "epoch": 0.9372271436443013, + "grad_norm": 0.01896664686501026, + "learning_rate": 8.318444392608713e-05, + "loss": 0.0001, + "step": 2764 + }, + { + "epoch": 0.9379053108973, + "grad_norm": 0.036994654685258865, + "learning_rate": 8.316070048921477e-05, + "loss": 0.0004, + "step": 2766 + }, + { + "epoch": 0.9385834781502989, + "grad_norm": 0.02746070735156536, + "learning_rate": 8.313694369521621e-05, + "loss": 0.0002, + "step": 2768 + }, + { + "epoch": 0.9392616454032976, + "grad_norm": 0.050837792456150055, + "learning_rate": 8.311317355366069e-05, + "loss": 0.0003, + "step": 2770 + }, + { + "epoch": 0.9399398126562963, + "grad_norm": 0.004767420701682568, + "learning_rate": 8.308939007412281e-05, + "loss": 0.0001, + "step": 2772 + }, + { + "epoch": 0.9406179799092951, + "grad_norm": 0.030202515423297882, + "learning_rate": 8.306559326618259e-05, + "loss": 0.0001, + "step": 2774 + }, + { + "epoch": 0.9412961471622939, + "grad_norm": 0.0155405905097723, + "learning_rate": 8.304178313942536e-05, + "loss": 0.0, + "step": 2776 + }, + { + "epoch": 0.9419743144152927, + "grad_norm": 0.06447181850671768, + "learning_rate": 8.301795970344182e-05, + "loss": 0.0001, + "step": 2778 + }, + { + "epoch": 0.9426524816682914, + "grad_norm": 0.0006567398668266833, + "learning_rate": 8.299412296782804e-05, + "loss": 0.0, + "step": 2780 + }, + { + "epoch": 0.9433306489212903, + "grad_norm": 0.0004741934244520962, + "learning_rate": 8.297027294218551e-05, + "loss": 0.0001, + "step": 2782 + }, + { + "epoch": 0.944008816174289, + "grad_norm": 0.044876668602228165, + "learning_rate": 8.294640963612096e-05, + "loss": 0.0004, + "step": 2784 + }, + { + "epoch": 0.9446869834272877, + "grad_norm": 0.03857933729887009, + "learning_rate": 8.292253305924655e-05, + "loss": 0.0008, + "step": 2786 + }, + { + "epoch": 0.9453651506802865, + "grad_norm": 0.0046510398387908936, + "learning_rate": 8.289864322117977e-05, + "loss": 0.0, + "step": 2788 + }, + { + "epoch": 0.9460433179332853, + "grad_norm": 0.09152370691299438, + "learning_rate": 8.287474013154343e-05, + "loss": 0.0002, + "step": 2790 + }, + { + "epoch": 0.9467214851862841, + "grad_norm": 0.009585980325937271, + "learning_rate": 8.285082379996571e-05, + "loss": 0.0001, + "step": 2792 + }, + { + "epoch": 0.9473996524392828, + "grad_norm": 0.008876519277691841, + "learning_rate": 8.282689423608008e-05, + "loss": 0.0001, + "step": 2794 + }, + { + "epoch": 0.9480778196922816, + "grad_norm": 0.008391699753701687, + "learning_rate": 8.280295144952536e-05, + "loss": 0.0, + "step": 2796 + }, + { + "epoch": 0.9487559869452804, + "grad_norm": 0.0072625367902219296, + "learning_rate": 8.277899544994575e-05, + "loss": 0.0, + "step": 2798 + }, + { + "epoch": 0.9494341541982791, + "grad_norm": 0.09617011994123459, + "learning_rate": 8.275502624699067e-05, + "loss": 0.0003, + "step": 2800 + }, + { + "epoch": 0.9501123214512779, + "grad_norm": 0.0077019380405545235, + "learning_rate": 8.273104385031494e-05, + "loss": 0.0, + "step": 2802 + }, + { + "epoch": 0.9507904887042767, + "grad_norm": 0.012218840420246124, + "learning_rate": 8.270704826957868e-05, + "loss": 0.0005, + "step": 2804 + }, + { + "epoch": 0.9514686559572755, + "grad_norm": 0.03991170600056648, + "learning_rate": 8.268303951444726e-05, + "loss": 0.0005, + "step": 2806 + }, + { + "epoch": 0.9521468232102742, + "grad_norm": 0.038647279143333435, + "learning_rate": 8.265901759459144e-05, + "loss": 0.0002, + "step": 2808 + }, + { + "epoch": 0.952824990463273, + "grad_norm": 0.040331657975912094, + "learning_rate": 8.263498251968724e-05, + "loss": 0.0008, + "step": 2810 + }, + { + "epoch": 0.9535031577162718, + "grad_norm": 0.07936710119247437, + "learning_rate": 8.2610934299416e-05, + "loss": 0.0003, + "step": 2812 + }, + { + "epoch": 0.9541813249692706, + "grad_norm": 0.010408652015030384, + "learning_rate": 8.25868729434643e-05, + "loss": 0.0002, + "step": 2814 + }, + { + "epoch": 0.9548594922222693, + "grad_norm": 0.088149294257164, + "learning_rate": 8.25627984615241e-05, + "loss": 0.0005, + "step": 2816 + }, + { + "epoch": 0.9555376594752681, + "grad_norm": 0.09806723147630692, + "learning_rate": 8.253871086329255e-05, + "loss": 0.0006, + "step": 2818 + }, + { + "epoch": 0.9562158267282669, + "grad_norm": 0.11084140837192535, + "learning_rate": 8.251461015847217e-05, + "loss": 0.0013, + "step": 2820 + }, + { + "epoch": 0.9568939939812656, + "grad_norm": 0.28381043672561646, + "learning_rate": 8.24904963567707e-05, + "loss": 0.0013, + "step": 2822 + }, + { + "epoch": 0.9575721612342644, + "grad_norm": 0.9102690815925598, + "learning_rate": 8.246636946790119e-05, + "loss": 0.0133, + "step": 2824 + }, + { + "epoch": 0.9582503284872632, + "grad_norm": 0.6345599293708801, + "learning_rate": 8.244222950158193e-05, + "loss": 0.0126, + "step": 2826 + }, + { + "epoch": 0.958928495740262, + "grad_norm": 0.11776126176118851, + "learning_rate": 8.241807646753652e-05, + "loss": 0.0032, + "step": 2828 + }, + { + "epoch": 0.9596066629932607, + "grad_norm": 0.11438094079494476, + "learning_rate": 8.239391037549378e-05, + "loss": 0.0039, + "step": 2830 + }, + { + "epoch": 0.9602848302462594, + "grad_norm": 0.11632762104272842, + "learning_rate": 8.236973123518781e-05, + "loss": 0.0018, + "step": 2832 + }, + { + "epoch": 0.9609629974992583, + "grad_norm": 0.04498480632901192, + "learning_rate": 8.234553905635797e-05, + "loss": 0.0015, + "step": 2834 + }, + { + "epoch": 0.961641164752257, + "grad_norm": 0.09208396822214127, + "learning_rate": 8.232133384874887e-05, + "loss": 0.0022, + "step": 2836 + }, + { + "epoch": 0.9623193320052558, + "grad_norm": 0.08108418434858322, + "learning_rate": 8.229711562211036e-05, + "loss": 0.002, + "step": 2838 + }, + { + "epoch": 0.9629974992582546, + "grad_norm": 0.1276485174894333, + "learning_rate": 8.227288438619754e-05, + "loss": 0.0015, + "step": 2840 + }, + { + "epoch": 0.9636756665112534, + "grad_norm": 0.058672089129686356, + "learning_rate": 8.224864015077073e-05, + "loss": 0.0018, + "step": 2842 + }, + { + "epoch": 0.9643538337642521, + "grad_norm": 0.0685403048992157, + "learning_rate": 8.222438292559556e-05, + "loss": 0.0016, + "step": 2844 + }, + { + "epoch": 0.9650320010172508, + "grad_norm": 0.06782464683055878, + "learning_rate": 8.220011272044277e-05, + "loss": 0.0009, + "step": 2846 + }, + { + "epoch": 0.9657101682702497, + "grad_norm": 0.038573555648326874, + "learning_rate": 8.217582954508844e-05, + "loss": 0.0008, + "step": 2848 + }, + { + "epoch": 0.9663883355232484, + "grad_norm": 0.04300495982170105, + "learning_rate": 8.215153340931382e-05, + "loss": 0.0012, + "step": 2850 + }, + { + "epoch": 0.9670665027762472, + "grad_norm": 0.03777950629591942, + "learning_rate": 8.212722432290538e-05, + "loss": 0.0016, + "step": 2852 + }, + { + "epoch": 0.9677446700292459, + "grad_norm": 0.2890669107437134, + "learning_rate": 8.210290229565483e-05, + "loss": 0.0028, + "step": 2854 + }, + { + "epoch": 0.9684228372822448, + "grad_norm": 0.21739627420902252, + "learning_rate": 8.207856733735908e-05, + "loss": 0.0035, + "step": 2856 + }, + { + "epoch": 0.9691010045352435, + "grad_norm": 0.14371730387210846, + "learning_rate": 8.205421945782024e-05, + "loss": 0.0018, + "step": 2858 + }, + { + "epoch": 0.9697791717882422, + "grad_norm": 0.12269674241542816, + "learning_rate": 8.202985866684563e-05, + "loss": 0.0027, + "step": 2860 + }, + { + "epoch": 0.9704573390412411, + "grad_norm": 0.06122859939932823, + "learning_rate": 8.200548497424778e-05, + "loss": 0.0011, + "step": 2862 + }, + { + "epoch": 0.9711355062942398, + "grad_norm": 0.09474431723356247, + "learning_rate": 8.198109838984441e-05, + "loss": 0.0014, + "step": 2864 + }, + { + "epoch": 0.9718136735472386, + "grad_norm": 0.1314152330160141, + "learning_rate": 8.195669892345843e-05, + "loss": 0.001, + "step": 2866 + }, + { + "epoch": 0.9724918408002373, + "grad_norm": 0.22767437994480133, + "learning_rate": 8.193228658491795e-05, + "loss": 0.001, + "step": 2868 + }, + { + "epoch": 0.9731700080532362, + "grad_norm": 0.07902339100837708, + "learning_rate": 8.190786138405626e-05, + "loss": 0.0009, + "step": 2870 + }, + { + "epoch": 0.9738481753062349, + "grad_norm": 0.10243401676416397, + "learning_rate": 8.188342333071181e-05, + "loss": 0.0023, + "step": 2872 + }, + { + "epoch": 0.9745263425592336, + "grad_norm": 0.0833156555891037, + "learning_rate": 8.185897243472826e-05, + "loss": 0.0013, + "step": 2874 + }, + { + "epoch": 0.9752045098122324, + "grad_norm": 0.02530244179069996, + "learning_rate": 8.183450870595441e-05, + "loss": 0.0008, + "step": 2876 + }, + { + "epoch": 0.9758826770652312, + "grad_norm": 0.06331485509872437, + "learning_rate": 8.181003215424428e-05, + "loss": 0.0011, + "step": 2878 + }, + { + "epoch": 0.97656084431823, + "grad_norm": 0.05141822248697281, + "learning_rate": 8.178554278945701e-05, + "loss": 0.0004, + "step": 2880 + }, + { + "epoch": 0.9772390115712287, + "grad_norm": 0.04588576778769493, + "learning_rate": 8.176104062145687e-05, + "loss": 0.0004, + "step": 2882 + }, + { + "epoch": 0.9779171788242276, + "grad_norm": 0.06578666716814041, + "learning_rate": 8.173652566011338e-05, + "loss": 0.0015, + "step": 2884 + }, + { + "epoch": 0.9785953460772263, + "grad_norm": 0.08554302155971527, + "learning_rate": 8.171199791530116e-05, + "loss": 0.0011, + "step": 2886 + }, + { + "epoch": 0.979273513330225, + "grad_norm": 0.07025856524705887, + "learning_rate": 8.168745739689997e-05, + "loss": 0.0016, + "step": 2888 + }, + { + "epoch": 0.9799516805832238, + "grad_norm": 0.03825186938047409, + "learning_rate": 8.166290411479473e-05, + "loss": 0.0005, + "step": 2890 + }, + { + "epoch": 0.9806298478362226, + "grad_norm": 0.08433175086975098, + "learning_rate": 8.163833807887549e-05, + "loss": 0.0018, + "step": 2892 + }, + { + "epoch": 0.9813080150892214, + "grad_norm": 0.026424286887049675, + "learning_rate": 8.161375929903746e-05, + "loss": 0.0007, + "step": 2894 + }, + { + "epoch": 0.9819861823422201, + "grad_norm": 0.06956103444099426, + "learning_rate": 8.158916778518097e-05, + "loss": 0.0003, + "step": 2896 + }, + { + "epoch": 0.9826643495952189, + "grad_norm": 0.040197454392910004, + "learning_rate": 8.156456354721147e-05, + "loss": 0.0006, + "step": 2898 + }, + { + "epoch": 0.9833425168482177, + "grad_norm": 0.02220018580555916, + "learning_rate": 8.153994659503954e-05, + "loss": 0.0002, + "step": 2900 + }, + { + "epoch": 0.9840206841012165, + "grad_norm": 0.06474003195762634, + "learning_rate": 8.15153169385809e-05, + "loss": 0.0008, + "step": 2902 + }, + { + "epoch": 0.9846988513542152, + "grad_norm": 0.06129273399710655, + "learning_rate": 8.149067458775636e-05, + "loss": 0.0009, + "step": 2904 + }, + { + "epoch": 0.985377018607214, + "grad_norm": 0.034316860139369965, + "learning_rate": 8.146601955249188e-05, + "loss": 0.0006, + "step": 2906 + }, + { + "epoch": 0.9860551858602128, + "grad_norm": 0.0359802171587944, + "learning_rate": 8.144135184271849e-05, + "loss": 0.0005, + "step": 2908 + }, + { + "epoch": 0.9867333531132115, + "grad_norm": 0.05747028440237045, + "learning_rate": 8.141667146837231e-05, + "loss": 0.0004, + "step": 2910 + }, + { + "epoch": 0.9874115203662103, + "grad_norm": 0.02614564448595047, + "learning_rate": 8.139197843939463e-05, + "loss": 0.0003, + "step": 2912 + }, + { + "epoch": 0.9880896876192091, + "grad_norm": 0.1071239560842514, + "learning_rate": 8.136727276573179e-05, + "loss": 0.0004, + "step": 2914 + }, + { + "epoch": 0.9887678548722079, + "grad_norm": 0.0214134119451046, + "learning_rate": 8.134255445733522e-05, + "loss": 0.0006, + "step": 2916 + }, + { + "epoch": 0.9894460221252066, + "grad_norm": 0.08182989060878754, + "learning_rate": 8.131782352416148e-05, + "loss": 0.0007, + "step": 2918 + }, + { + "epoch": 0.9901241893782055, + "grad_norm": 0.038342587649822235, + "learning_rate": 8.129307997617216e-05, + "loss": 0.0003, + "step": 2920 + }, + { + "epoch": 0.9908023566312042, + "grad_norm": 0.08064837753772736, + "learning_rate": 8.126832382333399e-05, + "loss": 0.0007, + "step": 2922 + }, + { + "epoch": 0.9914805238842029, + "grad_norm": 0.05218582600355148, + "learning_rate": 8.124355507561871e-05, + "loss": 0.0004, + "step": 2924 + }, + { + "epoch": 0.9921586911372017, + "grad_norm": 0.03252217173576355, + "learning_rate": 8.121877374300317e-05, + "loss": 0.0002, + "step": 2926 + }, + { + "epoch": 0.9928368583902005, + "grad_norm": 0.04875943437218666, + "learning_rate": 8.119397983546932e-05, + "loss": 0.0003, + "step": 2928 + }, + { + "epoch": 0.9935150256431993, + "grad_norm": 0.019884569570422173, + "learning_rate": 8.116917336300411e-05, + "loss": 0.0003, + "step": 2930 + }, + { + "epoch": 0.994193192896198, + "grad_norm": 0.06939960271120071, + "learning_rate": 8.114435433559959e-05, + "loss": 0.0005, + "step": 2932 + }, + { + "epoch": 0.9948713601491967, + "grad_norm": 0.052569735795259476, + "learning_rate": 8.111952276325287e-05, + "loss": 0.0004, + "step": 2934 + }, + { + "epoch": 0.9955495274021956, + "grad_norm": 0.01638111099600792, + "learning_rate": 8.109467865596612e-05, + "loss": 0.0001, + "step": 2936 + }, + { + "epoch": 0.9962276946551943, + "grad_norm": 0.05986509472131729, + "learning_rate": 8.106982202374651e-05, + "loss": 0.0005, + "step": 2938 + }, + { + "epoch": 0.9969058619081931, + "grad_norm": 0.03333326429128647, + "learning_rate": 8.104495287660632e-05, + "loss": 0.0006, + "step": 2940 + }, + { + "epoch": 0.9975840291611919, + "grad_norm": 0.08217639476060867, + "learning_rate": 8.102007122456281e-05, + "loss": 0.0004, + "step": 2942 + }, + { + "epoch": 0.9982621964141907, + "grad_norm": 0.03954688087105751, + "learning_rate": 8.099517707763831e-05, + "loss": 0.0002, + "step": 2944 + }, + { + "epoch": 0.9989403636671894, + "grad_norm": 0.06906257569789886, + "learning_rate": 8.097027044586021e-05, + "loss": 0.0007, + "step": 2946 + }, + { + "epoch": 0.9996185309201882, + "grad_norm": 0.0183778777718544, + "learning_rate": 8.094535133926087e-05, + "loss": 0.0001, + "step": 2948 + }, + { + "epoch": 1.0, + "grad_norm": 0.13468892872333527, + "learning_rate": 8.09204197678777e-05, + "loss": 0.0014, + "step": 2950 + }, + { + "epoch": 1.0006781672529987, + "grad_norm": 0.09359505027532578, + "learning_rate": 8.089547574175317e-05, + "loss": 0.0008, + "step": 2952 + }, + { + "epoch": 1.0013563345059975, + "grad_norm": 0.05758538097143173, + "learning_rate": 8.08705192709347e-05, + "loss": 0.0011, + "step": 2954 + }, + { + "epoch": 1.0020345017589962, + "grad_norm": 0.027957310900092125, + "learning_rate": 8.084555036547474e-05, + "loss": 0.0004, + "step": 2956 + }, + { + "epoch": 1.0027126690119952, + "grad_norm": 0.035399600863456726, + "learning_rate": 8.08205690354308e-05, + "loss": 0.0006, + "step": 2958 + }, + { + "epoch": 1.003390836264994, + "grad_norm": 0.049815621227025986, + "learning_rate": 8.079557529086532e-05, + "loss": 0.0011, + "step": 2960 + }, + { + "epoch": 1.0040690035179927, + "grad_norm": 0.05087139829993248, + "learning_rate": 8.077056914184582e-05, + "loss": 0.0008, + "step": 2962 + }, + { + "epoch": 1.0047471707709914, + "grad_norm": 0.03460082411766052, + "learning_rate": 8.074555059844474e-05, + "loss": 0.0009, + "step": 2964 + }, + { + "epoch": 1.0054253380239901, + "grad_norm": 0.016356630250811577, + "learning_rate": 8.072051967073955e-05, + "loss": 0.0003, + "step": 2966 + }, + { + "epoch": 1.0061035052769889, + "grad_norm": 0.023417463526129723, + "learning_rate": 8.069547636881272e-05, + "loss": 0.0007, + "step": 2968 + }, + { + "epoch": 1.0067816725299876, + "grad_norm": 0.0395391583442688, + "learning_rate": 8.067042070275167e-05, + "loss": 0.0003, + "step": 2970 + }, + { + "epoch": 1.0074598397829866, + "grad_norm": 0.01494118757545948, + "learning_rate": 8.064535268264883e-05, + "loss": 0.0003, + "step": 2972 + }, + { + "epoch": 1.0081380070359853, + "grad_norm": 0.02469460852444172, + "learning_rate": 8.06202723186016e-05, + "loss": 0.0003, + "step": 2974 + }, + { + "epoch": 1.008816174288984, + "grad_norm": 0.22398723661899567, + "learning_rate": 8.059517962071233e-05, + "loss": 0.0006, + "step": 2976 + }, + { + "epoch": 1.0094943415419828, + "grad_norm": 0.03068060800433159, + "learning_rate": 8.057007459908839e-05, + "loss": 0.0004, + "step": 2978 + }, + { + "epoch": 1.0101725087949815, + "grad_norm": 0.16560781002044678, + "learning_rate": 8.054495726384204e-05, + "loss": 0.0017, + "step": 2980 + }, + { + "epoch": 1.0108506760479803, + "grad_norm": 0.06457192450761795, + "learning_rate": 8.051982762509056e-05, + "loss": 0.001, + "step": 2982 + }, + { + "epoch": 1.011528843300979, + "grad_norm": 0.014625279232859612, + "learning_rate": 8.049468569295616e-05, + "loss": 0.0004, + "step": 2984 + }, + { + "epoch": 1.0122070105539778, + "grad_norm": 0.12633752822875977, + "learning_rate": 8.046953147756601e-05, + "loss": 0.0005, + "step": 2986 + }, + { + "epoch": 1.0128851778069767, + "grad_norm": 0.02757919207215309, + "learning_rate": 8.044436498905222e-05, + "loss": 0.0003, + "step": 2988 + }, + { + "epoch": 1.0135633450599755, + "grad_norm": 0.030746174976229668, + "learning_rate": 8.041918623755187e-05, + "loss": 0.0004, + "step": 2990 + }, + { + "epoch": 1.0142415123129742, + "grad_norm": 0.06324061751365662, + "learning_rate": 8.039399523320692e-05, + "loss": 0.0005, + "step": 2992 + }, + { + "epoch": 1.014919679565973, + "grad_norm": 0.03737432509660721, + "learning_rate": 8.036879198616434e-05, + "loss": 0.0008, + "step": 2994 + }, + { + "epoch": 1.0155978468189717, + "grad_norm": 0.04197308048605919, + "learning_rate": 8.034357650657598e-05, + "loss": 0.0005, + "step": 2996 + }, + { + "epoch": 1.0162760140719704, + "grad_norm": 0.021742910146713257, + "learning_rate": 8.031834880459863e-05, + "loss": 0.0001, + "step": 2998 + }, + { + "epoch": 1.0169541813249692, + "grad_norm": 0.02395406737923622, + "learning_rate": 8.0293108890394e-05, + "loss": 0.0003, + "step": 3000 + }, + { + "epoch": 1.0176323485779681, + "grad_norm": 0.013543182983994484, + "learning_rate": 8.026785677412873e-05, + "loss": 0.0001, + "step": 3002 + }, + { + "epoch": 1.0183105158309669, + "grad_norm": 0.2532791495323181, + "learning_rate": 8.024259246597439e-05, + "loss": 0.0007, + "step": 3004 + }, + { + "epoch": 1.0189886830839656, + "grad_norm": 0.023439206182956696, + "learning_rate": 8.02173159761074e-05, + "loss": 0.0001, + "step": 3006 + }, + { + "epoch": 1.0196668503369644, + "grad_norm": 0.1361963301897049, + "learning_rate": 8.019202731470916e-05, + "loss": 0.0017, + "step": 3008 + }, + { + "epoch": 1.020345017589963, + "grad_norm": 0.05129358544945717, + "learning_rate": 8.016672649196593e-05, + "loss": 0.0008, + "step": 3010 + }, + { + "epoch": 1.0210231848429618, + "grad_norm": 0.10559432208538055, + "learning_rate": 8.014141351806886e-05, + "loss": 0.0017, + "step": 3012 + }, + { + "epoch": 1.0217013520959606, + "grad_norm": 0.05691803991794586, + "learning_rate": 8.011608840321405e-05, + "loss": 0.0009, + "step": 3014 + }, + { + "epoch": 1.0223795193489595, + "grad_norm": 0.03549787402153015, + "learning_rate": 8.009075115760243e-05, + "loss": 0.0007, + "step": 3016 + }, + { + "epoch": 1.0230576866019583, + "grad_norm": 0.038408488035202026, + "learning_rate": 8.006540179143981e-05, + "loss": 0.0002, + "step": 3018 + }, + { + "epoch": 1.023735853854957, + "grad_norm": 0.02867027185857296, + "learning_rate": 8.004004031493699e-05, + "loss": 0.0005, + "step": 3020 + }, + { + "epoch": 1.0244140211079558, + "grad_norm": 0.044582683593034744, + "learning_rate": 8.001466673830951e-05, + "loss": 0.0009, + "step": 3022 + }, + { + "epoch": 1.0250921883609545, + "grad_norm": 0.05415906384587288, + "learning_rate": 7.998928107177783e-05, + "loss": 0.0003, + "step": 3024 + }, + { + "epoch": 1.0257703556139532, + "grad_norm": 0.032624825835227966, + "learning_rate": 7.996388332556735e-05, + "loss": 0.0007, + "step": 3026 + }, + { + "epoch": 1.026448522866952, + "grad_norm": 0.04468724876642227, + "learning_rate": 7.993847350990824e-05, + "loss": 0.0012, + "step": 3028 + }, + { + "epoch": 1.0271266901199507, + "grad_norm": 0.04837553948163986, + "learning_rate": 7.991305163503558e-05, + "loss": 0.0008, + "step": 3030 + }, + { + "epoch": 1.0278048573729497, + "grad_norm": 0.03291107714176178, + "learning_rate": 7.988761771118932e-05, + "loss": 0.0004, + "step": 3032 + }, + { + "epoch": 1.0284830246259484, + "grad_norm": 0.042646851390600204, + "learning_rate": 7.98621717486142e-05, + "loss": 0.0004, + "step": 3034 + }, + { + "epoch": 1.0291611918789472, + "grad_norm": 0.0066047427244484425, + "learning_rate": 7.983671375755985e-05, + "loss": 0.0001, + "step": 3036 + }, + { + "epoch": 1.029839359131946, + "grad_norm": 0.0016902305651456118, + "learning_rate": 7.98112437482808e-05, + "loss": 0.0002, + "step": 3038 + }, + { + "epoch": 1.0305175263849446, + "grad_norm": 0.016091756522655487, + "learning_rate": 7.978576173103632e-05, + "loss": 0.0001, + "step": 3040 + }, + { + "epoch": 1.0311956936379434, + "grad_norm": 0.020465312525629997, + "learning_rate": 7.976026771609057e-05, + "loss": 0.0001, + "step": 3042 + }, + { + "epoch": 1.0318738608909421, + "grad_norm": 0.012479097582399845, + "learning_rate": 7.973476171371255e-05, + "loss": 0.0002, + "step": 3044 + }, + { + "epoch": 1.032552028143941, + "grad_norm": 0.032463181763887405, + "learning_rate": 7.970924373417608e-05, + "loss": 0.0002, + "step": 3046 + }, + { + "epoch": 1.0332301953969398, + "grad_norm": 0.016862597316503525, + "learning_rate": 7.968371378775978e-05, + "loss": 0.0, + "step": 3048 + }, + { + "epoch": 1.0339083626499386, + "grad_norm": 0.023158496245741844, + "learning_rate": 7.96581718847471e-05, + "loss": 0.0001, + "step": 3050 + }, + { + "epoch": 1.0345865299029373, + "grad_norm": 0.009301434271037579, + "learning_rate": 7.963261803542634e-05, + "loss": 0.0, + "step": 3052 + }, + { + "epoch": 1.035264697155936, + "grad_norm": 0.038685571402311325, + "learning_rate": 7.960705225009059e-05, + "loss": 0.0001, + "step": 3054 + }, + { + "epoch": 1.0359428644089348, + "grad_norm": 0.002880866639316082, + "learning_rate": 7.958147453903773e-05, + "loss": 0.0, + "step": 3056 + }, + { + "epoch": 1.0366210316619335, + "grad_norm": 0.01774897798895836, + "learning_rate": 7.955588491257045e-05, + "loss": 0.0, + "step": 3058 + }, + { + "epoch": 1.0372991989149325, + "grad_norm": 0.009110460057854652, + "learning_rate": 7.953028338099627e-05, + "loss": 0.0001, + "step": 3060 + }, + { + "epoch": 1.0379773661679312, + "grad_norm": 0.007355507463216782, + "learning_rate": 7.950466995462749e-05, + "loss": 0.0, + "step": 3062 + }, + { + "epoch": 1.03865553342093, + "grad_norm": 0.03832750394940376, + "learning_rate": 7.947904464378115e-05, + "loss": 0.0003, + "step": 3064 + }, + { + "epoch": 1.0393337006739287, + "grad_norm": 0.0005527051980607212, + "learning_rate": 7.945340745877919e-05, + "loss": 0.0, + "step": 3066 + }, + { + "epoch": 1.0400118679269275, + "grad_norm": 0.009945861995220184, + "learning_rate": 7.94277584099482e-05, + "loss": 0.0, + "step": 3068 + }, + { + "epoch": 1.0406900351799262, + "grad_norm": 0.013605031184852123, + "learning_rate": 7.940209750761964e-05, + "loss": 0.0, + "step": 3070 + }, + { + "epoch": 1.041368202432925, + "grad_norm": 0.0027010184712707996, + "learning_rate": 7.937642476212975e-05, + "loss": 0.0, + "step": 3072 + }, + { + "epoch": 1.0420463696859237, + "grad_norm": 0.018177801743149757, + "learning_rate": 7.935074018381945e-05, + "loss": 0.0, + "step": 3074 + }, + { + "epoch": 1.0427245369389226, + "grad_norm": 0.0041480534709990025, + "learning_rate": 7.932504378303452e-05, + "loss": 0.0, + "step": 3076 + }, + { + "epoch": 1.0434027041919214, + "grad_norm": 0.00467638298869133, + "learning_rate": 7.929933557012545e-05, + "loss": 0.0, + "step": 3078 + }, + { + "epoch": 1.0440808714449201, + "grad_norm": 0.00816649291664362, + "learning_rate": 7.927361555544754e-05, + "loss": 0.0, + "step": 3080 + }, + { + "epoch": 1.0447590386979189, + "grad_norm": 0.0006185176898725331, + "learning_rate": 7.924788374936078e-05, + "loss": 0.0, + "step": 3082 + }, + { + "epoch": 1.0454372059509176, + "grad_norm": 0.000762543233577162, + "learning_rate": 7.922214016222993e-05, + "loss": 0.0, + "step": 3084 + }, + { + "epoch": 1.0461153732039163, + "grad_norm": 0.002773831132799387, + "learning_rate": 7.919638480442452e-05, + "loss": 0.0, + "step": 3086 + }, + { + "epoch": 1.046793540456915, + "grad_norm": 0.0035561330150812864, + "learning_rate": 7.917061768631882e-05, + "loss": 0.0001, + "step": 3088 + }, + { + "epoch": 1.047471707709914, + "grad_norm": 0.001374011393636465, + "learning_rate": 7.914483881829179e-05, + "loss": 0.0, + "step": 3090 + }, + { + "epoch": 1.0481498749629128, + "grad_norm": 0.0007063335506245494, + "learning_rate": 7.911904821072717e-05, + "loss": 0.0, + "step": 3092 + }, + { + "epoch": 1.0488280422159115, + "grad_norm": 0.008668769150972366, + "learning_rate": 7.909324587401342e-05, + "loss": 0.0, + "step": 3094 + }, + { + "epoch": 1.0495062094689103, + "grad_norm": 0.003817323362454772, + "learning_rate": 7.90674318185437e-05, + "loss": 0.0, + "step": 3096 + }, + { + "epoch": 1.050184376721909, + "grad_norm": 0.005347683094441891, + "learning_rate": 7.904160605471594e-05, + "loss": 0.0, + "step": 3098 + }, + { + "epoch": 1.0508625439749077, + "grad_norm": 0.0007073975284583867, + "learning_rate": 7.901576859293272e-05, + "loss": 0.0, + "step": 3100 + }, + { + "epoch": 1.0515407112279065, + "grad_norm": 0.002511057071387768, + "learning_rate": 7.898991944360137e-05, + "loss": 0.0, + "step": 3102 + }, + { + "epoch": 1.0522188784809055, + "grad_norm": 0.00014569952327292413, + "learning_rate": 7.896405861713394e-05, + "loss": 0.0, + "step": 3104 + }, + { + "epoch": 1.0528970457339042, + "grad_norm": 0.013416592963039875, + "learning_rate": 7.893818612394717e-05, + "loss": 0.0, + "step": 3106 + }, + { + "epoch": 1.053575212986903, + "grad_norm": 0.024336976930499077, + "learning_rate": 7.891230197446249e-05, + "loss": 0.0002, + "step": 3108 + }, + { + "epoch": 1.0542533802399017, + "grad_norm": 0.00041308748768642545, + "learning_rate": 7.888640617910603e-05, + "loss": 0.0, + "step": 3110 + }, + { + "epoch": 1.0549315474929004, + "grad_norm": 0.00036542725865729153, + "learning_rate": 7.886049874830862e-05, + "loss": 0.0, + "step": 3112 + }, + { + "epoch": 1.0556097147458992, + "grad_norm": 0.002658010693266988, + "learning_rate": 7.883457969250576e-05, + "loss": 0.0001, + "step": 3114 + }, + { + "epoch": 1.056287881998898, + "grad_norm": 0.0011226636124774814, + "learning_rate": 7.880864902213765e-05, + "loss": 0.0, + "step": 3116 + }, + { + "epoch": 1.0569660492518969, + "grad_norm": 0.0011952995555475354, + "learning_rate": 7.878270674764917e-05, + "loss": 0.0001, + "step": 3118 + }, + { + "epoch": 1.0576442165048956, + "grad_norm": 0.0014733074931427836, + "learning_rate": 7.875675287948982e-05, + "loss": 0.0001, + "step": 3120 + }, + { + "epoch": 1.0583223837578943, + "grad_norm": 0.00036149792140349746, + "learning_rate": 7.873078742811388e-05, + "loss": 0.0, + "step": 3122 + }, + { + "epoch": 1.059000551010893, + "grad_norm": 0.002111283363774419, + "learning_rate": 7.87048104039802e-05, + "loss": 0.0, + "step": 3124 + }, + { + "epoch": 1.0596787182638918, + "grad_norm": 0.0004321536107454449, + "learning_rate": 7.86788218175523e-05, + "loss": 0.0003, + "step": 3126 + }, + { + "epoch": 1.0603568855168906, + "grad_norm": 0.002552892081439495, + "learning_rate": 7.865282167929842e-05, + "loss": 0.0002, + "step": 3128 + }, + { + "epoch": 1.0610350527698893, + "grad_norm": 0.0008854964398778975, + "learning_rate": 7.862680999969139e-05, + "loss": 0.0, + "step": 3130 + }, + { + "epoch": 1.061713220022888, + "grad_norm": 0.0017830159049481153, + "learning_rate": 7.860078678920872e-05, + "loss": 0.0, + "step": 3132 + }, + { + "epoch": 1.062391387275887, + "grad_norm": 0.03791436180472374, + "learning_rate": 7.857475205833254e-05, + "loss": 0.0003, + "step": 3134 + }, + { + "epoch": 1.0630695545288857, + "grad_norm": 0.00176812254358083, + "learning_rate": 7.854870581754966e-05, + "loss": 0.0, + "step": 3136 + }, + { + "epoch": 1.0637477217818845, + "grad_norm": 0.00034444493940100074, + "learning_rate": 7.852264807735147e-05, + "loss": 0.0, + "step": 3138 + }, + { + "epoch": 1.0644258890348832, + "grad_norm": 0.047313202172517776, + "learning_rate": 7.849657884823408e-05, + "loss": 0.0002, + "step": 3140 + }, + { + "epoch": 1.065104056287882, + "grad_norm": 0.003105980111286044, + "learning_rate": 7.847049814069811e-05, + "loss": 0.0001, + "step": 3142 + }, + { + "epoch": 1.0657822235408807, + "grad_norm": 0.04377178102731705, + "learning_rate": 7.844440596524891e-05, + "loss": 0.0001, + "step": 3144 + }, + { + "epoch": 1.0664603907938794, + "grad_norm": 0.014682923443615437, + "learning_rate": 7.841830233239638e-05, + "loss": 0.0001, + "step": 3146 + }, + { + "epoch": 1.0671385580468784, + "grad_norm": 0.030811438336968422, + "learning_rate": 7.839218725265506e-05, + "loss": 0.0002, + "step": 3148 + }, + { + "epoch": 1.0678167252998771, + "grad_norm": 0.024048537015914917, + "learning_rate": 7.836606073654413e-05, + "loss": 0.0001, + "step": 3150 + }, + { + "epoch": 1.0684948925528759, + "grad_norm": 0.05794329568743706, + "learning_rate": 7.833992279458732e-05, + "loss": 0.0001, + "step": 3152 + }, + { + "epoch": 1.0691730598058746, + "grad_norm": 0.037551164627075195, + "learning_rate": 7.831377343731298e-05, + "loss": 0.0002, + "step": 3154 + }, + { + "epoch": 1.0698512270588734, + "grad_norm": 0.01598585955798626, + "learning_rate": 7.828761267525411e-05, + "loss": 0.0006, + "step": 3156 + }, + { + "epoch": 1.070529394311872, + "grad_norm": 0.024345286190509796, + "learning_rate": 7.826144051894824e-05, + "loss": 0.0001, + "step": 3158 + }, + { + "epoch": 1.0712075615648708, + "grad_norm": 0.009724217467010021, + "learning_rate": 7.823525697893749e-05, + "loss": 0.0001, + "step": 3160 + }, + { + "epoch": 1.0718857288178698, + "grad_norm": 0.0019536882173269987, + "learning_rate": 7.820906206576862e-05, + "loss": 0.0002, + "step": 3162 + }, + { + "epoch": 1.0725638960708685, + "grad_norm": 0.0014726163353770971, + "learning_rate": 7.81828557899929e-05, + "loss": 0.0, + "step": 3164 + }, + { + "epoch": 1.0732420633238673, + "grad_norm": 0.022503558546304703, + "learning_rate": 7.815663816216625e-05, + "loss": 0.0001, + "step": 3166 + }, + { + "epoch": 1.073920230576866, + "grad_norm": 0.011805574409663677, + "learning_rate": 7.813040919284913e-05, + "loss": 0.0, + "step": 3168 + }, + { + "epoch": 1.0745983978298648, + "grad_norm": 0.00046225800178945065, + "learning_rate": 7.810416889260653e-05, + "loss": 0.0, + "step": 3170 + }, + { + "epoch": 1.0752765650828635, + "grad_norm": 0.0012093356344848871, + "learning_rate": 7.807791727200808e-05, + "loss": 0.0001, + "step": 3172 + }, + { + "epoch": 1.0759547323358623, + "grad_norm": 0.0019421727629378438, + "learning_rate": 7.80516543416279e-05, + "loss": 0.0001, + "step": 3174 + }, + { + "epoch": 1.0766328995888612, + "grad_norm": 0.004075727425515652, + "learning_rate": 7.80253801120447e-05, + "loss": 0.0, + "step": 3176 + }, + { + "epoch": 1.07731106684186, + "grad_norm": 0.0018418336985632777, + "learning_rate": 7.799909459384176e-05, + "loss": 0.0, + "step": 3178 + }, + { + "epoch": 1.0779892340948587, + "grad_norm": 0.009764020331203938, + "learning_rate": 7.797279779760685e-05, + "loss": 0.0, + "step": 3180 + }, + { + "epoch": 1.0786674013478574, + "grad_norm": 0.0008932505734264851, + "learning_rate": 7.794648973393234e-05, + "loss": 0.0, + "step": 3182 + }, + { + "epoch": 1.0793455686008562, + "grad_norm": 0.015295356512069702, + "learning_rate": 7.792017041341511e-05, + "loss": 0.0001, + "step": 3184 + }, + { + "epoch": 1.080023735853855, + "grad_norm": 0.001601558760739863, + "learning_rate": 7.789383984665657e-05, + "loss": 0.0, + "step": 3186 + }, + { + "epoch": 1.0807019031068537, + "grad_norm": 0.03218120336532593, + "learning_rate": 7.786749804426268e-05, + "loss": 0.0, + "step": 3188 + }, + { + "epoch": 1.0813800703598524, + "grad_norm": 0.002657031174749136, + "learning_rate": 7.78411450168439e-05, + "loss": 0.0, + "step": 3190 + }, + { + "epoch": 1.0820582376128514, + "grad_norm": 0.001861520460806787, + "learning_rate": 7.781478077501525e-05, + "loss": 0.0, + "step": 3192 + }, + { + "epoch": 1.08273640486585, + "grad_norm": 0.12163111567497253, + "learning_rate": 7.778840532939622e-05, + "loss": 0.0001, + "step": 3194 + }, + { + "epoch": 1.0834145721188488, + "grad_norm": 0.02530871331691742, + "learning_rate": 7.776201869061085e-05, + "loss": 0.0001, + "step": 3196 + }, + { + "epoch": 1.0840927393718476, + "grad_norm": 0.04777170345187187, + "learning_rate": 7.773562086928768e-05, + "loss": 0.0002, + "step": 3198 + }, + { + "epoch": 1.0847709066248463, + "grad_norm": 0.03361360356211662, + "learning_rate": 7.770921187605973e-05, + "loss": 0.0001, + "step": 3200 + }, + { + "epoch": 1.085449073877845, + "grad_norm": 0.0027512232773005962, + "learning_rate": 7.768279172156453e-05, + "loss": 0.0001, + "step": 3202 + }, + { + "epoch": 1.0861272411308438, + "grad_norm": 0.015211574733257294, + "learning_rate": 7.765636041644417e-05, + "loss": 0.0002, + "step": 3204 + }, + { + "epoch": 1.0868054083838428, + "grad_norm": 0.011098697781562805, + "learning_rate": 7.762991797134514e-05, + "loss": 0.0003, + "step": 3206 + }, + { + "epoch": 1.0874835756368415, + "grad_norm": 0.017032770439982414, + "learning_rate": 7.760346439691843e-05, + "loss": 0.0001, + "step": 3208 + }, + { + "epoch": 1.0881617428898402, + "grad_norm": 0.010514136403799057, + "learning_rate": 7.757699970381958e-05, + "loss": 0.0001, + "step": 3210 + }, + { + "epoch": 1.088839910142839, + "grad_norm": 0.016751520335674286, + "learning_rate": 7.755052390270854e-05, + "loss": 0.0001, + "step": 3212 + }, + { + "epoch": 1.0895180773958377, + "grad_norm": 0.003923425450921059, + "learning_rate": 7.752403700424979e-05, + "loss": 0.0001, + "step": 3214 + }, + { + "epoch": 1.0901962446488365, + "grad_norm": 0.0033663345966488123, + "learning_rate": 7.74975390191122e-05, + "loss": 0.0001, + "step": 3216 + }, + { + "epoch": 1.0908744119018352, + "grad_norm": 0.016712361946702003, + "learning_rate": 7.74710299579692e-05, + "loss": 0.0, + "step": 3218 + }, + { + "epoch": 1.091552579154834, + "grad_norm": 0.005410924553871155, + "learning_rate": 7.74445098314986e-05, + "loss": 0.0001, + "step": 3220 + }, + { + "epoch": 1.092230746407833, + "grad_norm": 0.005119035951793194, + "learning_rate": 7.741797865038273e-05, + "loss": 0.0001, + "step": 3222 + }, + { + "epoch": 1.0929089136608316, + "grad_norm": 0.0694790631532669, + "learning_rate": 7.739143642530833e-05, + "loss": 0.0001, + "step": 3224 + }, + { + "epoch": 1.0935870809138304, + "grad_norm": 0.037400029599666595, + "learning_rate": 7.736488316696663e-05, + "loss": 0.0001, + "step": 3226 + }, + { + "epoch": 1.0942652481668291, + "grad_norm": 0.003814551280811429, + "learning_rate": 7.733831888605325e-05, + "loss": 0.0, + "step": 3228 + }, + { + "epoch": 1.0949434154198279, + "grad_norm": 0.0038876463659107685, + "learning_rate": 7.731174359326829e-05, + "loss": 0.0001, + "step": 3230 + }, + { + "epoch": 1.0956215826728266, + "grad_norm": 0.05716302618384361, + "learning_rate": 7.728515729931629e-05, + "loss": 0.0003, + "step": 3232 + }, + { + "epoch": 1.0962997499258256, + "grad_norm": 0.007123335264623165, + "learning_rate": 7.725856001490618e-05, + "loss": 0.0002, + "step": 3234 + }, + { + "epoch": 1.0969779171788243, + "grad_norm": 0.016324380412697792, + "learning_rate": 7.723195175075136e-05, + "loss": 0.0003, + "step": 3236 + }, + { + "epoch": 1.097656084431823, + "grad_norm": 0.008962440304458141, + "learning_rate": 7.720533251756963e-05, + "loss": 0.0004, + "step": 3238 + }, + { + "epoch": 1.0983342516848218, + "grad_norm": 0.004593605175614357, + "learning_rate": 7.717870232608322e-05, + "loss": 0.0001, + "step": 3240 + }, + { + "epoch": 1.0990124189378205, + "grad_norm": 0.008814834989607334, + "learning_rate": 7.715206118701876e-05, + "loss": 0.0005, + "step": 3242 + }, + { + "epoch": 1.0996905861908193, + "grad_norm": 0.022388102486729622, + "learning_rate": 7.71254091111073e-05, + "loss": 0.0002, + "step": 3244 + }, + { + "epoch": 1.100368753443818, + "grad_norm": 0.02076655626296997, + "learning_rate": 7.709874610908429e-05, + "loss": 0.0001, + "step": 3246 + }, + { + "epoch": 1.1010469206968168, + "grad_norm": 0.0011612613452598453, + "learning_rate": 7.70720721916896e-05, + "loss": 0.0001, + "step": 3248 + }, + { + "epoch": 1.1017250879498157, + "grad_norm": 0.00931808352470398, + "learning_rate": 7.704538736966746e-05, + "loss": 0.0, + "step": 3250 + }, + { + "epoch": 1.1024032552028145, + "grad_norm": 0.0014653477119281888, + "learning_rate": 7.701869165376653e-05, + "loss": 0.0, + "step": 3252 + }, + { + "epoch": 1.1030814224558132, + "grad_norm": 0.0131069365888834, + "learning_rate": 7.699198505473983e-05, + "loss": 0.0, + "step": 3254 + }, + { + "epoch": 1.103759589708812, + "grad_norm": 0.004241168964654207, + "learning_rate": 7.696526758334477e-05, + "loss": 0.0003, + "step": 3256 + }, + { + "epoch": 1.1044377569618107, + "grad_norm": 0.019079118967056274, + "learning_rate": 7.693853925034315e-05, + "loss": 0.0003, + "step": 3258 + }, + { + "epoch": 1.1051159242148094, + "grad_norm": 0.029437581077218056, + "learning_rate": 7.691180006650116e-05, + "loss": 0.0002, + "step": 3260 + }, + { + "epoch": 1.1057940914678082, + "grad_norm": 0.010389408096671104, + "learning_rate": 7.68850500425893e-05, + "loss": 0.0001, + "step": 3262 + }, + { + "epoch": 1.1064722587208071, + "grad_norm": 0.0028994681779295206, + "learning_rate": 7.685828918938251e-05, + "loss": 0.0, + "step": 3264 + }, + { + "epoch": 1.1071504259738059, + "grad_norm": 0.005432248581200838, + "learning_rate": 7.683151751766004e-05, + "loss": 0.0, + "step": 3266 + }, + { + "epoch": 1.1078285932268046, + "grad_norm": 0.012759734876453876, + "learning_rate": 7.680473503820553e-05, + "loss": 0.0003, + "step": 3268 + }, + { + "epoch": 1.1085067604798033, + "grad_norm": 0.0006062522297725081, + "learning_rate": 7.677794176180695e-05, + "loss": 0.0, + "step": 3270 + }, + { + "epoch": 1.109184927732802, + "grad_norm": 0.001002602744847536, + "learning_rate": 7.675113769925663e-05, + "loss": 0.0, + "step": 3272 + }, + { + "epoch": 1.1098630949858008, + "grad_norm": 0.0004955371841788292, + "learning_rate": 7.672432286135125e-05, + "loss": 0.0, + "step": 3274 + }, + { + "epoch": 1.1105412622387996, + "grad_norm": 0.0018315280321985483, + "learning_rate": 7.669749725889182e-05, + "loss": 0.0, + "step": 3276 + }, + { + "epoch": 1.1112194294917983, + "grad_norm": 0.0031102353241294622, + "learning_rate": 7.667066090268366e-05, + "loss": 0.0, + "step": 3278 + }, + { + "epoch": 1.1118975967447973, + "grad_norm": 0.0009504572371952236, + "learning_rate": 7.66438138035365e-05, + "loss": 0.0, + "step": 3280 + }, + { + "epoch": 1.112575763997796, + "grad_norm": 0.000422391458414495, + "learning_rate": 7.661695597226432e-05, + "loss": 0.0, + "step": 3282 + }, + { + "epoch": 1.1132539312507947, + "grad_norm": 0.001483218977227807, + "learning_rate": 7.659008741968546e-05, + "loss": 0.0, + "step": 3284 + }, + { + "epoch": 1.1139320985037935, + "grad_norm": 0.004129413980990648, + "learning_rate": 7.656320815662257e-05, + "loss": 0.0, + "step": 3286 + }, + { + "epoch": 1.1146102657567922, + "grad_norm": 0.00036125979386270046, + "learning_rate": 7.653631819390261e-05, + "loss": 0.0, + "step": 3288 + }, + { + "epoch": 1.115288433009791, + "grad_norm": 0.0027813103515654802, + "learning_rate": 7.650941754235686e-05, + "loss": 0.0, + "step": 3290 + }, + { + "epoch": 1.1159666002627897, + "grad_norm": 0.0009888537460938096, + "learning_rate": 7.648250621282086e-05, + "loss": 0.0, + "step": 3292 + }, + { + "epoch": 1.1166447675157887, + "grad_norm": 0.00042577870772220194, + "learning_rate": 7.645558421613457e-05, + "loss": 0.0, + "step": 3294 + }, + { + "epoch": 1.1173229347687874, + "grad_norm": 0.0008866444695740938, + "learning_rate": 7.64286515631421e-05, + "loss": 0.0, + "step": 3296 + }, + { + "epoch": 1.1180011020217862, + "grad_norm": 0.00012718561629299074, + "learning_rate": 7.640170826469195e-05, + "loss": 0.0, + "step": 3298 + }, + { + "epoch": 1.118679269274785, + "grad_norm": 0.002095594769343734, + "learning_rate": 7.637475433163685e-05, + "loss": 0.0, + "step": 3300 + }, + { + "epoch": 1.1193574365277836, + "grad_norm": 0.02821212261915207, + "learning_rate": 7.634778977483389e-05, + "loss": 0.0001, + "step": 3302 + }, + { + "epoch": 1.1200356037807824, + "grad_norm": 9.883302845992148e-05, + "learning_rate": 7.632081460514433e-05, + "loss": 0.0, + "step": 3304 + }, + { + "epoch": 1.1207137710337811, + "grad_norm": 0.00022422336041927338, + "learning_rate": 7.629382883343381e-05, + "loss": 0.0, + "step": 3306 + }, + { + "epoch": 1.12139193828678, + "grad_norm": 0.00010571469465503469, + "learning_rate": 7.626683247057218e-05, + "loss": 0.0, + "step": 3308 + }, + { + "epoch": 1.1220701055397788, + "grad_norm": 0.00012248066195752472, + "learning_rate": 7.623982552743356e-05, + "loss": 0.0, + "step": 3310 + }, + { + "epoch": 1.1227482727927776, + "grad_norm": 0.0009538671583868563, + "learning_rate": 7.621280801489637e-05, + "loss": 0.0, + "step": 3312 + }, + { + "epoch": 1.1234264400457763, + "grad_norm": 0.00031743402360007167, + "learning_rate": 7.618577994384324e-05, + "loss": 0.0, + "step": 3314 + }, + { + "epoch": 1.124104607298775, + "grad_norm": 0.00019657450320664793, + "learning_rate": 7.615874132516107e-05, + "loss": 0.0, + "step": 3316 + }, + { + "epoch": 1.1247827745517738, + "grad_norm": 0.0009447080083191395, + "learning_rate": 7.613169216974105e-05, + "loss": 0.0, + "step": 3318 + }, + { + "epoch": 1.1254609418047725, + "grad_norm": 0.00016561873781029135, + "learning_rate": 7.610463248847852e-05, + "loss": 0.0, + "step": 3320 + }, + { + "epoch": 1.1261391090577715, + "grad_norm": 0.00016318686539307237, + "learning_rate": 7.607756229227316e-05, + "loss": 0.0, + "step": 3322 + }, + { + "epoch": 1.1268172763107702, + "grad_norm": 0.0007923968369141221, + "learning_rate": 7.605048159202883e-05, + "loss": 0.0, + "step": 3324 + }, + { + "epoch": 1.127495443563769, + "grad_norm": 0.001106180832721293, + "learning_rate": 7.602339039865362e-05, + "loss": 0.0, + "step": 3326 + }, + { + "epoch": 1.1281736108167677, + "grad_norm": 0.00042214279528707266, + "learning_rate": 7.599628872305989e-05, + "loss": 0.0, + "step": 3328 + }, + { + "epoch": 1.1288517780697664, + "grad_norm": 0.00034455794957466424, + "learning_rate": 7.596917657616414e-05, + "loss": 0.0, + "step": 3330 + }, + { + "epoch": 1.1295299453227652, + "grad_norm": 0.0002697907912079245, + "learning_rate": 7.594205396888718e-05, + "loss": 0.0, + "step": 3332 + }, + { + "epoch": 1.130208112575764, + "grad_norm": 0.00040953329880721867, + "learning_rate": 7.5914920912154e-05, + "loss": 0.0, + "step": 3334 + }, + { + "epoch": 1.1308862798287627, + "grad_norm": 0.00025164970429614186, + "learning_rate": 7.588777741689376e-05, + "loss": 0.0, + "step": 3336 + }, + { + "epoch": 1.1315644470817616, + "grad_norm": 0.0008031830657273531, + "learning_rate": 7.586062349403987e-05, + "loss": 0.0, + "step": 3338 + }, + { + "epoch": 1.1322426143347604, + "grad_norm": 8.426888234680519e-05, + "learning_rate": 7.583345915452994e-05, + "loss": 0.0, + "step": 3340 + }, + { + "epoch": 1.132920781587759, + "grad_norm": 0.0001466549001634121, + "learning_rate": 7.580628440930574e-05, + "loss": 0.0, + "step": 3342 + }, + { + "epoch": 1.1335989488407578, + "grad_norm": 0.0003892593667842448, + "learning_rate": 7.577909926931328e-05, + "loss": 0.0, + "step": 3344 + }, + { + "epoch": 1.1342771160937566, + "grad_norm": 9.798775863600895e-05, + "learning_rate": 7.575190374550272e-05, + "loss": 0.0, + "step": 3346 + }, + { + "epoch": 1.1349552833467553, + "grad_norm": 0.011068711057305336, + "learning_rate": 7.57246978488284e-05, + "loss": 0.0, + "step": 3348 + }, + { + "epoch": 1.135633450599754, + "grad_norm": 0.00015634531155228615, + "learning_rate": 7.569748159024887e-05, + "loss": 0.0, + "step": 3350 + }, + { + "epoch": 1.136311617852753, + "grad_norm": 0.00030616766889579594, + "learning_rate": 7.567025498072681e-05, + "loss": 0.0, + "step": 3352 + }, + { + "epoch": 1.1369897851057518, + "grad_norm": 0.00010516907786950469, + "learning_rate": 7.564301803122915e-05, + "loss": 0.0, + "step": 3354 + }, + { + "epoch": 1.1376679523587505, + "grad_norm": 0.0002856908831745386, + "learning_rate": 7.561577075272686e-05, + "loss": 0.0, + "step": 3356 + }, + { + "epoch": 1.1383461196117493, + "grad_norm": 0.0011014806805178523, + "learning_rate": 7.558851315619518e-05, + "loss": 0.0, + "step": 3358 + }, + { + "epoch": 1.139024286864748, + "grad_norm": 0.00206442060880363, + "learning_rate": 7.556124525261347e-05, + "loss": 0.0, + "step": 3360 + }, + { + "epoch": 1.1397024541177467, + "grad_norm": 0.002881616121158004, + "learning_rate": 7.553396705296522e-05, + "loss": 0.0, + "step": 3362 + }, + { + "epoch": 1.1403806213707455, + "grad_norm": 0.018271418288350105, + "learning_rate": 7.55066785682381e-05, + "loss": 0.0, + "step": 3364 + }, + { + "epoch": 1.1410587886237442, + "grad_norm": 0.0005267587839625776, + "learning_rate": 7.54793798094239e-05, + "loss": 0.0, + "step": 3366 + }, + { + "epoch": 1.1417369558767432, + "grad_norm": 0.00033838770468719304, + "learning_rate": 7.545207078751857e-05, + "loss": 0.0, + "step": 3368 + }, + { + "epoch": 1.142415123129742, + "grad_norm": 0.0003050903615076095, + "learning_rate": 7.542475151352216e-05, + "loss": 0.0, + "step": 3370 + }, + { + "epoch": 1.1430932903827407, + "grad_norm": 0.022571176290512085, + "learning_rate": 7.539742199843889e-05, + "loss": 0.0001, + "step": 3372 + }, + { + "epoch": 1.1437714576357394, + "grad_norm": 0.00010642271081451327, + "learning_rate": 7.537008225327706e-05, + "loss": 0.0, + "step": 3374 + }, + { + "epoch": 1.1444496248887381, + "grad_norm": 0.00012813869398087263, + "learning_rate": 7.534273228904915e-05, + "loss": 0.0, + "step": 3376 + }, + { + "epoch": 1.1451277921417369, + "grad_norm": 0.0001594992500031367, + "learning_rate": 7.531537211677169e-05, + "loss": 0.0, + "step": 3378 + }, + { + "epoch": 1.1458059593947358, + "grad_norm": 0.0005135987303219736, + "learning_rate": 7.528800174746539e-05, + "loss": 0.0, + "step": 3380 + }, + { + "epoch": 1.1464841266477346, + "grad_norm": 0.001825116341933608, + "learning_rate": 7.526062119215497e-05, + "loss": 0.0, + "step": 3382 + }, + { + "epoch": 1.1471622939007333, + "grad_norm": 0.02989931032061577, + "learning_rate": 7.523323046186941e-05, + "loss": 0.0001, + "step": 3384 + }, + { + "epoch": 1.147840461153732, + "grad_norm": 0.006477775983512402, + "learning_rate": 7.52058295676416e-05, + "loss": 0.0, + "step": 3386 + }, + { + "epoch": 1.1485186284067308, + "grad_norm": 0.0006138100288808346, + "learning_rate": 7.517841852050866e-05, + "loss": 0.0, + "step": 3388 + }, + { + "epoch": 1.1491967956597295, + "grad_norm": 7.73553765611723e-05, + "learning_rate": 7.515099733151177e-05, + "loss": 0.0, + "step": 3390 + }, + { + "epoch": 1.1498749629127283, + "grad_norm": 0.0010425514774397016, + "learning_rate": 7.512356601169615e-05, + "loss": 0.0, + "step": 3392 + }, + { + "epoch": 1.150553130165727, + "grad_norm": 0.0010554770706221461, + "learning_rate": 7.509612457211113e-05, + "loss": 0.0, + "step": 3394 + }, + { + "epoch": 1.151231297418726, + "grad_norm": 0.00047697455738671124, + "learning_rate": 7.506867302381015e-05, + "loss": 0.0, + "step": 3396 + }, + { + "epoch": 1.1519094646717247, + "grad_norm": 0.00013069043052382767, + "learning_rate": 7.504121137785066e-05, + "loss": 0.0, + "step": 3398 + }, + { + "epoch": 1.1525876319247235, + "grad_norm": 0.0029345303773880005, + "learning_rate": 7.501373964529424e-05, + "loss": 0.0, + "step": 3400 + }, + { + "epoch": 1.1532657991777222, + "grad_norm": 0.002666602609679103, + "learning_rate": 7.498625783720646e-05, + "loss": 0.0, + "step": 3402 + }, + { + "epoch": 1.153943966430721, + "grad_norm": 0.0011464230483397841, + "learning_rate": 7.495876596465703e-05, + "loss": 0.0, + "step": 3404 + }, + { + "epoch": 1.1546221336837197, + "grad_norm": 6.778768147341907e-05, + "learning_rate": 7.493126403871964e-05, + "loss": 0.0, + "step": 3406 + }, + { + "epoch": 1.1553003009367184, + "grad_norm": 6.291534373303875e-05, + "learning_rate": 7.490375207047209e-05, + "loss": 0.0, + "step": 3408 + }, + { + "epoch": 1.1559784681897174, + "grad_norm": 0.00035271712113171816, + "learning_rate": 7.487623007099618e-05, + "loss": 0.0, + "step": 3410 + }, + { + "epoch": 1.1566566354427161, + "grad_norm": 0.0012021423317492008, + "learning_rate": 7.484869805137778e-05, + "loss": 0.0, + "step": 3412 + }, + { + "epoch": 1.1573348026957149, + "grad_norm": 4.147122672293335e-05, + "learning_rate": 7.482115602270677e-05, + "loss": 0.0002, + "step": 3414 + }, + { + "epoch": 1.1580129699487136, + "grad_norm": 7.595721399411559e-05, + "learning_rate": 7.479360399607707e-05, + "loss": 0.0, + "step": 3416 + }, + { + "epoch": 1.1586911372017124, + "grad_norm": 9.155207226285711e-05, + "learning_rate": 7.476604198258666e-05, + "loss": 0.0, + "step": 3418 + }, + { + "epoch": 1.159369304454711, + "grad_norm": 7.61376868467778e-05, + "learning_rate": 7.473846999333749e-05, + "loss": 0.0, + "step": 3420 + }, + { + "epoch": 1.1600474717077098, + "grad_norm": 6.941679021110758e-05, + "learning_rate": 7.471088803943557e-05, + "loss": 0.0, + "step": 3422 + }, + { + "epoch": 1.1607256389607086, + "grad_norm": 9.52499030972831e-05, + "learning_rate": 7.468329613199091e-05, + "loss": 0.0, + "step": 3424 + }, + { + "epoch": 1.1614038062137075, + "grad_norm": 8.02976283011958e-05, + "learning_rate": 7.465569428211751e-05, + "loss": 0.0, + "step": 3426 + }, + { + "epoch": 1.1620819734667063, + "grad_norm": 0.00013924592349212617, + "learning_rate": 7.46280825009334e-05, + "loss": 0.0, + "step": 3428 + }, + { + "epoch": 1.162760140719705, + "grad_norm": 6.39605859760195e-05, + "learning_rate": 7.460046079956063e-05, + "loss": 0.0, + "step": 3430 + }, + { + "epoch": 1.1634383079727038, + "grad_norm": 0.000588226830586791, + "learning_rate": 7.457282918912516e-05, + "loss": 0.0, + "step": 3432 + }, + { + "epoch": 1.1641164752257025, + "grad_norm": 0.0006162902573123574, + "learning_rate": 7.454518768075704e-05, + "loss": 0.0, + "step": 3434 + }, + { + "epoch": 1.1647946424787012, + "grad_norm": 7.87221870268695e-05, + "learning_rate": 7.451753628559027e-05, + "loss": 0.0, + "step": 3436 + }, + { + "epoch": 1.1654728097317002, + "grad_norm": 0.00011970425839535892, + "learning_rate": 7.44898750147628e-05, + "loss": 0.0, + "step": 3438 + }, + { + "epoch": 1.166150976984699, + "grad_norm": 6.236095941858366e-05, + "learning_rate": 7.446220387941661e-05, + "loss": 0.0001, + "step": 3440 + }, + { + "epoch": 1.1668291442376977, + "grad_norm": 5.8419889683136716e-05, + "learning_rate": 7.443452289069763e-05, + "loss": 0.0, + "step": 3442 + }, + { + "epoch": 1.1675073114906964, + "grad_norm": 6.64609469822608e-05, + "learning_rate": 7.440683205975574e-05, + "loss": 0.0, + "step": 3444 + }, + { + "epoch": 1.1681854787436952, + "grad_norm": 7.750398071948439e-05, + "learning_rate": 7.437913139774482e-05, + "loss": 0.0, + "step": 3446 + }, + { + "epoch": 1.168863645996694, + "grad_norm": 0.00014469469897449017, + "learning_rate": 7.435142091582268e-05, + "loss": 0.0, + "step": 3448 + }, + { + "epoch": 1.1695418132496926, + "grad_norm": 0.0036504254676401615, + "learning_rate": 7.432370062515113e-05, + "loss": 0.0, + "step": 3450 + }, + { + "epoch": 1.1702199805026914, + "grad_norm": 0.00010814583220053464, + "learning_rate": 7.429597053689585e-05, + "loss": 0.0, + "step": 3452 + }, + { + "epoch": 1.1708981477556903, + "grad_norm": 0.0001561157696414739, + "learning_rate": 7.426823066222656e-05, + "loss": 0.0, + "step": 3454 + }, + { + "epoch": 1.171576315008689, + "grad_norm": 0.00011890140740433708, + "learning_rate": 7.424048101231686e-05, + "loss": 0.0, + "step": 3456 + }, + { + "epoch": 1.1722544822616878, + "grad_norm": 0.009851573035120964, + "learning_rate": 7.42127215983443e-05, + "loss": 0.0, + "step": 3458 + }, + { + "epoch": 1.1729326495146866, + "grad_norm": 0.00021269686112646013, + "learning_rate": 7.418495243149038e-05, + "loss": 0.0, + "step": 3460 + }, + { + "epoch": 1.1736108167676853, + "grad_norm": 0.0004351783136371523, + "learning_rate": 7.415717352294051e-05, + "loss": 0.0, + "step": 3462 + }, + { + "epoch": 1.174288984020684, + "grad_norm": 0.00015459170390386134, + "learning_rate": 7.412938488388404e-05, + "loss": 0.0, + "step": 3464 + }, + { + "epoch": 1.1749671512736828, + "grad_norm": 0.00015582202468067408, + "learning_rate": 7.41015865255142e-05, + "loss": 0.0, + "step": 3466 + }, + { + "epoch": 1.1756453185266817, + "grad_norm": 0.00025736543466337025, + "learning_rate": 7.407377845902823e-05, + "loss": 0.0, + "step": 3468 + }, + { + "epoch": 1.1763234857796805, + "grad_norm": 0.00011491354962345213, + "learning_rate": 7.404596069562715e-05, + "loss": 0.0, + "step": 3470 + }, + { + "epoch": 1.1770016530326792, + "grad_norm": 5.988680277368985e-05, + "learning_rate": 7.401813324651599e-05, + "loss": 0.0, + "step": 3472 + }, + { + "epoch": 1.177679820285678, + "grad_norm": 0.00019077217439189553, + "learning_rate": 7.399029612290362e-05, + "loss": 0.0, + "step": 3474 + }, + { + "epoch": 1.1783579875386767, + "grad_norm": 0.00011350025306455791, + "learning_rate": 7.396244933600285e-05, + "loss": 0.0, + "step": 3476 + }, + { + "epoch": 1.1790361547916755, + "grad_norm": 0.000465821212856099, + "learning_rate": 7.393459289703035e-05, + "loss": 0.0, + "step": 3478 + }, + { + "epoch": 1.1797143220446742, + "grad_norm": 0.0001867363171186298, + "learning_rate": 7.39067268172067e-05, + "loss": 0.0, + "step": 3480 + }, + { + "epoch": 1.180392489297673, + "grad_norm": 0.00012939849693793803, + "learning_rate": 7.387885110775632e-05, + "loss": 0.0, + "step": 3482 + }, + { + "epoch": 1.181070656550672, + "grad_norm": 0.00019843588233925402, + "learning_rate": 7.38509657799076e-05, + "loss": 0.0, + "step": 3484 + }, + { + "epoch": 1.1817488238036706, + "grad_norm": 0.000269518030108884, + "learning_rate": 7.38230708448927e-05, + "loss": 0.0, + "step": 3486 + }, + { + "epoch": 1.1824269910566694, + "grad_norm": 0.00014750401896890253, + "learning_rate": 7.379516631394772e-05, + "loss": 0.0, + "step": 3488 + }, + { + "epoch": 1.1831051583096681, + "grad_norm": 7.503381493734196e-05, + "learning_rate": 7.376725219831258e-05, + "loss": 0.0, + "step": 3490 + }, + { + "epoch": 1.1837833255626669, + "grad_norm": 0.0002959097037091851, + "learning_rate": 7.373932850923111e-05, + "loss": 0.0, + "step": 3492 + }, + { + "epoch": 1.1844614928156656, + "grad_norm": 0.00016742225852794945, + "learning_rate": 7.371139525795095e-05, + "loss": 0.0, + "step": 3494 + }, + { + "epoch": 1.1851396600686646, + "grad_norm": 6.525198114104569e-05, + "learning_rate": 7.368345245572362e-05, + "loss": 0.0, + "step": 3496 + }, + { + "epoch": 1.1858178273216633, + "grad_norm": 8.403266838286072e-05, + "learning_rate": 7.365550011380447e-05, + "loss": 0.0, + "step": 3498 + }, + { + "epoch": 1.186495994574662, + "grad_norm": 4.769510996993631e-05, + "learning_rate": 7.362753824345272e-05, + "loss": 0.0, + "step": 3500 + }, + { + "epoch": 1.1871741618276608, + "grad_norm": 4.9367710744263604e-05, + "learning_rate": 7.359956685593137e-05, + "loss": 0.0, + "step": 3502 + }, + { + "epoch": 1.1878523290806595, + "grad_norm": 3.271954119554721e-05, + "learning_rate": 7.357158596250731e-05, + "loss": 0.0, + "step": 3504 + }, + { + "epoch": 1.1885304963336583, + "grad_norm": 0.00010201849363511428, + "learning_rate": 7.354359557445126e-05, + "loss": 0.0, + "step": 3506 + }, + { + "epoch": 1.189208663586657, + "grad_norm": 4.488110062084161e-05, + "learning_rate": 7.351559570303771e-05, + "loss": 0.0, + "step": 3508 + }, + { + "epoch": 1.1898868308396557, + "grad_norm": 5.867435902473517e-05, + "learning_rate": 7.348758635954503e-05, + "loss": 0.0, + "step": 3510 + }, + { + "epoch": 1.1905649980926545, + "grad_norm": 0.00029293273109942675, + "learning_rate": 7.345956755525539e-05, + "loss": 0.0, + "step": 3512 + }, + { + "epoch": 1.1912431653456534, + "grad_norm": 0.00039945391472429037, + "learning_rate": 7.343153930145473e-05, + "loss": 0.0, + "step": 3514 + }, + { + "epoch": 1.1919213325986522, + "grad_norm": 0.00014375893806573004, + "learning_rate": 7.340350160943284e-05, + "loss": 0.0, + "step": 3516 + }, + { + "epoch": 1.192599499851651, + "grad_norm": 0.00047697001718916, + "learning_rate": 7.33754544904833e-05, + "loss": 0.0, + "step": 3518 + }, + { + "epoch": 1.1932776671046497, + "grad_norm": 4.121005258639343e-05, + "learning_rate": 7.334739795590347e-05, + "loss": 0.0, + "step": 3520 + }, + { + "epoch": 1.1939558343576484, + "grad_norm": 4.2145245970459655e-05, + "learning_rate": 7.331933201699457e-05, + "loss": 0.0, + "step": 3522 + }, + { + "epoch": 1.1946340016106471, + "grad_norm": 3.849615313811228e-05, + "learning_rate": 7.32912566850615e-05, + "loss": 0.0001, + "step": 3524 + }, + { + "epoch": 1.195312168863646, + "grad_norm": 9.348442836198956e-05, + "learning_rate": 7.326317197141304e-05, + "loss": 0.0, + "step": 3526 + }, + { + "epoch": 1.1959903361166448, + "grad_norm": 0.00015607431123498827, + "learning_rate": 7.323507788736167e-05, + "loss": 0.0, + "step": 3528 + }, + { + "epoch": 1.1966685033696436, + "grad_norm": 2.9953756893519312e-05, + "learning_rate": 7.320697444422372e-05, + "loss": 0.0, + "step": 3530 + }, + { + "epoch": 1.1973466706226423, + "grad_norm": 0.08800560981035233, + "learning_rate": 7.317886165331925e-05, + "loss": 0.0002, + "step": 3532 + }, + { + "epoch": 1.198024837875641, + "grad_norm": 0.028783440589904785, + "learning_rate": 7.315073952597205e-05, + "loss": 0.0001, + "step": 3534 + }, + { + "epoch": 1.1987030051286398, + "grad_norm": 0.001528909313492477, + "learning_rate": 7.312260807350975e-05, + "loss": 0.0, + "step": 3536 + }, + { + "epoch": 1.1993811723816385, + "grad_norm": 0.0027066238690167665, + "learning_rate": 7.309446730726367e-05, + "loss": 0.0, + "step": 3538 + }, + { + "epoch": 1.2000593396346373, + "grad_norm": 0.006256472785025835, + "learning_rate": 7.306631723856895e-05, + "loss": 0.0001, + "step": 3540 + }, + { + "epoch": 1.2007375068876363, + "grad_norm": 0.051211562007665634, + "learning_rate": 7.303815787876438e-05, + "loss": 0.0006, + "step": 3542 + }, + { + "epoch": 1.201415674140635, + "grad_norm": 0.08517817407846451, + "learning_rate": 7.300998923919259e-05, + "loss": 0.0012, + "step": 3544 + }, + { + "epoch": 1.2020938413936337, + "grad_norm": 0.04007571563124657, + "learning_rate": 7.298181133119987e-05, + "loss": 0.0004, + "step": 3546 + }, + { + "epoch": 1.2027720086466325, + "grad_norm": 0.07620231807231903, + "learning_rate": 7.29536241661363e-05, + "loss": 0.0007, + "step": 3548 + }, + { + "epoch": 1.2034501758996312, + "grad_norm": 0.03595748171210289, + "learning_rate": 7.292542775535567e-05, + "loss": 0.0003, + "step": 3550 + }, + { + "epoch": 1.20412834315263, + "grad_norm": 0.03171832486987114, + "learning_rate": 7.289722211021546e-05, + "loss": 0.0001, + "step": 3552 + }, + { + "epoch": 1.2048065104056287, + "grad_norm": 0.018091239035129547, + "learning_rate": 7.286900724207694e-05, + "loss": 0.0002, + "step": 3554 + }, + { + "epoch": 1.2054846776586277, + "grad_norm": 0.06366919726133347, + "learning_rate": 7.284078316230504e-05, + "loss": 0.0004, + "step": 3556 + }, + { + "epoch": 1.2061628449116264, + "grad_norm": 0.008984467014670372, + "learning_rate": 7.281254988226842e-05, + "loss": 0.0, + "step": 3558 + }, + { + "epoch": 1.2068410121646251, + "grad_norm": 0.02150563895702362, + "learning_rate": 7.278430741333941e-05, + "loss": 0.0002, + "step": 3560 + }, + { + "epoch": 1.2075191794176239, + "grad_norm": 0.030671702697873116, + "learning_rate": 7.275605576689412e-05, + "loss": 0.0002, + "step": 3562 + }, + { + "epoch": 1.2081973466706226, + "grad_norm": 0.014883170835673809, + "learning_rate": 7.272779495431228e-05, + "loss": 0.0001, + "step": 3564 + }, + { + "epoch": 1.2088755139236214, + "grad_norm": 0.01106690801680088, + "learning_rate": 7.269952498697734e-05, + "loss": 0.0002, + "step": 3566 + }, + { + "epoch": 1.20955368117662, + "grad_norm": 0.010097106918692589, + "learning_rate": 7.267124587627647e-05, + "loss": 0.0001, + "step": 3568 + }, + { + "epoch": 1.2102318484296188, + "grad_norm": 0.019213326275348663, + "learning_rate": 7.264295763360046e-05, + "loss": 0.0, + "step": 3570 + }, + { + "epoch": 1.2109100156826178, + "grad_norm": 0.03620114549994469, + "learning_rate": 7.261466027034383e-05, + "loss": 0.0001, + "step": 3572 + }, + { + "epoch": 1.2115881829356165, + "grad_norm": 0.00409303605556488, + "learning_rate": 7.258635379790474e-05, + "loss": 0.0, + "step": 3574 + }, + { + "epoch": 1.2122663501886153, + "grad_norm": 0.008056599646806717, + "learning_rate": 7.255803822768505e-05, + "loss": 0.0, + "step": 3576 + }, + { + "epoch": 1.212944517441614, + "grad_norm": 0.025245072320103645, + "learning_rate": 7.252971357109025e-05, + "loss": 0.0001, + "step": 3578 + }, + { + "epoch": 1.2136226846946128, + "grad_norm": 0.002902921987697482, + "learning_rate": 7.250137983952952e-05, + "loss": 0.0, + "step": 3580 + }, + { + "epoch": 1.2143008519476115, + "grad_norm": 0.0032983103301376104, + "learning_rate": 7.247303704441568e-05, + "loss": 0.0, + "step": 3582 + }, + { + "epoch": 1.2149790192006105, + "grad_norm": 0.012134423479437828, + "learning_rate": 7.24446851971652e-05, + "loss": 0.0001, + "step": 3584 + }, + { + "epoch": 1.2156571864536092, + "grad_norm": 0.0009648970444686711, + "learning_rate": 7.241632430919822e-05, + "loss": 0.0, + "step": 3586 + }, + { + "epoch": 1.216335353706608, + "grad_norm": 0.0016609688755124807, + "learning_rate": 7.238795439193848e-05, + "loss": 0.0, + "step": 3588 + }, + { + "epoch": 1.2170135209596067, + "grad_norm": 0.0020582436118274927, + "learning_rate": 7.235957545681342e-05, + "loss": 0.0, + "step": 3590 + }, + { + "epoch": 1.2176916882126054, + "grad_norm": 0.0016364112962037325, + "learning_rate": 7.2331187515254e-05, + "loss": 0.0001, + "step": 3592 + }, + { + "epoch": 1.2183698554656042, + "grad_norm": 0.005296518560498953, + "learning_rate": 7.230279057869493e-05, + "loss": 0.0, + "step": 3594 + }, + { + "epoch": 1.219048022718603, + "grad_norm": 0.012611468322575092, + "learning_rate": 7.227438465857448e-05, + "loss": 0.0, + "step": 3596 + }, + { + "epoch": 1.2197261899716016, + "grad_norm": 0.007326039485633373, + "learning_rate": 7.224596976633456e-05, + "loss": 0.0, + "step": 3598 + }, + { + "epoch": 1.2204043572246006, + "grad_norm": 0.00021889481286052614, + "learning_rate": 7.221754591342067e-05, + "loss": 0.0, + "step": 3600 + }, + { + "epoch": 1.2210825244775994, + "grad_norm": 0.000410861597629264, + "learning_rate": 7.218911311128196e-05, + "loss": 0.0, + "step": 3602 + }, + { + "epoch": 1.221760691730598, + "grad_norm": 0.00032909182482399046, + "learning_rate": 7.216067137137112e-05, + "loss": 0.0, + "step": 3604 + }, + { + "epoch": 1.2224388589835968, + "grad_norm": 0.032102398574352264, + "learning_rate": 7.213222070514453e-05, + "loss": 0.0001, + "step": 3606 + }, + { + "epoch": 1.2231170262365956, + "grad_norm": 0.00018162255582865328, + "learning_rate": 7.210376112406205e-05, + "loss": 0.0, + "step": 3608 + }, + { + "epoch": 1.2237951934895943, + "grad_norm": 0.0013201575493440032, + "learning_rate": 7.207529263958726e-05, + "loss": 0.0, + "step": 3610 + }, + { + "epoch": 1.224473360742593, + "grad_norm": 0.0005053295753896236, + "learning_rate": 7.204681526318724e-05, + "loss": 0.0, + "step": 3612 + }, + { + "epoch": 1.225151527995592, + "grad_norm": 0.0003318975795991719, + "learning_rate": 7.201832900633265e-05, + "loss": 0.0, + "step": 3614 + }, + { + "epoch": 1.2258296952485908, + "grad_norm": 0.007706344127655029, + "learning_rate": 7.198983388049778e-05, + "loss": 0.0, + "step": 3616 + }, + { + "epoch": 1.2265078625015895, + "grad_norm": 0.00017018703510984778, + "learning_rate": 7.196132989716045e-05, + "loss": 0.0, + "step": 3618 + }, + { + "epoch": 1.2271860297545882, + "grad_norm": 0.0008728935499675572, + "learning_rate": 7.193281706780206e-05, + "loss": 0.0, + "step": 3620 + }, + { + "epoch": 1.227864197007587, + "grad_norm": 0.00048533343942835927, + "learning_rate": 7.190429540390757e-05, + "loss": 0.0, + "step": 3622 + }, + { + "epoch": 1.2285423642605857, + "grad_norm": 0.00021282030502334237, + "learning_rate": 7.187576491696553e-05, + "loss": 0.0, + "step": 3624 + }, + { + "epoch": 1.2292205315135845, + "grad_norm": 0.004707904066890478, + "learning_rate": 7.184722561846798e-05, + "loss": 0.0, + "step": 3626 + }, + { + "epoch": 1.2298986987665832, + "grad_norm": 0.0026295706629753113, + "learning_rate": 7.181867751991057e-05, + "loss": 0.0, + "step": 3628 + }, + { + "epoch": 1.2305768660195822, + "grad_norm": 0.0006299935048446059, + "learning_rate": 7.179012063279247e-05, + "loss": 0.0, + "step": 3630 + }, + { + "epoch": 1.231255033272581, + "grad_norm": 0.00022165761038195342, + "learning_rate": 7.176155496861638e-05, + "loss": 0.0, + "step": 3632 + }, + { + "epoch": 1.2319332005255796, + "grad_norm": 0.0006338229868561029, + "learning_rate": 7.173298053888855e-05, + "loss": 0.0, + "step": 3634 + }, + { + "epoch": 1.2326113677785784, + "grad_norm": 0.0003351867781020701, + "learning_rate": 7.170439735511876e-05, + "loss": 0.0, + "step": 3636 + }, + { + "epoch": 1.2332895350315771, + "grad_norm": 9.762517584022135e-05, + "learning_rate": 7.167580542882031e-05, + "loss": 0.0, + "step": 3638 + }, + { + "epoch": 1.2339677022845759, + "grad_norm": 0.04705990478396416, + "learning_rate": 7.164720477151002e-05, + "loss": 0.0001, + "step": 3640 + }, + { + "epoch": 1.2346458695375748, + "grad_norm": 0.0005479294923134148, + "learning_rate": 7.161859539470824e-05, + "loss": 0.0, + "step": 3642 + }, + { + "epoch": 1.2353240367905736, + "grad_norm": 0.015779433771967888, + "learning_rate": 7.158997730993884e-05, + "loss": 0.0001, + "step": 3644 + }, + { + "epoch": 1.2360022040435723, + "grad_norm": 0.009688456542789936, + "learning_rate": 7.156135052872915e-05, + "loss": 0.0, + "step": 3646 + }, + { + "epoch": 1.236680371296571, + "grad_norm": 0.0007699349662289023, + "learning_rate": 7.153271506261003e-05, + "loss": 0.001, + "step": 3648 + }, + { + "epoch": 1.2373585385495698, + "grad_norm": 0.02694147825241089, + "learning_rate": 7.150407092311586e-05, + "loss": 0.0009, + "step": 3650 + }, + { + "epoch": 1.2380367058025685, + "grad_norm": 0.054436396807432175, + "learning_rate": 7.147541812178451e-05, + "loss": 0.0004, + "step": 3652 + }, + { + "epoch": 1.2387148730555673, + "grad_norm": 0.011601508595049381, + "learning_rate": 7.14467566701573e-05, + "loss": 0.0004, + "step": 3654 + }, + { + "epoch": 1.239393040308566, + "grad_norm": 0.009782589972019196, + "learning_rate": 7.141808657977907e-05, + "loss": 0.0002, + "step": 3656 + }, + { + "epoch": 1.2400712075615647, + "grad_norm": 0.036703720688819885, + "learning_rate": 7.138940786219813e-05, + "loss": 0.0001, + "step": 3658 + }, + { + "epoch": 1.2407493748145637, + "grad_norm": 0.01584113948047161, + "learning_rate": 7.136072052896625e-05, + "loss": 0.0003, + "step": 3660 + }, + { + "epoch": 1.2414275420675625, + "grad_norm": 0.00839573610574007, + "learning_rate": 7.133202459163872e-05, + "loss": 0.0001, + "step": 3662 + }, + { + "epoch": 1.2421057093205612, + "grad_norm": 0.015938330441713333, + "learning_rate": 7.130332006177421e-05, + "loss": 0.0003, + "step": 3664 + }, + { + "epoch": 1.24278387657356, + "grad_norm": 0.03005986101925373, + "learning_rate": 7.127460695093493e-05, + "loss": 0.0002, + "step": 3666 + }, + { + "epoch": 1.2434620438265587, + "grad_norm": 0.007686430588364601, + "learning_rate": 7.124588527068652e-05, + "loss": 0.0001, + "step": 3668 + }, + { + "epoch": 1.2441402110795574, + "grad_norm": 0.03607560321688652, + "learning_rate": 7.121715503259807e-05, + "loss": 0.0002, + "step": 3670 + }, + { + "epoch": 1.2448183783325564, + "grad_norm": 0.018232079222798347, + "learning_rate": 7.118841624824209e-05, + "loss": 0.0003, + "step": 3672 + }, + { + "epoch": 1.2454965455855551, + "grad_norm": 0.014432893134653568, + "learning_rate": 7.115966892919459e-05, + "loss": 0.0, + "step": 3674 + }, + { + "epoch": 1.2461747128385539, + "grad_norm": 0.019733788445591927, + "learning_rate": 7.113091308703498e-05, + "loss": 0.0001, + "step": 3676 + }, + { + "epoch": 1.2468528800915526, + "grad_norm": 0.013168008998036385, + "learning_rate": 7.110214873334611e-05, + "loss": 0.0001, + "step": 3678 + }, + { + "epoch": 1.2475310473445513, + "grad_norm": 0.02215537056326866, + "learning_rate": 7.107337587971424e-05, + "loss": 0.0001, + "step": 3680 + }, + { + "epoch": 1.24820921459755, + "grad_norm": 0.014840980060398579, + "learning_rate": 7.104459453772909e-05, + "loss": 0.0, + "step": 3682 + }, + { + "epoch": 1.2488873818505488, + "grad_norm": 0.0022126887924969196, + "learning_rate": 7.101580471898377e-05, + "loss": 0.0, + "step": 3684 + }, + { + "epoch": 1.2495655491035476, + "grad_norm": 0.0022282488644123077, + "learning_rate": 7.098700643507485e-05, + "loss": 0.0, + "step": 3686 + }, + { + "epoch": 1.2502437163565463, + "grad_norm": 0.0018177232705056667, + "learning_rate": 7.095819969760221e-05, + "loss": 0.0, + "step": 3688 + }, + { + "epoch": 1.2509218836095453, + "grad_norm": 0.007829871959984303, + "learning_rate": 7.092938451816926e-05, + "loss": 0.0, + "step": 3690 + }, + { + "epoch": 1.251600050862544, + "grad_norm": 0.0006072524702176452, + "learning_rate": 7.090056090838274e-05, + "loss": 0.0, + "step": 3692 + }, + { + "epoch": 1.2522782181155427, + "grad_norm": 0.030747784301638603, + "learning_rate": 7.087172887985276e-05, + "loss": 0.0001, + "step": 3694 + }, + { + "epoch": 1.2529563853685415, + "grad_norm": 0.02318490482866764, + "learning_rate": 7.08428884441929e-05, + "loss": 0.0001, + "step": 3696 + }, + { + "epoch": 1.2536345526215402, + "grad_norm": 0.005606907419860363, + "learning_rate": 7.081403961302006e-05, + "loss": 0.0, + "step": 3698 + }, + { + "epoch": 1.2543127198745392, + "grad_norm": 0.010471644811332226, + "learning_rate": 7.078518239795456e-05, + "loss": 0.0, + "step": 3700 + }, + { + "epoch": 1.254990887127538, + "grad_norm": 0.05316907539963722, + "learning_rate": 7.075631681062007e-05, + "loss": 0.0006, + "step": 3702 + }, + { + "epoch": 1.2556690543805367, + "grad_norm": 0.010512227192521095, + "learning_rate": 7.072744286264365e-05, + "loss": 0.0, + "step": 3704 + }, + { + "epoch": 1.2563472216335354, + "grad_norm": 0.028978761285543442, + "learning_rate": 7.069856056565572e-05, + "loss": 0.0001, + "step": 3706 + }, + { + "epoch": 1.2570253888865341, + "grad_norm": 0.02035476639866829, + "learning_rate": 7.066966993129009e-05, + "loss": 0.0001, + "step": 3708 + }, + { + "epoch": 1.2577035561395329, + "grad_norm": 0.0009529913077130914, + "learning_rate": 7.064077097118385e-05, + "loss": 0.0001, + "step": 3710 + }, + { + "epoch": 1.2583817233925316, + "grad_norm": 0.0010063213994726539, + "learning_rate": 7.061186369697756e-05, + "loss": 0.0, + "step": 3712 + }, + { + "epoch": 1.2590598906455304, + "grad_norm": 0.0006211152067407966, + "learning_rate": 7.058294812031503e-05, + "loss": 0.0, + "step": 3714 + }, + { + "epoch": 1.259738057898529, + "grad_norm": 0.05503276735544205, + "learning_rate": 7.055402425284346e-05, + "loss": 0.0, + "step": 3716 + }, + { + "epoch": 1.260416225151528, + "grad_norm": 0.021889325231313705, + "learning_rate": 7.052509210621337e-05, + "loss": 0.0002, + "step": 3718 + }, + { + "epoch": 1.2610943924045268, + "grad_norm": 0.03572991490364075, + "learning_rate": 7.049615169207864e-05, + "loss": 0.0002, + "step": 3720 + }, + { + "epoch": 1.2617725596575255, + "grad_norm": 0.001351168379187584, + "learning_rate": 7.046720302209646e-05, + "loss": 0.0, + "step": 3722 + }, + { + "epoch": 1.2624507269105243, + "grad_norm": 0.002800681861117482, + "learning_rate": 7.043824610792735e-05, + "loss": 0.0, + "step": 3724 + }, + { + "epoch": 1.263128894163523, + "grad_norm": 0.00028367669438011944, + "learning_rate": 7.040928096123516e-05, + "loss": 0.0003, + "step": 3726 + }, + { + "epoch": 1.2638070614165218, + "grad_norm": 0.0004958737408742309, + "learning_rate": 7.038030759368703e-05, + "loss": 0.0, + "step": 3728 + }, + { + "epoch": 1.2644852286695207, + "grad_norm": 0.0005568963824771345, + "learning_rate": 7.035132601695343e-05, + "loss": 0.0, + "step": 3730 + }, + { + "epoch": 1.2651633959225195, + "grad_norm": 0.04810892790555954, + "learning_rate": 7.032233624270816e-05, + "loss": 0.0008, + "step": 3732 + }, + { + "epoch": 1.2658415631755182, + "grad_norm": 0.08710169792175293, + "learning_rate": 7.029333828262827e-05, + "loss": 0.0001, + "step": 3734 + }, + { + "epoch": 1.266519730428517, + "grad_norm": 0.040873605757951736, + "learning_rate": 7.026433214839416e-05, + "loss": 0.0002, + "step": 3736 + }, + { + "epoch": 1.2671978976815157, + "grad_norm": 0.0023663484025746584, + "learning_rate": 7.023531785168948e-05, + "loss": 0.0, + "step": 3738 + }, + { + "epoch": 1.2678760649345144, + "grad_norm": 0.07860898226499557, + "learning_rate": 7.020629540420118e-05, + "loss": 0.0003, + "step": 3740 + }, + { + "epoch": 1.2685542321875132, + "grad_norm": 0.011849219910800457, + "learning_rate": 7.017726481761951e-05, + "loss": 0.0001, + "step": 3742 + }, + { + "epoch": 1.269232399440512, + "grad_norm": 0.021843140944838524, + "learning_rate": 7.014822610363798e-05, + "loss": 0.0001, + "step": 3744 + }, + { + "epoch": 1.2699105666935107, + "grad_norm": 0.06327859312295914, + "learning_rate": 7.01191792739534e-05, + "loss": 0.0004, + "step": 3746 + }, + { + "epoch": 1.2705887339465096, + "grad_norm": 0.030594058334827423, + "learning_rate": 7.00901243402658e-05, + "loss": 0.0005, + "step": 3748 + }, + { + "epoch": 1.2712669011995084, + "grad_norm": 0.05372355878353119, + "learning_rate": 7.006106131427853e-05, + "loss": 0.0003, + "step": 3750 + }, + { + "epoch": 1.271945068452507, + "grad_norm": 0.03993901237845421, + "learning_rate": 7.003199020769815e-05, + "loss": 0.0005, + "step": 3752 + }, + { + "epoch": 1.2726232357055058, + "grad_norm": 0.06573836505413055, + "learning_rate": 7.000291103223453e-05, + "loss": 0.0003, + "step": 3754 + }, + { + "epoch": 1.2733014029585046, + "grad_norm": 0.01412898674607277, + "learning_rate": 6.997382379960071e-05, + "loss": 0.0002, + "step": 3756 + }, + { + "epoch": 1.2739795702115035, + "grad_norm": 0.0644112154841423, + "learning_rate": 6.994472852151307e-05, + "loss": 0.0004, + "step": 3758 + }, + { + "epoch": 1.2746577374645023, + "grad_norm": 0.7547671794891357, + "learning_rate": 6.991562520969116e-05, + "loss": 0.0154, + "step": 3760 + }, + { + "epoch": 1.275335904717501, + "grad_norm": 0.059369105845689774, + "learning_rate": 6.98865138758578e-05, + "loss": 0.0276, + "step": 3762 + }, + { + "epoch": 1.2760140719704998, + "grad_norm": 0.9872779846191406, + "learning_rate": 6.985739453173903e-05, + "loss": 0.0121, + "step": 3764 + }, + { + "epoch": 1.2766922392234985, + "grad_norm": 0.18768635392189026, + "learning_rate": 6.982826718906412e-05, + "loss": 0.0134, + "step": 3766 + }, + { + "epoch": 1.2773704064764972, + "grad_norm": 3.0843405723571777, + "learning_rate": 6.979913185956556e-05, + "loss": 0.0152, + "step": 3768 + }, + { + "epoch": 1.278048573729496, + "grad_norm": 0.3620230257511139, + "learning_rate": 6.976998855497905e-05, + "loss": 0.0199, + "step": 3770 + }, + { + "epoch": 1.2787267409824947, + "grad_norm": 0.6195115447044373, + "learning_rate": 6.974083728704351e-05, + "loss": 0.0219, + "step": 3772 + }, + { + "epoch": 1.2794049082354935, + "grad_norm": 2.172183036804199, + "learning_rate": 6.971167806750107e-05, + "loss": 0.036, + "step": 3774 + }, + { + "epoch": 1.2800830754884924, + "grad_norm": 2.368955373764038, + "learning_rate": 6.968251090809708e-05, + "loss": 0.1914, + "step": 3776 + }, + { + "epoch": 1.2807612427414912, + "grad_norm": 2.1724870204925537, + "learning_rate": 6.965333582058002e-05, + "loss": 0.0416, + "step": 3778 + }, + { + "epoch": 1.28143940999449, + "grad_norm": 1.5031301975250244, + "learning_rate": 6.962415281670167e-05, + "loss": 0.0268, + "step": 3780 + }, + { + "epoch": 1.2821175772474886, + "grad_norm": 1.1724557876586914, + "learning_rate": 6.95949619082169e-05, + "loss": 0.051, + "step": 3782 + }, + { + "epoch": 1.2827957445004874, + "grad_norm": 0.24994565546512604, + "learning_rate": 6.956576310688382e-05, + "loss": 0.0078, + "step": 3784 + }, + { + "epoch": 1.2834739117534861, + "grad_norm": 0.874370813369751, + "learning_rate": 6.953655642446368e-05, + "loss": 0.0047, + "step": 3786 + }, + { + "epoch": 1.284152079006485, + "grad_norm": 0.08019069582223892, + "learning_rate": 6.950734187272098e-05, + "loss": 0.0028, + "step": 3788 + }, + { + "epoch": 1.2848302462594838, + "grad_norm": 0.18998371064662933, + "learning_rate": 6.947811946342329e-05, + "loss": 0.0032, + "step": 3790 + }, + { + "epoch": 1.2855084135124826, + "grad_norm": 0.08605227619409561, + "learning_rate": 6.944888920834142e-05, + "loss": 0.9699, + "step": 3792 + }, + { + "epoch": 1.2861865807654813, + "grad_norm": 0.1060505360364914, + "learning_rate": 6.941965111924929e-05, + "loss": 0.0012, + "step": 3794 + }, + { + "epoch": 1.28686474801848, + "grad_norm": 0.13712160289287567, + "learning_rate": 6.939040520792402e-05, + "loss": 0.0016, + "step": 3796 + }, + { + "epoch": 1.2875429152714788, + "grad_norm": 0.19173935055732727, + "learning_rate": 6.936115148614584e-05, + "loss": 0.0016, + "step": 3798 + }, + { + "epoch": 1.2882210825244775, + "grad_norm": 70.08195495605469, + "learning_rate": 6.933188996569817e-05, + "loss": 1.3792, + "step": 3800 + }, + { + "epoch": 1.2888992497774763, + "grad_norm": 0.691962480545044, + "learning_rate": 6.930262065836752e-05, + "loss": 0.0204, + "step": 3802 + }, + { + "epoch": 1.289577417030475, + "grad_norm": 0.8632583022117615, + "learning_rate": 6.92733435759436e-05, + "loss": 0.0146, + "step": 3804 + }, + { + "epoch": 1.290255584283474, + "grad_norm": 1.6672307252883911, + "learning_rate": 6.924405873021918e-05, + "loss": 0.0287, + "step": 3806 + }, + { + "epoch": 1.2909337515364727, + "grad_norm": 0.9258363246917725, + "learning_rate": 6.921476613299018e-05, + "loss": 0.0152, + "step": 3808 + }, + { + "epoch": 1.2916119187894715, + "grad_norm": 0.15455372631549835, + "learning_rate": 6.918546579605572e-05, + "loss": 0.0117, + "step": 3810 + }, + { + "epoch": 1.2922900860424702, + "grad_norm": 0.1822269707918167, + "learning_rate": 6.915615773121791e-05, + "loss": 0.0051, + "step": 3812 + }, + { + "epoch": 1.292968253295469, + "grad_norm": 0.20265448093414307, + "learning_rate": 6.912684195028208e-05, + "loss": 0.0054, + "step": 3814 + }, + { + "epoch": 1.293646420548468, + "grad_norm": 0.09864166378974915, + "learning_rate": 6.909751846505656e-05, + "loss": 0.0038, + "step": 3816 + }, + { + "epoch": 1.2943245878014666, + "grad_norm": 0.04846733435988426, + "learning_rate": 6.906818728735291e-05, + "loss": 0.0018, + "step": 3818 + }, + { + "epoch": 1.2950027550544654, + "grad_norm": 0.07136884331703186, + "learning_rate": 6.903884842898569e-05, + "loss": 0.0015, + "step": 3820 + }, + { + "epoch": 1.2956809223074641, + "grad_norm": 0.024747643619775772, + "learning_rate": 6.900950190177263e-05, + "loss": 0.0005, + "step": 3822 + }, + { + "epoch": 1.2963590895604629, + "grad_norm": 0.09419432282447815, + "learning_rate": 6.898014771753443e-05, + "loss": 0.0008, + "step": 3824 + }, + { + "epoch": 1.2970372568134616, + "grad_norm": 0.0712243989109993, + "learning_rate": 6.895078588809502e-05, + "loss": 0.0009, + "step": 3826 + }, + { + "epoch": 1.2977154240664603, + "grad_norm": 0.017767785117030144, + "learning_rate": 6.892141642528134e-05, + "loss": 0.001, + "step": 3828 + }, + { + "epoch": 1.298393591319459, + "grad_norm": 0.041735365986824036, + "learning_rate": 6.889203934092336e-05, + "loss": 0.0006, + "step": 3830 + }, + { + "epoch": 1.2990717585724578, + "grad_norm": 0.04800105467438698, + "learning_rate": 6.88626546468542e-05, + "loss": 0.0009, + "step": 3832 + }, + { + "epoch": 1.2997499258254566, + "grad_norm": 0.06188105419278145, + "learning_rate": 6.883326235491001e-05, + "loss": 0.0006, + "step": 3834 + }, + { + "epoch": 1.3004280930784555, + "grad_norm": 0.03689048811793327, + "learning_rate": 6.880386247692999e-05, + "loss": 0.0005, + "step": 3836 + }, + { + "epoch": 1.3011062603314543, + "grad_norm": 0.009136232547461987, + "learning_rate": 6.877445502475643e-05, + "loss": 0.0004, + "step": 3838 + }, + { + "epoch": 1.301784427584453, + "grad_norm": 0.024388829246163368, + "learning_rate": 6.874504001023461e-05, + "loss": 0.0004, + "step": 3840 + }, + { + "epoch": 1.3024625948374517, + "grad_norm": 0.0197377260774374, + "learning_rate": 6.871561744521293e-05, + "loss": 0.0001, + "step": 3842 + }, + { + "epoch": 1.3031407620904505, + "grad_norm": 0.01399923861026764, + "learning_rate": 6.868618734154276e-05, + "loss": 0.0003, + "step": 3844 + }, + { + "epoch": 1.3038189293434495, + "grad_norm": 0.0143091706559062, + "learning_rate": 6.865674971107858e-05, + "loss": 0.0002, + "step": 3846 + }, + { + "epoch": 1.3044970965964482, + "grad_norm": 0.006448931992053986, + "learning_rate": 6.862730456567786e-05, + "loss": 0.0001, + "step": 3848 + }, + { + "epoch": 1.305175263849447, + "grad_norm": 0.019587090238928795, + "learning_rate": 6.859785191720106e-05, + "loss": 0.0003, + "step": 3850 + } + ], + "logging_steps": 2, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.403106472762278e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}